├── contributed └── models │ └── README.md ├── labs ├── vLLM │ ├── Chess │ │ ├── assets │ │ │ ├── __init__.py │ │ │ ├── agents │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ └── stockfish_agent.py │ │ │ ├── vllm-server │ │ │ │ ├── compile_model.py │ │ │ │ ├── vllm.sh │ │ │ │ ├── start_vllm_python.py │ │ │ │ └── README.md │ │ │ ├── env.example │ │ │ └── example.py │ │ ├── requirements.txt │ │ ├── .gitignore │ │ ├── README.md │ │ └── Chess-Tournament.ipynb │ └── Servers.ipynb ├── FineTuning │ └── HuggingFaceExample │ │ └── 01_finetuning │ │ ├── assets │ │ ├── requirements.txt │ │ ├── consolidate_adapter_shards_and_merge_model.py │ │ ├── finetune_model.py │ │ └── finetune_chess_model.py │ │ ├── Finetune-Qwen3-1.7B.ipynb │ │ └── FT-Qwen3-1.7B-chess.ipynb ├── NxD │ ├── generation_config.json │ └── Lab_One_NxDI.ipynb └── NKI │ ├── Lab_Three_NKI_Custom_Operators.ipynb │ ├── Lab_Four_NKI_Profiling.ipynb │ └── Lab_Two_NKI.ipynb ├── NOTICE ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── README.md ├── LICENSE └── doc └── README.md /contributed/models/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/assets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /labs/FineTuning/HuggingFaceExample/01_finetuning/assets/requirements.txt: -------------------------------------------------------------------------------- 1 | optimum-neuron==0.3.0 2 | peft==0.16.0 3 | trl==0.11.4 4 | huggingface_hub==0.33.4 5 | datasets==3.6.0 6 | -------------------------------------------------------------------------------- /labs/NxD/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config": true, 3 | "bos_token_id": 128000, 4 | "eos_token_id": 128001, 5 | "transformers_version": "4.45.0.dev0", 6 | "do_sample": true, 7 | "temperature": 0.6, 8 | "top_p": 0.9 9 | } -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/requirements.txt: -------------------------------------------------------------------------------- 1 | python-chess>=1.10.0 2 | rich>=13.0.0 3 | stockfish>=3.28.0 4 | openai>=1.0.0 5 | python-dotenv>=1.0.0 6 | p-tqdm>=1.4.0 7 | pytest>=7.0.0 8 | pytest-cov>=4.0.0 9 | black>=23.0.0 10 | flake8>=6.0.0 11 | mypy>=1.0.0 12 | huggingface_hub>=0.24.0 13 | trueskill>=0.4.5 14 | optimum-neuron[vllm]==0.3.0 15 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/assets/agents/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Chess agents package. 3 | 4 | This package contains implementations of various chess-playing agents. 5 | """ 6 | 7 | from .base import ChessAgent 8 | from .stockfish_agent import StockfishAgent 9 | from .vllm_agent import VLLMAgent 10 | 11 | __all__ = [ 12 | "ChessAgent", 13 | "StockfishAgent", 14 | "VLLMAgent", 15 | ] 16 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | 8 | # Virtual environments 9 | venv/ 10 | env/ 11 | ENV/ 12 | 13 | # Environment variables 14 | .env 15 | .env.local 16 | 17 | # Chess game outputs 18 | *.pgn 19 | tournament_*/ 20 | games/ 21 | 22 | # vLLM logs 23 | vllm-server/vllm-server.log 24 | vllm-server/*.log 25 | 26 | # Jupyter 27 | .ipynb_checkpoints/ 28 | *.ipynb_checkpoints 29 | 30 | # IDE 31 | .vscode/ 32 | .idea/ 33 | *.swp 34 | *.swo 35 | 36 | # OS 37 | .DS_Store 38 | Thumbs.db 39 | 40 | # Model compilation artifacts (too large) 41 | *.pt 42 | *.neff 43 | 44 | # Test outputs 45 | .pytest_cache/ 46 | .coverage 47 | htmlcov/ 48 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/assets/vllm-server/compile_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Compile the chess model for AWS Neuron using optimum-neuron. 4 | """ 5 | import os 6 | from optimum.neuron import NeuronModelForCausalLM 7 | 8 | MODEL_PATH = "/home/ubuntu/chess-model-qwen/" 9 | COMPILED_MODEL_PATH = "/home/ubuntu/traced_model/sharded-Qwen3-chess-tp-2/" 10 | 11 | print(f"Compiling model from {MODEL_PATH}") 12 | print(f"Output will be saved to {COMPILED_MODEL_PATH}") 13 | print("This will take 10-30 minutes...") 14 | 15 | # Compile the model for Neuron 16 | # This exports and compiles the model for vLLM inference 17 | model = NeuronModelForCausalLM.from_pretrained( 18 | MODEL_PATH, 19 | export=True, 20 | tensor_parallel_size=2, 21 | batch_size=1, 22 | sequence_length=4096, 23 | auto_cast_type="bf16", 24 | ) 25 | 26 | # Save the compiled model 27 | model.save_pretrained(COMPILED_MODEL_PATH) 28 | 29 | print(f"✓ Model compiled and saved to {COMPILED_MODEL_PATH}") 30 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/assets/vllm-server/vllm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # 1) Use the HF model ID 5 | MODEL_PATH="kunhunjon/ChessLM_Qwen3_Trainium_AWS_Format" 6 | 7 | # 2) Make sure we use NxD inference as the Neuron backend in vLLM 8 | export VLLM_NEURON_FRAMEWORK="neuronx-distributed-inference" 9 | 10 | # (Optional) Explicitly list plugins, but leaving this unset is also fine since 11 | # optimum_neuron is already being found & loaded. 12 | # export VLLM_PLUGINS="optimum_neuron" 13 | 14 | # (Optional) Where to cache compiled artifacts if/when vLLM/Optimum compiles anything new 15 | # export NEURON_COMPILED_ARTIFACTS="/home/ubuntu/neuron-compiled-artifacts/chess-qwen" 16 | 17 | VLLM_RPC_TIMEOUT=100000 python -m vllm.entrypoints.openai.api_server \ 18 | --model "$MODEL_PATH" \ 19 | --device neuron \ 20 | --tensor-parallel-size 2 \ 21 | --max-model-len 2048 \ 22 | --max-num-seqs 4 \ 23 | --dtype bfloat16 \ 24 | --port 8080 \ 25 | --task generate & # vLLM "task" (generate vs embeddings), *not* HF pipeline task 26 | 27 | PID=$! 28 | echo "vLLM server started with PID $PID" 29 | echo "Server will be available at http://localhost:8080" 30 | echo "To stop: kill $PID" 31 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/assets/agents/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base chess agent abstract class. 3 | """ 4 | 5 | from abc import ABC, abstractmethod 6 | from typing import List, Tuple, Union 7 | 8 | import chess 9 | 10 | 11 | class ChessAgent(ABC): 12 | """Abstract base class for chess agents.""" 13 | 14 | @abstractmethod 15 | def choose_move( 16 | self, 17 | board: chess.Board, 18 | legal_moves: List[chess.Move], 19 | move_history: List[str], 20 | side_to_move: str, 21 | ) -> Tuple[Union[chess.Move, None], Union[str, None]]: 22 | """ 23 | Choose a move from the given legal moves. 24 | 25 | Args: 26 | board: Current chess board state 27 | legal_moves: List of legal moves available 28 | move_history: List of moves played so far (in UCI notation) 29 | side_to_move: Which side is to move ('White' or 'Black') 30 | 31 | Returns: 32 | Tuple of (chosen_move, optional_comment) 33 | - chosen_move: The chosen chess move, or None to resign 34 | - optional_comment: Optional comment explaining the move or resignation reason 35 | """ 36 | pass 37 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/assets/vllm-server/start_vllm_python.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Start vLLM server using Python API with pre-compiled Neuron model. 4 | This bypasses the CLI task inference bug. 5 | """ 6 | import os 7 | import uvicorn 8 | from vllm import AsyncEngineArgs, AsyncLLMEngine 9 | from vllm.entrypoints.openai.api_server import build_app 10 | from vllm.usage.usage_lib import UsageContext 11 | 12 | # Model configuration 13 | MODEL_PATH = "/home/ubuntu/ChessLM_Qwen3_Trainium" 14 | 15 | # Engine arguments 16 | engine_args = AsyncEngineArgs( 17 | model=MODEL_PATH, 18 | device="neuron", 19 | tensor_parallel_size=2, 20 | max_model_len=2048, 21 | max_num_seqs=1, 22 | dtype="bfloat16", 23 | trust_remote_code=False, 24 | # Override neuron config to specify task 25 | override_neuron_config={"task": "text-generation"}, 26 | ) 27 | 28 | if __name__ == "__main__": 29 | print(f"Starting vLLM server with model: {MODEL_PATH}") 30 | print(f"Engine args: {engine_args}") 31 | 32 | # Create async engine 33 | engine = AsyncLLMEngine.from_engine_args( 34 | engine_args, 35 | usage_context=UsageContext.OPENAI_API_SERVER 36 | ) 37 | 38 | # Build FastAPI app 39 | app = build_app(engine) 40 | 41 | # Start server 42 | print("Server starting on http://localhost:8000") 43 | uvicorn.run( 44 | app, 45 | host="0.0.0.0", 46 | port=8000, 47 | log_level="info", 48 | ) 49 | -------------------------------------------------------------------------------- /labs/FineTuning/HuggingFaceExample/01_finetuning/assets/consolidate_adapter_shards_and_merge_model.py: -------------------------------------------------------------------------------- 1 | from optimum.neuron.models.training import ( 2 | consolidate_model_parallel_checkpoints_to_unified_checkpoint, 3 | ) 4 | from transformers import AutoModel, AutoTokenizer 5 | from argparse import ArgumentParser 6 | from shutil import copyfile 7 | import os 8 | import peft 9 | 10 | parser = ArgumentParser() 11 | parser.add_argument( 12 | "-i", 13 | "--input_dir", 14 | help="source checkpoint directory containing sharded adapter checkpoint files", 15 | required=True, 16 | ) 17 | parser.add_argument( 18 | "-o", 19 | "--output_dir", 20 | help="destination directory for final merged model (adapters merged into base model)", 21 | required=True, 22 | ) 23 | args = parser.parse_args() 24 | 25 | consolidated_ckpt_dir = os.path.join(args.input_dir, "consolidated") 26 | 27 | # Consolidate the adapter shards into a PEFT-compatible checkpoint 28 | print("Consolidating LoRA adapter shards") 29 | consolidate_model_parallel_checkpoints_to_unified_checkpoint( 30 | args.input_dir, consolidated_ckpt_dir 31 | ) 32 | copyfile( 33 | os.path.join(args.input_dir, "adapter_default/adapter_config.json"), 34 | os.path.join(consolidated_ckpt_dir, "adapter_config.json"), 35 | ) 36 | 37 | # Load AutoPeftModel using the consolidated PEFT checkpoint 38 | peft_model = peft.AutoPeftModelForCausalLM.from_pretrained(consolidated_ckpt_dir) 39 | 40 | # Merge adapter weights into base model, save new pretrained model 41 | print("Merging LoRA adapter shards into base model") 42 | merged_model = peft_model.merge_and_unload() 43 | print(f"Saving merged model to {args.output_dir}") 44 | merged_model.save_pretrained(args.output_dir) 45 | 46 | print(f"Saving tokenizer to {args.output_dir}") 47 | tokenizer = AutoTokenizer.from_pretrained(args.input_dir) 48 | tokenizer.save_pretrained(args.output_dir) 49 | 50 | # Load the pretrained model and print config 51 | print("Merged model config:") 52 | model = AutoModel.from_pretrained(args.output_dir) 53 | print(model) 54 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/assets/env.example: -------------------------------------------------------------------------------- 1 | # AWS Trainium Chess Workshop - Environment Configuration 2 | # Copy this file to .env and customize as needed 3 | # All settings are optional; defaults work for standard workshop environment 4 | 5 | # ============================================================ 6 | # vLLM Server Configuration (Your Fine-Tuned Model) 7 | # ============================================================ 8 | # These settings connect to your locally deployed vLLM server 9 | # See vllm-server/README.md for deployment instructions 10 | 11 | VLLM_BASE_URL=http://localhost:8080/v1 12 | VLLM_MODEL=kunhunjon/ChessLM_Qwen3_Trainium_AWS_Format 13 | VLLM_TEMPERATURE=0.1 14 | VLLM_MAX_TOKENS=50 15 | VLLM_TIMEOUT=30.0 16 | 17 | # ============================================================ 18 | # Stockfish Configuration (Baseline Opponent) 19 | # ============================================================ 20 | # Path to Stockfish binary (only needed if not in system PATH) 21 | # Default: auto-detected from PATH 22 | # Uncomment and set if Stockfish is in a custom location 23 | 24 | # STOCKFISH_PATH=/usr/local/bin/stockfish 25 | # STOCKFISH_PATH=/opt/homebrew/bin/stockfish # macOS Homebrew 26 | # STOCKFISH_PATH=C:\Program Files\Stockfish\stockfish.exe # Windows 27 | 28 | # ============================================================ 29 | # Chess Environment Settings 30 | # ============================================================ 31 | # Global game parameters (can be overridden via CLI arguments) 32 | 33 | CHESS_MAX_MOVES=100 # Max moves per game before draw 34 | CHESS_TIME_LIMIT=30.0 # Time limit per move (seconds) 35 | 36 | # ============================================================ 37 | # Workshop Notes 38 | # ============================================================ 39 | # 40 | # Quick Start: 41 | # 1. Copy this file: cp env.example .env 42 | # 2. Deploy vLLM: cd vllm-server && bash compile.sh && bash vllm.sh 43 | # 3. Test connection: python -c "from agents import VLLMAgent; print(VLLMAgent().test_connection())" 44 | # 4. Run tournament: python run_game.py --agent vllm --agent stockfish-skill5-depth10 --num-games 10 45 | # 46 | # For detailed instructions, see README.md 47 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/assets/vllm-server/README.md: -------------------------------------------------------------------------------- 1 | # vLLM Server for AWS Neuron/Trainium 2 | 3 | This directory contains scripts for deploying a chess-playing LLM on AWS Neuron hardware (Trainium/Inferentia) using vLLM with OpenAI-compatible API. 4 | 5 | ## Overview 6 | 7 | The vLLM deployment provides: 8 | - **Local inference** on AWS Neuron accelerators (no external API costs) 9 | - **OpenAI-compatible API** at `http://localhost:8000/v1` 10 | - **Optimized for Qwen3** chess model with tensor parallelism 11 | 12 | ## Prerequisites 13 | 14 | Follow the [instructions](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch a trn2.3xlarge instance. 15 | 16 | ### Software Requirements 17 | 18 | ```bash 19 | # Install optimum-neuron with vLLM support 20 | pip install optimum-neuron[vllm]==0.3.0 21 | ``` 22 | 23 | That's it! The optimum-neuron package handles all Neuron SDK dependencies and vLLM integration automatically. 24 | 25 | ## Quick Start 26 | 27 | ### Step 1: Launch the vLLM Server 28 | 29 | ```bash 30 | # Edit paths in vllm.sh if needed 31 | bash vllm.sh 32 | ``` 33 | 34 | **What this does:** 35 | - Compiles the model on first run (if needed) 36 | - Starts OpenAI-compatible API server on port 8000 37 | - Runs in background (use `jobs` to check status) 38 | - Logs PID for easy management 39 | 40 | ### Step 2: Test the Connection 41 | 42 | ```bash 43 | # Test with curl 44 | curl http://localhost:8000/v1/models 45 | 46 | # Test with Python 47 | python -c " 48 | from openai import OpenAI 49 | client = OpenAI(base_url='http://localhost:8000/v1', api_key='dummy') 50 | response = client.chat.completions.create( 51 | model='/home/ubuntu/chess-model-qwen', 52 | messages=[{'role': 'user', 'content': 'Hello'}], 53 | max_tokens=10 54 | ) 55 | print(response.choices[0].message.content) 56 | " 57 | ``` 58 | 59 | ### Step 3: Use with Chess Agent 60 | 61 | ```python 62 | from agents import VLLMAgent 63 | from env import ChessEnvironment, StockfishAgent 64 | 65 | # Create vLLM agent (connects to localhost:8000) 66 | vllm_agent = VLLMAgent() 67 | 68 | # Test connection 69 | if vllm_agent.test_connection(): 70 | print("✓ Connected to vLLM server") 71 | 72 | # Play a game 73 | env = ChessEnvironment(vllm_agent, StockfishAgent(skill_level=5, depth=10)) 74 | result = env.play_game(verbose=True) 75 | print(f"Result: {result['result']}") 76 | else: 77 | print("✗ Failed to connect - is vLLM server running?") 78 | ``` 79 | 80 | Or run games directly: 81 | 82 | ```bash 83 | python run_game.py --agent1 vllm --agent2 stockfish-skill1-depth2 --verbose 84 | ``` 85 | 86 | ## Configuration 87 | 88 | ### Model Paths 89 | 90 | Edit these variables in `vllm.sh`: 91 | 92 | ```bash 93 | # Source model directory (HuggingFace format) 94 | MODEL_PATH="/home/ubuntu/chess-model-qwen/" 95 | 96 | # Compiled model artifacts directory (auto-generated on first run) 97 | COMPILED_MODEL_PATH="/home/ubuntu/traced_model/sharded-Qwen3-chess-tp-4/" 98 | ``` 99 | 100 | ### Tensor Parallelism 101 | 102 | Configure in `vllm.sh`: 103 | 104 | ```bash 105 | # Adjust based on available NeuronCores (trn2.3xlarge has 4 cores) 106 | --tensor-parallel-size 4 107 | ``` 108 | 109 | ### Environment Variables 110 | 111 | Set these in your `.env` file: 112 | 113 | ```bash 114 | # vLLM server configuration 115 | VLLM_BASE_URL=http://localhost:8000/v1 # Server endpoint 116 | VLLM_MODEL=Qwen3-chess # Model name 117 | VLLM_TEMPERATURE=0.1 # Generation temperature 118 | VLLM_MAX_TOKENS=50 # Max tokens per response 119 | VLLM_TIMEOUT=30.0 # Request timeout 120 | ``` 121 | 122 | ## 🔧 Management 123 | 124 | ### Check Server Status 125 | 126 | ```bash 127 | # Check if server is running 128 | ps aux | grep vllm 129 | 130 | # Check logs 131 | tail -f vllm-server/log 132 | 133 | # Test endpoint 134 | curl http://localhost:8000/health 135 | ``` 136 | 137 | ### Stop the Server 138 | 139 | ```bash 140 | # Find the PID 141 | ps aux | grep vllm 142 | 143 | # Kill the process 144 | kill 145 | 146 | # Or use the PID from startup 147 | # (printed as "vLLM server started with PID ") 148 | ``` 149 | 150 | ### Restart the Server 151 | 152 | ```bash 153 | # Stop existing server 154 | pkill -f "vllm.entrypoints.openai.api_server" 155 | 156 | # Start new server 157 | bash vllm.sh 158 | ``` 159 | 160 | ## Integration with Chess Environment 161 | 162 | The `VLLMAgent` class automatically connects to your local vLLM server: 163 | 164 | ```python 165 | # Default configuration (localhost:8000) 166 | agent = VLLMAgent() 167 | 168 | # Custom configuration 169 | agent = VLLMAgent( 170 | base_url="http://localhost:8000/v1", 171 | model="Qwen3-chess", 172 | temperature=0.1, 173 | max_tokens=50 174 | ) 175 | 176 | # Use in games 177 | from env import ChessEnvironment 178 | env = ChessEnvironment(agent, opponent) 179 | result = env.play_game(verbose=True) 180 | ``` 181 | 182 | ## Additional Resources 183 | 184 | - [Optimum Neuron Documentation](https://huggingface.co/docs/optimum-neuron/index) 185 | - [vLLM Documentation](https://docs.vllm.ai/) 186 | - [AWS Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/) 187 | - [OpenAI API Reference](https://platform.openai.com/docs/api-reference) 188 | 189 | ## Support 190 | 191 | For issues: 192 | 1. Check logs: `tail -f vllm-server/log` 193 | 2. Verify Neuron status: `neuron-ls` and `neuron-top` 194 | 3. Test endpoint: `curl http://localhost:8000/v1/models` 195 | 4. Review AWS Neuron documentation for hardware-specific issues 196 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Neuron Workshops 2 | 3 | In this workshop you will learn how to develop support for a new model with [NeuronX Distributed Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-overview.html#nxdi-overview), through the context of Llama 3.2 1B. You will also learn how to write your own kernel to directly program the accelerated hardware with the [Neuron Kernel Interface](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html). These tools will help you design your research proposals and experiments on Trainium. 4 | 5 | It also includes an end-to-end example of using Hugging Face Optimum Neuron to fine-tune and host a small language model with Amazon SageMaker. 6 | 7 | 8 | ### What are AWS Trainium and Neuron? 9 | AWS Trainium is an AI chip developed by AWS for accelerating building and deploying machine learning models. Built on a specialized architecture designed for deep learning, Trainium accelerates the training and inference of complex models with high output and scalability, making it ideal for academic researchers looking to optimize performance and costs. This architecture also emphasizes sustainability through energy-efficient design, reducing environmental impact. Amazon has established a dedicated Trainium research cluster featuring up to 40,000 Trainium chips, accessible via Amazon EC2 Trn1 instances. These instances are connected through a non-blocking, petabit-scale network using Amazon EC2 UltraClusters, enabling seamless high-performance ML training. The Trn1 instance family is optimized to deliver substantial compute power for cutting-edge AI research and development. This unique offering not only enhances the efficiency and affordability of model training but also presents academic researchers with opportunities to publish new papers on underrepresented compute architectures, thus advancing the field. 10 | 11 | Learn more about Trainium [here](https://aws.amazon.com/ai/machine-learning/trainium/). 12 | 13 | ### Your workshop 14 | This hands-on workshop is designed for developers, data scientists, and machine learning engineers who are getting started in their journey on the Neuron SDK. 15 | 16 | The workshop has multiple available modules: 17 | 1. Set up instructions 18 | 2. Run inference with Llama and NeuronX Distributed inference (NxD) 19 | 3. Write your own kernel with Neuron Kernel Interface (NKI) 20 | 4. Fine tune and host an existing, supported model with a different data set using SageMaker. 21 | 22 | #### Instructor-led workshop 23 | If you are participating in an instructor-led workshop, follow the guidance provided by your instructor for accessing the environment. 24 | 25 | #### Self-managed workshop 26 | If you are following the workshop steps in your own environment, you will need to take the following actions: 27 | 1. Launch a trn1.2xlarge instance on Amazon EC2, using the latest [DLAMI with Neuron packages preinstalled](https://repost.aws/articles/ARTxLi0wndTwquyl7frQYuKg) 28 | 2. Use a Python virtual environment preinstalled in that DLAMI, commonly located in `/opt/aws_`. 29 | 3. Set up and manage your own development environment on that instance, such as by using VSCode or a Jupyter Lab server. 30 | 31 | ### Background knowledge 32 | This workshop introduces developing on AWS Trainium for the academic AI research audience and technical innovators. As such it's expected that the audience will already have a firm understanding of machine learning fundamentals. 33 | 34 | ### Workshop costs 35 | If you are participating in an instructor-led workshop hosted in an AWS-managed Workshop Studio environment, you will not incur any costs through using this environment. If you are following this workshop in your own environment, then you will incur associated costs with provisioning an Amazon EC2 instance. Please see the service pricing details [here](https://aws.amazon.com/ec2/pricing/on-demand/). 36 | 37 | At the time of writing, this workshop uses a trn1.2xlarge instance with an on-demand hourly rate in supported US regions of $1.34 per hour. The fine tuning workshop requires less than an hour of ml.trn1.2xlarge at $1.54 per hour, and an ml.inf2.xlarge at $0.99 per hour. Please ensure you delete the resources when you are finished. 38 | 39 | ## FAQ's and known issues 40 | 1. Workshop instructions are available [here](https://catalog.us-east-1.prod.workshops.aws/workshops/bf9d80a3-5e4b-4648-bca8-1d887bb2a9ca/en-US). 41 | 2. If you use the `NousResearch` Llama 3.2 1B, please note you'll need to remove a trailing comma in the model config file. You can do this by using VIM in VSCode. If you do not take this step, you'll get an error for invalid JSON in trying to read the model config in Lab 1. If editing the file through the terminal is a little challenging, you can also download the config file from this repository with the following command: 42 | `!wget https://github.com/aws-neuron/build-on-trainium-workshop/blob/main/labs/generation_config.json -P /home/ec2-user/environment/models/llama/` 43 | 4. Jupyter kernels can hold on to the NeuronCores as a Python process even after your cell has completed. This can then cause issues when you try to run a new notebook, and sometimes when you try to run another cell. If you encounter a `NeuronCore not found` or similar error statement, please just restart your Jupyter kernel and/or shut down kernels from previous sessions. You can also restart the instance through the EC2 console. Once your node is back online, you can always check the availability of the NeuronCores with `neuron-ls`. 44 | 5. Want to see how to integrate NKI with NxD? Check out our `nki-llama` [here](https://github.com/aws-samples/nki-llama). 45 | 46 | 47 | ## Security 48 | 49 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 50 | 51 | ## License 52 | 53 | This project is licensed under the Apache-2.0 License. 54 | 55 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/README.md: -------------------------------------------------------------------------------- 1 | # Chess Model Deployment and Evaluation Lab 2 | 3 | This lab demonstrates deploying a fine-tuned chess model on AWS Trainium using vLLM, and evaluating it through competitive tournaments. 4 | 5 | ## Overview 6 | 7 | Learn how to: 8 | - Deploy a fine-tuned Qwen3 chess model via vLLM on Trainium 9 | - Configure vLLM for optimal throughput with continuous batching 10 | - Run competitive chess tournaments with TrueSkill ratings 11 | - Leverage automatic request concurrency for improved performance 12 | 13 | ## Prerequisites 14 | 15 | 1. **Complete the Fine-Tuning Lab** (recommended): 16 | - [FT-Qwen3-1.7B-chess.ipynb](../../FineTuning/HuggingFaceExample/01_finetuning/FT-Qwen3-1.7B-chess.ipynb) 17 | - Or use a pre-trained chess model from HuggingFace 18 | 19 | 2. **Hardware Requirements**: 20 | - AWS Trainium instance (trn1.2xlarge or larger) 21 | - Neuron SDK 2.25 (included in Workshop Studio / Neuron DLAMI) 22 | 23 | 3. **Software Requirements**: 24 | - Python 3.10+ 25 | - optimum-neuron 0.3.0+ 26 | - Dependencies in `assets/requirements.txt` 27 | 28 | ## Lab Structure 29 | 30 | ### 1. Chess-Deployment.ipynb 31 | **Objective**: Deploy your chess model via vLLM on Trainium 32 | 33 | **What you'll learn**: 34 | - vLLM server setup with Neuron backend 35 | - Model compilation for Trainium (batch_size=4, continuous_batching) 36 | - Testing deployed model with simple games 37 | - Performance validation (latency, throughput) 38 | 39 | **Duration**: 30-40 minutes (including compilation) 40 | 41 | ### 2. Chess-Tournament.ipynb 42 | **Objective**: Evaluate your model through competitive tournaments 43 | 44 | **What you'll learn**: 45 | - Tournament system with TrueSkill ratings 46 | - Playing multiple games in parallel for throughput 47 | - Analyzing results with metrics (win rate, ELO, ACPL) 48 | - Understanding automatic request batching benefits 49 | 50 | **Duration**: 20-30 minutes 51 | 52 | ## Quick Start 53 | 54 | ```bash 55 | # Navigate to lab directory 56 | cd /home/ubuntu/environment/neuron-workshops/labs/vLLM/Chess 57 | 58 | #install stockfish 59 | sudo apt install stockfish 60 | 61 | # Install dependencies 62 | pip install -r requirements.txt 63 | 64 | # Open and follow the deployment notebooks 65 | jupyter notebook Chess-Deployment.ipynb 66 | jupyter notebook Chess-Tournament.ipynb 67 | ``` 68 | 69 | ## Architecture 70 | 71 | ### Components 72 | 73 | 1. **ChessEnvironment** (`env.py`) 74 | - Game engine powered by python-chess 75 | - Manages turn-based gameplay between agents 76 | - Enforces chess rules and detects game termination 77 | 78 | 2. **VLLMAgent** (`agents/vllm_agent.py`) 79 | - Connects to vLLM server via OpenAI-compatible API 80 | - Parses model outputs to extract UCI chess moves 81 | - Handles retries and error recovery 82 | 83 | 3. **Tournament Scheduler** (`run_game.py`) 84 | - Runs multiple games in parallel using multiprocessing (p_map) 85 | - Implements TrueSkill rating system for agent comparison 86 | - Generates PGN files and statistics 87 | 88 | ### Concurrency Model 89 | 90 | **Two levels of parallelism work together automatically:** 91 | 92 | 1. **Process-Level Parallelism** (`p_map` in run_game.py) 93 | - Runs N games simultaneously in separate processes 94 | - Each game makes independent HTTP requests to vLLM 95 | - Controlled by `--parallelism` flag (default: 4) 96 | 97 | 2. **Request-Level Batching** (vLLM server) 98 | - vLLM server configured with `max_num_seqs=4` 99 | - Continuous batching automatically groups concurrent requests 100 | - When 4 games request moves simultaneously → 1.4x throughput improvement 101 | 102 | **Performance:** 103 | - Single request: ~0.58s latency, 1.72 moves/sec 104 | - 4 concurrent requests: ~1.86s latency per request, 2.15 total moves/sec 105 | - **Throughput improvement: 1.4x** (games complete ~40% faster overall) 106 | 107 | ## Files Overview 108 | 109 | ``` 110 | Chess/ 111 | ├── README.md # This file 112 | ├── Chess-Deployment.ipynb # Lab 1: Model deployment 113 | ├── Chess-Tournament.ipynb # Lab 2: Tournament evaluation 114 | ├── requirements.txt # Python dependencies 115 | └── assets/ 116 | ├── env.example # Environment template 117 | ├── env.py # Chess game environment 118 | ├── chess_renderer.py # Board visualization 119 | ├── run_game.py # Tournament orchestration 120 | ├── example.py # Usage examples 121 | ├── agents/ 122 | │ ├── __init__.py 123 | │ ├── base.py # Abstract agent interface 124 | │ ├── vllm_agent.py # vLLM integration 125 | │ └── stockfish_agent.py # Baseline opponent 126 | └── vllm-server/ 127 | ├── README.md # vLLM setup guide 128 | ├── vllm.sh # Server startup script 129 | ├── compile_model.py # Model compilation 130 | └── start_vllm_python.py # Python server starter 131 | ``` 132 | 133 | ### 3. Analyze Model Performance 134 | 135 | Tournament results are saved to `tournament.json` with detailed metrics: 136 | 137 | ```python 138 | import json 139 | 140 | # Load tournament results 141 | with open('tournament_results/tournament.json') as f: 142 | results = json.load(f) 143 | 144 | # Check your model's rating 145 | agent_stats = results['agents']['vllm'] 146 | print(f"Conservative Rating: {agent_stats['conservative']:.1f}") 147 | print(f"Win Rate: {agent_stats['wins'] / agent_stats['games'] * 100:.1f}%") 148 | 149 | # Analyze move quality 150 | metrics = results['engine_metrics']['vllm'] 151 | print(f"Move Accuracy: {metrics['accuracy_pct']:.1f}%") 152 | print(f"Avg Centipawn Loss: {metrics['acpl']:.1f}") 153 | ``` 154 | 155 | ## Troubleshooting 156 | 157 | ### vLLM Server Not Starting 158 | 159 | **Problem**: Server fails with "Neuron cores not available" 160 | 161 | **Solution**: 162 | ```bash 163 | # Check Neuron core usage 164 | neuron-ls 165 | 166 | # Kill processes using cores 167 | pkill -f vllm 168 | 169 | # Restart server 170 | cd assets/vllm-server && bash vllm.sh 171 | ``` 172 | 173 | ### Model Compilation Takes Too Long 174 | 175 | **Problem**: First-time compilation can take 10-30 minutes 176 | 177 | **Solution**: This is expected behavior. Neuron compiles the model for Trainium hardware. Subsequent runs will be fast as compiled artifacts are cached. 178 | 179 | ### Slow Inference Performance 180 | 181 | **Problem**: High latency per move (>1s) 182 | 183 | **Possible causes**: 184 | - Not using compiled model (check for `model.pt` file) 185 | - Wrong tensor parallelism setting (should match cores: tp=2 for trn1.2xlarge) 186 | - `max_num_seqs` mismatch with compiled batch_size 187 | 188 | **Minimum versions:** 189 | - `neuronx-cc`: 2.21 190 | - `optimum-neuron`: 0.3.0+ 191 | 192 | If versions are too old, see [Neuron SDK Installation Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/). 193 | 194 | ## Additional Resources 195 | 196 | - [AWS Neuron Documentation](https://awsdocs-neuron.readthedocs-hosted.com/) 197 | - [vLLM on Neuron Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html) 198 | - [optimum-neuron Documentation](https://huggingface.co/docs/optimum-neuron) 199 | - [Chess Fine-Tuning Lab](../../FineTuning/HuggingFaceExample/01_finetuning/FT-Qwen3-1.7B-chess.ipynb) 200 | 201 | ## Next Steps 202 | 203 | 1. Complete [Chess-Deployment.ipynb](Chess-Deployment.ipynb) to deploy your model 204 | 2. Run [Chess-Tournament.ipynb](Chess-Tournament.ipynb) to evaluate performance 205 | 3. Experiment with different opponents and tournament configurations 206 | 4. Fine-tune your model further based on tournament results 207 | 5. Deploy to production with learned configurations 208 | 209 | 210 | -------------------------------------------------------------------------------- /labs/vLLM/Servers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Install vLLM\n", 8 | "\n", 9 | "There is a prior version of the SDK that was upstreamed into the main vLLM repository. However, most of the time we want to install from source from the aws-neuron fork. \n", 10 | "\n", 11 | "Instructions are available here: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html#nxdi-vllm-user-guide\n", 12 | "\n", 13 | "However, the steps are below. Run the next three cells. The pip installs could take 5 minutes.\n", 14 | "\n", 15 | "The AWS workshop environment deploys using a Neuron DLAMI with a recent SDK. If you are deploying this in your own environment, you may need to match the branch to your SDK version or follow the latest instructions at the link above." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "%%bash\n", 25 | "git clone -b 2.25.0 https://github.com/aws-neuron/upstreaming-to-vllm.git\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "Note: you may need to restart the kernel to use updated packages.\n" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "%pip install --quiet -r /home/ubuntu/environment/vLLM/upstreaming-to-vllm/requirements/neuron.txt\n", 43 | "#expected to produce no output for 4 or 5 minutes. Remove the --quiet flag if you want to see ALL the packages installed! Or look in the neuron.txt requirements doc." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "!VLLM_TARGET_DEVICE=\"neuron\" pip install --quiet -e /home/ubuntu/environment/vLLM/upstreaming-to-vllm/.\n", 53 | "# expected to product no output for 5 or 6 minutes" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "# Download copies of the model to deploy\n", 61 | "We are downloading a copy of the stock Qwen3-8B model as well as the compiled version from Hugging Face.\n" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "!hf download aws-neuron/Qwen3-8BSharded --local-dir /home/ubuntu/environment/qwen3\n", 71 | "#this could take 3-4 minutes" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "!hf download Qwen/Qwen3-8B --local-dir /home/ubuntu/environment/Qwen3-8B --exclude \"*.safetensors\"\n", 81 | "#This is the stock model. It will only take seconds because we don't need to download the weights." 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "# Make sure you restart your kernel\n", 89 | "If you get an error that vllm could not be found, it is because you didn't restart your kernel after installing it above\n", 90 | "\n", 91 | "# Offline inference example\n", 92 | "\n", 93 | "In this example, we load the qwen3 precompiled model artifacts (or NEFF files) and the model presharded for two cores. We do this because of the system memory limitations of the trn1.2xlarge (32GB of system RAM). The trn1.2xlarge also has 32GB of device RAM on the Trainium1 device (that has two Neuron cores), but system RAM is (usually) our limiter for compiling.\n", 94 | "\n", 95 | "May take 8 minutes to run" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "import os\n", 105 | "from vllm import LLM, SamplingParams\n", 106 | "os.environ['VLLM_NEURON_FRAMEWORK'] = \"neuronx-distributed-inference\"\n", 107 | "os.environ['NEURON_COMPILED_ARTIFACTS'] = \"/home/ubuntu/environment/qwen3\"\n", 108 | "#os.environ['BASE_COMPILE_WORK_DIR'] = \"/home/ubuntu/qwen3/\"\n", 109 | "llm = LLM(\n", 110 | " model=\"/home/ubuntu/environment/Qwen3-8B\", #model weights\n", 111 | " max_num_seqs=1,\n", 112 | " max_model_len=1024,\n", 113 | " device=\"neuron\",\n", 114 | " tensor_parallel_size=2,\n", 115 | " override_neuron_config={})\n", 116 | "prompts = [\n", 117 | " \"Hello, my name is\",\n", 118 | " \"The president of the United States is\",\n", 119 | " \"The capital of France is\",\n", 120 | " \"The future of AI is\",\n", 121 | "]\n", 122 | "# note that top_k must be set to lower than the global_top_k defined in\n", 123 | "# the neuronx_distributed_inference.models.config.OnDeviceSamplingConfig\n", 124 | "sampling_params = SamplingParams(top_k=10, temperature=0.8, top_p=0.95)\n", 125 | "outputs = llm.generate(prompts, sampling_params)\n", 126 | "for output in outputs:\n", 127 | " prompt = output.prompt\n", 128 | " generated_text = output.outputs[0].text\n", 129 | " print(f\"Prompt: {prompt!r}, Generated text: {generated_text!r}\")\n", 130 | "\n", 131 | "# Free up the Neuron Cores for the next step -- in production, keep the object around to avoid load times and warmup times\n", 132 | "del llm" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "# Online inference example.\n", 140 | "In this case, we are loading the model directly from Hugging Face and compiling what we need as we go (this is a 1.1B parameter model, so it needs less system RAM to compile than the Qwen3-8B example above)\n", 141 | "It may take 5 minutes for the model to download, compile and run.\n\n", 142 | "# Restart your kernel!!\n", 143 | "Restart your kernel before you run the next cell. This will remove the python script and anything it has loaded in the devices.\n" 144 | 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Because you are running this in a Jupyter notebook, this cell will keep running until you stop it. The server should remain available (and using the Neuron cores) until you stop it. \n", 152 | "\n", 153 | "You'll run this cell with different parameters your instructor will be discussing and using the guidellm tool in the Benchmark.ipynb notebook to run against this server.\n", 154 | "\n", 155 | "Run the next cell and wait until you see something like this (it should take about 5 minutes):\n", 156 | "```\n", 157 | "INFO: Started server process [21298]\n", 158 | "INFO: Waiting for application startup.\n", 159 | "INFO: Application startup complete.\n", 160 | "```" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "!VLLM_NEURON_FRAMEWORK='neuronx-distributed-inference' python -m vllm.entrypoints.openai.api_server \\\n", 170 | " --model=\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\" \\\n", 171 | " --max-num-seqs=1 \\\n", 172 | " --max-model-len=1024 \\\n", 173 | " --tensor-parallel-size=2 \\\n", 174 | " --port=8080 \\\n", 175 | " --device \"neuron\" " 176 | ] 177 | } 178 | ], 179 | "metadata": { 180 | "kernelspec": { 181 | "display_name": "aws_neuronx_venv_pytorch_latest", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.10.12" 196 | } 197 | }, 198 | "nbformat": 4, 199 | "nbformat_minor": 2 200 | } 201 | -------------------------------------------------------------------------------- /labs/FineTuning/HuggingFaceExample/01_finetuning/assets/finetune_model.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from datasets import load_dataset 3 | from peft import LoraConfig 4 | from transformers import ( 5 | AutoTokenizer, 6 | set_seed, 7 | ) 8 | import os 9 | import subprocess 10 | import boto3 11 | from botocore.exceptions import ClientError 12 | from huggingface_hub import login 13 | import torch 14 | 15 | from optimum.neuron import NeuronHfArgumentParser as HfArgumentParser 16 | from optimum.neuron import NeuronSFTConfig, NeuronSFTTrainer, NeuronTrainingArguments 17 | from torch_xla.core.xla_model import is_master_ordinal 18 | from optimum.neuron.models.training import NeuronModelForCausalLM 19 | 20 | 21 | 22 | def training_function(script_args, training_args): 23 | dataset = load_dataset("b-mc2/sql-create-context", split="train") 24 | dataset = dataset.shuffle(seed=23) 25 | train_dataset = dataset.select(range(50000)) 26 | eval_dataset = dataset.select(range(50000, 50500)) 27 | 28 | def create_conversation(sample): 29 | system_message = ( 30 | "You are a text to SQL query translator. Users will ask you questions in English and you will generate a " 31 | "SQL query based on the provided SCHEMA.\nSCHEMA:\n{schema}" 32 | ) 33 | return { 34 | "messages": [ 35 | { 36 | "role": "system", 37 | "content": system_message.format(schema=sample["context"]), 38 | }, 39 | {"role": "user", "content": sample["question"]}, 40 | {"role": "assistant", "content": sample["answer"] + ";"}, 41 | ] 42 | } 43 | 44 | train_dataset = train_dataset.map( 45 | create_conversation, remove_columns=train_dataset.features, batched=False 46 | ) 47 | eval_dataset = eval_dataset.map( 48 | create_conversation, remove_columns=eval_dataset.features, batched=False 49 | ) 50 | 51 | tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_id) 52 | # tokenizer.pad_token = tokenizer.eos_token 53 | # tokenizer.eos_token_id = 128001 54 | 55 | trn_config = training_args.trn_config 56 | dtype = torch.bfloat16 if training_args.bf16 else torch.float32 57 | model = NeuronModelForCausalLM.from_pretrained( 58 | script_args.model_id, 59 | trn_config, 60 | torch_dtype=dtype, 61 | # Use FlashAttention2 for better performance and to be able to use larger sequence lengths. 62 | use_flash_attention_2=False, #Because we are training a sequence lower than 2K for the workshop 63 | ) 64 | 65 | config = LoraConfig( 66 | r=script_args.lora_r, 67 | lora_alpha=script_args.lora_alpha, 68 | lora_dropout=script_args.lora_dropout, 69 | target_modules=[ 70 | "q_proj", 71 | "gate_proj", 72 | "v_proj", 73 | "o_proj", 74 | "k_proj", 75 | "up_proj", 76 | "down_proj", 77 | ], 78 | bias="none", 79 | task_type="CAUSAL_LM", 80 | ) 81 | 82 | args = training_args.to_dict() 83 | 84 | sft_config = NeuronSFTConfig( 85 | max_seq_length=1024, 86 | packing=True, 87 | **args, 88 | dataset_kwargs={ 89 | "add_special_tokens": False, 90 | "append_concat_token": True, 91 | }, 92 | ) 93 | 94 | trainer = NeuronSFTTrainer( 95 | args=sft_config, 96 | model=model, 97 | peft_config=config, 98 | tokenizer=tokenizer, 99 | train_dataset=train_dataset, 100 | eval_dataset=eval_dataset, 101 | ) 102 | 103 | # Start training 104 | trainer.train() 105 | del trainer 106 | 107 | 108 | @dataclass 109 | class ScriptArguments: 110 | model_id: str = field( 111 | default="TinyLlama/TinyLlama-1.1B-Chat-v1.0", 112 | metadata={ 113 | "help": "The model that you want to train from the Hugging Face hub." 114 | }, 115 | ) 116 | tokenizer_id: str = field( 117 | default="TinyLlama/TinyLlama-1.1B-Chat-v1.0", 118 | metadata={"help": "The tokenizer used to tokenize text for fine-tuning."}, 119 | ) 120 | lora_r: int = field( 121 | default=16, 122 | metadata={"help": "LoRA r value to be used during fine-tuning."}, 123 | ) 124 | lora_alpha: int = field( 125 | default=32, 126 | metadata={"help": "LoRA alpha value to be used during fine-tuning."}, 127 | ) 128 | lora_dropout: float = field( 129 | default=0.05, 130 | metadata={"help": "LoRA dropout value to be used during fine-tuning."}, 131 | ) 132 | secret_name: str = field( 133 | default="huggingface/token", 134 | metadata={"help": "AWS Secrets Manager secret name containing Hugging Face token."}, 135 | ) 136 | secret_region: str = field( 137 | default="us-west-2", 138 | metadata={"help": "AWS region where the secret is stored."}, 139 | ) 140 | 141 | 142 | def get_secret(secret_name, region_name): 143 | """ 144 | Retrieve a secret from AWS Secrets Manager by searching for secrets with the given name prefix. 145 | This is specific to the workshop environment. 146 | """ 147 | try: 148 | session = boto3.session.Session() 149 | client = session.client(service_name='secretsmanager', region_name=region_name) 150 | 151 | # List secrets and find one that starts with the secret_name 152 | paginator = client.get_paginator('list_secrets') 153 | for page in paginator.paginate(): 154 | for secret in page['SecretList']: 155 | if secret['Name'].startswith(secret_name): 156 | response = client.get_secret_value(SecretId=secret['ARN']) 157 | if 'SecretString' in response: 158 | return response['SecretString'] 159 | return None 160 | except ClientError: 161 | print("Could not retrieve secret from AWS Secrets Manager") 162 | return None 163 | 164 | if __name__ == "__main__": 165 | parser = HfArgumentParser([ScriptArguments, NeuronTrainingArguments]) 166 | script_args, training_args = parser.parse_args_into_dataclasses() 167 | 168 | # Check for Hugging Face token in environment variable 169 | hf_token = os.environ.get("HF_TOKEN") 170 | 171 | # If no token in environment, try to get it from AWS Secrets Manager 172 | if not hf_token: 173 | print("No Hugging Face token found in environment, checking AWS Secrets Manager...") 174 | hf_token = get_secret(script_args.secret_name, script_args.secret_region) 175 | 176 | # Login to Hugging Face if a valid token is found 177 | if hf_token: 178 | print("Logging in to Hugging Face Hub...") 179 | login(token=hf_token) 180 | else: 181 | print("No valid Hugging Face token found, continuing without authentication") 182 | 183 | set_seed(training_args.seed) 184 | training_function(script_args, training_args) 185 | 186 | # Consolidate LoRA adapter shards, merge LoRA adapters into base model, save merged model 187 | if is_master_ordinal(): 188 | input_ckpt_dir = os.path.join( 189 | training_args.output_dir, f"checkpoint-{training_args.max_steps}" 190 | ) 191 | output_ckpt_dir = os.path.join(training_args.output_dir, "merged_model") 192 | # the spawned process expects to see 2 NeuronCores for consolidating checkpoints with a tp=2 193 | # Either the second core isn't really used or it is freed up by the other thread finishing. 194 | # Adjusting Neuron env. var to advertise 2 NeuronCores to the process. 195 | env = os.environ.copy() 196 | env["NEURON_RT_VISIBLE_CORES"] = "0-1" 197 | subprocess.run( 198 | [ 199 | "python3", 200 | "consolidate_adapter_shards_and_merge_model.py", 201 | "-i", 202 | input_ckpt_dir, 203 | "-o", 204 | output_ckpt_dir, 205 | ], 206 | env=env 207 | ) -------------------------------------------------------------------------------- /labs/FineTuning/HuggingFaceExample/01_finetuning/assets/finetune_chess_model.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from datasets import load_dataset 3 | from peft import LoraConfig 4 | from transformers import ( 5 | AutoTokenizer, 6 | set_seed, 7 | ) 8 | import os 9 | import subprocess 10 | import boto3 11 | from botocore.exceptions import ClientError 12 | from huggingface_hub import login 13 | import torch 14 | 15 | from optimum.neuron import NeuronHfArgumentParser as HfArgumentParser 16 | from optimum.neuron import NeuronSFTConfig, NeuronSFTTrainer, NeuronTrainingArguments 17 | from torch_xla.core.xla_model import is_master_ordinal 18 | from optimum.neuron.models.training import NeuronModelForCausalLM 19 | 20 | 21 | 22 | def training_function(script_args, training_args): 23 | dataset = load_dataset("aicrowd/ChessExplained", split="train") 24 | dataset = dataset.shuffle(seed=23) 25 | 26 | # Use appropriate dataset size - adjust based on total available 27 | # Using smaller subset for workshop timing (~25 minutes training) 28 | train_dataset = dataset.select(range(min(50000, len(dataset)))) 29 | eval_dataset_end = min(50500, len(dataset)) 30 | eval_dataset_start = min(50000, len(dataset)) 31 | if eval_dataset_end > eval_dataset_start: 32 | eval_dataset = dataset.select(range(eval_dataset_start, eval_dataset_end)) 33 | else: 34 | # If dataset is too small, use a small portion of training data for eval 35 | eval_dataset = dataset.select(range(min(500, len(dataset)))) 36 | 37 | def parse_chess_conversation(sample): 38 | """ 39 | Parse the chess dataset's pre-formatted conversations. 40 | The 'text' column contains conversations with <|im_start|> and <|im_end|> tags. 41 | Assistant responses include both reasoning and output. 42 | """ 43 | text = sample["text"] 44 | messages = [] 45 | 46 | # Split by <|im_start|> to get each message block 47 | parts = text.split("<|im_start|>") 48 | 49 | for part in parts[1:]: # Skip first empty part 50 | if "<|im_end|>" in part: 51 | content = part.split("<|im_end|>")[0] 52 | if content.startswith("user\n"): 53 | messages.append({ 54 | "role": "user", 55 | "content": content.replace("user\n", "", 1).strip() 56 | }) 57 | elif content.startswith("assistant\n"): 58 | messages.append({ 59 | "role": "assistant", 60 | "content": content.replace("assistant\n", "", 1).strip() 61 | }) 62 | elif content.startswith("system\n"): 63 | messages.append({ 64 | "role": "system", 65 | "content": content.replace("system\n", "", 1).strip() 66 | }) 67 | 68 | return {"messages": messages} 69 | 70 | train_dataset = train_dataset.map( 71 | parse_chess_conversation, remove_columns=train_dataset.features, batched=False 72 | ) 73 | eval_dataset = eval_dataset.map( 74 | parse_chess_conversation, remove_columns=eval_dataset.features, batched=False 75 | ) 76 | 77 | tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_id) 78 | # tokenizer.pad_token = tokenizer.eos_token 79 | # tokenizer.eos_token_id = 128001 80 | 81 | trn_config = training_args.trn_config 82 | dtype = torch.bfloat16 if training_args.bf16 else torch.float32 83 | model = NeuronModelForCausalLM.from_pretrained( 84 | script_args.model_id, 85 | trn_config, 86 | torch_dtype=dtype, 87 | # Use FlashAttention2 for better performance and to be able to use larger sequence lengths. 88 | use_flash_attention_2=False, #Because we are training a sequence lower than 2K for the workshop 89 | ) 90 | 91 | config = LoraConfig( 92 | r=script_args.lora_r, 93 | lora_alpha=script_args.lora_alpha, 94 | lora_dropout=script_args.lora_dropout, 95 | target_modules=[ 96 | "q_proj", 97 | "gate_proj", 98 | "v_proj", 99 | "o_proj", 100 | "k_proj", 101 | "up_proj", 102 | "down_proj", 103 | ], 104 | bias="none", 105 | task_type="CAUSAL_LM", 106 | ) 107 | 108 | args = training_args.to_dict() 109 | 110 | sft_config = NeuronSFTConfig( 111 | max_seq_length=1024, 112 | packing=True, 113 | **args, 114 | dataset_kwargs={ 115 | "add_special_tokens": False, 116 | "append_concat_token": True, 117 | }, 118 | ) 119 | 120 | trainer = NeuronSFTTrainer( 121 | args=sft_config, 122 | model=model, 123 | peft_config=config, 124 | tokenizer=tokenizer, 125 | train_dataset=train_dataset, 126 | eval_dataset=eval_dataset, 127 | ) 128 | 129 | # Start training 130 | trainer.train() 131 | del trainer 132 | 133 | 134 | @dataclass 135 | class ScriptArguments: 136 | model_id: str = field( 137 | default="Qwen/Qwen3-1.7B", 138 | metadata={ 139 | "help": "The model that you want to train from the Hugging Face hub." 140 | }, 141 | ) 142 | tokenizer_id: str = field( 143 | default="Qwen/Qwen3-1.7B", 144 | metadata={"help": "The tokenizer used to tokenize text for fine-tuning."}, 145 | ) 146 | lora_r: int = field( 147 | default=16, 148 | metadata={"help": "LoRA r value to be used during fine-tuning."}, 149 | ) 150 | lora_alpha: int = field( 151 | default=32, 152 | metadata={"help": "LoRA alpha value to be used during fine-tuning."}, 153 | ) 154 | lora_dropout: float = field( 155 | default=0.05, 156 | metadata={"help": "LoRA dropout value to be used during fine-tuning."}, 157 | ) 158 | secret_name: str = field( 159 | default="huggingface/token", 160 | metadata={"help": "AWS Secrets Manager secret name containing Hugging Face token."}, 161 | ) 162 | secret_region: str = field( 163 | default="us-west-2", 164 | metadata={"help": "AWS region where the secret is stored."}, 165 | ) 166 | 167 | 168 | def get_secret(secret_name, region_name): 169 | """ 170 | Retrieve a secret from AWS Secrets Manager by searching for secrets with the given name prefix. 171 | This is specific to the workshop environment. 172 | """ 173 | try: 174 | session = boto3.session.Session() 175 | client = session.client(service_name='secretsmanager', region_name=region_name) 176 | 177 | # List secrets and find one that starts with the secret_name 178 | paginator = client.get_paginator('list_secrets') 179 | for page in paginator.paginate(): 180 | for secret in page['SecretList']: 181 | if secret['Name'].startswith(secret_name): 182 | response = client.get_secret_value(SecretId=secret['ARN']) 183 | if 'SecretString' in response: 184 | return response['SecretString'] 185 | return None 186 | except ClientError: 187 | print("Could not retrieve secret from AWS Secrets Manager") 188 | return None 189 | 190 | if __name__ == "__main__": 191 | parser = HfArgumentParser([ScriptArguments, NeuronTrainingArguments]) 192 | script_args, training_args = parser.parse_args_into_dataclasses() 193 | 194 | # Check for Hugging Face token in environment variable 195 | hf_token = os.environ.get("HF_TOKEN") 196 | 197 | # If no token in environment, try to get it from AWS Secrets Manager 198 | if not hf_token: 199 | print("No Hugging Face token found in environment, checking AWS Secrets Manager...") 200 | hf_token = get_secret(script_args.secret_name, script_args.secret_region) 201 | 202 | # Login to Hugging Face if a valid token is found 203 | if hf_token: 204 | print("Logging in to Hugging Face Hub...") 205 | login(token=hf_token) 206 | else: 207 | print("No valid Hugging Face token found, continuing without authentication") 208 | 209 | set_seed(training_args.seed) 210 | training_function(script_args, training_args) 211 | 212 | # Consolidate LoRA adapter shards, merge LoRA adapters into base model, save merged model 213 | if is_master_ordinal(): 214 | input_ckpt_dir = os.path.join( 215 | training_args.output_dir, f"checkpoint-{training_args.max_steps}" 216 | ) 217 | output_ckpt_dir = os.path.join(training_args.output_dir, "merged_model") 218 | # the spawned process expects to see 2 NeuronCores for consolidating checkpoints with a tp=2 219 | # Either the second core isn't really used or it is freed up by the other thread finishing. 220 | # Adjusting Neuron env. var to advertise 2 NeuronCores to the process. 221 | env = os.environ.copy() 222 | env["NEURON_RT_VISIBLE_CORES"] = "0-1" 223 | subprocess.run( 224 | [ 225 | "python3", 226 | "consolidate_adapter_shards_and_merge_model.py", 227 | "-i", 228 | input_ckpt_dir, 229 | "-o", 230 | output_ckpt_dir, 231 | ], 232 | env=env 233 | ) 234 | 235 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | -------------------------------------------------------------------------------- /labs/FineTuning/HuggingFaceExample/01_finetuning/Finetune-Qwen3-1.7B.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Intro\n", 8 | "In this notebook, we showcase how to fine-tune the Qwen3-1.7B model on AWS Trainium using the Hugging Face Optimum Neuron library.\n", 9 | "The goal of this task is Text-to-SQL generation — training the model to translate natural language questions into executable SQL queries.\n", 10 | "\n", 11 | "We will fine-tune the model using `optimum.neuron`, save the trained checkpoint, and then deploy it for inference with Optimum-Neuron[vllm], enabling high-performance, low-latency Text-to-SQL execution.\n", 12 | "\n", 13 | "By the end of this notebook, you’ll have a fine-tuned, Trainium-optimized Qwen3 model ready for deployment and real-time inference. This workflow demonstrates how to leverage the Optimum Neuron toolchain to efficiently train and serve large language models on AWS Neuron devices.\n", 14 | "\n", 15 | "For this module, you will be using the [b-mc2/sql-create-context](https://huggingface.co/datasets/b-mc2/sql-create-context) dataset which consists of thousands of examples of SQL schemas, questions about the schemas, and SQL queries intended to answer the questions.\n", 16 | "\n", 17 | "*Dataset example 1:*\n", 18 | "* *SQL schema/context:* `CREATE TABLE management (department_id VARCHAR); CREATE TABLE department (department_id VARCHAR)`\n", 19 | "* *Question:* `How many departments are led by heads who are not mentioned?`\n", 20 | "* *SQL query/answer:* `SELECT COUNT(*) FROM department WHERE NOT department_id IN (SELECT department_id FROM management)`\n", 21 | "\n", 22 | "*Dataset example 2:*\n", 23 | "* *SQL schema/context:* `CREATE TABLE courses (course_name VARCHAR, course_id VARCHAR); CREATE TABLE student_course_registrations (student_id VARCHAR, course_id VARCHAR)`\n", 24 | "* *Question:* `What are the ids of all students for courses and what are the names of those courses?`\n", 25 | "* *SQL query/answer:* `SELECT T1.student_id, T2.course_name FROM student_course_registrations AS T1 JOIN courses AS T2 ON T1.course_id = T2.course_id`\n", 26 | "\n", 27 | "By fine-tuning the model over several thousand of these text-to-SQL examples, the model will then learn how to generate an appropriate SQL query when presented with a SQL context and a free-form question.\n", 28 | "\n", 29 | "This text-to-SQL use case was selected so you can successfully fine-tune your model in a reasonably short amount of time (~25 minutes) which is appropriate for this workshop. Although this is a relatively simple use case, please keep in mind that the same techniques and components used in this module can also be applied to fine-tune LLMs for more advanced use cases such as writing code, summarizing documents, creating blog posts - the possibilities are endless!\n", 30 | "\n", 31 | "# Install requirements\n", 32 | "This notebook uses [Hugging Face Optimum Neuron](https://github.com/huggingface/optimum-neuron) which works like an interface between the Hugging Face Transformers library and AWS Accelerators including AWS Trainium and AWS Inferentia. We will also install some other libraries like peft, trl etc.\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "%cd /home/ubuntu/environment/FineTuning/HuggingFaceExample/01_finetuning/assets\n", 42 | "%pip install -r requirements.txt" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# Fine-tuning\n", 50 | "\n", 51 | "In this section, we fine-tune the Qwen3-1.7B model on the Text-to-SQL task using Hugging Face Optimum Neuron. Here are the parameters we are going to pass - \n", 52 | "\n", 53 | "1. `--nnodes`:\tNumber of nodes (1 = single node)\n", 54 | "2. `--nproc_per_node`: \tProcesses per node (usually equals number of devices).\n", 55 | "3. `--model_id, --tokenizer_id`:\tModel and tokenizer identifiers (from Hugging Face or local path).\n", 56 | "4. `--output_dir`:\tDirectory for saving checkpoints and logs.\n", 57 | "5. `--bf16`:\tEnables bfloat16 precision for faster, memory-efficient training.\n", 58 | "5. `--gradient_checkpointing`:\tSaves memory by recomputing activations during backprop.\n", 59 | "6. `--gradient_accumulation_steps`:\tSteps to accumulate gradients before optimizer update.\n", 60 | "7. `--learning_rate`:\tInitial training learning rate.\n", 61 | "8. `--max_steps`:\tTotal number of training steps.\n", 62 | "9. `--per_device_train_batch_size`:\tBatch size per device.\n", 63 | "10. `--tensor_parallel_size`:\tNumber of devices for tensor parallelism.\n", 64 | "11. `--lora_r, --lora_alpha, --lora_dropout`:\tLoRA hyperparameters — rank, scaling, and dropout rate.\n", 65 | "12. `--dataloader_drop_last`:\tDrops last incomplete batch.\n", 66 | "13. `--disable_tqdm`: Disables progress bar.\n", 67 | "14. `--logging_steps`:\tLog interval (in steps)." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!torchrun \\\n", 77 | " --nnodes 1 \\\n", 78 | " --nproc_per_node 2 \\\n", 79 | " finetune_model.py \\\n", 80 | " --model_id Qwen/Qwen3-1.7B \\\n", 81 | " --tokenizer_id Qwen/Qwen3-1.7B \\\n", 82 | " --output_dir ~/environment/ml/qwen \\\n", 83 | " --bf16 True \\\n", 84 | " --gradient_checkpointing True \\\n", 85 | " --gradient_accumulation_steps 1 \\\n", 86 | " --learning_rate 5e-5 \\\n", 87 | " --max_steps 1000 \\\n", 88 | " --per_device_train_batch_size 2 \\\n", 89 | " --tensor_parallel_size 2 \\\n", 90 | " --lora_r 16 \\\n", 91 | " --lora_alpha 32 \\\n", 92 | " --lora_dropout 0.05 \\\n", 93 | " --dataloader_drop_last True \\\n", 94 | " --disable_tqdm True \\\n", 95 | " --logging_steps 10" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "# Compilation\n", 103 | "\n", 104 | "After completing the fine-tuning process, the next step is to compile the trained model for AWS Trainium inference using the Hugging Face Optimum Neuron toolchain.\n", 105 | "Neuron compilation optimizes the model graph and converts it into a Neuron Executable File Format (NEFF), enabling efficient execution on NeuronCores." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "!optimum-cli export neuron \\\n", 115 | " --model /home/ubuntu/environment/ml/qwen/merged_model \\\n", 116 | " --task text-generation \\\n", 117 | " --sequence_length 512 \\\n", 118 | " --batch_size 1 \\\n", 119 | " /home/ubuntu/environment/ml/qwen/compiled_model" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "# Inference\n", 127 | "\n", 128 | "We will install the Optimum Neuron vllm library. Then, run inference using the compiled model." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "%pip install optimum-neuron[vllm]\n" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "import os\n", 147 | "from vllm import LLM, SamplingParams\n", 148 | "llm = LLM(\n", 149 | " model=\"/home/ubuntu/environment/ml/qwen/compiled_model\", #local compiled model\n", 150 | " max_num_seqs=1,\n", 151 | " max_model_len=2048,\n", 152 | " device=\"neuron\",\n", 153 | " tensor_parallel_size=2,\n", 154 | " override_neuron_config={})\n", 155 | "example1=\"\"\"\n", 156 | "<|im_start|>system\n", 157 | "You are a text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\n", 158 | "SCHEMA:\n", 159 | "CREATE TABLE management (department_id VARCHAR); CREATE TABLE department (department_id VARCHAR)<|im_end|>\n", 160 | "<|im_start|>user\n", 161 | "How many departments are led by heads who are not mentioned?<|im_end|>\n", 162 | "<|im_start|>assistant\n", 163 | "\"\"\"\n", 164 | "example2=\"\"\"\n", 165 | "<|im_start|>system\n", 166 | "You are a text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\n", 167 | "SCHEMA:\n", 168 | "CREATE TABLE courses (course_name VARCHAR, course_id VARCHAR); CREATE TABLE student_course_registrations (student_id VARCHAR, course_id VARCHAR)<|im_end|>\n", 169 | "<|im_start|>user\n", 170 | "What are the ids of all students for courses and what are the names of those courses?<|im_end|>\n", 171 | "<|im_start|>assistant\n", 172 | "\"\"\"\n", 173 | "example3=\"\"\"\n", 174 | "<|im_start|>system\n", 175 | "You are a text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\n", 176 | "SCHEMA:\n", 177 | "CREATE TABLE table_name_9 (wins INTEGER, year VARCHAR, team VARCHAR, points VARCHAR)<|im_end|>\n", 178 | "<|im_start|>user\n", 179 | "Which highest wins number had Kawasaki as a team, 95 points, and a year prior to 1981?<|im_end|>\n", 180 | "<|im_start|>assistant\n", 181 | "\"\"\"\n", 182 | "\n", 183 | "prompts = [\n", 184 | " example1,\n", 185 | " example2,\n", 186 | " example3\n", 187 | "]\n", 188 | "\n", 189 | "sampling_params = SamplingParams(max_tokens=2048, temperature=0.8)\n", 190 | "outputs = llm.generate(prompts, sampling_params)\n", 191 | "\n", 192 | "print(\"#########################################################\")\n", 193 | "\n", 194 | "for output in outputs:\n", 195 | " prompt = output.prompt\n", 196 | " generated_text = output.outputs[0].text\n", 197 | " print(f\"Prompt: {prompt!r}, \\n\\n Generated text: {generated_text!r} \\n\")" 198 | ] 199 | } 200 | ], 201 | "metadata": { 202 | "kernelspec": { 203 | "display_name": "aws_neuronx_venv_pytorch_latest", 204 | "language": "python", 205 | "name": "python3" 206 | }, 207 | "language_info": { 208 | "codemirror_mode": { 209 | "name": "ipython", 210 | "version": 3 211 | }, 212 | "file_extension": ".py", 213 | "mimetype": "text/x-python", 214 | "name": "python", 215 | "nbconvert_exporter": "python", 216 | "pygments_lexer": "ipython3", 217 | "version": "3.10.12" 218 | } 219 | }, 220 | "nbformat": 4, 221 | "nbformat_minor": 2 222 | } 223 | -------------------------------------------------------------------------------- /labs/NKI/Lab_Three_NKI_Custom_Operators.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Writing custom PyTyorch Operators with NKI\n", 8 | "\n", 9 | "This workshop was borrowed from the AWS NKI Workshop. To find the full original content, see here:\n", 10 | "- Workshop: https://catalog.us-east-1.prod.workshops.aws/workshops/0d84c975-7a94-469a-b6bc-661768d303f7/en-US/lab-0\n", 11 | "- Github: https://github.com/aws-samples/ml-specialized-hardware/tree/main/workshops/03_NKIWorkshop\n", 12 | "\n", 13 | "This notebook demonstrates how to insert a NKI kernel as a custom operators into a PyTorch.\n", 14 | "\n", 15 | "## Using NKI kernels\n", 16 | "To register a NKI kernel registration, you need to call a decorated NKI function.\n", 17 | "\n", 18 | "Let’s examine a guiding example below where we randomly initialize two inputs, add them together, and then multiply the result by the two input tensors element-wise. This effectively calculates: `a * b * (a + b)`.\n", 19 | "\n", 20 | "We define a common NKI kernel for addition. For more information on the kernel, see [SPMD Tensor Addition](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/tutorials/spmd_tensor_addition.html)." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import neuronxcc.nki as nki\n", 30 | "import neuronxcc.nki.language as nl\n", 31 | "\n", 32 | "@nki.jit\n", 33 | "def nki_tensor_add_kernel_(a_input, b_input):\n", 34 | " \"\"\"NKI kernel to compute element-wise addition of two input tensors\n", 35 | " \n", 36 | " This kernel assumes strict input/output sizes can be uniformly tiled to [128,512]\n", 37 | "\n", 38 | " Args:\n", 39 | " a_input: a first input tensor\n", 40 | " b_input: a second input tensor\n", 41 | "\n", 42 | " Returns:\n", 43 | " c_output: an output tensor\n", 44 | " \"\"\"\n", 45 | "\n", 46 | " # Create output tensor shared between all SPMD instances as result tensor\n", 47 | " c_output = nl.ndarray(a_input.shape, dtype=a_input.dtype, buffer=nl.shared_hbm)\n", 48 | "\n", 49 | " # Calculate tile offsets based on current 'program'\n", 50 | " offset_i_x = nl.program_id(0) * 128\n", 51 | " offset_i_y = nl.program_id(1) * 512\n", 52 | "\n", 53 | " # Generate tensor indices to index tensors a and b\n", 54 | " ix_, iy_ = nl.mgrid[0:128, 0:512]\n", 55 | " ix = offset_i_x + ix_\n", 56 | " iy = offset_i_y + iy_\n", 57 | "\n", 58 | " # Load input data from device memory (HBM) to on-chip memory (SBUF)\n", 59 | " # We refer to an indexed portion of a tensor as an intermediate tensor\n", 60 | " a_tile = nl.load(a_input[ix, iy])\n", 61 | " b_tile = nl.load(b_input[ix, iy])\n", 62 | "\n", 63 | " # compute a + b\n", 64 | " c_tile = a_tile + b_tile\n", 65 | "\n", 66 | " # store the addition results back to device memory (c_output)\n", 67 | " nl.store(c_output[ix, iy], value=c_tile)\n", 68 | "\n", 69 | " # Transfer the ownership of `c_output` to the caller\n", 70 | " return c_output" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## PyTorch\n", 78 | "We can perform `(a + b) * a * b` using native PyTorch code." 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "import torch\n", 88 | "from torch_xla.core import xla_model as xm\n", 89 | "\n", 90 | "device = xm.xla_device()\n", 91 | "\n", 92 | "a = torch.randn(256, 1024, dtype=torch.float32).to(device)\n", 93 | "b = torch.randn(256, 1024, dtype=torch.float32).to(device)\n", 94 | "c = a + b\n", 95 | "out = a * b * c\n", 96 | "\n", 97 | "print(out)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "Now let’s replace the tensor addition (`c = a + b`) with a NKI kernel. To do this we replace the `+` operator with a call to the NKI kernel caller (`nki_tensor_add`), and everything else works as before." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "def nki_tensor_add(a_input, b_input):\n", 114 | " \"\"\"NKI kernel caller to compute element-wise addition of two input tensors\n", 115 | "\n", 116 | " This kernel caller lifts tile-size restriction, by applying the kernel on tiles of the inputs/outputs\n", 117 | "\n", 118 | " Args:\n", 119 | " a_input: a first input tensor, of shape [N*128, M*512]\n", 120 | " b_input: a second input tensor, of shape [N*128, M*512]\n", 121 | "\n", 122 | " Returns:\n", 123 | " a tensor of shape [N*128, M*512], the result of a_input + b_input\n", 124 | " \"\"\"\n", 125 | "\n", 126 | " # The SPMD launch grid denotes the number of kernel instances.\n", 127 | " # In this case, we use a 2D grid where the size of each invocation is 128x512\n", 128 | " grid_x = a_input.shape[0] // 128\n", 129 | " grid_y = a_input.shape[1] // 512\n", 130 | "\n", 131 | " return nki_tensor_add_kernel_[grid_x, grid_y](a_input, b_input)\n", 132 | "\n", 133 | "device = xm.xla_device()\n", 134 | "a = torch.randn(256, 1024, dtype=torch.float32).to(device)\n", 135 | "b = torch.randn(256, 1024, dtype=torch.float32).to(device)\n", 136 | "c = nki_tensor_add(a, b) # calling a NKI kernel, instead of the built-in torch op\n", 137 | "out = a * b * c\n", 138 | "print(out)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "To understand what happens under the hood when we compile the above code, we can print HLO IR graph generated by XLA by setting the `NEURON_FRAMEWORK_DEBUG` environment variable. For example, you may add the following lines to your code:" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "import os\n", 155 | "os.environ['NEURON_FRAMEWORK_DEBUG'] = \"1\"" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "A `.pbtxt` file is then written in your run directory that has the corresponding human-readable HLO IR.\n", 163 | "\n", 164 | "Let’s examine the XLA output of this example. In line #5 we can identify that the tensor addition is now mapped to an HLO `custom-call` instruction, with `AwsNeuronCustomNativeKernel` as `custom_call_target`. The output of that `custom-call` is then consumed by the next instruction in line #6 as usual.\n", 165 | "\n", 166 | "```python\n", 167 | "ENTRY %SyncTensorsGraph.22 (p0.2: f32[256,1024], p1.2: f32[256,1024]) -> (f32[256,1024]) {\n", 168 | " %p1.2 = f32[256,1024]{1,0} parameter(1), frontend_attributes={neff_input_name=\"input1\"}\n", 169 | " %p0.2 = f32[256,1024]{1,0} parameter(0), frontend_attributes={neff_input_name=\"input0\"}\n", 170 | " %multiply = f32[256,1024]{1,0} multiply(f32[256,1024]{1,0} %p1.2, f32[256,1024]{1,0} %p0.2)\n", 171 | " %custom-call.2 = f32[256,1024]{1,0} custom-call(f32[256,1024]{1,0} %p1.2, f32[256,1024]{1,0} %p0.2), custom_call_target=\"AwsNeuronCustomNativeKernel\", api_version=API_VERSION_UNSPECIFIED, backend_config=\"...\")\n", 172 | " %multiply.1 = f32[256,1024]{1,0} multiply(f32[256,1024]{1,0} %multiply, f32[256,1024]{1,0} %custom-call.2)\n", 173 | " ROOT %tuple = (f32[256,1024]{1,0}) tuple(f32[256,1024]{1,0} %multiply.1), frontend_attributes={neff_output_names=\"output0\"}\n", 174 | "}\n", 175 | "```\n", 176 | "\n", 177 | "The Neuron compiler replaces the above custom-call with the corresponding NKI kernel implementation while optimizing the rest of the compute graph as usual. At the end of the compilation process, a single compiled binary NEFF file is generated representing the entire graph including the NKI kernel. For more information about NEFF files, see [Neuron Compiler](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/compiler/index.html)." 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "## Using NKI in training graphs\n", 185 | "\n", 186 | "If you are using NKI to implement a new operator in a training graph, you might need to make the new operator interplay with the `autograd` engine in the framework. To do this, in PyTorch, you can subclass the framework’s base operator class and implement both the `forward()` and `backward()` methods. The `autograd` engine then uses the `backward()` method when performing auto-differentiation. See Extending [torch.autograd](https://pytorch.org/docs/stable/notes/extending.html) in the PyTorch Docs for instructions on doing this in PyTorch.\n", 187 | "\n", 188 | "Let’s reuse the `nki_tensor_add` kernel from before and demonstrate how to train a simple compute graph `(a+b)*a*b` in PyTorch.\n", 189 | "\n", 190 | "## PyTorch\n", 191 | "\n", 192 | "We define a `NkiAddFunc` class, which leverages the `nki_tensor_add` kernel in its `forward()` function. The gradients of both input tensors in `y = a + b` are ones, so the `backward()` function propagates the `dy` gradients from the previous backward function." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "import torch\n", 202 | "import torch_xla.core.xla_model as xm\n", 203 | "device = xm.xla_device()\n", 204 | "\n", 205 | "class NkiAddFunc(torch.autograd.Function):\n", 206 | " @staticmethod\n", 207 | " def forward(ctx, a, b):\n", 208 | " return nki_tensor_add(a, b)\n", 209 | "\n", 210 | " @staticmethod\n", 211 | " def backward(ctx, dy, *args):\n", 212 | " # gradients for a and b\n", 213 | " return dy, dy\n", 214 | "\n", 215 | "# now, let's define the compute graph\n", 216 | "a = torch.randn(256, 1024, dtype=torch.float32).to(device).detach().requires_grad_()\n", 217 | "b = torch.randn(256, 1024, dtype=torch.float32).to(device).detach().requires_grad_()\n", 218 | "c = NkiAddFunc.apply(a, b)\n", 219 | "out = a * b * c\n", 220 | "\n", 221 | "# here we define a (dummy) loss-function, in prep for backward propagation\n", 222 | "loss = out.sum()\n", 223 | "\n", 224 | "# lastly, let's invoke the auto-grad engine\n", 225 | "loss.backward()\n", 226 | "\n", 227 | "xm.mark_step()" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "## Release the NeuronCore for the next notebook\n", 235 | "\n", 236 | "Before moving to the next notebook we need to release the NeuronCore. If we don't do this the next notebook will not be able resources - you can also stop the kernel via the GUI" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "import IPython\n", 246 | "IPython.Application.instance().kernel.do_shutdown(True)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [] 255 | } 256 | ], 257 | "metadata": { 258 | "kernelspec": { 259 | "display_name": "Python 3 (ipykernel)", 260 | "language": "python", 261 | "name": "python3" 262 | }, 263 | "language_info": { 264 | "codemirror_mode": { 265 | "name": "ipython", 266 | "version": 3 267 | }, 268 | "file_extension": ".py", 269 | "mimetype": "text/x-python", 270 | "name": "python", 271 | "nbconvert_exporter": "python", 272 | "pygments_lexer": "ipython3", 273 | "version": "3.10.12" 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 4 278 | } 279 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/assets/example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | AWS Trainium Chess Workshop - Example Code 4 | 5 | Demonstrates practical usage patterns for deploying and testing your fine-tuned 6 | chess model with vLLM and Stockfish baselines. 7 | 8 | For CLI usage and tournaments, see: python run_game.py --help 9 | """ 10 | 11 | import sys 12 | from agents import VLLMAgent, StockfishAgent, RandomAgent 13 | from env import ChessEnvironment 14 | 15 | 16 | def test_vllm_connection(): 17 | """Test connection to vLLM server.""" 18 | print("=== Testing vLLM Connection ===\n") 19 | 20 | agent = VLLMAgent() 21 | 22 | if agent.test_connection(): 23 | print("✓ vLLM server is running and accessible") 24 | print(f" Base URL: {agent.base_url}") 25 | print(f" Model: {agent.model}") 26 | print(f" Temperature: {agent.temperature}") 27 | print(f" Max tokens: {agent.max_tokens}") 28 | return True 29 | else: 30 | print("✗ Cannot connect to vLLM server") 31 | print("\nTroubleshooting:") 32 | print("1. Check if server is running: lsof -i :8000") 33 | print("2. Start server: cd vllm-server && bash vllm.sh") 34 | print("3. View logs: tail -f vllm-server/vllm-server.log") 35 | return False 36 | 37 | 38 | def quick_test_game(): 39 | """Run a quick test game with verbose output.""" 40 | print("\n=== Quick Test Game: vLLM vs Random ===\n") 41 | 42 | # Create agents 43 | vllm = VLLMAgent() 44 | random = RandomAgent() 45 | 46 | # Create environment with short game for testing 47 | env = ChessEnvironment(vllm, random, max_moves=20, time_limit=30.0) 48 | 49 | print(f"White: {vllm.__class__.__name__}") 50 | print(f"Black: {random.__class__.__name__}") 51 | print(f"Max moves: {env.max_moves}") 52 | print(f"Time limit: {env.time_limit}s per move\n") 53 | 54 | # Play game with verbose output 55 | result = env.play_game(verbose=True) 56 | 57 | # Display results 58 | print("\n" + "="*50) 59 | print("GAME RESULTS") 60 | print("="*50) 61 | print(f"Result: {result['result']}") 62 | print(f"Moves played: {result['moves_played']}") 63 | print(f"Game over reason: {result['game_over_reason']}") 64 | print(f"White illegal attempts: {result['white_illegal_attempts']}") 65 | print(f"Black illegal attempts: {result['black_illegal_attempts']}") 66 | 67 | # Export to PGN 68 | pgn_file = "quick_test.pgn" 69 | if env.export_pgn_file(pgn_file): 70 | print(f"\n✓ Game exported to {pgn_file}") 71 | 72 | return result 73 | 74 | 75 | def test_against_stockfish(): 76 | """Test vLLM agent against Stockfish baseline.""" 77 | print("\n=== Testing Against Stockfish Baseline ===\n") 78 | 79 | # Create agents 80 | vllm = VLLMAgent() 81 | stockfish = StockfishAgent(skill_level=5, depth=10) 82 | 83 | # Create environment 84 | env = ChessEnvironment(vllm, stockfish, max_moves=100, time_limit=30.0) 85 | 86 | print(f"White (your model): VLLMAgent") 87 | print(f"Black (baseline): StockfishAgent (skill=5, depth=10)") 88 | print(f"Max moves: {env.max_moves}") 89 | print(f"Time limit: {env.time_limit}s per move\n") 90 | 91 | # Play game 92 | print("Playing game... (this may take a few minutes)\n") 93 | result = env.play_game(verbose=False) 94 | 95 | # Display results 96 | print("\n" + "="*50) 97 | print("BASELINE TEST RESULTS") 98 | print("="*50) 99 | print(f"Result: {result['result']}") 100 | print(f"Moves played: {result['moves_played']}") 101 | print(f"Game over reason: {result['game_over_reason']}") 102 | 103 | # Interpret result 104 | if result['result'] == '1-0': 105 | print("\n✓ Your model won! Strong performance against skill 5.") 106 | elif result['result'] == '0-1': 107 | print("\n✗ Your model lost. Consider:") 108 | print(" - Reviewing model outputs (run with verbose=True)") 109 | print(" - Testing against weaker baseline (skill 1-3)") 110 | print(" - Checking illegal move rate") 111 | else: 112 | print("\n⚡ Draw. Your model competed well!") 113 | 114 | print(f"\nWhite illegal attempts: {result['white_illegal_attempts']}") 115 | print(f"Black illegal attempts: {result['black_illegal_attempts']}") 116 | 117 | # Export to PGN 118 | pgn_file = "baseline_test.pgn" 119 | if env.export_pgn_file(pgn_file): 120 | print(f"\n✓ Game exported to {pgn_file}") 121 | 122 | return result 123 | 124 | 125 | def multiple_baseline_tests(): 126 | """Run multiple games against Stockfish to estimate strength.""" 127 | print("\n=== Multiple Baseline Tests ===\n") 128 | 129 | num_games = 3 130 | skill_level = 5 131 | 132 | print(f"Running {num_games} games vs Stockfish (skill={skill_level})\n") 133 | 134 | # Track results 135 | wins = 0 136 | losses = 0 137 | draws = 0 138 | total_moves = 0 139 | illegal_attempts = 0 140 | 141 | for i in range(num_games): 142 | print(f"Game {i+1}/{num_games}...", end=" ", flush=True) 143 | 144 | # Create agents 145 | vllm = VLLMAgent() 146 | stockfish = StockfishAgent(skill_level=skill_level, depth=10) 147 | 148 | # Alternate colors 149 | if i % 2 == 0: 150 | env = ChessEnvironment(vllm, stockfish, max_moves=100, time_limit=30.0) 151 | white_is_vllm = True 152 | else: 153 | env = ChessEnvironment(stockfish, vllm, max_moves=100, time_limit=30.0) 154 | white_is_vllm = False 155 | 156 | # Play game 157 | result = env.play_game(verbose=False) 158 | 159 | # Update statistics 160 | total_moves += result['moves_played'] 161 | 162 | if white_is_vllm: 163 | illegal_attempts += result['white_illegal_attempts'] 164 | if result['result'] == '1-0': 165 | wins += 1 166 | elif result['result'] == '0-1': 167 | losses += 1 168 | else: 169 | draws += 1 170 | else: 171 | illegal_attempts += result['black_illegal_attempts'] 172 | if result['result'] == '0-1': 173 | wins += 1 174 | elif result['result'] == '1-0': 175 | losses += 1 176 | else: 177 | draws += 1 178 | 179 | print(f"Result: {result['result']} ({result['moves_played']} moves)") 180 | 181 | # Display summary 182 | print("\n" + "="*50) 183 | print("BASELINE TEST SUMMARY") 184 | print("="*50) 185 | print(f"Games played: {num_games}") 186 | print(f"Wins: {wins} ({wins/num_games*100:.1f}%)") 187 | print(f"Losses: {losses} ({losses/num_games*100:.1f}%)") 188 | print(f"Draws: {draws} ({draws/num_games*100:.1f}%)") 189 | print(f"Average moves per game: {total_moves/num_games:.1f}") 190 | print(f"Total illegal attempts: {illegal_attempts}") 191 | print(f"Illegal move rate: {illegal_attempts/(total_moves or 1)*100:.2f}%") 192 | 193 | # Interpretation 194 | print("\n" + "="*50) 195 | print("INTERPRETATION") 196 | print("="*50) 197 | 198 | win_rate = wins / num_games * 100 199 | if win_rate >= 60: 200 | print("✓ Excellent performance! Your model is strong.") 201 | print(" Try testing against skill 10 baseline.") 202 | elif win_rate >= 40: 203 | print("⚡ Good performance. Your model is competitive.") 204 | print(" Continue optimizing and test more games.") 205 | else: 206 | print("✗ Needs improvement. Consider:") 207 | print(" - Test against weaker baseline (skill 1-3)") 208 | print(" - Review model outputs for invalid moves") 209 | print(" - Check prompt template effectiveness") 210 | 211 | illegal_rate = illegal_attempts / (total_moves or 1) * 100 212 | if illegal_rate > 5: 213 | print(f"\n⚠ High illegal move rate ({illegal_rate:.1f}%)") 214 | print(" - Check model output format (UCI notation required)") 215 | print(" - Verify legal moves are provided in prompt") 216 | print(" - Consider increasing max_tokens for reasoning") 217 | 218 | 219 | def demonstrate_custom_configuration(): 220 | """Demonstrate custom agent configuration.""" 221 | print("\n=== Custom Agent Configuration ===\n") 222 | 223 | # Custom vLLM configuration 224 | custom_vllm = VLLMAgent( 225 | base_url="http://localhost:8000/v1", 226 | model="Qwen3-chess", 227 | temperature=0.05, # Lower for more deterministic play 228 | max_tokens=30, # Fewer tokens for faster inference 229 | timeout=60.0 # Longer timeout for complex positions 230 | ) 231 | 232 | print("Custom VLLMAgent configuration:") 233 | print(f" Base URL: {custom_vllm.base_url}") 234 | print(f" Model: {custom_vllm.model}") 235 | print(f" Temperature: {custom_vllm.temperature}") 236 | print(f" Max tokens: {custom_vllm.max_tokens}") 237 | print(f" Timeout: {custom_vllm.timeout}s") 238 | 239 | # Custom Stockfish configuration 240 | custom_stockfish = StockfishAgent( 241 | skill_level=3, # Weaker for testing 242 | depth=5, # Shallower search 243 | time_limit_ms=500 # Faster moves 244 | ) 245 | 246 | print("\nCustom StockfishAgent configuration:") 247 | print(f" Skill level: {custom_stockfish.skill_level}") 248 | print(f" Depth: {custom_stockfish.depth}") 249 | print(f" Time limit: {custom_stockfish.time_limit_ms}ms") 250 | 251 | # Custom environment 252 | custom_env = ChessEnvironment( 253 | custom_vllm, 254 | custom_stockfish, 255 | max_moves=50, # Shorter games for testing 256 | time_limit=15.0 # Faster games 257 | ) 258 | 259 | print("\nCustom Environment configuration:") 260 | print(f" Max moves: {custom_env.max_moves}") 261 | print(f" Time limit per move: {custom_env.time_limit}s") 262 | 263 | 264 | def debugging_example(): 265 | """Demonstrate debugging techniques.""" 266 | print("\n=== Debugging Example ===\n") 267 | 268 | print("Debugging tips:") 269 | print("\n1. Test vLLM connection:") 270 | print(" python -c \"from agents import VLLMAgent; print(VLLMAgent().test_connection())\"") 271 | 272 | print("\n2. Run game with verbose output:") 273 | print(" env.play_game(verbose=True)") 274 | 275 | print("\n3. Check server logs:") 276 | print(" tail -f vllm-server/vllm-server.log") 277 | 278 | print("\n4. Inspect game results:") 279 | print(" result = env.play_game()") 280 | print(" print(result['white_illegal_attempts'])") 281 | print(" print(result['black_illegal_attempts'])") 282 | 283 | print("\n5. Export and analyze PGN:") 284 | print(" env.export_pgn_file('debug_game.pgn')") 285 | print(" # Open in Lichess, Chess.com, or other analysis tool") 286 | 287 | print("\n6. Check Neuron runtime:") 288 | print(" neuron-ls") 289 | print(" neuron-top") 290 | 291 | print("\n7. Verify model output format:") 292 | print(" # Model should output: e2e4") 293 | print(" # UCI notation: source + destination (e.g., e2e4, g1f3)") 294 | 295 | 296 | def main(): 297 | """Main demonstration program.""" 298 | print("="*60) 299 | print("AWS Trainium Chess Workshop - Example Code") 300 | print("="*60) 301 | 302 | # Test vLLM connection first 303 | if not test_vllm_connection(): 304 | print("\n⚠ vLLM server is not accessible. Please start the server first.") 305 | print("\nTo start vLLM server:") 306 | print(" cd vllm-server") 307 | print(" bash vllm.sh") 308 | sys.exit(1) 309 | 310 | print("\n" + "="*60) 311 | print("Choose an example to run:") 312 | print("="*60) 313 | print("1. Quick test game (vLLM vs Random)") 314 | print("2. Test against Stockfish baseline") 315 | print("3. Multiple baseline tests (3 games)") 316 | print("4. Show custom configuration examples") 317 | print("5. Show debugging tips") 318 | print("6. Run all examples") 319 | print("0. Exit") 320 | 321 | try: 322 | choice = input("\nEnter choice (0-6): ").strip() 323 | except (KeyboardInterrupt, EOFError): 324 | print("\n\nExiting...") 325 | sys.exit(0) 326 | 327 | if choice == '1': 328 | quick_test_game() 329 | elif choice == '2': 330 | test_against_stockfish() 331 | elif choice == '3': 332 | multiple_baseline_tests() 333 | elif choice == '4': 334 | demonstrate_custom_configuration() 335 | elif choice == '5': 336 | debugging_example() 337 | elif choice == '6': 338 | quick_test_game() 339 | test_against_stockfish() 340 | multiple_baseline_tests() 341 | demonstrate_custom_configuration() 342 | debugging_example() 343 | elif choice == '0': 344 | print("Exiting...") 345 | sys.exit(0) 346 | else: 347 | print("Invalid choice. Please run again.") 348 | sys.exit(1) 349 | 350 | print("\n" + "="*60) 351 | print("For tournament mode and advanced features:") 352 | print(" python run_game.py --help") 353 | print("\nFor documentation:") 354 | print(" See README.md and docs/WORKSHOP_GUIDE.md") 355 | print("="*60) 356 | 357 | 358 | if __name__ == "__main__": 359 | main() 360 | -------------------------------------------------------------------------------- /labs/NKI/Lab_Four_NKI_Profiling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Neuron Profile \n", 8 | "\n", 9 | "This workshop was borrowed from the AWS NKI Workshop. To find the full original content, see here:\n", 10 | "- Workshop: https://catalog.us-east-1.prod.workshops.aws/workshops/0d84c975-7a94-469a-b6bc-661768d303f7/en-US/lab-0\n", 11 | "- Github: https://github.com/aws-samples/ml-specialized-hardware/tree/main/workshops/03_NKIWorkshop\n", 12 | "\n", 13 | "In this tutorial, we use Neuron Profile to view the execution trace of a NKI kernel captured on a NeuronCore. In doing so, we learn about:\n", 14 | "\n", 15 | "- Installation and usage of Neuron Profile.\n", 16 | "\n", 17 | "- Inspecting a detailed execution timeline of compute engine instructions and DMA engine activities generated from your NKI kernel.\n", 18 | "\n", 19 | "As background, [Neuron Profile](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-profile-user-guide.html) is the tool you need to visualize where time is being spent during kernel execution on NeuronDevices, which is crucial for identifying performance bottlenecks and opportunities of your kernel. Neuron Profile produces runtime execution data for every instruction executed on each compute engine and also every data movement activity completed by DMA engines. Neuron Profile also reports key performance metrics such as compute engine and memory bandwidth utilization, which allows developers to quickly find out the achieved hardware efficiency of their kernel. Profiling typically has near zero overhead thanks to the dedicated on-chip profiling hardware in NeuronDevices.\n", 20 | "\n", 21 | "## Profile a NKI Kernel\n", 22 | "\n", 23 | "### Install Neuron Profile\n", 24 | "Make sure you have the latest version of the `aws-neuronx-tools`, which includes updated profiling support for NKI kernels. Neuron Profile is included within this package and is installed to `/opt/aws/neuron/bin`.\n", 25 | "\n", 26 | "The `aws-neuronx-tools` package comes pre-installed on [Neuron DLAMIs](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/dlami/index.html). For detailed installation instructions see [Neuron Profile User Guide: Installation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-profile-user-guide.html#installation).\n", 27 | "\n", 28 | "### Profile using `neuron-profile capture`\n", 29 | "\n", 30 | "To profile a NKI kernel the required steps are (1) enable `NEURON_FRAMEWORK_DEBUG` to tell the compiler to save the `NEFF` file, (2) execute the NKI kernel to generate the `NEFF`, and (3) run `neuron-profile capture` to generate a `NTFF` profile. Each step is described in more detail below.\n", 31 | "\n", 32 | "We will profile a NKI kernel which computes the element-wise exponential of an input tensor of any 2D shape. The rest of this tutorial will use a performance profile generated from this kernel as an example. Full code of `prof-kernel.py`:" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "%%writefile prof-kernel.py\n", 42 | "\"\"\"\n", 43 | "Example kernel used to demmonstrate Neuron Profile.\n", 44 | "\"\"\"\n", 45 | "import torch\n", 46 | "from neuronxcc import nki\n", 47 | "import neuronxcc.nki.language as nl\n", 48 | "import math\n", 49 | "import os\n", 50 | "os.environ[\"NEURON_FRAMEWORK_DEBUG\"] = \"1\"\n", 51 | "os.environ[\"NEURON_CC_FLAGS\"]= \" --disable-dge \"\n", 52 | "\n", 53 | "@nki.jit\n", 54 | "def tensor_exp_kernel_(in_tensor):\n", 55 | " \"\"\"NKI kernel to compute elementwise exponential of an input tensor\n", 56 | "\n", 57 | " Args:\n", 58 | " in_tensor: an input tensor of ANY 2D shape (up to SBUF size)\n", 59 | " Returns:\n", 60 | " out_tensor: an output tensor of ANY 2D shape (up to SBUF size)\n", 61 | " \"\"\"\n", 62 | " out_tensor = nl.ndarray(in_tensor.shape, dtype=in_tensor.dtype,\n", 63 | " buffer=nl.shared_hbm)\n", 64 | "\n", 65 | " sz_p, sz_f = in_tensor.shape\n", 66 | "\n", 67 | " i_f = nl.arange(sz_f)[None, :]\n", 68 | "\n", 69 | " for p in nl.affine_range(math.ceil(sz_p / nl.tile_size.pmax)):\n", 70 | " # Generate tensor indices for the input/output tensors\n", 71 | " # pad index to pmax, for simplicity\n", 72 | " i_p = p * nl.tile_size.pmax + nl.arange(nl.tile_size.pmax)[:, None]\n", 73 | "\n", 74 | " # Load input data from external memory to on-chip memory\n", 75 | " # only read up to sz_p\n", 76 | " in_tile = nl.load(in_tensor[i_p, i_f], mask=(i_p\n", 122 | "Use the flag `--disable-dge` to temporarily disable a new compiler feature which is interfering with DMA debugging information display in neuron-profile. This is highly recommended to improve NKI performance debugging experience until we release a software fix for this issue.\n", 123 | "\n", 124 | "\n", 125 | "2. Compile your NKI kernel to create a NEFF in your current directory:" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "!python3 prof-kernel.py" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "
\n", 142 | "Find your NEFF named similarly to `MODULE_0_SyncTensorsGraph.13_12659246067793504316.neff`.\n", 143 | "
\n", 144 | "\n", 145 | "3. Profile the NEFF. This profiling step executes the NEFF on the NeuronDevice and records a raw execution trace into an Neuron Trace File Format (NTFF) artifact." 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "!neuron-profile capture -n -s profile.ntff --profile-nth-exec=2" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "This will save your NTFF profile to `profile_exec_2.ntff`.\n", 162 | "\n", 163 | "
\n", 164 | "The `--profile-nth-exec=2` option will profile your NEFF twice on the NeuronDevice and output a NTFF profile for the second iteration. This is recommended to avoid one-time warmup delays which can be seen in the first iteration of execution.\n", 165 | "
\n", 166 | "\n", 167 | "In [View Neuron Profile UI](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/neuron_profile_for_nki.html#nki-view-neuron-profile-ui), we will view the profile in a user-friendly format using the Neuron Profile UI.\n", 168 | "\n", 169 | "### Profile using nki.benchmark\n", 170 | "\n", 171 | "You may also use the [nki.benchmark](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/api/generated/nki.benchmark.html) API to generate a NEFF and NTFF programmatically. One caveat is [nki.benchmark](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/api/generated/nki.benchmark.html) runs your NEFF without an ML framework in [nki.baremetal](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/api/generated/nki.baremetal.html) mode, so the input tensors to the kernel must be NumPy arrays instead of framework tensors such as `torch.Tensor`.\n", 172 | "\n", 173 | "Below is an example NKI kernel decorated by [nki.benchmark](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/api/generated/nki.benchmark.html). Full code of `prof-kernel-benchmark.py`:" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "%%writefile prof-kernel-benchmark.py\n", 183 | "\"\"\"\n", 184 | "Example kernel used to demmonstrate Neuron Profile with nki.benchmark.\n", 185 | "\"\"\"\n", 186 | "from neuronxcc import nki\n", 187 | "from neuronxcc.nki.typing import tensor\n", 188 | "import neuronxcc.nki.language as nl\n", 189 | "import math\n", 190 | "\n", 191 | "\n", 192 | "@nki.benchmark(save_neff_name='file.neff', save_trace_name='profile.ntff')\n", 193 | "def tensor_exp_kernel_(in_tensor):\n", 194 | " \"\"\"NKI kernel to compute elementwise exponential of an input tensor\n", 195 | " Args:\n", 196 | " in_tensor: an input tensor of ANY 2D shape (up to SBUF size)\n", 197 | " Returns:\n", 198 | " out_tensor: an output tensor of ANY 2D shape (up to SBUF size)\n", 199 | " \"\"\"\n", 200 | " out_tensor = nl.ndarray(in_tensor.shape, dtype=in_tensor.dtype,\n", 201 | " buffer=nl.shared_hbm)\n", 202 | "\n", 203 | " sz_p, sz_f = in_tensor.shape\n", 204 | " i_f = nl.arange(sz_f)[None, :]\n", 205 | " for p in nl.affine_range(math.ceil(sz_p / nl.tile_size.pmax)):\n", 206 | " # Generate tensor indices for the input/output tensors\n", 207 | " # pad index to pmax, for simplicity\n", 208 | " i_p = p * nl.tile_size.pmax + nl.arange(nl.tile_size.pmax)[:, None]\n", 209 | " # Load input data from external memory to on-chip memory\n", 210 | " # only read up to sz_p\n", 211 | " in_tile = nl.load(in_tensor[i_p, i_f], mask=(i_p` tags)\n", 26 | "- Best move selection (in `` tags)\n", 27 | "\n", 28 | "**Dataset example:**\n", 29 | "\n", 30 | "*Position (FEN):* `rnbq1rk1/ppp1bpp1/4pn1p/3p4/2PP4/2N1PN2/PP1B1PPP/R2QKB1R b KQ - 0 7`\n", 31 | "\n", 32 | "*Legal moves:* `['g8h8', 'g8h7', 'f8e8', 'd8e8', 'c7c5', 'b7b6', 'a7a6', ...]`\n", 33 | "\n", 34 | "*Expert analysis:* \n", 35 | "```\n", 36 | "\n", 37 | "After Pawn moves to c5, this causes Black to attacks the pawn on d4. So c5 is the most logical. Position is drawish.\n", 38 | "\n", 39 | "\n", 40 | "c7c5\n", 41 | "```\n", 42 | "\n", 43 | "By fine-tuning the model over several thousand of these chess examples, the model will learn to analyze positions and generate both reasoning and optimal moves.\n", 44 | "\n", 45 | "This chess move prediction use case was selected so you can successfully fine-tune your model in a reasonably short amount of time (~25 minutes) which is appropriate for this workshop. The same techniques can be applied to more complex reasoning tasks such as strategic game playing, multi-step planning, and expert decision-making.\n", 46 | "\n", 47 | "## Install requirements\n", 48 | "This notebook uses [Hugging Face Optimum Neuron](https://github.com/huggingface/optimum-neuron) which works like an interface between the Hugging Face Transformers library and AWS Accelerators including AWS Trainium and AWS Inferentia. We will also install some other libraries like peft, trl etc. You may see some errors from the pip dependency resolver. This is expected. \n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "%cd /home/ubuntu/environment/FineTuning/HuggingFaceExample/01_finetuning/assets\n", 58 | "%pip install -q -r requirements.txt\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "# Fine-tuning\n", 66 | "\n", 67 | "In this section, we fine-tune the Qwen3-1.7B model on the chess move prediction task using Hugging Face Optimum Neuron. Here are the parameters we are going to pass - \n", 68 | "\n", 69 | "1. `--nnodes`:\tNumber of nodes (1 = single node)\n", 70 | "2. `--nproc_per_node`: \tProcesses per node (usually equals number of devices).\n", 71 | "3. `--model_id, --tokenizer_id`:\tModel and tokenizer identifiers (from Hugging Face or local path).\n", 72 | "4. `--output_dir`:\tDirectory for saving checkpoints and logs.\n", 73 | "5. `--bf16`:\tEnables bfloat16 precision for faster, memory-efficient training.\n", 74 | "6. `--gradient_checkpointing`:\tSaves memory by recomputing activations during backprop.\n", 75 | "7. `--gradient_accumulation_steps`:\tSteps to accumulate gradients before optimizer update.\n", 76 | "8. `--learning_rate`:\tInitial training learning rate.\n", 77 | "9. `--max_steps`:\tTotal number of training steps.\n", 78 | "10. `--per_device_train_batch_size`:\tBatch size per device.\n", 79 | "11. `--tensor_parallel_size`:\tNumber of devices for tensor parallelism.\n", 80 | "12. `--lora_r, --lora_alpha, --lora_dropout`:\tLoRA hyperparameters — rank, scaling, and dropout rate.\n", 81 | "13. `--dataloader_drop_last`:\tDrops last incomplete batch.\n", 82 | "14. `--disable_tqdm`: Disables progress bar.\n", 83 | "15. `--logging_steps`:\tLog interval (in steps).\n" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "!torchrun \\\n", 93 | " --nnodes 1 \\\n", 94 | " --nproc_per_node 2 \\\n", 95 | " finetune_chess_model.py \\\n", 96 | " --model_id Qwen/Qwen3-1.7B \\\n", 97 | " --tokenizer_id Qwen/Qwen3-1.7B \\\n", 98 | " --output_dir ~/environment/ml/qwen-chess \\\n", 99 | " --bf16 True \\\n", 100 | " --gradient_checkpointing True \\\n", 101 | " --gradient_accumulation_steps 1 \\\n", 102 | " --learning_rate 5e-5 \\\n", 103 | " --max_steps 1000 \\\n", 104 | " --per_device_train_batch_size 2 \\\n", 105 | " --tensor_parallel_size 2 \\\n", 106 | " --lora_r 16 \\\n", 107 | " --lora_alpha 32 \\\n", 108 | " --lora_dropout 0.05 \\\n", 109 | " --dataloader_drop_last True \\\n", 110 | " --disable_tqdm True \\\n", 111 | " --logging_steps 10\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "# Compilation\n", 119 | "\n", 120 | "After completing the fine-tuning process, the next step is to compile the trained model for AWS Trainium inference using the Hugging Face Optimum Neuron toolchain.\n", 121 | "Neuron compilation optimizes the model graph and converts it into a Neuron Executable File Format (NEFF), enabling efficient execution on NeuronCores.\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "!optimum-cli export neuron \\\n", 131 | " --model /home/ubuntu/environment/ml/qwen-chess/merged_model \\\n", 132 | " --task text-generation \\\n", 133 | " --sequence_length 2048 \\\n", 134 | " --batch_size 4 \\\n", 135 | " /home/ubuntu/environment/ml/qwen-chess/compiled_model\n" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "# Inference\n", 143 | "\n", 144 | "We will install the Optimum Neuron vllm library. Then, run inference using the compiled model.\n" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "%pip install optimum-neuron[vllm]\n" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "import os\n", 163 | "from vllm import LLM, SamplingParams\n", 164 | "\n", 165 | "llm = LLM(\n", 166 | " model=\"/home/ubuntu/environment/ml/qwen-chess/compiled_model\", #local compiled model\n", 167 | " max_num_seqs=4,\n", 168 | " max_model_len=2048,\n", 169 | " device=\"neuron\",\n", 170 | " tensor_parallel_size=2,\n", 171 | " override_neuron_config={})\n", 172 | "\n", 173 | "example1=\"\"\"\n", 174 | "<|im_start|>user\n", 175 | "You are an expert chess player looking at the following position in FEN format:\n", 176 | "\n", 177 | "rnbq1rk1/ppp1bpp1/4pn1p/3p4/2PP4/2N1PN2/PP1B1PPP/R2QKB1R b KQ - 0 7\n", 178 | "\n", 179 | "Briefly, FEN describes chess pieces by single letters [PNBRKQ] for white and [pnbrkq] for black. The pieces found in each rank are specified, starting at the top of the board (a8..h8) and describing all eight ranks.\n", 180 | "\n", 181 | "Here is an additional visualization of the board (♔♕♖♗♘♙ = White pieces, ♚♛♜♝♞♟ = Black pieces):\n", 182 | "\n", 183 | "a b c d e f g h\n", 184 | "+---------------+\n", 185 | "8 | ♜ ♞ ♝ ♛ · ♜ ♚ · | 8\n", 186 | "7 | ♟ ♟ ♟ · ♝ ♟ ♟ · | 7\n", 187 | "6 | · · · · ♟ ♞ · ♟ | 6\n", 188 | "5 | · · · ♟ · · · · | 5\n", 189 | "4 | · · ♙ ♙ · · · · | 4\n", 190 | "3 | · · ♘ · ♙ ♘ · · | 3\n", 191 | "2 | ♙ ♙ · ♗ · ♙ ♙ ♙ | 2\n", 192 | "1 | ♖ · · ♕ ♔ ♗ · ♖ | 1\n", 193 | "+---------------+\n", 194 | "a b c d e f g h\n", 195 | "\n", 196 | "The current side to move is black.\n", 197 | "The possible legal moves for the side to move are: ['g8h8', 'g8h7', 'f8e8', 'd8e8', 'c7c5', 'b7b6', 'a7a6', 'h6h5', 'e6e5', 'g7g5', 'c7c6', 'b7b5', 'a7a5'].\n", 198 | "\n", 199 | "Your task is to select the best move for the side to move. Output your thinking in tags and the move in tags.<|im_end|>\n", 200 | "<|im_start|>assistant\n", 201 | "\"\"\"\n", 202 | "\n", 203 | "example2=\"\"\"\n", 204 | "<|im_start|>user\n", 205 | "You are an expert chess player. Analyze this position in FEN format:\n", 206 | "\n", 207 | "r1bqkbnr/pppp1ppp/2n5/4p3/4P3/5N2/PPPP1PPP/RNBQKB1R w KQkq - 2 3\n", 208 | "\n", 209 | "Here is the board visualization:\n", 210 | "\n", 211 | "a b c d e f g h\n", 212 | "+---------------+\n", 213 | "8 | ♜ · ♝ ♛ ♚ ♝ ♞ ♜ | 8\n", 214 | "7 | ♟ ♟ ♟ ♟ · ♟ ♟ ♟ | 7\n", 215 | "6 | · · ♞ · · · · · | 6\n", 216 | "5 | · · · · ♟ · · · | 5\n", 217 | "4 | · · · · ♙ · · · | 4\n", 218 | "3 | · · · · · ♘ · · | 3\n", 219 | "2 | ♙ ♙ ♙ ♙ · ♙ ♙ ♙ | 2\n", 220 | "1 | ♖ ♘ ♗ ♕ ♔ ♗ · ♖ | 1\n", 221 | "+---------------+\n", 222 | "a b c d e f g h\n", 223 | "\n", 224 | "The current side to move is white.\n", 225 | "Select the best move from: ['d2d4', 'f1c4', 'f1b5', 'b1c3', 'd2d3']\n", 226 | "\n", 227 | "Output your analysis in tags and your move choice in tags.<|im_end|>\n", 228 | "<|im_start|>assistant\n", 229 | "\"\"\"\n", 230 | "\n", 231 | "example3=\"\"\"\n", 232 | "<|im_start|>user\n", 233 | "Analyze this chess position in FEN format:\n", 234 | "\n", 235 | "r2qkb1r/ppp2ppp/2n2n2/3pp1B1/1b1PP3/2N2N2/PPP2PPP/R2QKB1R w KQkq - 0 6\n", 236 | "\n", 237 | "Board visualization:\n", 238 | "\n", 239 | "a b c d e f g h\n", 240 | "+---------------+\n", 241 | "8 | ♜ · · ♛ ♚ ♝ · ♜ | 8\n", 242 | "7 | ♟ ♟ ♟ · · ♟ ♟ ♟ | 7\n", 243 | "6 | · · ♞ · · ♞ · · | 6\n", 244 | "5 | · · · ♟ ♟ · ♗ · | 5\n", 245 | "4 | · ♝ · ♙ ♙ · · · | 4\n", 246 | "3 | · · ♘ · · ♘ · · | 3\n", 247 | "2 | ♙ ♙ ♙ · · ♙ ♙ ♙ | 2\n", 248 | "1 | ♖ · · ♕ ♔ ♗ · ♖ | 1\n", 249 | "+---------------+\n", 250 | "a b c d e f g h\n", 251 | "\n", 252 | "White to move. Legal moves: ['g5f6', 'g5e7', 'g5h6', 'g5d2', 'c3b5', 'c3d5', 'f3d4', 'f3e5', 'd1d3', 'd1d2', 'e1d2']\n", 253 | "\n", 254 | "Provide your reasoning and best move.<|im_end|>\n", 255 | "<|im_start|>assistant\n", 256 | "\"\"\"\n", 257 | "\n", 258 | "prompts = [\n", 259 | " example1,\n", 260 | " example2,\n", 261 | " example3\n", 262 | "]\n", 263 | "\n", 264 | "sampling_params = SamplingParams(max_tokens=2048, temperature=0.8)\n", 265 | "outputs = llm.generate(prompts, sampling_params)\n", 266 | "\n", 267 | "print(\"#########################################################\")\n", 268 | "\n", 269 | "for output in outputs:\n", 270 | " prompt = output.prompt\n", 271 | " generated_text = output.outputs[0].text\n", 272 | " print(f\"Prompt: {prompt!r}, \\n\\n Generated text: {generated_text!r} \\n\")\n" 273 | ] 274 | } 275 | ], 276 | "metadata": { 277 | "kernelspec": { 278 | "display_name": "aws_neuronx_venv_pytorch_latest", 279 | "language": "python", 280 | "name": "python3" 281 | }, 282 | "language_info": { 283 | "codemirror_mode": { 284 | "name": "ipython", 285 | "version": 3 286 | }, 287 | "file_extension": ".py", 288 | "mimetype": "text/x-python", 289 | "name": "python", 290 | "nbconvert_exporter": "python", 291 | "pygments_lexer": "ipython3", 292 | "version": "3.10.12" 293 | } 294 | }, 295 | "nbformat": 4, 296 | "nbformat_minor": 2 297 | } 298 | -------------------------------------------------------------------------------- /labs/NKI/Lab_Two_NKI.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d6b1e73f-dc2c-4d66-b3ba-4fb71b5243c8", 6 | "metadata": {}, 7 | "source": [ 8 | "# Write your own kernel with the Neuron Kernel Interface (NKI)\n", 9 | "In this notebook you'll learn how to develop your own kernel with [NKI](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html). A kernel is a set of user-defined functions that are executed largely as defined by the user, not by the compiler. With NKI you can write your own functions to define any operations you like, using supported APIs, and execute them on Trainium and Inferentia hardware. You have the control and lower-level access to define the data movement, computational patterns, and physical execution for the mathematics of your algorithms with NKI.\n", 10 | "\n", 11 | "The structure of the notebook is as follows:\n", 12 | "1. Brief introduction to the NeuronCore and the NKI programming model\n", 13 | "2. Your first NKI kernel - tensor addition\n", 14 | "3. Your second NKI kernel - matrix multiplication\n", 15 | "\n", 16 | "Wrap up and next steps." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "54d843c8-b824-4896-ad23-1098dd859872", 22 | "metadata": {}, 23 | "source": [ 24 | "### 1. Introduction to the NeuronCore and NKI programming model\n", 25 | "The NeuronCore is the main acceleration unit within AWS AI chips Trainium and Inferentia. As you can see in the image below, it is composed of 4 compute engines. These engines are based on a systollic array architecture. The compute engines are fed data from the primary on-chip memory cache, SBUF. Data is moved from the HBM banks to SBUF when you call `nl.load`. You'll index into your tensors to create lower-level objects, called `tiles`. A tile is the result of `nl.load`. Once you've defined `tiles`, you can send them to various NKI mathematical APIS such as `add`, `subtract`, `matmul`, etc. The result of these operations are stored on the secondary on-chip memory cache, PSUM. After moving the data back to SBUF, you can then send it back to HBM with `nl.store`.\n", 26 | "\n", 27 | "" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "id": "37fc461d-1067-4b96-95a5-3da49e15723f", 33 | "metadata": {}, 34 | "source": [ 35 | "Trainium1 chips feature two NeuronCore-v2 acceleration units, 2 HBM banks, NeuronLink-v2 chip-to-chip connect, host PCIE, and dedicated engines for both data movement and collective communications. Trainium1 offers 32 GiB of device memory (sum of all 4 HBM banks), with 840 GiB/sec of bandwidth. Trainium1 instances feature 16 Trainium chips, providing a total of up to 3 petaflops of FP16 compute and 512 accelerator memory capacity. For more architectural details, see our docs [here](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trainium.html#trainium-arch). \n", 36 | "\n", 37 | "\n", 38 | "The on-chip memory cache, SBUF, **has ~20x higher memory bandwidth than HBM**. The purpose of your kernel is to exploit as much of that compute acceleration as you can within the context of your model and workload." 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "id": "0fffc189-2875-4a42-a14a-de4ea122d2ef", 44 | "metadata": {}, 45 | "source": [ 46 | "#### Structuring data and tensors for NKI\n", 47 | "\n", 48 | "To easily move data and design our kernels on NKI, we'll want to exploit the 128 partitions built into SBUF as shown in the image below. In particular, SBUF has 128 partition lanes. Each of these lanes can execute programs in parallel on the engines. As much as possible, we'll want to align the tensors and data structures in our algorithms to follow this physical design. The benefit is that our kernels will run faster and be easier to develop!\n", 49 | "\n", 50 | "Your data movement from HBM to SBUF should be very carefully aligned with this 128-lane partition dimension, also called p-dim. Each tile needs a precise definition along the p-dim. Your second dimension is called the free dimension, or f-dim. As the name goes, this dimension is much more flexible than p-dim. Though it may surprise you, it's better not to fully saturate sbuf with extremely large tiles. This is so that the compiler can overlap data movement and collectives with compute, giving you better overall compute utilization and performance." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "id": "0c786fce-ac8e-4549-8bf1-edaee7512211", 56 | "metadata": {}, 57 | "source": [ 58 | "" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "id": "03a9fbbe-2fa4-41c0-9cc8-7560fbc7a49f", 64 | "metadata": {}, 65 | "source": [ 66 | "### 2. Your first NKI kernel\n", 67 | "Now that you have some understanding of the compute architecture and motivation for kernels, let's write your first NKI kernel! Importing the `nki` library may take a few moments the first time you've imported it on an instance." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 1, 73 | "id": "2da52760-db72-403a-ade9-d8bebac40de3", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "import numpy as np\n", 78 | "import neuronxcc.nki as nki\n", 79 | "import neuronxcc.nki.language as nl" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 2, 85 | "id": "b83039ee-1788-478f-809f-f139cb032cce", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "@nki.jit\n", 90 | "def nki_tensor_add_kernel_(a_input, b_input):\n", 91 | " \n", 92 | " # Create output tensor \n", 93 | " c_output = nl.ndarray(a_input.shape, dtype=a_input.dtype, buffer=nl.shared_hbm)\n", 94 | "\n", 95 | " # Load input data from device memory (HBM) to on-chip memory (SBUF)\n", 96 | " a_tile = nl.load(a_input)\n", 97 | " b_tile = nl.load(b_input)\n", 98 | "\n", 99 | " # compute a + b\n", 100 | " c_tile = a_tile + b_tile\n", 101 | "\n", 102 | " # return the final tensor\n", 103 | " nl.store(c_output, value=c_tile)\n", 104 | "\n", 105 | " # Transfer the ownership of `c_output` to the caller\n", 106 | " return c_output\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 3, 112 | "id": "486f0e0a-6af1-4882-afe2-4ce5a1912ddc", 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "NKI and NumPy match\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "a = np.random.rand(128, 512).astype(np.float16)\n", 125 | "b = np.random.rand(128, 512).astype(np.float16)\n", 126 | "\n", 127 | "output_nki = nki_tensor_add_kernel_(a, b)\n", 128 | "\n", 129 | "output_np = a + b\n", 130 | "\n", 131 | "allclose = np.allclose(output_np, output_nki, atol=1e-4, rtol=1e-2)\n", 132 | "if allclose:\n", 133 | " print(\"NKI and NumPy match\")\n", 134 | "else:\n", 135 | " print(\"NKI and NumPy differ\")\n" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "id": "35f65891-2d62-4af4-aa5d-7620c707f6bd", 141 | "metadata": {}, 142 | "source": [ 143 | "Now let's see if we can do that for matrix multiplication!" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "id": "e8a65cb0-215d-4590-8335-d53c23eef5c1", 149 | "metadata": {}, 150 | "source": [ 151 | "### 3. Your second NKI kernel\n", 152 | "Now, let's try to use PyTorch arrays and pass them to the device with XLA. Then we'll try a matrix multiplication kernel.\n", 153 | "\n", 154 | "If you get any errors, you may need to use a different python environment. Choose the Python 3.9 kernel (or create a new venv) and install these packages:\n", 155 | "\n", 156 | "%pip install neuronx-cc==2.18.121.0+9e31e41a\n", 157 | "\n", 158 | "%pip install torch torch_xla torch_neuronx\n", 159 | "\n" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 4, 165 | "id": "e4e24399-7bae-4db2-b964-b5fdcc93fb32", 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stderr", 170 | "output_type": "stream", 171 | "text": [ 172 | "WARNING:root:MASTER_ADDR environment variable is not set, defaulting to localhost\n", 173 | "WARNING:root:Found libneuronpjrt.so. Setting PJRT_DEVICE=NEURON.\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "import torch\n", 179 | "from torch_xla.core import xla_model as xm\n", 180 | "\n", 181 | "device = xm.xla_device()\n", 182 | "\n", 183 | "lhs_small = torch.rand((64, 128), dtype=torch.bfloat16, device=device)\n", 184 | "rhs_small = torch.rand((128, 512), dtype=torch.bfloat16, device=device)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 5, 190 | "id": "0bc1f344-6e02-4f3a-928a-9f1bccabfb12", 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "@nki.jit\n", 195 | "def nki_matmul_basic_(lhsT, rhs):\n", 196 | " \"\"\"NKI kernel to compute a 64x128x512 matrix multiplication operation\n", 197 | "\n", 198 | " Args:\n", 199 | " lhsT: an input tensor of shape [128,64], a left hand side argument of the\n", 200 | " matrix multiplication, delivered transposed for optimal performance\n", 201 | " rhs: an input tensor of shape [128,512], a right hand side argument of the\n", 202 | " matrix multiplication\n", 203 | " Returns:\n", 204 | " result: the resulting output tensor of shape [64,512]\n", 205 | " \"\"\"\n", 206 | " result = nl.ndarray((64, 512), dtype=lhsT.dtype, buffer=nl.shared_hbm)\n", 207 | "\n", 208 | " # Defining indexes for input LHS.T\n", 209 | " # - Note: here we take LayoutConstraint #1 into account:\n", 210 | " # \"For MatMult, contraction axis must be mapped to P-dim\"\n", 211 | " i_lhsT_p, i_lhsT_f = nl.mgrid[0:128, 0:64]\n", 212 | "\n", 213 | " # Defining indexes for input RHS\n", 214 | " # - Note: here we take LayoutConstraint #1 into account:\n", 215 | " # \"For MatMult, contraction axis must be mapped to P-dim\"\n", 216 | " i_rhs_p, i_rhs_f = nl.mgrid[0:128, 0:512]\n", 217 | "\n", 218 | " # Defining indexes for the output ([64,128]@[128,512] -> [64,512])\n", 219 | " i_out_p, i_out_f = nl.mgrid[0:64, 0:512]\n", 220 | "\n", 221 | " # Loading the inputs (HBM->SBUF)\n", 222 | " # Note: here we take Tile dtype definition into account,\n", 223 | " # which forces P-dim as the left most index\n", 224 | " lhs_tile = nl.load(lhsT[i_lhsT_p, i_lhsT_f])\n", 225 | " rhs_tile = nl.load(rhs[i_rhs_p, i_rhs_f])\n", 226 | "\n", 227 | " # Perform the matrix-multiplication\n", 228 | " # Note1: We set transpose_x to True, to indicate that the LHS input is transposed\n", 229 | " # Note2: A NKI matmul instruction always writes to PSUM in float32 data-type\n", 230 | " result_psum = nl.matmul(lhs_tile, rhs_tile, transpose_x=True)\n", 231 | "\n", 232 | " # Copy the result from PSUM back to SBUF, and cast to expected output data-type\n", 233 | " result_sbuf = nl.copy(result_psum, dtype=result.dtype)\n", 234 | "\n", 235 | " # The result of a [64,128] x [128,512] matrix multiplication has a shape of [64, 512].\n", 236 | " # This dictates which indices to use to address the result tile.\n", 237 | " nl.store(result[i_out_p, i_out_f], value=result_sbuf)\n", 238 | "\n", 239 | " return result" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 6, 245 | "id": "d5b2a228-0a08-42fc-9bd4-81dedba0e4d6", 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "name": "stdout", 250 | "output_type": "stream", 251 | "text": [ 252 | "Checking correctness of nki_matmul_basic\n", 253 | "2025-03-17 22:45:04.000657: 512118 INFO ||NEURON_CC_WRAPPER||: Call compiler with cmd: neuronx-cc compile --framework=XLA /tmp/ec2-user/neuroncc_compile_workdir/58a5f9b5-7dd1-4569-b58f-bae92b1f0d13/model.MODULE_6255296715421101974+e30acd3a.hlo_module.pb --output /tmp/ec2-user/neuroncc_compile_workdir/58a5f9b5-7dd1-4569-b58f-bae92b1f0d13/model.MODULE_6255296715421101974+e30acd3a.neff --target=trn1 --verbose=35\n", 254 | ".\n", 255 | "Compiler status PASS\n", 256 | "NKI and Torch match\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "# Run NKI kernel\n", 262 | "output_small = nki_matmul_basic_(lhs_small.T, rhs_small)\n", 263 | "\n", 264 | "# Run torch reference\n", 265 | "output_small_torch = torch.matmul(lhs_small, rhs_small)\n", 266 | "\n", 267 | "# Compare results\n", 268 | "print(\"Checking correctness of nki_matmul_basic\")\n", 269 | "if torch.allclose(output_small_torch, output_small, atol=1e-4, rtol=1e-2):\n", 270 | " print(\"NKI and Torch match\")\n", 271 | "else:\n", 272 | " print(\"NKI and Torch differ\")" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "id": "801236a2-9d4d-4630-a750-dc42bb2e4514", 278 | "metadata": {}, 279 | "source": [ 280 | "### 4. Wrap up and next steps\n", 281 | "The simplicity you see in the `tensor_add` kernel above is possible because the shapes we pass in are very small. We've intentionally selected them to exactly match the shapes of tiles that NKI supports as maximum dimensions, for both the partition and free dimensions.\n", 282 | "\n", 283 | "As you saw above, the partition dimension has a maximum length of 128. This the most important dimension and shape to embrace in your kernels, because it impacts your ability to load data onto the chip. In order to exploit the parallelism of execution enabled through the 128 lanes on sbuf, you might want to develop into your kernel the ability to extract data in batches of 128 to load onto sbuf. \n", 284 | "\n", 285 | "The second dimension, also known as the free dimension, is more flexible. Once you have clean batches of 128 lanes being loaded onto sbuf, you can build in tiling on the second dimension of much more varying sizes up to 512. \n", 286 | "\n", 287 | "To learn more about tiling, and to step through the rest of the matrix multiplication tutorial, see our docs on the topic [here](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/tutorials/matrix_multiplication.html#)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "id": "bd810a4b-2365-48a3-ad0f-23f3850ffc71", 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [] 297 | } 298 | ], 299 | "metadata": { 300 | "kernelspec": { 301 | "display_name": "Python 3 (ipykernel)", 302 | "language": "python", 303 | "name": "python3" 304 | }, 305 | "language_info": { 306 | "codemirror_mode": { 307 | "name": "ipython", 308 | "version": 3 309 | }, 310 | "file_extension": ".py", 311 | "mimetype": "text/x-python", 312 | "name": "python", 313 | "nbconvert_exporter": "python", 314 | "pygments_lexer": "ipython3", 315 | "version": "3.9.16" 316 | } 317 | }, 318 | "nbformat": 4, 319 | "nbformat_minor": 5 320 | } 321 | -------------------------------------------------------------------------------- /labs/NxD/Lab_One_NxDI.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5a972332", 6 | "metadata": {}, 7 | "source": [ 8 | "# Develop support for a new model with NeuronX Distributed Inference\n", 9 | "\n", 10 | "In this notebook you will learn how to develop support for a new model with NeuronX Distributed Inference (NxD). NxD is a Python package developed by Annapurna Labs that enables you to shard, compile, train, and host PyTorch models on Trainium and Inferentia instances. We develop two key packages demonstrating how to use this, [NxD Inference](https://github.com/aws-neuron/neuronx-distributed-inference/tree/main) and [NxD Training](https://github.com/aws-neuron/neuronx-distributed-training). This notebook focuses on inference. You will learn how to develop support for a new model in NxD Inference through the context of Llama 3.2, 1B.\n", 11 | "\n", 12 | "#### Overview\n", 13 | "1. Check dependencies for AWS Neuron SDK\n", 14 | "2. Accept the Meta usage terms and download the model from Hugging Face.\n", 15 | "3. Learn how to invoke the model step-by-step\n", 16 | " - Load the model from a local path.\n", 17 | " - Shard and compile it for Trainium.\n", 18 | " - Download and tokenize the dataset\n", 19 | " - Invoke the model with prompts\n", 20 | "4. Learn how to modify the underlying APIs to work with your own models\n", 21 | "\n", 22 | "#### Prerequisites\n", 23 | "This notebook was developed on a trn1.2xlarge instance, using the latest Amazon Linux DLAMI. Both the Amazon Linux and Ubuntu Neuron DLAMI's have preinstalled Python virtual environments with all the basic software packages included. The virtual environment used to develop this notebook is located at this path in both Amazon Linux and Ubuntu DLAMIs: `/opt/aws_neuronx_venv_pytorch_2_5_nxd_inference`. " 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "3652fc5a", 29 | "metadata": {}, 30 | "source": [ 31 | "### Step 1. Import NxD Inference packages\n", 32 | "\n", 33 | "If you are running this notebook in the virtual environment for NxD Inference, then the package should already be installed. Let's verify that with the following import." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "id": "c4405a13-5431-4d29-a6a6-2eb989fb0f50", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import neuronx_distributed_inference" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "id": "0d1970fc", 49 | "metadata": {}, 50 | "source": [ 51 | "### Step 2. Accept the Meta usage terms and download the model\n", 52 | "\n", 53 | "If you would like to use the model directly from Meta, you'll need to navigate over to the Hugging Face hub for Llama 3.2 1B [here](https://huggingface.co/meta-llama/Llama-3.2-1B). Log in to the Hub, accept the usage term, and request access to the model. Once access has been granted, copy your Hugging Face token and paste it into the download command below.\n", 54 | "\n", 55 | "If you do not have your token readily available, you can proceed with the alternative model shown below." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "959fb008-a2c8-4505-8f60-42e5b2060b31", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# helpful packages to speed up the download\n", 66 | "!pip install hf_transfer \"huggingface_hub[cli]\"" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "id": "ccff01a8-94f7-4d10-bdf7-71229ec19cb9", 72 | "metadata": {}, 73 | "source": [ 74 | "We'll download the `NousResearch/Llama3.2-1B` model here." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "75a2e3d1-7c1b-4d9d-b1f5-d294a1381566", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "!hf download NousResearch/Llama-3.2-1B --local-dir /home/ubuntu/environment/models/llama/" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "02214b8a", 90 | "metadata": {}, 91 | "source": [ 92 | "### Step 3. Establish model configs\n", 93 | "Next, you'll point to the local model files and establish config objects. Each of these configs are helpful in successfully invoking the model." 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "77e54a5f-842f-4b2c-ab79-c0f11a6ef292", 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "# the original checkpoint\n", 104 | "model_path = '/home/ubuntu/environment/models/llama/'" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "094dc24d-dd06-45c8-adec-fa997f02e6d1", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "# where your NxD trace will go\n", 115 | "traced_model_path = '/home/ubuntu/environment/models/traced_llama'" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "id": "9f72bda4-5e04-442c-b016-f30816db54d4", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "import torch\n", 126 | "from transformers import AutoTokenizer, GenerationConfig\n", 127 | "\n", 128 | "from neuronx_distributed_inference.models.config import NeuronConfig, OnDeviceSamplingConfig\n", 129 | "from neuronx_distributed_inference.models.llama.modeling_llama import LlamaInferenceConfig, NeuronLlamaForCausalLM\n", 130 | "from neuronx_distributed_inference.utils.hf_adapter import HuggingFaceGenerationAdapter, load_pretrained_config\n", 131 | "from neuronx_distributed_inference.modules.generation.sampling import prepare_sampling_params\n", 132 | "\n", 133 | "# torch.manual_seed(0)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "812403b6", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# update the generation config to address a trailing comma\n", 144 | "!cp generation_config.json $model_path/" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "id": "857c6e49-ce3a-47c9-868a-520f0cd68276", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "# Initialize configs \n", 155 | "generation_config = GenerationConfig.from_pretrained(model_path)\n", 156 | "\n", 157 | "# Some sample overrides for generation\n", 158 | "generation_config_kwargs = {\n", 159 | " \"do_sample\": True,\n", 160 | " \"top_k\": 1,\n", 161 | " \"pad_token_id\": generation_config.eos_token_id,\n", 162 | "}\n", 163 | "generation_config.update(**generation_config_kwargs)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "d196acdb-d094-41c0-9638-9974cec332c4", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "neuron_config = NeuronConfig(\n", 174 | " tp_degree=2,\n", 175 | " batch_size=2,\n", 176 | " max_context_length=32,\n", 177 | " seq_len=64,\n", 178 | " on_device_sampling_config=OnDeviceSamplingConfig(top_k=1),\n", 179 | " enable_bucketing=True,\n", 180 | " flash_decoding_enabled=False\n", 181 | ")\n", 182 | "\n", 183 | "# Build the Llama Inference config\n", 184 | "config = LlamaInferenceConfig(\n", 185 | " neuron_config,\n", 186 | " load_config=load_pretrained_config(model_path),\n", 187 | ")" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "id": "5269bcdd-cf8c-4b10-a428-0cd0fafd83d1", 193 | "metadata": {}, 194 | "source": [ 195 | "### Step 4. Shard and compile the model\n", 196 | "The NeuronX compiler will optimize your model for Trainium hardware, ultimately generating the assembly code that executes your operations. We will invoke that compiler now. Generally, it's suggested to compile for some of the larger input and output shapes for your model, while using bucketing to optimize performance. Both of those are handled for you automatically with NxD.\n", 197 | "\n", 198 | "With NxD, this step also shards your checkpoint for the TP degree that you defined above. Compilation can take some time, for a 1B model this should run for a few minutes." 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "id": "afd1e5d5-a989-40fb-8350-fca737470b19", 205 | "metadata": { 206 | "scrolled": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "model = NeuronLlamaForCausalLM(model_path, config)\n", 211 | "model.compile(traced_model_path)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "id": "38178c7e-0f6e-41ab-9383-2942615b82ed", 217 | "metadata": {}, 218 | "source": [ 219 | "Once compilation is complete your new model is saved and ready to load! " 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "id": "63a37f02-ed94-4c3e-81cc-6d9e23c04175", 225 | "metadata": {}, 226 | "source": [ 227 | "### Step 5. Download the tokenizer" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "id": "0c5e306f-9488-4b0e-8e6a-f238a50f2cfe", 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side=\"right\")\n", 238 | "tokenizer.pad_token = tokenizer.eos_token\n", 239 | "tokenizer.save_pretrained(traced_model_path)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "id": "212cfe39-9e66-4a02-bf21-2560de065a34", 245 | "metadata": {}, 246 | "source": [ 247 | "### Step 6. Load the traced model" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "id": "c945db68-5392-406c-8dd6-9e66b9ab0a63", 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "model = NeuronLlamaForCausalLM(traced_model_path)\n", 258 | "model.load(traced_model_path)\n", 259 | "tokenizer = AutoTokenizer.from_pretrained(traced_model_path)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "id": "75b0f5f3-b12b-4ac4-883c-856604f8d44e", 265 | "metadata": {}, 266 | "source": [ 267 | "### Step 7. Define the prompts and prepare them for sampling" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "id": "f203a455-402b-4ddc-81d3-d4d1b4335c5c", 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "prompts = [\"I believe the meaning of life is\", \"The color of the sky is\"]\n", 278 | "\n", 279 | "# Example: parameter sweeps for sampling\n", 280 | "sampling_params = prepare_sampling_params(batch_size=neuron_config.batch_size,\n", 281 | " top_k=[10, 5],\n", 282 | " top_p=[0.5, 0.9],\n", 283 | " temperature=[0.9, 0.5])\n", 284 | "\n", 285 | "inputs = tokenizer(prompts, padding=True, return_tensors=\"pt\")" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "id": "108f43f8-a2a8-4986-af7c-fdc58a37f3cd", 291 | "metadata": {}, 292 | "source": [ 293 | "### Step 8. Create a Generation Adapter and run inference" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "id": "2f511a6f-049c-4a05-bccc-f5cce8071334", 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "generation_model = HuggingFaceGenerationAdapter(model)\n", 304 | "outputs = generation_model.generate(\n", 305 | " inputs.input_ids,\n", 306 | " generation_config=generation_config,\n", 307 | " attention_mask=inputs.attention_mask,\n", 308 | " max_length=model.config.neuron_config.max_length,\n", 309 | " sampling_params=sampling_params,\n", 310 | ")\n", 311 | "output_tokens = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)\n", 312 | "\n", 313 | "print(\"Generated outputs:\")\n", 314 | "for i, output_token in enumerate(output_tokens):\n", 315 | " print(f\"Output {i}: {output_token}\")\n" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "id": "a5b840fc-dcba-428a-bcf8-c35702d144e0", 321 | "metadata": {}, 322 | "source": [ 323 | "---\n", 324 | "# Develop support for a new model with NxDI\n", 325 | "Now that you've run inference with this model, let's take a closer look at how this works. The cells you just ran are based on a script available in our repository [here](https://github.com/aws-neuron/neuronx-distributed-inference/tree/main). You can step through this repository to understand how the objects are developed, inherited, and made available for inference. The full developer guide on the topic is available [here](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/onboarding-models.html#nxdi-onboarding-models). Let's look at some of the key points!" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "id": "f5ec151f-ce53-4051-a9d0-957654834f51", 331 | "metadata": {}, 332 | "source": [ 333 | "#### 1/ NeuronConfig class\n", 334 | "You can inherit our base `NeuronConfig` class and extend it with your own model parameters. In the notebook you just ran, this is how we defined the following parameters:\n", 335 | "- Tensor Parallel (TP) Degree\n", 336 | "- Batch size\n", 337 | "- Max context length (input shape)\n", 338 | "- Sequence length (output shape)\n", 339 | "- On device sampling\n", 340 | "- Enabling bucketing\n", 341 | "- Flash decoding\n", 342 | "\n", 343 | "\n", 344 | "This object and these parameters will be sent to the compiler when you call `model.compile`. It's a helpful way to ensure that the compiler registers your design choices so that it can start optimizations. It also enables the model sharing with NxDI for your preferred TP degree, which lets you very quickly test a variety of TP degrees (TP=8, 32, 64, etc.)." 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "id": "ac98eb22-c02b-4c74-bd4c-3cd1bd196f54", 350 | "metadata": {}, 351 | "source": [ 352 | "#### 2/ InferenceConfig class\n", 353 | "Next, you can inherit our base `InferenceConfig` class and extend it with the rest of your modeling parameters. In the notebook you ran above, we took two important steps with this config.\n", 354 | "1. Passed into it the base `NeuronConfig`.\n", 355 | "2. Passed the rest of the model config from the HuggingFace pretrained config.\n", 356 | "\n", 357 | "Your inference class is where you define modeling parameters like the following:\n", 358 | "- hidden size\n", 359 | "- num attention heads\n", 360 | "- num hidden layers\n", 361 | "- num key value heads\n", 362 | "- vocab size\n", 363 | "\n", 364 | "You'll use this `config` object to save and compile your model. Let's learn how!" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "id": "71016dc5-112d-470f-a1ee-ce1855a5487d", 370 | "metadata": {}, 371 | "source": [ 372 | "#### 3/ NeuronModel\n", 373 | "This is how you fundamentally integrate your modeling code into the Neuron SDK. If you'd like to simply reuse our `NeuronAttentionBase`, you can inherit this directly through the library and simply pass your parameters through the `InferenceConfig` you defined above. This is how the example code in our notebook works. This is also the fastest way of getting your model online with NxD I.\n", 374 | "\n", 375 | "In the example code you ran, you also used our code for `NeuronLlamaMLP`. This is a layer in the network which inherits from `nn.Module` directly, and it's where you can define the structure of your computations. The `NeuronLlamaMLP` uses a predefined `ColumnParallelLinear` object for both the gate and up projections, while using a predefined `RowParallelLinear` object for the down projection. It also defines a forward pass on that layer.\n", 376 | "\n", 377 | "The rest of the model is defined similarly: either you inherit from our base objects and just passing in your `InferenceConfig`, or you define a new layer inheriting from `nn.Module` and write those layers as either `RowParallelLinear`, `ColumnParallelLinear`, or something else. The benefit of writing your layers into the `Row` and `Column` parallel layers as presented here is that we can handle the distribution of your model for you. \n", 378 | "\n", 379 | "For a more complete guide check out our documentation on the subject [here](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/api_guide.html#api-guide)." 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "id": "8e98a0d4", 385 | "metadata": {}, 386 | "source": [ 387 | "### Notebook Wrap-Up\n", 388 | "\n", 389 | "For more advanced topics:\n", 390 | "- **Profiling**: See [Neuron Profiling Tools](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-profile/index.html).\n", 391 | "- **Distributed Serving**: Explore vLLM or other serving frameworks.\n", 392 | "- **Performance Benchmarking**: Use `llmperf` or custom scripts.\n", 393 | "\n", 394 | "Thank you for using AWS Trainium, and happy LLM experimentation!\n" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "id": "3bfc3c62-08a4-49ae-adef-5c0d661f2712", 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [] 404 | } 405 | ], 406 | "metadata": { 407 | "kernelspec": { 408 | "display_name": "Python 3 (ipykernel)", 409 | "language": "python", 410 | "name": "python3" 411 | }, 412 | "language_info": { 413 | "codemirror_mode": { 414 | "name": "ipython", 415 | "version": 3 416 | }, 417 | "file_extension": ".py", 418 | "mimetype": "text/x-python", 419 | "name": "python", 420 | "nbconvert_exporter": "python", 421 | "pygments_lexer": "ipython3", 422 | "version": "3.9.21" 423 | } 424 | }, 425 | "nbformat": 4, 426 | "nbformat_minor": 5 427 | } 428 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/assets/agents/stockfish_agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stockfish chess agent implementation. 3 | 4 | This agent uses the Stockfish chess engine to make strong chess moves. 5 | """ 6 | 7 | import os 8 | import platform 9 | import subprocess 10 | from typing import Any, Dict, List, Optional 11 | 12 | import chess 13 | from dotenv import load_dotenv 14 | 15 | from .base import ChessAgent 16 | 17 | # Load environment variables from .env file 18 | load_dotenv() 19 | 20 | 21 | class StockfishAgent(ChessAgent): 22 | """ 23 | Chess agent that uses the Stockfish chess engine. 24 | 25 | This agent requires the Stockfish binary to be installed on the system. 26 | It automatically detects common installation paths or can be configured 27 | with a custom path via environment variable STOCKFISH_PATH. 28 | 29 | The agent provides robust cleanup and can be used as a context manager: 30 | 31 | ```python 32 | # Automatic cleanup with context manager 33 | with StockfishAgent() as agent: 34 | move = agent.choose_move(board, legal_moves, [], "White") 35 | 36 | # Manual cleanup 37 | agent = StockfishAgent() 38 | try: 39 | move = agent.choose_move(board, legal_moves, [], "White") 40 | finally: 41 | agent.close() 42 | ``` 43 | """ 44 | 45 | # Common Stockfish binary paths for different operating systems 46 | COMMON_PATHS = { 47 | "darwin": [ # macOS 48 | "/usr/local/bin/stockfish", 49 | "/opt/homebrew/bin/stockfish", 50 | "/usr/bin/stockfish", 51 | ], 52 | "linux": [ # Linux 53 | "/usr/local/bin/stockfish", 54 | "/usr/bin/stockfish", 55 | "/usr/games/stockfish", 56 | ], 57 | "win32": [ # Windows 58 | "C:\\Program Files\\Stockfish\\stockfish.exe", 59 | "C:\\Program Files (x86)\\Stockfish\\stockfish.exe", 60 | "stockfish.exe", # If in PATH 61 | ], 62 | } 63 | 64 | def __init__( 65 | self, 66 | stockfish_path: Optional[str] = None, 67 | depth: int = 15, 68 | skill_level: int = 20, 69 | elo_rating: Optional[int] = None, 70 | parameters: Optional[Dict[str, Any]] = None, 71 | time_limit_ms: Optional[int] = None, 72 | hash_size_mb: int = 128, 73 | threads: int = 1, 74 | ): 75 | """ 76 | Initialize the Stockfish agent. 77 | 78 | Args: 79 | stockfish_path: Path to Stockfish binary. If None, will auto-detect. 80 | depth: Search depth for Stockfish (default: 15) 81 | skill_level: Skill level 0-20 (default: 20, highest) 82 | elo_rating: ELO rating to limit strength (ignores skill_level if set) 83 | parameters: Additional Stockfish parameters as dict 84 | time_limit_ms: Time limit per move in milliseconds 85 | hash_size_mb: Hash table size in MB (default: 128) 86 | threads: Number of threads to use (default: 1) 87 | 88 | Raises: 89 | RuntimeError: If Stockfish binary cannot be found or started 90 | """ 91 | self.depth = depth 92 | self.skill_level = skill_level 93 | self.elo_rating = elo_rating 94 | self.time_limit_ms = time_limit_ms 95 | self.hash_size_mb = hash_size_mb 96 | self.threads = threads 97 | 98 | # Find Stockfish binary 99 | self.stockfish_path = self._find_stockfish_binary(stockfish_path) 100 | 101 | # Initialize Stockfish process 102 | self._stockfish = None 103 | self._initialize_stockfish() 104 | 105 | # Set initial parameters 106 | self._set_parameters(parameters or {}) 107 | 108 | def _find_stockfish_binary(self, custom_path: Optional[str]) -> str: 109 | """ 110 | Find the Stockfish binary path. 111 | 112 | Args: 113 | custom_path: Custom path provided by user 114 | 115 | Returns: 116 | Path to Stockfish binary 117 | 118 | Raises: 119 | RuntimeError: If Stockfish binary cannot be found 120 | """ 121 | # Check custom path first 122 | if custom_path: 123 | if os.path.isfile(custom_path) and os.access(custom_path, os.X_OK): 124 | return custom_path 125 | else: 126 | raise RuntimeError(f"Custom Stockfish path is not executable: {custom_path}") 127 | 128 | # Check environment variable 129 | env_path = os.environ.get("STOCKFISH_PATH") 130 | if env_path: 131 | if os.path.isfile(env_path) and os.access(env_path, os.X_OK): 132 | return env_path 133 | else: 134 | raise RuntimeError(f"STOCKFISH_PATH environment variable points to non-executable file: {env_path}") 135 | 136 | # Auto-detect based on platform 137 | system = platform.system().lower() 138 | if system == "darwin": 139 | paths = self.COMMON_PATHS["darwin"] 140 | elif system == "linux": 141 | paths = self.COMMON_PATHS["linux"] 142 | elif system == "windows": 143 | paths = self.COMMON_PATHS["win32"] 144 | else: 145 | paths = [] 146 | 147 | # Check common paths 148 | for path in paths: 149 | if os.path.isfile(path) and os.access(path, os.X_OK): 150 | return path 151 | 152 | # Try to find in PATH 153 | try: 154 | result = subprocess.run( 155 | ["which", "stockfish"], 156 | capture_output=True, 157 | text=True, 158 | check=False 159 | ) 160 | if result.returncode == 0: 161 | path = result.stdout.strip() 162 | if os.path.isfile(path) and os.access(path, os.X_OK): 163 | return path 164 | except (subprocess.SubprocessError, FileNotFoundError): 165 | pass 166 | 167 | # If we get here, Stockfish was not found 168 | error_msg = ( 169 | "Stockfish binary not found. Please install Stockfish or set the STOCKFISH_PATH " 170 | "environment variable.\n\n" 171 | "Installation instructions:\n" 172 | " macOS: brew install stockfish\n" 173 | " Ubuntu/Debian: sudo apt install stockfish\n" 174 | " Windows: Download from https://stockfishchess.org/download/\n" 175 | " Or set STOCKFISH_PATH environment variable to point to your Stockfish binary." 176 | ) 177 | raise RuntimeError(error_msg) 178 | 179 | def _initialize_stockfish(self): 180 | """Initialize the Stockfish process.""" 181 | try: 182 | self._stockfish = subprocess.Popen( 183 | [self.stockfish_path], 184 | stdin=subprocess.PIPE, 185 | stdout=subprocess.PIPE, 186 | stderr=subprocess.PIPE, 187 | text=True, 188 | bufsize=1, 189 | universal_newlines=True 190 | ) 191 | 192 | # Test if Stockfish is working 193 | self._send_command("uci") 194 | self._send_command("isready") 195 | 196 | # Wait for readyok with timeout 197 | response = self._read_response(timeout=5.0) # 5 second timeout for initialization 198 | if "readyok" not in response: 199 | raise RuntimeError("Stockfish did not respond with 'readyok'") 200 | 201 | # Start a new game context to clear internal engine state 202 | self._send_command("ucinewgame") 203 | 204 | except Exception as e: 205 | if hasattr(self, '_stockfish') and self._stockfish: 206 | try: 207 | self._stockfish.terminate() 208 | except Exception: 209 | pass # Ignore errors during cleanup 210 | self._stockfish = None 211 | raise RuntimeError(f"Failed to initialize Stockfish: {e}") 212 | 213 | def _send_command(self, command: str): 214 | """Send a command to Stockfish.""" 215 | if not hasattr(self, '_stockfish') or not self._stockfish: 216 | raise RuntimeError("Stockfish process not initialized") 217 | 218 | self._stockfish.stdin.write(command + "\n") 219 | self._stockfish.stdin.flush() 220 | 221 | def _read_response(self, timeout: float = 1.0) -> str: 222 | """Read response from Stockfish with timeout.""" 223 | if not hasattr(self, '_stockfish') or not self._stockfish: 224 | raise RuntimeError("Stockfish process not initialized") 225 | 226 | response = "" 227 | try: 228 | # Simple timeout-based reading 229 | import time 230 | 231 | start_time = time.time() 232 | while time.time() - start_time < timeout: 233 | # Try to read a line 234 | if self._stockfish.stdout.readable(): 235 | line = self._stockfish.stdout.readline() 236 | if line: 237 | response += line 238 | if "readyok" in line or "bestmove" in line: 239 | break 240 | else: 241 | # No more data available 242 | break 243 | else: 244 | # Small delay to avoid busy waiting 245 | time.sleep(0.01) 246 | 247 | except Exception as e: 248 | print(f"Warning: Error reading from Stockfish: {e}") 249 | 250 | return response 251 | 252 | def _set_parameters(self, additional_params: Dict[str, Any]): 253 | """Set Stockfish engine parameters.""" 254 | # Set basic parameters 255 | params = { 256 | "Hash": self.hash_size_mb, 257 | "Threads": self.threads, 258 | "Skill Level": self.skill_level, 259 | } 260 | 261 | # Add additional parameters 262 | params.update(additional_params) 263 | 264 | # Apply parameters 265 | for param, value in params.items(): 266 | self._send_command(f"setoption name {param} value {value}") 267 | 268 | # Set ELO rating if specified 269 | if self.elo_rating is not None: 270 | self._send_command(f"setoption name UCI_LimitStrength value true") 271 | self._send_command(f"setoption name UCI_Elo value {self.elo_rating}") 272 | 273 | def _set_position(self, board: chess.Board): 274 | """Set the current position in Stockfish using exact FEN and wait for readiness.""" 275 | # If it's a fresh game (no moves yet), inform engine explicitly 276 | if not board.move_stack: 277 | self._send_command("ucinewgame") 278 | # Use exact FEN to avoid any discrepancy in castling/en passant rights 279 | fen = board.fen() 280 | self._send_command(f"position fen {fen}") 281 | # Ensure engine processed position before searching 282 | self._send_command("isready") 283 | self._read_response(timeout=1.0) 284 | 285 | def _get_best_move(self) -> str: 286 | """Get the best move from Stockfish.""" 287 | if self.time_limit_ms: 288 | self._send_command(f"go movetime {self.time_limit_ms}") 289 | else: 290 | self._send_command(f"go depth {self.depth}") 291 | 292 | # Use a reasonable timeout for move calculation 293 | if self.time_limit_ms: 294 | timeout = max(0.1, (self.time_limit_ms / 1000.0) * 2.0) # 2x the time limit, minimum 0.1 second 295 | else: 296 | timeout = max(0.1, (self.depth / 20.0)) # Reasonable timeout based on depth 297 | response = self._read_response(timeout=timeout) 298 | 299 | # Parse best move from response 300 | for line in response.split('\n'): 301 | if line.startswith('bestmove'): 302 | parts = line.split() 303 | if len(parts) >= 2: 304 | return parts[1] 305 | 306 | raise RuntimeError("Stockfish did not return a best move") 307 | 308 | def choose_move( 309 | self, 310 | board: chess.Board, 311 | legal_moves: List[chess.Move], 312 | move_history: List[str], 313 | side_to_move: str, 314 | ) -> tuple[chess.Move | None, str | None]: 315 | """ 316 | Choose the best move using Stockfish engine. 317 | 318 | Args: 319 | board: Current chess board state 320 | legal_moves: List of legal moves available 321 | move_history: List of moves played so far (in UCI notation) 322 | side_to_move: Which side is to move ('White' or 'Black') 323 | 324 | Returns: 325 | Tuple of (chosen_move, optional_comment) 326 | - chosen_move: The best move according to Stockfish, or None to resign 327 | - optional_comment: Comment describing the move evaluation or resignation 328 | 329 | Raises: 330 | RuntimeError: If Stockfish fails to provide a move 331 | """ 332 | if not legal_moves: 333 | raise ValueError("No legal moves available") 334 | 335 | try: 336 | # Set the current position 337 | self._set_position(board) 338 | 339 | # Get best move from Stockfish 340 | best_move_uci = self._get_best_move() 341 | 342 | # Convert UCI string to chess.Move 343 | best_move = chess.Move.from_uci(best_move_uci) 344 | 345 | # Verify the move is legal 346 | if best_move not in legal_moves: 347 | # If Stockfish suggests an illegal move, fall back to first legal move 348 | print(f"Warning: Stockfish suggested illegal move {best_move_uci}, using first legal move") 349 | return legal_moves[0], "FALLBACK MOVE - Stockfish suggested illegal move" 350 | 351 | # Create a comment about the move 352 | comment = f"Stockfish engine move (depth: {self.depth}, skill: {self.skill_level})" 353 | if self.elo_rating: 354 | comment += f", ELO limited to {self.elo_rating}" 355 | 356 | return best_move, comment 357 | 358 | except Exception as e: 359 | # Fallback to first legal move if Stockfish fails 360 | print(f"Warning: Stockfish failed: {e}, using first legal move") 361 | return legal_moves[0], f"FALLBACK MOVE - Stockfish failed: {e}" 362 | 363 | def update_parameters(self, parameters: Dict[str, Any]): 364 | """ 365 | Update Stockfish engine parameters. 366 | 367 | Args: 368 | parameters: Dictionary of parameter names and values 369 | """ 370 | self._set_parameters(parameters) 371 | 372 | def set_skill_level(self, skill_level: int): 373 | """ 374 | Set Stockfish skill level (0-20). 375 | 376 | Args: 377 | skill_level: Skill level from 0 (weakest) to 20 (strongest) 378 | """ 379 | if not 0 <= skill_level <= 20: 380 | raise ValueError("Skill level must be between 0 and 20") 381 | 382 | self.skill_level = skill_level 383 | self._send_command(f"setoption name Skill Level value {skill_level}") 384 | 385 | def set_elo_rating(self, elo_rating: int): 386 | """ 387 | Set Stockfish ELO rating limit. 388 | 389 | Args: 390 | elo_rating: ELO rating to limit strength 391 | """ 392 | self.elo_rating = elo_rating 393 | self._send_command("setoption name UCI_LimitStrength value true") 394 | self._send_command(f"setoption name UCI_Elo value {elo_rating}") 395 | 396 | def set_depth(self, depth: int): 397 | """ 398 | Set search depth for Stockfish. 399 | 400 | Args: 401 | depth: Search depth 402 | """ 403 | self.depth = depth 404 | 405 | def set_time_limit(self, time_limit_ms: int): 406 | """ 407 | Set time limit per move. 408 | 409 | Args: 410 | time_limit_ms: Time limit in milliseconds 411 | """ 412 | self.time_limit_ms = time_limit_ms 413 | 414 | def is_initialized(self) -> bool: 415 | """ 416 | Check if the Stockfish agent is properly initialized. 417 | 418 | Returns: 419 | True if Stockfish process is running, False otherwise 420 | """ 421 | return hasattr(self, '_stockfish') and self._stockfish is not None and self._stockfish.poll() is None 422 | 423 | def __del__(self): 424 | """Clean up Stockfish process on deletion.""" 425 | # Use hasattr to safely check if the attribute exists 426 | if hasattr(self, '_stockfish') and self._stockfish: 427 | try: 428 | self._stockfish.terminate() 429 | self._stockfish.wait(timeout=1) 430 | except (subprocess.TimeoutExpired, Exception): 431 | try: 432 | self._stockfish.kill() 433 | except Exception: 434 | pass # Ignore errors during cleanup 435 | 436 | def close(self): 437 | """Explicitly close the Stockfish process.""" 438 | if hasattr(self, '_stockfish') and self._stockfish: 439 | try: 440 | self._stockfish.terminate() 441 | self._stockfish.wait(timeout=1) 442 | except (subprocess.TimeoutExpired, Exception): 443 | try: 444 | self._stockfish.kill() 445 | except Exception: 446 | pass # Ignore errors during cleanup 447 | finally: 448 | self._stockfish = None 449 | 450 | def __enter__(self): 451 | """Context manager entry.""" 452 | return self 453 | 454 | def __exit__(self, exc_type, exc_val, exc_tb): 455 | """Context manager exit - ensures cleanup.""" 456 | self.close() 457 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # Build On Trainium Resources 2 | **Purpose:** 3 | 4 | Collection of resources (documentation, examples, tutorials and workshops) to help onboard new students and researchers. This set of resources will need to updated and maintained as new resources become available. 5 | 6 | # Resources 7 | 8 | This section contains links to various documentation sources and is a helpful index when working on Neuron. It is organized into several sections based on workload and relevance. 9 | 10 | ## Getting Started with Neuron 11 | 12 | |Title |Description |Link | 13 | |--- |--- |--- | 14 | |Getting Started with AWS |Getting started resource for AWS, generally, including AWS environment provisioning, budger alarms, CLI, instance setip and best practices for working in an AWS environment |[BoT Getting Started on AWS](https://github.com/scttfrdmn/aws-101-for-tranium) | 15 | |Neuron Documentation |The Neuron Official Product Documentation. This contains details on our software libraries and hardware. |[Neuron Docs](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html) | 16 | |Inf2 Instance Details |Helpful overview links for the Inferentia2 Instance and associated accelerators |
  • [AWS Landing Page](https://aws.amazon.com/ai/machine-learning/inferentia/)
  • [Instance Details](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/inf2-arch.html#aws-inf2-arch)
  • [Chip Details](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/inferentia2.html#inferentia2-arch)
  • [Core Details](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/neuron-core-v2.html#neuroncores-v2-arch)
| 17 | |Trn1 Instance Details |Similar overview links for Trn1 instances and acclerators |
  • [AWS Landing Page](https://aws.amazon.com/ai/machine-learning/trainium/)
  • [Instance Details](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trn1-arch.html#aws-trn1-arch)
  • [Chip Details](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trainium.html#trainium-arch)
  • [Core Details](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/neuron-core-v2.html#neuroncores-v2-arch)
| 18 | |Trn2 Instance Details |Similar overview links for Trn2 instances and acclerators |
  • [Youtube Launch Video](https://www.youtube.com/watch?v=Bteba8KLeGc)
  • [Instance Details](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trn2-arch.html#aws-trn2-arch)
  • [Chip Details](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trainium2.html#trainium2-arch)
  • [Core Details](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/neuron-core-v3.html#neuroncores-v3-arch)
| 19 | | Instance Service Quotas | Understand what service quotas are, how they apply to Inferentia and Trainium instances and endpoints, and have an example of what quotas would be appropriate for a POC. |[Inferentia and Trainium Service Quotas](https://repost.aws/articles/ARgmEMvbR6Re200FQs8rTduA/inferentia-and-trainium-service-quotas) | 20 | |Software Overview - General |Overview Video of Trainium Software Stack |[Video](https://www.youtube.com/watch?v=vaqj8XQfqwM&t=806s) | 21 | |Software Overview - Framework |Application Frameworks for developing on Neuron. Torch-NeuronX for small model inference and training, NxD for Distributed modeling primitives, NxDI - a higher abstraction library for inference and NxDT a corresponding abstraction for training. |
  • Torch-NeuronX ([Training](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/programming-guide/training/pytorch-neuron-programming-guide.html#pytorch-neuronx-programming-guide), [Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/programming-guide/inference/trace-vs-xla-lazytensor.html))
  • [NxD](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/developer-guide.html)
  • [NxD-T](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-training/overview.html#nxd-training-overview)
  • [NxD-I](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-overview.html#nxdi-overview)
| 22 | |Software Overview - ML Libraries |ML libraries which offer another interface for deploying to trn/inf. Optimum-Neuron provides and interface between transformers and AWS Accelerators. AXLearn is a training library built on top of JAX and XLA. |[Optimum Neuron](https://huggingface.co/docs/optimum-neuron/index) [AXLearn](https://github.com/apple/axlearn) | 23 | |Environment Setup |A set of resources on provisioning instances and setting up development environments with the appropriate Neuron Software. |
  • [Instance Guide](https://repost.aws/articles/ARTxLi0wndTwquyl7frQYuKg)
  • [Remote Development Guide](https://repost.aws/articles/ARmgDHboGkRKmaEyfBzyVP4w)
  • [AMIs](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/dlami/index.html)
  • [Containers](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/containers/index.html)
  • [Manual Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html#setup-torch-neuronx-ubuntu22)
| 24 | |Release Versions |Index of the latest release versions and their semantic version information. |
  • [Latest Release Version](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/index.html#latest-neuron-release)
  • [Component Package Verisons](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/releasecontent.html#latest-neuron-release-artifacts)
| 25 | 26 | ## Training Resources 27 | 28 | |Title |Description |Link | 29 | |--- |--- |--- | 30 | |Torch-NeuronX Docs |Torch-NeuronX docs on the XLA flow, and constructing a simple training loop on Trainium/Inferentia. |[Torch-NeuronX Training Docs](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/programming-guide/training/pytorch-neuron-programming-guide.html#pytorch-neuronx-programming-guide) | 31 | |NxD Docs |Details on NxD, as well as the Distributed Layer Primitives (Tensor Parallelism, Pipeline Parallelism, etc.) |[NxD Developer Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/developer-guide-training.html#neuronx-distributed-developer-guide-training) | 32 | |NxD Docs + PyTorch Lightning |PyTorch Lightning Docs for NxD Training |[PTL Developer Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/ptl_developer_guide.html#ptl-developer-guide) | 33 | |NxD-T Developer Guide |NxD-Training, A higher level abstraction library on NxD for training specific workloads. |[NxD Training Developer Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-training/overview.html#nxd-training-overview) | 34 | |PreTraining |Pre-Training samples within various different libraries above |
  • [Torch-NeuronX](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/tutorials/training/mlp.html#neuronx-mlp-training-tutorial)
  • [NXD](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/tutorials/training_llama_tp_zero1.html#llama2-7b-tp-zero1-tutorial)
  • [NxD-T](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-training/tutorials/hf_llama3_8B_pretraining.html#hf-llama3-8b-pretraining)
  • [Optimum Neuron](https://huggingface.co/docs/optimum-neuron/training_tutorials/pretraining_hyperpod_llm)
| 35 | |LoRA Fine Tuning |LoRA Samples within the various libraries for Neuron |
  • [NxD-T](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-training/tutorials/hf_llama3_8B_SFT_LORA.html#hf-llama3-8b-sft-lora)
  • [Optimum Neuron](https://huggingface.co/docs/optimum-neuron/training_tutorials/sft_lora_finetune_llm)
| 36 | |Preference Alignment |Preference Alignment Samples within the various libraries for Neuron |[NxD-T](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-training/tutorials/hf_llama3_8B_DPO_ORPO.html#hf-llama3-8b-dpo-orpo) | 37 | |Awsome Distributed Training |Reference Distributed Training Examples on AWS |[Awsome-distributed-training](https://github.com/aws-samples/awsome-distributed-training) | 38 | 39 | ## Inference Resources 40 | 41 | |Title |Description |Link | 42 | |--- |--- |--- | 43 | |Torch-NeuronX Docs |Torch-NeuronX docs on the XLA flow, and tracing models for Inference on a single core. Samples of various common models as well. |[Torch-NeuronX Docs](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/api-reference-guide/inference/api-torch-neuronx-trace.html#torch-neuronx-trace-api) 44 | [Samples](https://github.com/aws-neuron/aws-neuron-samples/tree/master/torch-neuronx/inference) | 45 | |NxD-I Developer Guide |NxD-Inference, A higher level abstraction library on NxD for inference specific workloads. |[NxD-I Docs](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/tutorials/index.html) | 46 | |Deployment vLLM |Guide for vLLM development with NxDI |[vLLM Docs](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html) | 47 | |TGI |Guide on how to use HuggingFace Text Generation Inference (TGI) with Neuron |[TGI Docs](https://huggingface.co/docs/optimum-neuron/en/guides/neuronx_tgi) | 48 | 49 | ## Kernel Resources 50 | 51 | |Title |Description |Link | 52 | |--- |--- |--- | 53 | |NKI Docs |General NKI docs |[NKI Docs](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/index.html) | 54 | |Getting Started With NKI |Getting started writing NKI Kernels |[Getting Started With NKI](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/getting_started.html#nki-getting-started) | 55 | |Performant Kernels with NKI |Understanding NKI kernel performance |[Performant Kernels with NKI](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/nki_arch_guides.html#nki-arch-guides) | 56 | |NKI - Sample Kernels |Sample Kernel Repository with reference implementation |[NKI - Sample Kernels](https://github.com/aws-neuron/nki-samples/tree/main) | 57 | 58 | ## Tools Resources 59 | 60 | |Title |Description |Link | 61 | |--- |--- |--- | 62 | |Profiler |Neuron Profiler User Guide |[Profiler Docs](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-profiler-2-0-beta-user-guide.html) | 63 | |Monitoring Tools and CLI |Monitoring and CLI tools for working with Neuron Hardware. |[Monitoring Tools and CLI](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/tools/neuron-sys-tools/neuron-monitor-user-guide.html) | 64 | 65 | # Learning Paths 66 | 67 | Learning Paths are a list or organized exercises 68 | 69 | ## Training 70 | 71 | |Title |Description |Link |Minimum Instance Required | 72 | |--- |--- |--- |--- | 73 | |Setup an Instance/Developer Environment |This section contains resources to provision a developer Environment. This is a great starting place if you need a clean environment for development, or for starting any of the following exercises. |
  • [Instance Setup](https://repost.aws/articles/ARTxLi0wndTwquyl7frQYuKg)
  • [DLAMIs](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/dlami/index.html)
|trn1.2xlarge | 74 | |Construct a simple Training Loop with torch-neuronx |This is a sample of how to construct a training loop using torch-neuronx. Relevant for getting started with XLA flows, as well as models which require a single core/DP. |[MLP Tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/tutorials/training/mlp.html#neuronx-mlp-training-tutorial) |trn1.2xlarge | 75 | |Implement Tensor Parallelism with NeuronX Distributed |Implement Tensor Parallel for a model to shard training across accelerators. |[BERT Pretraining Tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/tutorials/training.html#tp-training-tutorial) |trn1.32xlarge | 76 | |Pre-training Llama with TP, PP and ZeRO-1 |Train a model using multiple forms of parallelism (Tensor Parallelism, Pipeline Parallelism, and ZeRO-1). This uses the NxD Core Library and should give a good view of the parallel primatives. |[Llama Pretraining Tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/tutorials/training_llama_tp_zero1.html) |4x trn1.32xlarge cluster | 77 | |LoRA Fine Tuning with Optimum Neuron |Fine-Tune a model with LoRA on Optimum Neuron. Optimum Neuron is a library developed by HF and allows for simple modifications to transformers code to port to Neuron. |[Qwen LoRA Optimum Neuron](https://huggingface.co/docs/optimum-neuron/training_tutorials/qwen3-fine-tuning) |trn1.32xlarge | 78 | |LoRA Fine-Tuning with NxDT |LoRA based Fine-tune a model using NxD-T, our higher level training library built on top of NxD core. |[LoRA NxDT Tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-training/tutorials/hf_llama3_8B_SFT_LORA.html#hf-llama3-8b-sft-lora) |trn1.32xlarge | 79 | |DPO/ORPO Fine-Tuning with NxDT |Preference Alignment for a model using NxD-T, our higher level training library built on top of NxD core. |[DPO/ORPO Tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-training/tutorials/hf_llama3_8B_DPO_ORPO.html) |trn1.32xlarge | 80 | 81 | 82 | 83 | ## Inference Path 84 | 85 | |Title |Description |Link |Minimum Instance Required | 86 | |--- |--- |--- |--- | 87 | |Setup an Instance/Developer Environment |This section contains resources to provision a developer Environment. This is a great starting place if you need a clean environment for development, or for starting any of the following exercises. |
  • [Instance Setup](https://repost.aws/articles/ARTxLi0wndTwquyl7frQYuKg)
  • [DLAMIs](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/dlami/index.html)
|trn1.2xlarge | 88 | |Trace Models with Torch-NeuronX |Trace small models without model parallelism for inference with torch-neuronx. |[Torch-NeuronX Tutorials](https://github.com/aws-neuron/aws-neuron-samples/blob/master/torch-neuronx/README.md#inference) |trn1.2xlarge | 89 | |Deploy Various Models with Optimum Neuron |Optimum Neuron allows for popular models in diffusers and transformers to easily be deployed to Neuron devices. |[Optimum Neuron Tutorials](https://huggingface.co/docs/optimum-neuron/inference_tutorials/notebooks) |trn1.32xlarge | 90 | |Deploy LLM with NxD |NxD is our library with model sharding primitives. This guide serves as a good jumping off point for common LLMs |[NxD-I Production Models](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/model-reference.html) |trn1.32xlarge | 91 | |vLLM Integration |This guide walks through how to run models with vLLM on Neuron devices. This uses the previously mentioned NxDI back-end for the model deployments. |[vLLM User Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html) |trn1.32xlarge | 92 | |Deploy a DiT with NxD |This guide walks through a non LLM model architecture to be sharded and deployed on Neuron. In this case it is a Diffusion Transformer architecture for image generation |[PixArt Sigma on Neuron](https://aws.amazon.com/blogs/machine-learning/cost-effective-ai-image-generation-with-pixart-sigma-inference-on-aws-trainium-and-aws-inferentia/) |trn1.32xlarge | 93 | |Onboard a new Model to NxD-I |This guide walks through how to onboard a new model to NxD |[Model Onboarding Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/onboarding-models.html) |trn1.32xlarge | 94 | |Explore Additonal features of NxD-O |Here are a few additonal references for NxD-I feature that may be rel;evant for your specific use case (Multi-LoRA, Quantization, Spec. decode) |
  • [Quantization](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html)
  • [Spec. Decode](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/tutorials/trn2-llama3.3-70b-tutorial.html#nxdi-trn2-llama3-3-70b-tutorial)
  • [Multi-LoRA](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/tutorials/trn2-llama3.1-8b-multi-lora-tutorial.html#nxdi-trn2-llama3-1-8b-multi-lora-tutorial)
|trn1.32xlarge | 95 | 96 | ## Kernel/Compiler Path 97 | 98 | |Title |Description |Link |Minimum Instance Required | 99 | |--- |--- |--- |--- | 100 | |Setup an Instance/Developer Environment |This section contains resources to provision a developer Environment. This is a great starting place if you need a clean environment for development, or for starting any of the following exercises. |
  • [Instance Setup](https://repost.aws/articles/ARTxLi0wndTwquyl7frQYuKg)
  • [DLAMIs](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/dlami/index.html)
|trn1.2xlarge | 101 | |Writing Functional Kernels |This Getting Started Guide will demonstrate how to write a Hello World, element-wise tensor add kernel. This will give you a good foundation for reading and understanding the other kernels. |[Getting Started with NKI](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/getting_started.html#nki-getting-started) |trn1.2xlarge | 102 | |NKI workshop |This workshop walks through how to build, profile and integrate a kernel into PyTorch modelling. |[NKI Workshop](https://github.com/aws-samples/ml-specialized-hardware/tree/main/workshops/03_NKIWorkshop) |trn1.2xlarge | 103 | |Walkthrough NKI Tutorials |These tutorials walkthrough popular kernels and the associated optimizations applied. This is a good set of kernels to show how to iteratively write and optimize kernels. |[NKI Tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/tutorials.html) |trn1.2xlarge | 104 | |Review NKI Samples |This repository contains the implementations of optimized reference kernels, used within our serving libraries and implementations. |[NKI Samples](https://github.com/aws-neuron/nki-samples/) |trn1.2xlarge | 105 | |Profiling NKI Kernels |This guide walks through how to profile kernels and use the Neuron Profiler |[Profiling NKI Kernels](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/nki/neuron_profile_for_nki.html#neuron-profile-for-nki) |trn1.2xlarge | 106 | 107 | # Appendix 108 | 109 | ## Other Resources 110 | 111 | |Title |Description |Link | 112 | |--- |--- |--- | 113 | |Re:Invent 2024 Recap |REcap Post from Re:Invent, which includes links to workshops and sessions on Neuron |[RePost Article](https://repost.aws/articles/ARuhbPQliOSqKn74zJpGmMYQ) | 114 | |AI on EKS |Reference implementation for AI workloads on EKS including hosting on Trainium |[AI on EKS](https://github.com/awslabs/ai-on-eks) | 115 | -------------------------------------------------------------------------------- /labs/vLLM/Chess/Chess-Tournament.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chess Tournament Evaluation\n", 8 | "\n", 9 | "## Overview\n", 10 | "\n", 11 | "This notebook demonstrates running competitive chess tournaments to evaluate your fine-tuned model. You'll learn how to:\n", 12 | "\n", 13 | "- Run tournaments with TrueSkill ratings\n", 14 | "- Leverage automatic request batching for throughput\n", 15 | "- Analyze model performance with detailed metrics\n", 16 | "- Compare against multiple Stockfish baselines\n", 17 | "\n", 18 | "**Prerequisites:**\n", 19 | "- Complete Chess-Deployment.ipynb\n", 20 | "- vLLM server running with your chess model\n", 21 | "\n", 22 | "**Duration:** 20-30 minutes" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Step 1: Verify vLLM Server is Running\n", 30 | "\n", 31 | "Before starting the tournament, let's verify the vLLM server is still running." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import openai\n", 41 | "\n", 42 | "MODEL_PATH = \"kunhunjon/ChessLM_Qwen3_Trainium_AWS_Format\"\n", 43 | "\n", 44 | "client = openai.OpenAI(\n", 45 | " base_url=\"http://localhost:8080/v1\",\n", 46 | " api_key=\"not-needed\"\n", 47 | ")\n", 48 | "\n", 49 | "try:\n", 50 | " response = client.chat.completions.create(\n", 51 | " model=MODEL_PATH,\n", 52 | " messages=[{\"role\": \"user\", \"content\": \"Test\"}],\n", 53 | " max_tokens=5,\n", 54 | " extra_body={\"chat_template_kwargs\": {\"enable_thinking\": False}}\n", 55 | " )\n", 56 | " print(\" vLLM server is running and ready!\")\n", 57 | "except Exception as e:\n", 58 | " print(f\" Server not responding: {e}\")\n", 59 | " print(\"\\nPlease start the server from Chess-Deployment.ipynb first.\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "Note: prior to running any games, ensure your `.env` file is properly configured for vLLM and game parameters, or pass this to the agents directly." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# copy your env vars in\n", 76 | "%cd /home/ubuntu/environment/vLLM/Chess/assets/\n", 77 | "! cp env.example .env\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "# 1. Load the IPython extension that defines %dotenv\n", 87 | "%load_ext dotenv\n", 88 | "\n", 89 | "# 2. Load your .env file (adjust path if needed!)\n", 90 | "%dotenv /home/ubuntu/environment/vLLM/Chess/assets/.env" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "**Note:** If you update env vars after running the above cell, make sure to restart the kernel before moving ahead with the below section" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "# sanity check to see if they were sourced in\n", 107 | "import os\n", 108 | "print(os.getenv(\"VLLM_MODEL\"))\n" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "## Step 2: Play a Single Game\n", 116 | "\n", 117 | "Let's start with a single game to verify everything works." 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "%cd /home/ubuntu/environment/vLLM/Chess/\n", 127 | "\n", 128 | "# Play single game: vLLM vs Stockfish (skill 5)\n", 129 | "!python -m assets.run_game \\\n", 130 | " --agent1 vllm \\\n", 131 | " --agent2 stockfish-skill5-depth10 \\\n", 132 | " --num-games 1 \\\n", 133 | " --verbose" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "## Step 3: Run a Small Tournament (Sequential)\n", 141 | "\n", 142 | "Now let's run a small tournament with **parallelism=1** (sequential games) to establish a baseline." 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "import time\n", 152 | "\n", 153 | "print(\"Running 4 games sequentially (parallelism=1)...\\n\")\n", 154 | "\n", 155 | "start_time = time.time()\n", 156 | "\n", 157 | "!python -m assets.run_game \\\n", 158 | " --agent vllm \\\n", 159 | " --agent stockfish-skill1-depth1 \\\n", 160 | " --num-games 4 \\\n", 161 | " --parallelism 1 \\\n", 162 | " --output-dir tournament_sequential\n", 163 | "\n", 164 | "sequential_time = time.time() - start_time\n", 165 | "\n", 166 | "print(f\"\\n Sequential tournament completed in {sequential_time:.1f} seconds\")" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "## Step 4: Run Tournament with Concurrency\n", 174 | "\n", 175 | "Now let's run the same tournament with **parallelism=4** to see the throughput improvement from automatic request batching." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "print(\"Running 4 games in parallel (parallelism=4)...\\n\")\n", 185 | "\n", 186 | "start_time = time.time()\n", 187 | "\n", 188 | "!python -m assets.run_game \\\n", 189 | " --agent vllm \\\n", 190 | " --agent stockfish-skill1-depth1 \\\n", 191 | " --num-games 4 \\\n", 192 | " --parallelism 4 \\\n", 193 | " --output-dir tournament_parallel\n", 194 | "\n", 195 | "parallel_time = time.time() - start_time\n", 196 | "\n", 197 | "print(f\"\\n Parallel tournament completed in {parallel_time:.1f} seconds\")\n", 198 | "print(f\"\\nSpeedup: {sequential_time/parallel_time:.2f}x faster with parallelism=4\")\n", 199 | "print(f\"Time saved: {sequential_time - parallel_time:.1f} seconds ({(sequential_time - parallel_time)/sequential_time*100:.1f}%)\")" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "### Understanding the Speedup\n", 207 | "\n", 208 | "**How it works:**\n", 209 | "\n", 210 | "1. **Process-level parallelism** (`--parallelism 4`):\n", 211 | " - Tournament scheduler runs 4 games simultaneously in separate processes\n", 212 | " - Each game makes HTTP requests to the vLLM server independently\n", 213 | "\n", 214 | "2. **Request-level batching** (vLLM server):\n", 215 | " - Server configured with `max_num_seqs=4` and `continuous_batching=true`\n", 216 | " - When 4 games request moves at similar times, vLLM automatically batches them\n", 217 | " - Batched requests are processed together on Neuron cores\n", 218 | "\n", 219 | "**Expected results:**\n", 220 | "- Sequential (parallelism=1): ~0.65s per move, games run one after another\n", 221 | "- Parallel (parallelism=4): ~1.86s per move, but 4 games run simultaneously\n", 222 | "- **Throughput improvement: ~1.4x** (40% faster overall)\n", 223 | "\n", 224 | "**Why not 4x speedup?**\n", 225 | "- Individual request latency increases due to batching overhead\n", 226 | "- Not all requests arrive at exactly the same time (timing variance)\n", 227 | "- Server batch efficiency: ~35% of theoretical maximum\n", 228 | "- Still significant savings: games complete much faster overall" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "## Step 5: Run Full Tournament\n", 236 | "\n", 237 | "Now let's run a comprehensive tournament against multiple opponents to evaluate model strength." 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "!python -m assets.run_game \\\n", 247 | " --agent vllm \\\n", 248 | " --agent stockfish-skill1-depth2 \\\n", 249 | " --agent stockfish-skill5-depth10 \\\n", 250 | " --agent stockfish-skill10-depth15 \\\n", 251 | " --num-games 20 \\\n", 252 | " --parallelism 4 \\\n", 253 | " --output-dir tournament_full\n", 254 | "\n", 255 | "print(\"\\n Tournament complete! Results saved to tournament_full/\")" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "## Step 6: Analyze Results\n", 263 | "\n", 264 | "Let's load and analyze the tournament results." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "import json\n", 274 | "import pandas as pd\n", 275 | "\n", 276 | "# Load tournament results\n", 277 | "with open('tournament_full/tournament.json') as f:\n", 278 | " results = json.load(f)\n", 279 | "\n", 280 | "agents_data = []\n", 281 | "for agent_name, stats in results['agents'].items():\n", 282 | " totals = stats['totals']\n", 283 | " rating = stats['rating']\n", 284 | "\n", 285 | " games = totals.get('games', 0)\n", 286 | " wins = totals.get('wins', 0)\n", 287 | " losses = totals.get('losses', 0)\n", 288 | " draws = totals.get('draws', 0)\n", 289 | "\n", 290 | " win_rate = (wins + 0.5 * draws) / games * 100 if games > 0 else 0.0\n", 291 | "\n", 292 | " agents_data.append({\n", 293 | " 'Agent': agent_name,\n", 294 | " 'Games': games,\n", 295 | " 'Wins': wins,\n", 296 | " 'Losses': losses,\n", 297 | " 'Draws': draws,\n", 298 | " 'Win Rate': f\"{win_rate:.1f}%\",\n", 299 | " 'Rating': f\"{rating['conservative']:.1f}\",\n", 300 | " 'Mu': f\"{rating['mu']:.2f}\",\n", 301 | " 'Sigma': f\"{rating['sigma']:.2f}\",\n", 302 | " })\n", 303 | "\n", 304 | "df = pd.DataFrame(agents_data)\n", 305 | "df = df.sort_values('Rating', ascending=False)\n", 306 | "\n", 307 | "print(\"Tournament Standings:\")\n", 308 | "print(\"=\" * 80)\n", 309 | "print(df.to_string(index=False))\n", 310 | "print(\"\\nNote: Rating = mu - 3*sigma (conservative estimate)\")\n" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "## Step 7: Analyze Your Model's Performance\n", 318 | "\n", 319 | "Let's look at detailed metrics for your vLLM model." 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "import json\n", 329 | "\n", 330 | "with open('tournament_full/tournament.json') as f:\n", 331 | " results = json.load(f)\n", 332 | "\n", 333 | "# Get vLLM model statistics\n", 334 | "vllm_stats = results['agents']['vllm']\n", 335 | "totals = vllm_stats['totals']\n", 336 | "rating = vllm_stats['rating']\n", 337 | "engine = vllm_stats.get('engine_metrics_avg', {})\n", 338 | "illegal = vllm_stats.get('illegal_metrics', {})\n", 339 | "\n", 340 | "games = totals.get('games', 0)\n", 341 | "wins = totals.get('wins', 0)\n", 342 | "losses = totals.get('losses', 0)\n", 343 | "draws = totals.get('draws', 0)\n", 344 | "as_white = totals.get('as_white', 0)\n", 345 | "as_black = totals.get('as_black', 0)\n", 346 | "\n", 347 | "win_rate = (wins + 0.5 * draws) / games * 100 if games > 0 else 0.0\n", 348 | "\n", 349 | "print(\"Your Model Performance:\")\n", 350 | "print(\"=\" * 60)\n", 351 | "print(f\"\\nGames Played: {games}\")\n", 352 | "print(f\"Record: {wins}-{losses}-{draws}\")\n", 353 | "print(f\"Win Rate: {win_rate:.1f}%\")\n", 354 | "\n", 355 | "print(f\"\\nTrueSkill Rating:\")\n", 356 | "print(f\" Mu (skill estimate): {rating['mu']:.2f}\")\n", 357 | "print(f\" Sigma (uncertainty): {rating['sigma']:.2f}\")\n", 358 | "print(f\" Conservative rating (mu - 3σ): {rating['conservative']:.1f}\")\n", 359 | "\n", 360 | "# Engine metrics (per-game averages from Stockfish analysis)\n", 361 | "if engine:\n", 362 | " print(f\"\\nMove Quality (engine-based):\")\n", 363 | " print(f\" Accuracy: {engine.get('accuracy_pct', 0.0):.1f}% (matches Stockfish top move)\")\n", 364 | " print(f\" Avg Centipawn Loss: {engine.get('acpl', 0.0):.1f}\")\n", 365 | "\n", 366 | "# Illegal move stats\n", 367 | "if illegal:\n", 368 | " print(f\"\\nIllegal Move Metrics:\")\n", 369 | " print(f\" Illegal attempts: {illegal.get('attempts', 0)}\")\n", 370 | " print(f\" Total move attempts: {illegal.get('move_attempts', 0)}\")\n", 371 | " print(f\" Illegal %: {illegal.get('illegal_pct', 0.0):.2f}%\")\n", 372 | "\n", 373 | "print(f\"\\nGames as White: {as_white}\")\n", 374 | "print(f\"Games as Black: {as_black}\")\n" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "### Interpreting Results\n", 382 | "\n", 383 | "**TrueSkill Rating:**\n", 384 | "- Conservative rating 15-20: ~1200-1400 ELO (beginner)\n", 385 | "- Conservative rating 20-25: ~1400-1600 ELO (intermediate)\n", 386 | "- Conservative rating 25-30: ~1600-1800 ELO (advanced)\n", 387 | "- Conservative rating 30+: ~1800+ ELO (expert)\n", 388 | "\n", 389 | "**Move Quality:**\n", 390 | "- Accuracy >60%: Model frequently finds Stockfish's top move\n", 391 | "- ACPL <50: Good move quality (average mistake < half a pawn)\n", 392 | "- ACPL <30: Excellent move quality (near-optimal play)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "## Step 8: View Sample Game\n", 400 | "\n", 401 | "Let's look at a sample game from the tournament." 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "# Get first game involving vLLM (either side)\n", 411 | "vllm_game = None\n", 412 | "for game in results['games']:\n", 413 | " if 'vllm' in [game['white_agent_spec'], game['black_agent_spec']]:\n", 414 | " vllm_game = game\n", 415 | " break\n", 416 | "\n", 417 | "if vllm_game:\n", 418 | " print(\"Sample Game:\")\n", 419 | " print(\"=\" * 60)\n", 420 | " print(f\"Game ID: {vllm_game['id']}\")\n", 421 | " print(f\"White: {vllm_game['white_agent_spec']}\")\n", 422 | " print(f\"Black: {vllm_game['black_agent_spec']}\")\n", 423 | " print(f\"Result: {vllm_game['result']}\") # \"1-0\", \"0-1\", \"1-1\", or \"*\"\n", 424 | " print(f\"Moves: {vllm_game['moves_played']}\")\n", 425 | " print(f\"Reason: {vllm_game['game_over_reason']}\")\n", 426 | " print(f\"Final FEN: {vllm_game['final_fen']}\")\n", 427 | " \n", 428 | " # PGN path on disk\n", 429 | " pgn_path = vllm_game.get('pgn_path', '')\n", 430 | " if pgn_path:\n", 431 | " print(f\"\\nPGN file: {pgn_path}\")\n", 432 | " print(\"You can open this file and paste the PGN into: https://lichess.org/paste\")\n", 433 | " else:\n", 434 | " print(\"\\nNo PGN path recorded for this game.\")\n", 435 | "else:\n", 436 | " print(\"No vLLM games found in results.\")\n" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "## Step 9: Compare Different Parallelism Levels\n", 444 | "\n", 445 | "Let's visualize how concurrency affects performance." 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "# This data comes from the concurrency benchmark we ran\n", 455 | "concurrency_data = {\n", 456 | " 'Concurrency': [1, 2, 4, 8],\n", 457 | " 'Throughput (req/s)': [1.53, 1.89, 2.15, 2.17],\n", 458 | " 'Mean Latency (s)': [0.653, 1.057, 1.860, 3.315],\n", 459 | " 'Speedup': [1.00, 1.23, 1.40, 1.42]\n", 460 | "}\n", 461 | "\n", 462 | "df_concurrency = pd.DataFrame(concurrency_data)\n", 463 | "\n", 464 | "print(\"Concurrency Performance:\")\n", 465 | "print(\"=\" * 70)\n", 466 | "print(df_concurrency.to_string(index=False))\n", 467 | "print(\"\\nKey Insights:\")\n", 468 | "print(\"- Best performance at parallelism=4 (matches server batch_size)\")\n", 469 | "print(\"- 1.4x throughput improvement with automatic batching\")\n", 470 | "print(\"- Individual latency increases, but total time decreases\")\n", 471 | "print(\"- Minimal benefit beyond parallelism=4 (server saturated)\")" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "## Summary\n", 479 | "\n", 480 | "Congratulations! You've successfully:\n", 481 | "\n", 482 | "- Run chess tournaments with TrueSkill ratings \n", 483 | "- Leveraged automatic request batching for 1.4x throughput \n", 484 | "- Evaluated model strength against multiple baselines \n", 485 | "- Analyzed performance with detailed metrics \n", 486 | "- Understood concurrency trade-offs \n", 487 | "\n", 488 | "**Key Takeaways:**\n", 489 | "\n", 490 | "1. **Automatic Concurrency**: Process-level parallelism (`p_map`) + vLLM continuous batching work together automatically\n", 491 | "2. **Optimal Parallelism**: Set `--parallelism` to match `max_num_seqs` (typically 4) for best throughput\n", 492 | "3. **Throughput vs Latency**: Individual requests slower, but total throughput higher\n", 493 | "4. **Real-world Benefit**: Tournament games complete ~40% faster with parallelism=4\n", 494 | "\n", 495 | "**Next Steps:**\n", 496 | "\n", 497 | "- Fine-tune your model further based on tournament weaknesses\n", 498 | "- Experiment with different training datasets\n", 499 | "- Test against stronger opponents (Stockfish skill 15-20)\n", 500 | "- Deploy to production with learned configurations\n", 501 | "- Share your results with the community!" 502 | ] 503 | } 504 | ], 505 | "metadata": { 506 | "kernelspec": { 507 | "display_name": "aws_neuronx_venv_pytorch_latest", 508 | "language": "python", 509 | "name": "python3" 510 | }, 511 | "language_info": { 512 | "codemirror_mode": { 513 | "name": "ipython", 514 | "version": 3 515 | }, 516 | "file_extension": ".py", 517 | "mimetype": "text/x-python", 518 | "name": "python", 519 | "nbconvert_exporter": "python", 520 | "pygments_lexer": "ipython3", 521 | "version": "3.10.12" 522 | } 523 | }, 524 | "nbformat": 4, 525 | "nbformat_minor": 4 526 | } 527 | --------------------------------------------------------------------------------