├── opentinker ├── client │ ├── utils │ │ └── __init__.py │ ├── __init__.py │ ├── client_config │ │ ├── math_inference_config.yaml │ │ ├── opentinker_param.yaml │ │ ├── math_inference_scheduler_config.yaml │ │ ├── gomoku_inference_config.yaml │ │ ├── math_code_interpreter_inference_config.yaml │ │ ├── gomoku_inference_scheduler_config.yaml │ │ ├── geo3k_param.yaml │ │ ├── math_code_interpreter_param.yaml │ │ ├── geo3k_tool_param.yaml │ │ ├── math_param.yaml │ │ ├── generic_env_param.yaml │ │ └── gomoku_param.yaml │ ├── legacy │ │ ├── math_inference.py │ │ └── gomoku_inference.py │ ├── geo3k_rl.py │ ├── math_rl.py │ ├── math_inference.py │ ├── math_tool_rl.py │ ├── gomoku_inference.py │ ├── math_tool_inference.py │ └── geo3k_tool_rl.py ├── server │ ├── __init__.py │ ├── agent.yaml │ ├── config │ │ ├── tool_config_math.json │ │ ├── evaluation.yaml │ │ ├── actor │ │ │ ├── megatron_actor.yaml │ │ │ └── dp_actor.yaml │ │ ├── ref │ │ │ ├── megatron_ref.yaml │ │ │ ├── dp_ref.yaml │ │ │ └── ref.yaml │ │ ├── __init__.py │ │ ├── npu_profile │ │ │ └── npu_profile.yaml │ │ ├── optim │ │ │ ├── megatron.yaml │ │ │ └── fsdp.yaml │ │ ├── critic │ │ │ ├── megatron_critic.yaml │ │ │ └── dp_critic.yaml │ │ ├── engine │ │ │ ├── fsdp.yaml │ │ │ └── megatron.yaml │ │ ├── reward_model │ │ │ ├── dp_reward_model.yaml │ │ │ ├── megatron_reward_model.yaml │ │ │ └── reward_model.yaml │ │ ├── generation.yaml │ │ ├── model │ │ │ └── hf_model.yaml │ │ ├── sft_trainer_engine.yaml │ │ ├── sft_trainer.yaml │ │ ├── config.py │ │ ├── data │ │ │ └── legacy_data.yaml │ │ └── algorithm.py │ └── sandbox_tool.py ├── scheduler │ ├── __init__.py │ ├── config │ │ └── scheduler.yaml │ ├── register_user_example.py │ ├── web_dashboard.py │ └── SCHEDULER_GUIDE.md ├── data_preprocess │ ├── __init__.py │ ├── math.py │ ├── geo3k.py │ ├── math_multiturn_w_interaction.py │ └── math_dataset.py ├── reward_functions │ ├── __init__.py │ └── math_reward_server.py ├── docs │ ├── images │ │ └── opentinker_arch.jpeg │ ├── CORS_FIX.md │ └── SERVER_CONNECTION_FIX.md ├── __init__.py ├── backend_patch │ └── note.md ├── environment │ ├── geo3k │ │ ├── __init__.py │ │ ├── geo3k_env.py │ │ ├── geo3k_tool_server.py │ │ ├── geo3k_server.py │ │ └── geo3k_tool_env.py │ ├── math │ │ ├── __init__.py │ │ ├── math_server.py │ │ ├── math_env.py │ │ ├── math_tool_server.py │ │ └── math_tool_env.py │ ├── legacy │ │ ├── example │ │ │ └── interaction_config.yaml │ │ └── generic │ │ │ └── README.md │ ├── gomoku │ │ ├── __init__.py │ │ └── gomoku_server.py │ ├── __init__.py │ ├── static_data_generator_vl.py │ └── environment.py ├── utils │ └── __init__.py ├── requirements.txt ├── scripts │ └── launch_scheduler.sh ├── setup_cross_node.sh └── test_geo3k_data.py ├── assets ├── reallogo.png └── README.md ├── scheduler_users.db ├── .gitmodules ├── data ├── read.py └── math │ ├── test_example.json │ └── train_example.json ├── .gitignore ├── setup.py └── docs └── geo3k_quickstart.md /opentinker/client/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /opentinker/client/__init__.py: -------------------------------------------------------------------------------- 1 | """OpenTinker client module.""" 2 | -------------------------------------------------------------------------------- /opentinker/server/__init__.py: -------------------------------------------------------------------------------- 1 | """OpenTinker server module.""" 2 | -------------------------------------------------------------------------------- /opentinker/scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | """OpenTinker scheduler module.""" 2 | -------------------------------------------------------------------------------- /opentinker/data_preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | """OpenTinker data preprocessing module.""" 2 | -------------------------------------------------------------------------------- /opentinker/reward_functions/__init__.py: -------------------------------------------------------------------------------- 1 | """OpenTinker reward functions module.""" 2 | -------------------------------------------------------------------------------- /assets/reallogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-tinker/OpenTinker/HEAD/assets/reallogo.png -------------------------------------------------------------------------------- /scheduler_users.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-tinker/OpenTinker/HEAD/scheduler_users.db -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "verl"] 2 | path = verl 3 | url = https://github.com/volcengine/verl.git 4 | -------------------------------------------------------------------------------- /opentinker/server/agent.yaml: -------------------------------------------------------------------------------- 1 | - name: generic_agent 2 | _target_: opentinker.server.generic_agent_loop.GenericAgentLoop 3 | -------------------------------------------------------------------------------- /opentinker/docs/images/opentinker_arch.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-tinker/OpenTinker/HEAD/opentinker/docs/images/opentinker_arch.jpeg -------------------------------------------------------------------------------- /opentinker/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | OpenTinker: A framework for training and inference with interactive environments. 3 | """ 4 | 5 | __version__ = "0.1.0" 6 | -------------------------------------------------------------------------------- /data/read.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | import pandas as pd 3 | 4 | # 加载你的数据 5 | ds = load_dataset("parquet", data_files="./data/geo3k/test.parquet") 6 | print("Dataset columns:", ds['train'].column_names) 7 | print("\nFirst row:") 8 | print(ds['train'][0]) -------------------------------------------------------------------------------- /opentinker/backend_patch/note.md: -------------------------------------------------------------------------------- 1 | this is a patch for verl, to make it work with opentinker 2 | 3 | the patch is based on verl 0.7.0.dev0 4 | 5 | git clone https://github.com/volcengine/verl.git 6 | cd verl 7 | git checkout 418f964ab84d2b7c49aa4404f65774917501b092 -------------------------------------------------------------------------------- /opentinker/server/config/tool_config_math.json: -------------------------------------------------------------------------------- 1 | { 2 | "tools": [ 3 | { 4 | "class_name": "opentinker.server.sandbox_tool.SandboxTool", 5 | "config": { 6 | "type": "native", 7 | "sandbox_fusion_url": "http://localhost:8000/run_code" 8 | } 9 | } 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /opentinker/environment/geo3k/__init__.py: -------------------------------------------------------------------------------- 1 | """Geo3K geometry problem-solving game for OpenTinker.""" 2 | 3 | from .geo3k_game import Geo3KGame 4 | from .geo3k_env import Geo3KGameEnvironment 5 | from .geo3k_tool_game import Geo3KToolGame 6 | from .geo3k_tool_env import Geo3KToolEnvironment 7 | 8 | __all__ = [ 9 | "Geo3KGame", 10 | "Geo3KGameEnvironment", 11 | "Geo3KToolGame", 12 | "Geo3KToolEnvironment", 13 | ] 14 | -------------------------------------------------------------------------------- /assets/README.md: -------------------------------------------------------------------------------- 1 | # Assets Directory 2 | 3 | This directory contains visual assets for the README and documentation. 4 | 5 | ## Required Files 6 | 7 | | File | Description | 8 | |------|-------------| 9 | | `logo.png` | OpenTinker logo (recommended: 200x200 px) | 10 | | `demo.gif` | Demo animation showing OpenTinker in action | 11 | 12 | ## Optional Files 13 | 14 | Add any additional screenshots, diagrams, or visual materials here. 15 | -------------------------------------------------------------------------------- /opentinker/server/config/evaluation.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | path: /tmp/math_Qwen2-7B-Instruct.parquet 3 | prompt_key: prompt 4 | response_key: responses 5 | data_source_key: data_source 6 | reward_model_key: reward_model 7 | 8 | custom_reward_function: 9 | path: null 10 | name: compute_score 11 | 12 | ray_kwargs: 13 | ray_init: 14 | num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then. 15 | timeline_json_file: null 16 | -------------------------------------------------------------------------------- /opentinker/environment/math/__init__.py: -------------------------------------------------------------------------------- 1 | """Math Environment Package. 2 | 3 | Provides MathGame and related components for math problem solving: 4 | - MathGame: Single-turn math problem solving with rewards computed in step() 5 | - CodeInterpreterMathGame: Multi-turn math with code interpreter tool support 6 | """ 7 | 8 | from opentinker.environment.math.math_game import MathGame 9 | from opentinker.environment.math.math_tool_game import CodeInterpreterMathGame 10 | 11 | __all__ = ["MathGame", "CodeInterpreterMathGame"] 12 | 13 | -------------------------------------------------------------------------------- /opentinker/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Utility modules for OpenTinker 2 | from opentinker.utils.rollout_trace_saver import ( 3 | RolloutTraceSaver, 4 | RolloutTrace, 5 | init_weave_tracing, 6 | init_mlflow_tracing, 7 | get_global_saver, 8 | set_global_saver, 9 | init_global_saver, 10 | ) 11 | 12 | __all__ = [ 13 | "RolloutTraceSaver", 14 | "RolloutTrace", 15 | "init_weave_tracing", 16 | "init_mlflow_tracing", 17 | "get_global_saver", 18 | "set_global_saver", 19 | "init_global_saver", 20 | ] 21 | -------------------------------------------------------------------------------- /opentinker/server/config/actor/megatron_actor.yaml: -------------------------------------------------------------------------------- 1 | # megatron actor config, inheriting from trainer/config/actor/actor.yaml 2 | defaults: 3 | # megatron optimizer config 4 | - ../optim@optim: megatron 5 | 6 | # megatron engine config 7 | - ../engine@megatron: megatron 8 | 9 | - actor 10 | 11 | # load the reference default config, then apply the fields in the current yaml 12 | - _self_ 13 | 14 | _target_: verl.workers.config.McoreActorConfig 15 | 16 | strategy: megatron 17 | 18 | data_loader_seed: null 19 | 20 | load_weight: True 21 | -------------------------------------------------------------------------------- /data/math/test_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "level": "Level 3", 3 | "type": "Algebra", 4 | "data_source": "DigitalLearningGmbH/MATH-lighteval", 5 | "prompt": [ 6 | { 7 | "content": "How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have? Let's think step by step and output the final answer within \\boxed{}.", 8 | "role": "user" 9 | } 10 | ], 11 | "ability": "math", 12 | "reward_model": { 13 | "ground_truth": "2", 14 | "style": "rule" 15 | }, 16 | "extra_info": { 17 | "index": 0, 18 | "split": "test" 19 | } 20 | } -------------------------------------------------------------------------------- /opentinker/requirements.txt: -------------------------------------------------------------------------------- 1 | # OpenTinker Requirements 2 | # Python 3.8+ 3 | 4 | # Core dependencies 5 | ray>=2.9.0 6 | torch>=2.0.0 7 | transformers>=4.35.0 8 | 9 | # Web framework 10 | fastapi>=0.104.0 11 | uvicorn>=0.24.0 12 | pydantic>=2.0.0 13 | 14 | # Configuration 15 | omegaconf>=2.3.0 16 | hydra-core>=1.3.0 17 | pyyaml>=6.0 18 | 19 | # Data processing 20 | pandas>=2.0.0 21 | pyarrow>=14.0.0 22 | datasets>=2.14.0 23 | 24 | # Utilities 25 | requests>=2.31.0 26 | aiohttp>=3.9.0 27 | 28 | # Optional: Logging and monitoring 29 | wandb>=0.16.0 30 | 31 | # Optional: Development tools 32 | pytest>=7.4.0 33 | black>=23.0.0 34 | flake8>=6.1.0 35 | -------------------------------------------------------------------------------- /opentinker/server/config/ref/megatron_ref.yaml: -------------------------------------------------------------------------------- 1 | # megatron ref config, inheriting from trainer/config/ref/ref.yaml 2 | defaults: 3 | - ref 4 | 5 | # megatron engine config 6 | - ../engine@megatron: megatron 7 | 8 | # load the reference default config, then apply the fields in the current yaml 9 | - _self_ 10 | 11 | strategy: megatron 12 | 13 | megatron: 14 | _target_: verl.workers.config.MegatronEngineConfig 15 | seed: ${oc.select:actor_rollout_ref.actor.megatron.seed,42} 16 | override_transformer_config: ${oc.select:actor_rollout_ref.actor.megatron.override_transformer_config,{}} 17 | use_mbridge: ${oc.select:actor_rollout_ref.actor.megatron.use_mbridge,False} 18 | 19 | load_weight: True -------------------------------------------------------------------------------- /opentinker/scheduler/config/scheduler.yaml: -------------------------------------------------------------------------------- 1 | # Job Scheduler Configuration 2 | 3 | # GPU IDs available for job allocation 4 | available_gpus: [0, 1, 2, 3] 5 | 6 | # Number of GPUs to allocate per job 7 | gpus_per_job: 4 8 | 9 | # Port range for spawned training servers [min, max] 10 | # Set to null to auto-detect available ports 11 | port_range: null # or [38564, 38600] for manual range 12 | 13 | # Number of ports to auto-detect if port_range is null 14 | num_ports: 50 15 | 16 | # Port for the scheduler server itself 17 | scheduler_port: 8765 18 | 19 | # Authentication settings 20 | enable_auth: false # Set to false to disable authentication 21 | 22 | # Path to SQLite user database 23 | user_db_path: "scheduler_users.db" 24 | -------------------------------------------------------------------------------- /data/math/train_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "level": "Level 5", 3 | "type": "Algebra", 4 | "data_source": "DigitalLearningGmbH/MATH-lighteval", 5 | "prompt": [ 6 | { 7 | "content": "Let \\[f(x) = \\left\\{\n\\begin{array}{cl} ax+3, &\\text{ if }x>2, \\\\\nx-5 &\\text{ if } -2 \\le x \\le 2, \\\\\n2x-b &\\text{ if } x <-2.\n\\end{array}\n\\right.\\]Find $a+b$ if the piecewise function is continuous (which means that its graph can be drawn without lifting your pencil from the paper). Let's think step by step and output the final answer within \\boxed{}.", 8 | "role": "user" 9 | } 10 | ], 11 | "ability": "math", 12 | "reward_model": { 13 | "ground_truth": "0", 14 | "style": "rule" 15 | }, 16 | "extra_info": { 17 | "index": 0, 18 | "split": "train" 19 | } 20 | } -------------------------------------------------------------------------------- /opentinker/environment/legacy/example/interaction_config.yaml: -------------------------------------------------------------------------------- 1 | # Generic Environment Interaction Configuration 2 | # This file configures interactions for use with GenericAgentLoop. 3 | # Each entry defines an interaction that can be used during training. 4 | 5 | # Gym Environment Interaction 6 | # Connects to an external Gym-like environment via HTTP API 7 | - name: gym_env 8 | class: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction 9 | config: 10 | # HTTP endpoint for the environment server 11 | env_endpoint: "http://localhost:8084" 12 | # Maximum steps per episode 13 | max_steps: 100 14 | # Template for formatting observations as messages 15 | # Available variables: {observation}, {reward}, {step}, {cumulative_reward} 16 | observation_template: "Environment observation: {observation}" -------------------------------------------------------------------------------- /opentinker/server/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import algorithm, config 16 | from .algorithm import * # noqa: F401 17 | from .config import * # noqa: F401 18 | 19 | __all__ = config.__all__ + algorithm.__all__ 20 | -------------------------------------------------------------------------------- /opentinker/environment/gomoku/__init__.py: -------------------------------------------------------------------------------- 1 | """Gomoku Environment Module for LLM Training. 2 | 3 | Usage: 4 | from opentinker.environment.base_game_environment import GameEnvironment 5 | from opentinker.environment.gomoku import GomokuGame 6 | from opentinker.environment.game_stats_client import GameStatsClient 7 | 8 | env = GameEnvironment(game_class=GomokuGame, config=config) 9 | stats_client = GameStatsClient(env_endpoint) 10 | 11 | # Optional: GomokuGameStats for server-side metrics 12 | from opentinker.environment.gomoku import GomokuGameStats # may be None 13 | """ 14 | 15 | from .gomoku_game import GomokuGame 16 | 17 | # GomokuGameStats is optional - only available if gomoku_stats.py exists 18 | try: 19 | from .gomoku_stats import GomokuGameStats 20 | except ImportError: 21 | GomokuGameStats = None 22 | 23 | __all__ = [ 24 | "GomokuGame", 25 | "GomokuGameStats", 26 | ] 27 | -------------------------------------------------------------------------------- /opentinker/client/client_config/math_inference_config.yaml: -------------------------------------------------------------------------------- 1 | # OpenTinker Inference Configuration 2 | # Use with: python math_inference.py 3 | 4 | # Model settings 5 | model_path: null # Path to trained checkpoint (HuggingFace format) 6 | tokenizer_path: null # Tokenizer path (defaults to model_path if null) 7 | 8 | # GPU settings 9 | tensor_parallel_size: 1 # Number of GPUs for tensor parallelism 10 | gpu_memory_utilization: 0.9 11 | 12 | # Generation parameters (greedy by default for inference) 13 | temperature: 0.0 # 0.0 = greedy decoding 14 | top_p: 1.0 15 | max_new_tokens: 4096 16 | 17 | # Data settings 18 | data_path: null # Input data file (parquet/jsonl) 19 | output_path: null # Output results file (jsonl) 20 | max_samples: null # Limit samples (null = all) 21 | 22 | # Environment settings 23 | env_endpoint: http://localhost:8088 24 | 25 | # Multi-turn settings (same as training config) 26 | multi_turn: 27 | max_user_turns: 0 28 | max_assistant_turns: 1 -------------------------------------------------------------------------------- /opentinker/server/config/ref/dp_ref.yaml: -------------------------------------------------------------------------------- 1 | # defaults specify the default config from each component 2 | defaults: 3 | 4 | # dp ref config, inheriting from trainer/config/ref/ref.yaml 5 | - ref 6 | 7 | # fsdp engine config 8 | - ../engine@fsdp_config: fsdp 9 | 10 | # load the reference default config, then apply the fields in the current yaml 11 | - _self_ 12 | 13 | # ref model is assumed to be identical to actor model. Specify model.path for using a different ref model. 14 | # Potential use case involves on policy distillation where we calculate KL divergence between student actor 15 | # and teacher ref 16 | model: null 17 | 18 | # sequence parallel size 19 | # same as actor_rollout_ref.actor.ulysses_sequence_parallel_size if it exists, otherwise 1 20 | ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} 21 | 22 | # calculate entropy with chunking to reduce memory peak 23 | entropy_from_logits_with_chunking: False 24 | 25 | # recompute entropy 26 | entropy_checkpointing: False 27 | -------------------------------------------------------------------------------- /opentinker/server/config/npu_profile/npu_profile.yaml: -------------------------------------------------------------------------------- 1 | # Options for the npu profiler 2 | options: 3 | 4 | # Storage path of collected data. 5 | save_path: ./profiler_data 6 | 7 | # The roles that will be profiled. Only takes effect in discrete mode. 8 | # optional values: all, rollout_generate, actor_compute_log_prob, actor_update and ref_compute_log_prob. 9 | # "all" means all roles will be profiled. 10 | roles: ["all"] 11 | 12 | # Collection level, optional values: level_none, level0, level1, level2. 13 | level: level1 14 | 15 | # Whether to enable memory analysis. 16 | with_memory: False 17 | 18 | # Whether to record tensor shape. 19 | record_shapes: False 20 | 21 | # Whether to record Device-side performance data. 22 | with_npu: True 23 | 24 | # Whether to record Host-side performance data. 25 | with_cpu: True 26 | 27 | # Whether to record Python call stack information. 28 | with_module: False 29 | 30 | # Whether to record operator call stack information. 31 | with_stack: False 32 | 33 | # Whether to automatically parse the data. 34 | analysis: True -------------------------------------------------------------------------------- /opentinker/server/config/optim/megatron.yaml: -------------------------------------------------------------------------------- 1 | _target_: verl.workers.config.McoreOptimizerConfig 2 | 3 | # Learning rate 4 | lr: 1e-3 5 | 6 | # LR warmup steps ratio 7 | lr_warmup_steps_ratio: 0.0 8 | 9 | # Total training steps 10 | total_training_steps: -1 11 | 12 | # Weight decay 13 | weight_decay: 0.01 14 | 15 | # LR warmup steps 16 | lr_warmup_steps: -1 17 | 18 | # Betas for Adam optimizer 19 | betas: [0.9, 0.999] 20 | 21 | # Clip gradient 22 | clip_grad: 1.0 23 | 24 | # optimizer type 25 | optimizer: adam 26 | 27 | # initial learning rate for warmup, default to 0.0 28 | lr_warmup_init: 0.0 29 | 30 | lr_decay_steps: null 31 | 32 | # select from constant/linear/cosine/inverse_square_root 33 | lr_decay_style: constant 34 | 35 | # minimum learning rate, default to 0.0 36 | min_lr: 0.0 37 | 38 | # select from constant/linear/cosine 39 | weight_decay_incr_style: constant 40 | 41 | # select from constant/exponential/cosine 42 | lr_wsd_decay_style: exponential 43 | 44 | lr_wsd_decay_steps: null 45 | 46 | # use checkpoint optimizer parameter scheduler 47 | use_checkpoint_opt_param_scheduler: False 48 | 49 | override_optimizer_config: {} 50 | -------------------------------------------------------------------------------- /opentinker/server/config/optim/fsdp.yaml: -------------------------------------------------------------------------------- 1 | # Target class for this configuration 2 | _target_: verl.workers.config.FSDPOptimizerConfig 3 | 4 | # Optimizer class name (e.g., "AdamW", "AdamW8bit", "_AdamW", "Adam") 5 | optimizer: AdamW 6 | 7 | # Module path to import optimizer 8 | # Examples: "torch.optim", "torchao.optim", "bitsandbytes.optim" 9 | optimizer_impl: torch.optim 10 | 11 | # Learning rate 12 | lr: 1e-3 13 | 14 | # LR warmup steps ratio 15 | lr_warmup_steps_ratio: 0.0 16 | 17 | # Total training steps 18 | total_training_steps: -1 19 | 20 | # Weight decay 21 | weight_decay: 0.01 22 | 23 | # LR warmup steps 24 | lr_warmup_steps: -1 25 | 26 | # Betas for Adam optimizer 27 | betas: [0.9, 0.999] 28 | 29 | # Clip gradient 30 | clip_grad: 1.0 31 | 32 | # Minimum LR ratio for cosine schedule 33 | min_lr_ratio: 0.0 34 | 35 | # Number of cosine cycles in LR schedule 36 | num_cycles: 0.5 37 | 38 | # LR scheduler type: "constant" or "cosine" 39 | lr_scheduler_type: constant 40 | 41 | # deprecated 42 | warmup_style: null 43 | 44 | # Additional optimizer-specific keyword arguments 45 | # Example for torchao with bf16 stochastic rounding: 46 | # optimizer_impl: torchao.optim 47 | # optimizer: _AdamW 48 | # override_optimizer_config: 49 | # bf16_stochastic_round: true 50 | override_optimizer_config: null 51 | -------------------------------------------------------------------------------- /opentinker/client/client_config/opentinker_param.yaml: -------------------------------------------------------------------------------- 1 | server_url: "http://localhost:8000" 2 | scheduler_api_key: "otk_98b8db24ccd64c92e1fdd9a232e209fa" # Your API key for scheduler authentication 3 | 4 | # GPU allocation 5 | num_gpus: 4 # Number of GPUs to request from scheduler (default: 4) 6 | 7 | data_path: null 8 | val_data_path: null 9 | tokenizer_path: null 10 | batch_size: 64 11 | val_batch_size: 100 12 | # Training duration - set ONE of these (num_steps takes precedence if both set) 13 | num_epochs: 10 # Number of epochs (null = use num_steps) 14 | num_steps: null # Total training steps (null = use num_epochs) 15 | num_workers: 0 16 | 17 | # reward function 18 | ## api reward function 19 | reward: 20 | type: "remote" 21 | remote: 22 | reward_ip: "localhost" 23 | reward_port: null 24 | remote_api_key: null 25 | auto_start: true # Enable auto-start of reward server 26 | code: 27 | code_function: null 28 | ## code reward function 29 | 30 | project_name: "agent_loop_training" 31 | experiment_name: "math_with_tools" 32 | save_freq: 100 33 | test_freq: 50 34 | 35 | temperature: 1 36 | top_p: 1 37 | max_new_tokens: 4096 38 | max_prompt_tokens: 4096 39 | 40 | algorithm: "toolcall" 41 | 42 | logger_backends: ["console"] # options: ["console", "wandb"] 43 | wandb_key: null -------------------------------------------------------------------------------- /opentinker/server/config/critic/megatron_critic.yaml: -------------------------------------------------------------------------------- 1 | # defaults specify the default config from each component 2 | defaults: 3 | 4 | # megatron optimizer config 5 | - ../optim@optim: megatron 6 | 7 | # megatron engine config 8 | - ../engine@megatron: megatron 9 | 10 | # dp actor config, inheriting from trainer/config/critic/critic.yaml 11 | - critic 12 | 13 | # load the reference default config, then apply the fields in the current yaml 14 | - _self_ 15 | 16 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs 17 | _target_: verl.workers.config.McoreCriticConfig 18 | 19 | strategy: megatron 20 | 21 | # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron 22 | nccl_timeout: 600 23 | 24 | # model config for the critic 25 | model: 26 | 27 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs 28 | _target_: verl.trainer.config.BaseModelConfig 29 | 30 | # override default empty mapping 31 | override_config: 32 | 33 | model_config: {} 34 | 35 | moe_config: 36 | 37 | freeze_moe_router: False 38 | 39 | # Whether to load initial weights 40 | load_weight: True 41 | 42 | # seed for data loader 43 | data_loader_seed: ${oc.select:actor_rollout_ref.actor.data_loader_seed,null} 44 | -------------------------------------------------------------------------------- /opentinker/client/client_config/math_inference_scheduler_config.yaml: -------------------------------------------------------------------------------- 1 | # Math Inference with Scheduler Configuration 2 | # Use with: python math_inference_with_scheduler.py 3 | 4 | # Scheduler settings 5 | scheduler_url: http://0.0.0.0:8789 6 | scheduler_api_key: otk_98b8db24ccd64c92e1fdd9a232e209fa # Optional API key for authentication 7 | 8 | # Model settings 9 | model_path: null # Path to trained checkpoint (HuggingFace format) 10 | tokenizer_path: null # Tokenizer path (defaults to model_path if null) 11 | 12 | # GPU settings for vLLM server 13 | tensor_parallel_size: 1 # Number of GPUs for tensor parallelism 14 | num_gpus: null # Override number of GPUs (defaults to tensor_parallel_size) 15 | gpu_memory_utilization: 0.9 16 | max_model_len: null # Max model context length (optional) 17 | trust_remote_code: true 18 | 19 | # Generation parameters (greedy by default for inference) 20 | temperature: 0.0 # 0.0 = greedy decoding 21 | top_p: 1.0 22 | max_new_tokens: 4096 23 | 24 | # Data settings 25 | data_path: null # Input data file (parquet/jsonl) 26 | output_path: null # Output results file (jsonl) 27 | max_samples: null # Limit samples (null = all) 28 | 29 | # Environment settings 30 | env_endpoint: http://0.0.0.0:8088 31 | 32 | # Multi-turn settings (same as training config) 33 | multi_turn: 34 | max_user_turns: 0 35 | max_assistant_turns: 1 36 | -------------------------------------------------------------------------------- /opentinker/server/config/actor/dp_actor.yaml: -------------------------------------------------------------------------------- 1 | # Format checks enforced on CI: 2 | # 1. Comments must appear above each field. 3 | # 2. There must be a blank line between each field. 4 | # 3. Inline comments (after a field on the same line) are not allowed. 5 | # 4. Indentation level is respected for nested fields. 6 | 7 | # defaults specify the default config from each component 8 | defaults: 9 | 10 | # fsdp optimizer config 11 | - ../optim@optim: fsdp 12 | 13 | # fsdp engine config 14 | - ../engine@fsdp_config: fsdp 15 | 16 | # dp actor config, inheriting from trainer/config/actor/actor.yaml 17 | - actor 18 | 19 | # load the reference default config, then apply the fields in the current yaml 20 | - _self_ 21 | 22 | # Target class for this configuration 23 | _target_: verl.workers.config.FSDPActorConfig 24 | 25 | # TODO(haibin.lin): switch to fsdp2 26 | strategy: fsdp 27 | 28 | # Gradient clipping for actor updates, specific to the strategy. 29 | grad_clip: 1.0 30 | 31 | # Sequence parallelism size for Ulysses-style model parallelism 32 | # oc.select: the default val for ref.ulysses_sequence_parallel_size 33 | ulysses_sequence_parallel_size: 1 34 | 35 | # calculate entropy with chunking to reduce memory peak 36 | entropy_from_logits_with_chunking: False 37 | 38 | # recompute entropy 39 | entropy_checkpointing: False 40 | 41 | # Whether to remove padding tokens in inputs during training 42 | use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} -------------------------------------------------------------------------------- /opentinker/client/client_config/gomoku_inference_config.yaml: -------------------------------------------------------------------------------- 1 | # Gomoku Inference Configuration 2 | # Use with: python gomoku_inference.py 3 | 4 | # Model settings 5 | model_path: null # Path to trained checkpoint (HuggingFace format) 6 | tokenizer_path: null # Tokenizer path (defaults to model_path if null) 7 | vllm_server_url: null # vLLM server URL for server mode (e.g., "http://localhost:8000") 8 | 9 | # GPU settings (offline mode only) 10 | tensor_parallel_size: 1 # Number of GPUs for tensor parallelism 11 | gpu_memory_utilization: 0.9 12 | 13 | # Generation parameters 14 | temperature: 0.0 # 0.0 = greedy decoding 15 | top_p: 1.0 16 | max_new_tokens: 8192 # TOTAL response budget for entire multi-turn trajectory (NOT per-turn!) 17 | max_prompt_tokens: 4096 18 | max_context_length: 30000 # Max context before ending game (< model max 32768) 19 | 20 | # Data settings (Gomoku uses dynamic generation, no data_path needed) 21 | data_path: null # Not needed for Gomoku (uses dynamic generation) 22 | output_path: null # Output results file (jsonl) 23 | max_samples: 10 # Number of games to play 24 | 25 | # Environment settings 26 | env_endpoint: http://localhost:8091 27 | 28 | # Multi-turn settings (Gomoku is multi-turn game) 29 | multi_turn: 30 | max_user_turns: 39 # Max environment turns (moves) 31 | max_assistant_turns: 39 # Max model response turns 32 | max_tokens_per_turn: 256 # Per-turn response limit (optional, null for no limit) 33 | 34 | 35 | # Game-specific settings 36 | board_size: 9 # Gomoku board size (9x9) -------------------------------------------------------------------------------- /opentinker/environment/geo3k/geo3k_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Geo3K Game Environment using VL components.""" 3 | 4 | from opentinker.environment.vl_game_environment import VLGameEnvironment 5 | from opentinker.environment.geo3k.geo3k_game import Geo3KGame 6 | 7 | 8 | class Geo3KGameEnvironment(VLGameEnvironment): 9 | """GameEnvironment for Geo3K geometry problems with vision-language models. 10 | 11 | This environment uses: 12 | - VLGameEnvironment for multimodal data processing 13 | - StaticDatasetGeneratorVL for loading Geo3K parquet data with images 14 | - Geo3KGame for geometry problem logic 15 | 16 | Args: 17 | config: Configuration object 18 | data_paths: Training data paths (parquet files) 19 | val_data_paths: Validation data paths (optional) 20 | job_id: Job identifier 21 | 22 | Example: 23 | env = Geo3KGameEnvironment( 24 | config=config, 25 | data_paths=["~/data/geo3k/train.parquet"], 26 | val_data_paths=["~/data/geo3k/test.parquet"], 27 | job_id="geo3k_training_001", 28 | ) 29 | """ 30 | 31 | def __init__(self, config, data_paths, val_data_paths=None, job_id=None): 32 | # Initialize with Geo3K game and VL environment 33 | super().__init__( 34 | game_class=Geo3KGame, 35 | config=config, 36 | data_paths=data_paths, 37 | val_data_paths=val_data_paths, 38 | game_kwargs={}, 39 | job_id=job_id, 40 | image_key="images", # Geo3K uses "images" field 41 | ) 42 | -------------------------------------------------------------------------------- /opentinker/server/config/engine/fsdp.yaml: -------------------------------------------------------------------------------- 1 | # Target class for this configuration 2 | _target_: verl.workers.config.FSDPEngineConfig 3 | 4 | # policy for wrapping the model 5 | wrap_policy: 6 | 7 | # Minimum number of parameters to trigger wrapping a layer with FSDP 8 | min_num_params: 0 9 | 10 | # Whether to offload model parameters to CPU (trades speed for memory) 11 | # Note that this differs from the offload_policy in FSDP 12 | param_offload: false 13 | 14 | # Whether to offload optimizer state to CPU 15 | # Note that this differs from the offload_policy in FSDP 16 | optimizer_offload: false 17 | 18 | # Only for FSDP2: offload param/grad/optimizer during train 19 | offload_policy: false 20 | 21 | # Only for FSDP2: Reshard after forward pass to reduce memory footprint 22 | reshard_after_forward: true 23 | 24 | # Number of GPUs in each FSDP shard group; -1 means auto 25 | fsdp_size: -1 26 | 27 | # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather 28 | # before the current forward computation. 29 | forward_prefetch: False 30 | 31 | # model dtype of fsdp 32 | model_dtype: fp32 33 | 34 | # Whether to use original parameters in fsdp. Only avaiable in fsdp1 35 | use_orig_params: false 36 | 37 | # ulysses sequence parallel size 38 | ulysses_sequence_parallel_size: 1 39 | 40 | # Whether to use entropy_from_logits_with_chunking in fsdp. 41 | entropy_from_logits_with_chunking: false 42 | 43 | # Whether to use torch compile in fsdp. 44 | use_torch_compile: true 45 | 46 | # Whether to use entropy checkpointing in fsdp. 47 | entropy_checkpointing: false 48 | 49 | # Whether to use forward only in fsdp. 50 | forward_only: false 51 | 52 | # fsdp or fsdp2 53 | strategy: fsdp 54 | -------------------------------------------------------------------------------- /opentinker/client/client_config/math_code_interpreter_inference_config.yaml: -------------------------------------------------------------------------------- 1 | # Math Code Interpreter Inference Configuration 2 | # Use with: python math_code_interpreter_inference.py 3 | # This config supports multi-turn code execution for math problem solving 4 | 5 | # Scheduler settings 6 | scheduler_url: http://0.0.0.0:8780 7 | scheduler_api_key: null # Optional API key for authentication 8 | 9 | # Model settings 10 | model_path: null # Path to trained checkpoint (HuggingFace format) 11 | tokenizer_path: null # Tokenizer path (defaults to model_path if null) 12 | 13 | # GPU settings for vLLM server 14 | tensor_parallel_size: 1 # Number of GPUs for tensor parallelism 15 | num_gpus: null # Override number of GPUs (defaults to tensor_parallel_size) 16 | gpu_memory_utilization: 0.9 17 | max_model_len: null # Max model context length (optional) 18 | trust_remote_code: true 19 | 20 | # Generation parameters (greedy by default for inference) 21 | temperature: 0.0 # 0.0 = greedy decoding 22 | top_p: 1.0 23 | max_new_tokens: 8192 # Total response budget for entire trajectory 24 | max_tokens_per_turn: 1024 # Per-turn response limit 25 | 26 | # Data settings 27 | data_path: null # Input data file (parquet/jsonl) 28 | output_path: null # Output results file (jsonl) 29 | max_samples: null # Limit samples (null = all) 30 | 31 | # Environment settings (code interpreter math server) 32 | env_endpoint: http://0.0.0.0:8088 33 | 34 | # Multi-turn settings (allow code execution iterations) 35 | multi_turn: 36 | max_user_turns: 5 # Max environment responses (code execution results) 37 | max_assistant_turns: 5 # Max LLM responses for iterative solving 38 | -------------------------------------------------------------------------------- /opentinker/environment/geo3k/geo3k_tool_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Geo3K Multi-Turn Environment Server. 3 | 4 | This script starts a Geo3K geometry problem server using Geo3KToolGame 5 | for multi-turn verification-based interactions. 6 | 7 | Usage: 8 | python geo3k_tool_server.py 9 | # Or with custom config: 10 | python geo3k_tool_server.py --port 8088 --max_retries 3 11 | """ 12 | 13 | import argparse 14 | from opentinker.environment.base_game_server import run_game_server 15 | from opentinker.environment.geo3k.geo3k_tool_game import Geo3KToolGame 16 | 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser(description="Geo3K Multi-Turn Server") 20 | parser.add_argument("--host", default="0.0.0.0", help="Server host") 21 | parser.add_argument("--port", type=int, default=8088, help="Server port") 22 | parser.add_argument("--max_retries", type=int, default=3, help="Max verification attempts") 23 | args = parser.parse_args() 24 | 25 | print(f"\nGeo3K Multi-Turn Server Configuration:") 26 | print(f" Max retries: {args.max_retries}") 27 | print(f"\nFeedback format (verl-compatible):") 28 | print(f" 'Current parsed answer={{answer}} reward={{0.0|1.0}}'") 29 | print(f"\nReward structure:") 30 | print(f" Correct: +{Geo3KToolGame.REWARD_CORRECT}") 31 | print(f" Incorrect: {Geo3KToolGame.REWARD_INCORRECT}") 32 | print(f" No improvement penalty: {Geo3KToolGame.PENALTY_NO_IMPROVEMENT}") 33 | 34 | run_game_server( 35 | game_class=Geo3KToolGame, 36 | host=args.host, 37 | port=args.port, 38 | stats_class=None, # Use BaseGameStats 39 | max_retries=args.max_retries, 40 | ) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /opentinker/client/client_config/gomoku_inference_scheduler_config.yaml: -------------------------------------------------------------------------------- 1 | # Gomoku Inference with Scheduler Configuration 2 | # Use with: python gomoku_inference_with_scheduler.py 3 | 4 | # Scheduler settings 5 | scheduler_url: http://0.0.0.0:8780 6 | scheduler_api_key: otk_98b8db24ccd64c92e1fdd9a232e209fa # Optional API key for authentication 7 | 8 | # Model settings 9 | model_path: null # Path to trained checkpoint (HuggingFace format) 10 | tokenizer_path: null # Tokenizer path (defaults to model_path if null) 11 | 12 | # GPU settings for vLLM server 13 | tensor_parallel_size: 1 # Number of GPUs for tensor parallelism 14 | num_gpus: null # Override number of GPUs (defaults to tensor_parallel_size) 15 | gpu_memory_utilization: 0.9 16 | max_model_len: null # Max model context length (optional) 17 | trust_remote_code: true 18 | 19 | # Generation parameters 20 | temperature: 0.0 # 0.0 = greedy decoding 21 | top_p: 1.0 22 | max_new_tokens: 8192 # TOTAL response budget for entire multi-turn trajectory 23 | max_prompt_tokens: 4096 24 | max_context_length: 30000 # Max context before ending game 25 | 26 | # Data settings (Gomoku uses dynamic generation, no data_path needed) 27 | data_path: null # Not needed for Gomoku (uses dynamic generation) 28 | output_path: null # Output results file (jsonl) 29 | max_samples: 10 # Number of games to play 30 | 31 | # Environment settings 32 | env_endpoint: http://0.0.0.0:8091 33 | 34 | # Multi-turn settings (Gomoku is multi-turn game) 35 | multi_turn: 36 | max_user_turns: 39 # Max environment turns (moves) 37 | max_assistant_turns: 39 # Max model response turns 38 | max_tokens_per_turn: 256 # Per-turn response limit 39 | 40 | # Game-specific settings 41 | board_size: 9 # Gomoku board size (9x9) 42 | -------------------------------------------------------------------------------- /opentinker/server/config/reward_model/dp_reward_model.yaml: -------------------------------------------------------------------------------- 1 | # Format checks enforced on CI: 2 | # 1. Comments must appear above each field. 3 | # 2. There must be a blank line between each field. 4 | # 3. Inline comments (after a field on the same line) are not allowed. 5 | # 4. Indentation level is respected for nested fields. 6 | 7 | # defaults specify the default config from each component 8 | defaults: 9 | 10 | # dp actor config, inheriting from trainer/config/reward_model/reward_model.yaml 11 | - reward_model 12 | 13 | # load the reference default config, then apply the fields in the current yaml 14 | - _self_ 15 | 16 | strategy: fsdp 17 | 18 | model: 19 | 20 | # Whether to use shared memory for loading the model 21 | use_shm: False 22 | 23 | # Use remove padding optimization (saves compute) 24 | use_remove_padding: False 25 | 26 | # Whether to use fused reward kernels for speedup 27 | use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} 28 | 29 | # FSDP-specific config 30 | fsdp_config: 31 | 32 | # Target configuration dataclass 33 | _target_: verl.workers.config.FSDPEngineConfig 34 | 35 | # Policy for wrapping layers with FSDP 36 | wrap_policy: 37 | 38 | # Minimum number of parameters to trigger wrapping 39 | min_num_params: 0 40 | 41 | # Whether to offload model parameters to CPU 42 | param_offload: False 43 | 44 | # Only for FSDP2: Reshard after forward pass to reduce memory footprint 45 | reshard_after_forward: True 46 | 47 | # Number of GPUs in each FSDP shard group; -1 means auto 48 | fsdp_size: -1 49 | 50 | # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather 51 | # before the current forward computation. 52 | forward_prefetch: False 53 | 54 | # Sequence parallelism size for Ulysses-style model parallelism 55 | ulysses_sequence_parallel_size: 1 -------------------------------------------------------------------------------- /opentinker/client/legacy/math_inference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Math Environment Inference Script 4 | 5 | Uses the shared InferencePipeline to run inference on math problems. 6 | 7 | Usage: 8 | 1. Start the game server: 9 | python opentinker/environment/math/math_server.py 10 | 11 | 2. Run inference: 12 | python math_inference.py \ 13 | model_path=/path/to/checkpoint \ 14 | data_path=/ \ 15 | output_path=/tmp/results.jsonl 16 | 17 | 18 | """ 19 | 20 | import hydra 21 | from opentinker.environment.inference_pipeline import run_inference 22 | from opentinker.environment.math import MathGame 23 | 24 | 25 | @hydra.main(config_path="client_config", config_name="math_inference_config.yaml", version_base=None) 26 | def main(args): 27 | """Run inference on math problems.""" 28 | print("=" * 60) 29 | print("Math Environment Inference") 30 | print("=" * 60) 31 | 32 | if not args.model_path: 33 | raise ValueError("model_path is required") 34 | if not args.data_path: 35 | raise ValueError("data_path is required") 36 | 37 | results = run_inference( 38 | model_path=args.model_path, 39 | data_path=args.data_path, 40 | game_class=MathGame, 41 | env_endpoint=args.env_endpoint, 42 | output_path=args.get("output_path"), 43 | temperature=args.temperature, 44 | top_p=args.top_p, 45 | max_tokens=args.max_new_tokens, 46 | max_samples=args.get("max_samples"), 47 | max_user_turns=args.multi_turn.max_user_turns, 48 | max_assistant_turns=args.multi_turn.max_assistant_turns, 49 | tensor_parallel_size=args.get("tensor_parallel_size", 1), 50 | ) 51 | 52 | if args.get("output_path"): 53 | print(f"\nResults saved to: {args.output_path}") 54 | 55 | 56 | if __name__ == "__main__": 57 | main() -------------------------------------------------------------------------------- /opentinker/environment/geo3k/geo3k_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Geo3K Environment Server - Simplified launcher. 3 | 4 | This script starts a Geo3K geometry problem server using the generic base_game_server. 5 | 6 | Usage: 7 | python geo3k_server.py 8 | # Or with custom config: 9 | python geo3k_server.py --port 8082 --max_retries 0 10 | """ 11 | 12 | import argparse 13 | from opentinker.environment.base_game_server import run_game_server 14 | from opentinker.environment.geo3k.geo3k_game import Geo3KGame 15 | 16 | # Geo3KGameStats is optional - falls back to BaseGameStats if not available 17 | try: 18 | from opentinker.environment.geo3k.geo3k_stats import Geo3KGameStats 19 | except ImportError: 20 | Geo3KGameStats = None 21 | 22 | 23 | def main(): 24 | parser = argparse.ArgumentParser(description="Geo3K Geometry Problem Server") 25 | parser.add_argument("--host", default="0.0.0.0", help="Server host") 26 | parser.add_argument("--port", type=int, default=8082, help="Server port") 27 | parser.add_argument("--max_retries", type=int, default=0, help="Max retry attempts (0 = single turn)") 28 | args = parser.parse_args() 29 | 30 | print(f"\nGeo3K Game Configuration:") 31 | print(f" Max retries: {args.max_retries}") 32 | print(f"\nReward structure:") 33 | print(f" Correct: +{Geo3KGame.REWARD_CORRECT}") 34 | print(f" Incorrect: {Geo3KGame.REWARD_INCORRECT}") 35 | 36 | if Geo3KGameStats: 37 | print(f"\nUsing Geo3KGameStats for tracking") 38 | else: 39 | print(f"\nUsing BaseGameStats (Geo3KGameStats not available)") 40 | 41 | run_game_server( 42 | game_class=Geo3KGame, 43 | host=args.host, 44 | port=args.port, 45 | stats_class=Geo3KGameStats, # None falls back to BaseGameStats 46 | max_retries=args.max_retries, 47 | ) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /opentinker/server/config/generation.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | nnodes: 1 3 | n_gpus_per_node: 8 4 | device: cuda 5 | 6 | data: 7 | path: ~/data/rlhf/math/test.parquet 8 | prompt_key: prompt 9 | n_samples: 5 10 | output_path: /opt/tiger/math_Qwen2-7B-Instruct.parquet 11 | batch_size: 128 12 | 13 | model: 14 | path: ~/models/Qwen2-7B-Instruct 15 | external_lib: null 16 | rollout: 17 | _target_: verl.workers.config.RolloutConfig 18 | name: vllm 19 | mode: sync # sync: LLM, async: AsyncLLM 20 | temperature: 1.0 21 | top_k: 50 # 0 for hf rollout, -1 for vllm rollout 22 | top_p: 0.7 23 | prompt_length: 1536 24 | response_length: 512 25 | # for vllm rollout 26 | dtype: bfloat16 # should align with FSDP 27 | gpu_memory_utilization: 0.5 28 | ignore_eos: False 29 | enforce_eager: True 30 | free_cache_engine: True 31 | load_format: auto 32 | tensor_model_parallel_size: 1 33 | data_parallel_size: 1 34 | max_num_batched_tokens: 8192 35 | max_model_len: null 36 | max_num_seqs: 1024 37 | log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu 38 | log_prob_micro_batch_size_per_gpu: 8 39 | # for hf rollout 40 | do_sample: True 41 | disable_log_stats: True 42 | enable_chunked_prefill: True 43 | n: 1 44 | # support logging rollout prob for debugging purpose 45 | calculate_log_probs: False 46 | actor: 47 | strategy: fsdp # This is for backward-compatibility 48 | ulysses_sequence_parallel_size: 1 # sp size 49 | entropy_from_logits_with_chunking: False # calculate entropy with chunking to reduce memory peak 50 | entropy_checkpointing: False # recompute entropy 51 | fsdp_config: 52 | fsdp_size: -1 53 | forward_prefetch: False # FSDP1 forward_prefetch configuration 54 | 55 | ray_kwargs: 56 | ray_init: 57 | num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then. 58 | timeline_json_file: null 59 | -------------------------------------------------------------------------------- /opentinker/docs/CORS_FIX.md: -------------------------------------------------------------------------------- 1 | # CORS 错误修复说明 2 | 3 | ## 问题 4 | 5 | Web Dashboard 显示错误: 6 | ``` 7 | "OPTIONS /list_jobs HTTP/1.1" 405 Method Not Allowed 8 | ``` 9 | 10 | ## 原因 11 | 12 | 这是一个 **CORS (跨域资源共享)** 问题: 13 | 14 | 1. Web Dashboard 在浏览器中运行(`http://localhost:8081`) 15 | 2. 尝试访问调度器 API(`http://localhost:8765`) 16 | 3. 由于端口不同,浏览器认为这是跨域请求 17 | 4. 当请求包含 `Authorization` 头时,浏览器会先发送 OPTIONS 预检请求 18 | 5. 调度器没有配置 CORS,拒绝了 OPTIONS 请求 19 | 20 | ## 解决方案 21 | 22 | 已在调度器中添加 CORS 中间件支持! 23 | 24 | ### 修改内容 25 | 26 | 文件:`scheduler/job_scheduler.py` 27 | 28 | 1. 添加 CORS 中间件导入 29 | 2. 配置允许所有来源的跨域请求 30 | 3. 允许所有 HTTP 方法(包括 OPTIONS) 31 | 4. 允许所有请求头(包括 Authorization) 32 | 33 | ### 应用修复 34 | 35 | **重启调度器**: 36 | 37 | ```bash 38 | # 1. 停止当前运行的调度器(Ctrl+C) 39 | 40 | # 2. 重新启动调度器 41 | python scheduler/launch_scheduler.py \ 42 | available_gpus=[0,1,2,3] \ 43 | scheduler_port=8765 44 | ``` 45 | 46 | ### 验证修复 47 | 48 | 1. **启动 Web Dashboard**: 49 | ```bash 50 | python scheduler/web_dashboard.py --port 8081 51 | ``` 52 | 53 | 2. **刷新浏览器**:打开 `http://localhost:8081/web_dashboard.html` 54 | 55 | 3. **输入 API Key** 并保存 56 | 57 | 4. **检查结果**: 58 | - ✅ 应该能看到任务列表 59 | - ✅ 不再有 405 错误 60 | - ✅ OPTIONS 请求成功返回 200 61 | 62 | ### 技术细节 63 | 64 | 添加的 CORS 配置: 65 | ```python 66 | app.add_middleware( 67 | CORSMiddleware, 68 | allow_origins=["*"], # 允许所有来源 69 | allow_credentials=True, # 允许携带凭证 70 | allow_methods=["*"], # 允许所有方法 71 | allow_headers=["*"], # 允许所有请求头 72 | ) 73 | ``` 74 | 75 | **生产环境注意**: 76 | 在生产环境中,应该限制 `allow_origins` 为特定域名: 77 | ```python 78 | allow_origins=["https://your-dashboard-domain.com"] 79 | ``` 80 | 81 | ### 完整流程 82 | 83 | 现在 Web Dashboard 的完整工作流程: 84 | 85 | 1. 🌐 **浏览器**:打开 Dashboard → `http://localhost:8081/web_dashboard.html` 86 | 2. 🔑 **输入 API Key**:保存到 localStorage 87 | 3. 📡 **OPTIONS 请求**:浏览器发送预检请求 → 调度器允许 88 | 4. 📊 **GET 请求**:带 Authorization 头请求数据 → 调度器返回任务列表 89 | 5. ✅ **显示数据**:Dashboard 显示所有任务 90 | 91 | 全部流程现在都能正常工作! 92 | -------------------------------------------------------------------------------- /opentinker/scripts/launch_scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Convenience script to launch the job scheduler 3 | 4 | # Default configuration 5 | AVAILABLE_GPUS="[4,5,6,7,8,9]" 6 | PORT_RANGE="null" # Set to null for auto-detection 7 | NUM_PORTS=200 8 | SCHEDULER_PORT=8780 9 | 10 | # Parse command line arguments (optional) 11 | while [[ $# -gt 0 ]]; do 12 | case $1 in 13 | --gpus) 14 | AVAILABLE_GPUS="$2" 15 | shift 2 16 | ;; 17 | --ports) 18 | PORT_RANGE="$2" 19 | shift 2 20 | ;; 21 | --num-ports) 22 | NUM_PORTS="$2" 23 | shift 2 24 | ;; 25 | --scheduler-port) 26 | SCHEDULER_PORT="$2" 27 | shift 2 28 | ;; 29 | --auto-ports) 30 | PORT_RANGE="null" 31 | shift 1 32 | ;; 33 | *) 34 | echo "Unknown option: $1" 35 | echo "Usage: $0 [--gpus '[0,1,2,3]'] [--ports '[38564,38600]' | --auto-ports] [--num-ports 50] [--scheduler-port 8765]" 36 | exit 1 37 | ;; 38 | esac 39 | done 40 | 41 | echo "========================================" 42 | echo "Launching Job Scheduler" 43 | echo "========================================" 44 | echo "Available GPUs: $AVAILABLE_GPUS" 45 | if [ "$PORT_RANGE" = "null" ]; then 46 | echo "Port mode: Auto-detect ($NUM_PORTS ports)" 47 | else 48 | echo "Port range: $PORT_RANGE" 49 | fi 50 | echo "Scheduler port: $SCHEDULER_PORT" 51 | echo "========================================" 52 | echo "" 53 | 54 | # Launch scheduler 55 | if [ "$PORT_RANGE" = "null" ]; then 56 | python opentinker/scheduler/launch_scheduler_kill.py \ 57 | available_gpus=$AVAILABLE_GPUS \ 58 | port_range=null \ 59 | num_ports=$NUM_PORTS \ 60 | scheduler_port=$SCHEDULER_PORT 61 | else 62 | python opentinker/scheduler/launch_scheduler_kill.py \ 63 | available_gpus=$AVAILABLE_GPUS \ 64 | port_range=$PORT_RANGE \ 65 | scheduler_port=$SCHEDULER_PORT 66 | fi -------------------------------------------------------------------------------- /opentinker/server/config/model/hf_model.yaml: -------------------------------------------------------------------------------- 1 | # Format checks enforced on CI: 2 | # 1. Comments must appear above each field. 3 | # 2. There must be a blank line between each field. 4 | # 3. Inline comments (after a field on the same line) are not allowed. 5 | # 4. Indentation level is respected for nested fields. 6 | 7 | _target_: verl.workers.config.HFModelConfig 8 | 9 | # path to the huggingface model 10 | path: ~/models/deepseek-llm-7b-chat 11 | 12 | # config to the huggingface config. In case it is not the same as path 13 | hf_config_path: null 14 | 15 | # path to the huggingface tokenizer. In case it is not the same as path 16 | tokenizer_path: null 17 | 18 | # whether to use shared memory for model loading 19 | use_shm: False 20 | 21 | # whether to trust remote code. 22 | trust_remote_code: False 23 | 24 | # custom chat template for the model 25 | custom_chat_template: null 26 | 27 | # whether to use external libs for the model 28 | external_lib: null 29 | 30 | # override hf config 31 | override_config: {} 32 | 33 | # whether to enable gradient checkpointing. Only valid when we use hf model definition 34 | enable_gradient_checkpointing: True 35 | 36 | # whether to enable activation offload. Only valid when we use hf model definition 37 | enable_activation_offload: False 38 | 39 | # whether to use remove padding. Only valid when we use hf model definition 40 | use_remove_padding: False 41 | 42 | # Set to positive value to enable LoRA (e.g., 32) 43 | lora_rank: 0 44 | 45 | # LoRA scaling factor 46 | lora_alpha: 16 47 | 48 | # Target modules for LoRA adaptation 49 | target_modules: all-linear 50 | 51 | # Exclude modules from LoRA adaptation 52 | exclude_modules: null 53 | 54 | # Path to pre-trained LoRA adapter to load for continued training 55 | lora_adapter_path: null 56 | 57 | # whether to use liger. Only valid when we use hf model definition 58 | use_liger: False 59 | 60 | # whether to use fused kernels. 61 | use_fused_kernels: False 62 | 63 | # fused kernel options. 64 | fused_kernel_options: 65 | 66 | # the implementation backend for fused kernels. 67 | impl_backend: torch 68 | -------------------------------------------------------------------------------- /opentinker/server/config/reward_model/megatron_reward_model.yaml: -------------------------------------------------------------------------------- 1 | # defaults specify the default config from each component 2 | defaults: 3 | 4 | # dp actor config, inheriting from trainer/config/reward_model/reward_model.yaml 5 | - reward_model 6 | 7 | # load the reference default config, then apply the fields in the current yaml 8 | - _self_ 9 | 10 | strategy: megatron 11 | 12 | # seconds, default is 10 minutes for torch, you can set it to a larger value 13 | # if you have long-running operations like 32B or 72B model using megatron 14 | nccl_timeout: 600 15 | 16 | # Megatron parallelism & checkpointing config 17 | megatron: 18 | 19 | # Target configuration dataclass 20 | _target_: verl.workers.config.MegatronEngineConfig 21 | 22 | # Whether to offload model parameters to CPU 23 | param_offload: False 24 | 25 | # Number of GPUs in tensor model parallel group 26 | tensor_model_parallel_size: 1 27 | 28 | # Number of GPUs in expert model parallel group 29 | expert_model_parallel_size: 1 30 | 31 | # Expert tensor parallel size 32 | expert_tensor_parallel_size: 1 33 | 34 | # Number of pipeline model parallel stages 35 | pipeline_model_parallel_size: 1 36 | 37 | # change VPP interface for parallelism tests 38 | virtual_pipeline_model_parallel_size: null 39 | 40 | # Context parallel size 41 | context_parallel_size: 1 42 | 43 | # Whether to use sequence parallelism 44 | sequence_parallel: True 45 | 46 | # Whether to use distributed optimizer 47 | use_distributed_optimizer: False 48 | 49 | # Whether to enable distributed checkpointing 50 | use_dist_checkpointing: False 51 | 52 | # Path for distributed checkpoints 53 | dist_checkpointing_path: null 54 | 55 | # RNG seed for megatron 56 | seed: ${oc.select:actor_rollout_ref.actor.megatron.seed,42} 57 | 58 | # Any overrides to transformer config 59 | override_transformer_config: ${oc.select:actor_rollout_ref.actor.megatron.override_transformer_config,{}} 60 | 61 | # Whether to use mbridge for faster comms 62 | use_mbridge: ${oc.select:actor_rollout_ref.actor.megatron.use_mbridge,False} 63 | 64 | # Whether to load weights (default True) 65 | load_weight: True -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | **/*.pt 3 | **/checkpoints 4 | **/wget-log 5 | **/_build/ 6 | **/*.ckpt 7 | **/outputs 8 | **/*.tar.gz 9 | **/playground 10 | **/wandb 11 | **/tensorboard_log 12 | **/evaluation_results 13 | **/verl_debug 14 | **/tool 15 | .api_key_siqizhu4 16 | 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | dataset/* 22 | tensorflow/my_graph/* 23 | .idea/ 24 | # C extensions 25 | *.so 26 | 27 | # Distribution / packaging 28 | .Python 29 | env/ 30 | build/ 31 | develop-eggs/ 32 | dist/ 33 | downloads/ 34 | eggs/ 35 | .eggs/ 36 | lib/ 37 | lib64/ 38 | parts/ 39 | sdist/ 40 | var/ 41 | tmp/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *,cover 65 | .hypothesis/ 66 | pytest.ini 67 | output.txt 68 | 69 | # Translations 70 | *.mo 71 | *.pot 72 | 73 | # Django stuff: 74 | *.log 75 | local_settings.py 76 | 77 | # Flask stuff: 78 | instance/ 79 | .webassets-cache 80 | 81 | # Scrapy stuff: 82 | .scrapy 83 | 84 | # Sphinx documentation 85 | docs/_build/ 86 | 87 | # PyBuilder 88 | target/ 89 | 90 | # IPython Notebook 91 | .ipynb_checkpoints 92 | 93 | # pyenv 94 | .python-version 95 | 96 | # celery beat schedule file 97 | celerybeat-schedule 98 | 99 | # dotenv 100 | .env 101 | 102 | # virtualenv 103 | venv/ 104 | .venv/ 105 | ENV/ 106 | 107 | # Spyder project settings 108 | .spyderproject 109 | 110 | # Rope project settings 111 | .ropeproject 112 | 113 | # vscode 114 | .vscode 115 | 116 | # Mac 117 | .DS_Store 118 | 119 | # vim 120 | *.swp 121 | 122 | # ckpt 123 | *.lock 124 | 125 | # data 126 | *.parquet 127 | 128 | 129 | # local logs 130 | logs 131 | log 132 | outputs 133 | .history 134 | -------------------------------------------------------------------------------- /opentinker/environment/__init__.py: -------------------------------------------------------------------------------- 1 | """OpenTinker Environment Module. 2 | 3 | This module provides the environment framework for LLM training, including: 4 | - BaseEnvironment: Abstract base class for all environments 5 | - GameEnvironment: For multi-turn game environments (Gomoku, etc.) 6 | - StaticDataEnvironment: For single-turn static datasets (Math, etc.) 7 | - Data generators and utilities 8 | """ 9 | 10 | # Base classes 11 | from opentinker.environment.environment import BaseEnvironment, RewardFunctionSpec 12 | from opentinker.environment.base_game import AbstractGame, StepResult, GameDataGenerator 13 | from opentinker.environment.base_game_environment import GameEnvironment, InteractionSpec 14 | from opentinker.environment.base_data_generator import ( 15 | AbstractGameDataGenerator, 16 | DynamicGameDataset, 17 | collate_fn, 18 | ) 19 | 20 | # Static data support 21 | from opentinker.environment.static_data_generator import StaticDatasetGenerator 22 | # from opentinker.environment.static_data_environment import StaticDataEnvironment 23 | 24 | # Server utilities 25 | from opentinker.environment.base_game_server import ( 26 | BaseGameStats, 27 | GameStats, 28 | create_game_server, 29 | run_game_server, 30 | ) 31 | 32 | from opentinker.environment.inference_pipeline import ( 33 | InferencePipeline, 34 | InferenceResult, 35 | RemoteEnvironmentClient, 36 | run_inference, 37 | load_samples, 38 | generate_samples, 39 | ) 40 | 41 | __all__ = [ 42 | # Base 43 | "BaseEnvironment", 44 | "RewardFunctionSpec", 45 | # Game 46 | "AbstractGame", 47 | "StepResult", 48 | "GameDataGenerator", 49 | "GameEnvironment", 50 | "InteractionSpec", 51 | # Data 52 | "AbstractGameDataGenerator", 53 | "DynamicGameDataset", 54 | "collate_fn", 55 | # Static 56 | "StaticDatasetGenerator", 57 | # Inference 58 | "InferencePipeline", 59 | "InferenceResult", 60 | "RemoteEnvironmentClient", 61 | "run_inference", 62 | "load_samples", 63 | "generate_samples", 64 | # "StaticDataEnvironment", 65 | # Server 66 | "BaseGameStats", 67 | "GameStats", 68 | "create_game_server", 69 | "run_game_server", 70 | ] 71 | -------------------------------------------------------------------------------- /opentinker/scheduler/register_user_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Example: User Registration and Authentication 4 | 5 | This script demonstrates how to register a user and use the API key 6 | for authenticated scheduler operations. 7 | """ 8 | 9 | import requests 10 | 11 | SCHEDULER_URL = "http://localhost:8765" 12 | 13 | def main(): 14 | print("=" * 60) 15 | print("User Registration Example") 16 | print("=" * 60) 17 | 18 | # Step 1: Register a new user 19 | username = input("Enter username to register: ") 20 | 21 | print(f"\n📝 Registering user '{username}'...") 22 | 23 | response = requests.post( 24 | f"{SCHEDULER_URL}/register", 25 | params={"username": username} 26 | ) 27 | 28 | if response.status_code == 200: 29 | result = response.json() 30 | print("\n✅ Registration successful!") 31 | print("=" * 60) 32 | print("🔑 YOUR API KEY (save this - cannot be retrieved later!):") 33 | print("") 34 | print(f" {result['api_key']}") 35 | print("") 36 | print("=" * 60) 37 | print(f"User ID: {result['user_id']}") 38 | print(f"Username: {result['username']}") 39 | 40 | # Step 2: Test authentication with the API key 41 | api_key = result['api_key'] 42 | print(f"\n✅ Testing authentication...") 43 | 44 | # Try to list jobs with the API key 45 | headers = {"Authorization": f"Bearer {api_key}"} 46 | jobs_response = requests.get(f"{SCHEDULER_URL}/list_jobs", headers=headers) 47 | 48 | if jobs_response.status_code == 200: 49 | print("✅ Authentication successful!") 50 | jobs = jobs_response.json() 51 | print(f"Current jobs: {len(jobs['jobs'])}") 52 | else: 53 | print(f"❌ Failed to list jobs: {jobs_response.text}") 54 | 55 | # Save to file for easy reference 56 | with open(f".api_key_{username}", "w") as f: 57 | f.write(api_key) 58 | print(f"\n💾 API key saved to .api_key_{username}") 59 | 60 | else: 61 | print(f"❌ Registration failed: {response.text}") 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /opentinker/server/config/critic/dp_critic.yaml: -------------------------------------------------------------------------------- 1 | # Format checks enforced on CI: 2 | # 1. Comments must appear above each field. 3 | # 2. There must be a blank line between each field. 4 | # 3. Inline comments (after a field on the same line) are not allowed. 5 | # 4. Indentation level is respected for nested fields. 6 | 7 | # defaults specify the default config from each component 8 | defaults: 9 | 10 | # fsdp optimizer config 11 | - ../optim@optim: fsdp 12 | 13 | # fsdp engine config 14 | - ../engine@model.fsdp_config: fsdp 15 | 16 | # dp actor config, inheriting from trainer/config/critic/critic.yaml 17 | - critic 18 | 19 | # load the reference default config, then apply the fields in the current yaml 20 | - _self_ 21 | 22 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs 23 | _target_: verl.workers.config.FSDPCriticConfig 24 | 25 | # distribution strategy. Options: fsdp (deprecating), fsdp2 26 | strategy: fsdp 27 | 28 | # model config for the critic 29 | model: 30 | 31 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs 32 | _target_: verl.workers.config.FSDPCriticModelCfg 33 | 34 | # Whether to use shared memory for loading the model 35 | use_shm: False 36 | 37 | # Enable gradient checkpointing to save memory 38 | enable_gradient_checkpointing: True 39 | 40 | # Offload activations to CPU to reduce GPU memory usage 41 | enable_activation_offload: False 42 | 43 | # Use remove padding optimization (saves compute) 44 | use_remove_padding: False 45 | 46 | # Set to positive value to enable LoRA (e.g., 32) 47 | lora_rank: 0 48 | 49 | # LoRA scaling factor 50 | lora_alpha: 16 51 | 52 | # LoRA target modules: "all-linear" or list of linear projection layers 53 | target_modules: all-linear 54 | 55 | # Forward-only batch size during inference (global) 56 | forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} 57 | 58 | # Forward-only batch size during inference (per GPU) 59 | forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} 60 | 61 | # Sequence parallelism size for Ulysses-style model parallelism 62 | ulysses_sequence_parallel_size: 1 63 | 64 | # Gradient clipping for critic updates 65 | grad_clip: 1.0 66 | -------------------------------------------------------------------------------- /opentinker/environment/math/math_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Math Environment Server - HTTP server for math problem solving. 3 | 4 | This script starts a math game server using the generic base_game_server. 5 | For single-turn math problems, the server: 6 | - Receives reset() with ground_truth 7 | - Receives step() with model's answer 8 | - Returns reward computed by MathGame 9 | 10 | Usage: 11 | python math_server.py 12 | # Or with custom port: 13 | python math_server.py --port 8082 14 | 15 | # For multi-worker mode (faster handling of concurrent requests): 16 | uvicorn opentinker.environment.math.math_server:app --host 0.0.0.0 --port 8082 --workers 4 17 | """ 18 | 19 | import argparse 20 | from opentinker.environment.base_game_server import run_game_server, create_game_app 21 | from opentinker.environment.math.math_game import MathGame 22 | 23 | # Pre-import reward function to avoid first-request latency 24 | # (The first import of verl.utils.reward_score can be slow) 25 | try: 26 | from verl.utils.reward_score import default_compute_score 27 | except ImportError: 28 | pass 29 | 30 | # # Module-level app for uvicorn multi-worker mode 31 | # # Usage: uvicorn opentinker.environment.math.math_server:app --host 0.0.0.0 --port 8082 --workers 4 32 | # app = create_game_app(game_class=MathGame) 33 | 34 | 35 | def main(): 36 | parser = argparse.ArgumentParser(description="Math Game Server") 37 | parser.add_argument("--host", default="0.0.0.0", help="Server host") 38 | parser.add_argument("--port", type=int, default=8082, help="Server port") 39 | parser.add_argument("--max_retries", type=int, default=0, help="Max retry attempts (0=single turn)") 40 | args = parser.parse_args() 41 | 42 | print(f"\nMath Game Server Configuration:") 43 | print(f" Single-turn mode: {'Yes' if args.max_retries == 0 else 'No'}") 44 | print(f" Max retries: {args.max_retries}") 45 | print(f"\nReward structure:") 46 | print(f" Correct answer: +{MathGame.REWARD_CORRECT}") 47 | print(f" Incorrect answer: {MathGame.REWARD_INCORRECT}") 48 | 49 | run_game_server( 50 | game_class=MathGame, 51 | host=args.host, 52 | port=args.port, 53 | max_retries=args.max_retries, 54 | ) 55 | 56 | 57 | if __name__ == "__main__": 58 | main() 59 | -------------------------------------------------------------------------------- /opentinker/scheduler/web_dashboard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Web Dashboard Server for OpenTinker 4 | 5 | Serves the HTML dashboard and provides a simple HTTP server. 6 | """ 7 | 8 | import argparse 9 | import http.server 10 | import socketserver 11 | import os 12 | from pathlib import Path 13 | 14 | 15 | class CORSRequestHandler(http.server.SimpleHTTPRequestHandler): 16 | """HTTP request handler with CORS enabled""" 17 | 18 | def end_headers(self): 19 | """Add CORS headers""" 20 | self.send_header('Access-Control-Allow-Origin', '*') 21 | self.send_header('Access-Control-Allow-Methods', 'GET, POST, OPTIONS') 22 | self.send_header('Access-Control-Allow-Headers', 'Content-Type, Authorization') 23 | super().end_headers() 24 | 25 | def do_OPTIONS(self): 26 | """Handle OPTIONS request for CORS preflight""" 27 | self.send_response(200) 28 | self.end_headers() 29 | 30 | 31 | def main(): 32 | parser = argparse.ArgumentParser(description='OpenTinker Web Dashboard Server') 33 | parser.add_argument('--port', type=int, default=8081, help='Port to run the server on (default: 8080)') 34 | parser.add_argument('--scheduler-url', default='http://localhost:8767', 35 | help='Scheduler API URL (default: http://localhost:8767)') 36 | args = parser.parse_args() 37 | 38 | # Change to the directory containing the HTML file 39 | dashboard_dir = Path(__file__).parent 40 | os.chdir(dashboard_dir) 41 | 42 | print("="*70) 43 | print("🌐 OpenTinker Web Dashboard") 44 | print("="*70) 45 | print(f"\n📍 Dashboard URL: http://localhost:{args.port}") 46 | print(f"🔗 Scheduler URL: {args.scheduler_url}") 47 | print(f"📁 Serving from: {dashboard_dir}") 48 | print("\n💡 Press Ctrl+C to stop the server\n") 49 | print("="*70 + "\n") 50 | 51 | # Start server 52 | with socketserver.TCPServer(("", args.port), CORSRequestHandler) as httpd: 53 | try: 54 | print(f"✅ Server running on port {args.port}") 55 | print(f"\n🚀 Open http://localhost:{args.port}/web_dashboard.html in your browser\n") 56 | httpd.serve_forever() 57 | except KeyboardInterrupt: 58 | print("\n\n👋 Shutting down server...") 59 | 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /opentinker/client/client_config/geo3k_param.yaml: -------------------------------------------------------------------------------- 1 | # Geo3K Vision-Language Training Configuration 2 | # Use with: python opentinker/client/geo3k_rl.py 3 | 4 | # Project settings 5 | project_name: opentinker 6 | experiment_name: geo3k_vl_training 7 | 8 | # Logging 9 | logger_backends: ["console", "wandb"] 10 | 11 | # Tracing (optional) 12 | enable_tracing: true 13 | weave_project: null 14 | 15 | # WandB (optional) 16 | wandb_key: 2ed6f8544ac3e30d5c08879166cc10d9c6232448 17 | 18 | # Model and processor paths 19 | # For VL models, both tokenizer_path and processor_path should point to the same model 20 | tokenizer_path: Qwen/Qwen2.5-VL-7B-Instruct 21 | processor_path: Qwen/Qwen2.5-VL-7B-Instruct # AutoProcessor for VL models 22 | 23 | # Data paths - use Geo3K parquet files 24 | data_path: ./data/geo3k/train.parquet 25 | val_data_path: ./data/geo3k/test.parquet 26 | 27 | # Training parameters 28 | batch_size: 16 29 | num_workers: 0 30 | num_epochs: 5 # Total epochs 31 | num_steps: null # Or set num_steps for step-based training 32 | save_freq: -1 # Save checkpoint every N steps 33 | test_freq: 5 # Validate every N steps 34 | 35 | # Validation parameters 36 | val_batch_size: 32 # Total validation samples 37 | 38 | # Generation parameters 39 | temperature: 1.0 40 | top_p: 1.0 41 | max_new_tokens: 2048 # Max tokens per response 42 | max_prompt_tokens: 1024 # Max prompt length (shorter for VL due to image tokens) 43 | 44 | # Algorithm 45 | algorithm: "agent_loop" 46 | 47 | # RL Algorithm settings 48 | # GRPO is recommended for VL tasks 49 | adv_estimator: "grpo" 50 | rollout_n: 5 # Number of samples per prompt for GRPO 51 | 52 | # Interaction configuration 53 | interaction: 54 | name: geo3k 55 | class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction 56 | config: 57 | env_host: 0.0.0.0 58 | env_port: 8088 59 | env_endpoint: http://${interaction.config.env_host}:${interaction.config.env_port} 60 | max_steps: 1 # Single-turn geometry problem solving 61 | 62 | # Multi-turn settings (single-turn for Geo3K) 63 | multi_turn: 64 | max_user_turns: 0 65 | max_assistant_turns: 1 66 | max_tokens_per_turn: 2048 67 | weave_project: null 68 | experiment_name: "geo3k_vl_interaction" 69 | 70 | # Scheduler settings 71 | scheduler_url: "http://0.0.0.0:8780" 72 | scheduler_api_key: null 73 | 74 | # GPU settings 75 | num_gpus: 4 # Adjust based on your setup 76 | -------------------------------------------------------------------------------- /opentinker/client/legacy/gomoku_inference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Gomoku Environment Inference Script 4 | 5 | Uses the shared InferencePipeline to run inference on Gomoku games. 6 | 7 | Usage: 8 | 1. Start the game server: 9 | python opentinker/environment/gomoku/gomoku_server.py 10 | 11 | 2. Run inference: 12 | python gomoku_inference.py \ 13 | model_path=/path/to/checkpoint \ 14 | env_endpoint=http://localhost:8089 15 | """ 16 | 17 | import hydra 18 | from opentinker.environment.inference_pipeline import ( 19 | InferencePipeline, load_samples, run_inference 20 | ) 21 | from opentinker.environment.gomoku import GomokuGame 22 | 23 | 24 | @hydra.main(config_path="client_config", config_name="gomoku_inference_config.yaml", version_base=None) 25 | def main(args): 26 | """Run inference on Gomoku games.""" 27 | print("=" * 60) 28 | print("Gomoku Environment Inference") 29 | print("=" * 60) 30 | 31 | if not args.model_path and not args.get("vllm_server_url"): 32 | raise ValueError("model_path or vllm_server_url is required") 33 | 34 | # Gomoku is multi-turn: max_user_turns should be > 0 35 | max_user_turns = args.multi_turn.get("max_user_turns", 50) 36 | max_assistant_turns = args.multi_turn.get("max_assistant_turns", 50) 37 | 38 | results = run_inference( 39 | model_path=args.get("model_path"), 40 | vllm_server_url=args.get("vllm_server_url"), 41 | tokenizer_path=args.get("tokenizer_path"), 42 | data_path=args.get("data_path"), # None for dynamic generation 43 | game_class=GomokuGame, 44 | env_endpoint=args.env_endpoint, 45 | output_path=args.get("output_path"), 46 | temperature=args.temperature, 47 | top_p=args.top_p, 48 | max_tokens=args.max_new_tokens, 49 | max_tokens_per_turn=args.multi_turn.get("max_tokens_per_turn"), 50 | max_samples=args.get("max_samples", 10), 51 | max_user_turns=max_user_turns, 52 | max_assistant_turns=max_assistant_turns, 53 | max_context_length=args.get("max_context_length", 30000), 54 | tensor_parallel_size=args.get("tensor_parallel_size", 1), 55 | # GomokuGame kwargs 56 | board_size=args.get("board_size", 9), 57 | ) 58 | 59 | if args.get("output_path"): 60 | print(f"\nResults saved to: {args.output_path}") 61 | 62 | 63 | if __name__ == "__main__": 64 | main() -------------------------------------------------------------------------------- /opentinker/client/client_config/math_code_interpreter_param.yaml: -------------------------------------------------------------------------------- 1 | # Math Code Interpreter Training Configuration 2 | # Multi-turn agent with code interpreter using agent_loop algorithm 3 | # Use with: python math_code_interpreter_client.py 4 | 5 | # Project settings 6 | project_name: opentinker 7 | experiment_name: math_code_interpreter 8 | 9 | # Logging 10 | logger_backends: ["console", "wandb"] 11 | 12 | # Tracing (optional) 13 | enable_tracing: true 14 | weave_project: null 15 | 16 | # WandB (optional) 17 | wandb_key: null 18 | 19 | # Model and tokenizer 20 | tokenizer_path: null 21 | 22 | # Data paths 23 | data_path: null # Path to training data (parquet/JSON/JSONL) 24 | val_data_path: null # Path to validation data (parquet/JSON/JSONL) 25 | 26 | # Training parameters 27 | batch_size: 32 28 | num_workers: 0 29 | # Training duration - set ONE of these (num_steps takes precedence if both set) 30 | num_epochs: 10 # Number of epochs (null = use num_steps) 31 | num_steps: null # Total training steps (null = use num_epochs) 32 | save_freq: 100 33 | test_freq: 50 # Validation frequency (every N steps) 34 | 35 | # Validation parameters 36 | val_batch_size: 64 # Total validation samples 37 | 38 | # Generation parameters 39 | temperature: 1 40 | top_p: 1 41 | max_new_tokens: 8192 # TOTAL response budget for entire trajectory 42 | max_prompt_tokens: 2048 43 | 44 | # Algorithm - agent_loop for multi-turn with GymEnvironmentInteraction 45 | algorithm: "agent_loop" 46 | 47 | # RL Algorithm settings (passed to server via scheduler) 48 | # adv_estimator: "grpo" or "gae" (for PPO) 49 | adv_estimator: "grpo" 50 | # rollout_n: number of samples per prompt for GRPO (only used when adv_estimator=grpo) 51 | rollout_n: 8 52 | 53 | # Multi-turn configuration 54 | multi_turn: 55 | max_user_turns: 5 # Max environment responses (code execution results) 56 | max_assistant_turns: 5 # Max LLM responses 57 | max_tokens_per_turn: 1024 # Per-turn response limit 58 | weave_project: null 59 | experiment_name: "math_code_interpreter" 60 | 61 | interaction: 62 | name: math_code_interpreter 63 | class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction 64 | config: 65 | env_host: 0.0.0.0 66 | env_port: 8088 67 | env_endpoint: http://${interaction.config.env_host}:${interaction.config.env_port} 68 | max_steps: 5 # Max interaction steps (code executions) 69 | 70 | # Scheduler settings 71 | scheduler_url: "http://0.0.0.0:8780" 72 | scheduler_api_key: null 73 | 74 | # GPU settings 75 | num_gpus: 4 76 | 77 | -------------------------------------------------------------------------------- /opentinker/client/client_config/geo3k_tool_param.yaml: -------------------------------------------------------------------------------- 1 | # Geo3K Multi-Turn Training Configuration 2 | # Multi-turn agent with answer verification using agent_loop algorithm 3 | # Use with: python opentinker/client/geo3k_tool_rl.py 4 | 5 | # Project settings 6 | project_name: opentinker 7 | experiment_name: geo3k_multiturn 8 | 9 | # Logging 10 | logger_backends: ["console", "wandb"] 11 | 12 | # Tracing (optional) 13 | enable_tracing: true 14 | weave_project: null 15 | 16 | # WandB (optional) 17 | wandb_key: 2ed6f8544ac3e30d5c08879166cc10d9c6232448 18 | 19 | # Model and processor paths (VL model) 20 | tokenizer_path: Qwen/Qwen2.5-VL-7B-Instruct 21 | processor_path: Qwen/Qwen2.5-VL-7B-Instruct 22 | 23 | # Data paths - use preprocessed Geo3K multi-turn data 24 | data_path: ./data/geo3k_multiturn/train.parquet 25 | val_data_path: ./data/geo3k_multiturn/test.parquet 26 | 27 | # Training parameters 28 | batch_size: 4 # Reduced from 16 for VL model 29 | num_workers: 0 30 | num_epochs: 5 # Total epochs 31 | num_steps: null # Or set num_steps for step-based training 32 | save_freq: -1 # Save checkpoint every N steps 33 | test_freq: 5 # Validate every N steps 34 | 35 | # Validation parameters 36 | val_batch_size: 8 # Reduced from 32 for VL model 37 | 38 | # Generation parameters 39 | temperature: 1.0 40 | top_p: 1.0 41 | max_new_tokens: 2048 # Reduced from 4096 42 | max_prompt_tokens: 1024 # Max prompt length 43 | 44 | # Algorithm - agent_loop for multi-turn with GymEnvironmentInteraction 45 | algorithm: "agent_loop" 46 | 47 | # RL Algorithm settings (passed to server via scheduler) 48 | adv_estimator: "grpo" 49 | rollout_n: 4 # Reduced from 5 for VL model 50 | 51 | # Multi-turn configuration 52 | multi_turn: 53 | max_user_turns: 3 # Max environment responses (verification feedback) 54 | max_assistant_turns: 4 # Max LLM responses (initial + refinements) 55 | max_tokens_per_turn: 1024 # Per-turn response limit 56 | weave_project: null 57 | experiment_name: "geo3k_multiturn" 58 | 59 | # Interaction configuration - uses GymEnvironmentInteraction 60 | interaction: 61 | name: geo3k_tool 62 | class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction 63 | config: 64 | env_host: 0.0.0.0 65 | env_port: 8088 66 | env_endpoint: http://${interaction.config.env_host}:${interaction.config.env_port} 67 | max_steps: 3 # Max verification steps 68 | 69 | # Scheduler settings 70 | scheduler_url: "http://0.0.0.0:8780" 71 | scheduler_api_key: null 72 | 73 | # GPU settings 74 | num_gpus: 4 # Adjust based on your setup 75 | -------------------------------------------------------------------------------- /opentinker/client/client_config/math_param.yaml: -------------------------------------------------------------------------------- 1 | # Math Training Configuration (GameEnvironment Pattern) 2 | # Use with: python math_client_unified.py 3 | 4 | # Project settings 5 | project_name: opentinker 6 | experiment_name: math_training 7 | 8 | # Logging 9 | logger_backends: ["console", "wandb"] 10 | 11 | # Tracing (optional) 12 | enable_tracing: true 13 | weave_project: null 14 | 15 | # WandB (optional) 16 | wandb_key: null 17 | 18 | # Model and tokenizer 19 | tokenizer_path: null 20 | 21 | # Data paths 22 | data_path: null # Path to training data (JSON/JSONL) 23 | val_data_path: null # Path to validation data (JSON/JSONL) 24 | 25 | # Training parameters 26 | batch_size: 64 27 | num_workers: 0 28 | # Training duration - set ONE of these (num_steps takes precedence if both set) 29 | num_epochs: 10 # Number of epochs (null = use num_steps) 30 | num_steps: null # Total training steps (null = use num_epochs) 31 | save_freq: 100 32 | test_freq: 50 # Validation frequency (every N steps) 33 | 34 | # Validation parameters 35 | val_batch_size: 100 # Total validation samples 36 | 37 | # Model parameters 38 | # Generation parameters 39 | temperature: 1 40 | top_p: 1 41 | max_new_tokens: 4098 # TOTAL response budget for entire trajectory 42 | max_prompt_tokens: 4096 43 | 44 | # Algorithm - toolcall for math with tool use 45 | algorithm: "agent_loop" 46 | 47 | 48 | # RL Algorithm settings (passed to server via scheduler) 49 | # adv_estimator options: 50 | # - "grpo" : Standard GRPO (outcome-only advantage) 51 | # - "grpo_per_step" : Per-step GRPO with return-based advantages (for multi-turn tasks) 52 | # - "gae" : Generalized Advantage Estimation (for PPO, requires critic) 53 | adv_estimator: "grpo" 54 | # rollout_n: number of samples per prompt for GRPO/grpo_per_step (only used when adv_estimator=grpo or grpo_per_step) 55 | rollout_n: 8 56 | 57 | 58 | # Interaction configuration 59 | interaction: 60 | name: math 61 | class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction 62 | config: 63 | env_host: 0.0.0.0 64 | env_port: 8088 65 | env_endpoint: http://${interaction.config.env_host}:${interaction.config.env_port} 66 | max_steps: 1 # Max interaction steps 67 | 68 | multi_turn: 69 | max_user_turns: 0 70 | max_assistant_turns: 1 71 | max_tokens_per_turn: 4096 # Per-turn response limit 72 | weave_project: null 73 | experiment_name: "math_interaction" 74 | 75 | # Scheduler settings 76 | scheduler_url: "http://0.0.0.0:8780" 77 | scheduler_api_key: otk_98b8db24ccd64c92e1fdd9a232e209fa 78 | 79 | # GPU settings 80 | num_gpus: 4 81 | -------------------------------------------------------------------------------- /opentinker/environment/gomoku/gomoku_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Gomoku Environment Server - Simplified launcher. 3 | 4 | This script starts a Gomoku game server using the generic base_game_server. 5 | 6 | Usage: 7 | python gomoku_server.py 8 | # Or with custom config: 9 | python gomoku_server.py --port 8081 --board_size 9 10 | """ 11 | 12 | import argparse 13 | from opentinker.environment.base_game_server import run_game_server 14 | from opentinker.environment.gomoku.gomoku_game import GomokuGame 15 | 16 | # GomokuGameStats is optional - falls back to BaseGameStats if not available 17 | try: 18 | from opentinker.environment.gomoku.gomoku_stats import GomokuGameStats 19 | except ImportError: 20 | GomokuGameStats = None 21 | 22 | 23 | def main(): 24 | parser = argparse.ArgumentParser(description="Gomoku Game Server") 25 | parser.add_argument("--host", default="0.0.0.0", help="Server host") 26 | parser.add_argument("--port", type=int, default=8081, help="Server port") 27 | parser.add_argument("--board_size", type=int, default=9, help="Board size") 28 | parser.add_argument("--max_total_steps", type=int, default=40, help="Max steps") 29 | parser.add_argument("--max_initial_moves", type=int, default=6, help="Max initial moves (0-6)") 30 | parser.add_argument("--empty_board_prob", type=float, default=0.2, help="Probability of empty board (0.0-1.0)") 31 | args = parser.parse_args() 32 | 33 | print(f"\nGomoku Game Configuration:") 34 | print(f" Board size: {args.board_size}x{args.board_size}") 35 | print(f" Max steps: {args.max_total_steps}") 36 | print(f" Max initial moves: {args.max_initial_moves}") 37 | print(f" Empty board prob: {args.empty_board_prob}") 38 | print(f"\nReward structure:") 39 | print(f" Win: +{GomokuGame.REWARD_WIN}") 40 | print(f" Loss: {GomokuGame.REWARD_LOSS}") 41 | print(f" Invalid format: {GomokuGame.REWARD_INVALID_FORMAT}") 42 | print(f" Timeout: {GomokuGame.REWARD_TIMEOUT}") 43 | 44 | if GomokuGameStats: 45 | print(f"\nUsing GomokuGameStats for win/loss/draw tracking") 46 | else: 47 | print(f"\nUsing BaseGameStats (GomokuGameStats not available)") 48 | 49 | run_game_server( 50 | game_class=GomokuGame, 51 | host=args.host, 52 | port=args.port, 53 | stats_class=GomokuGameStats, # None falls back to BaseGameStats 54 | board_size=args.board_size, 55 | max_total_steps=args.max_total_steps, 56 | max_initial_moves=args.max_initial_moves, 57 | empty_board_prob=args.empty_board_prob, 58 | ) 59 | 60 | 61 | if __name__ == "__main__": 62 | main() 63 | -------------------------------------------------------------------------------- /opentinker/server/config/sft_trainer_engine.yaml: -------------------------------------------------------------------------------- 1 | # Format checks enforced on CI: 2 | # 1. Comments must appear above each field. 3 | # 2. There must be a blank line between each field. 4 | # 3. Inline comments (after a field on the same line) are not allowed. 5 | # 4. Indentation level is respected for nested fields. 6 | 7 | # @.: 8 | 9 | defaults: 10 | - model@model: hf_model 11 | - engine@engine: fsdp 12 | - optim@optim: fsdp 13 | - _self_ 14 | 15 | data: 16 | train_batch_size: 256 # global batch size 17 | micro_batch_size_per_gpu: 4 # this is also val batch size 18 | max_token_len_per_gpu: 8192 19 | use_dynamic_bsz: True 20 | train_files: ~/data/gsm8k/train.parquet 21 | val_files: null 22 | train_max_samples: -1 # set to -1 to use full dataset 23 | val_max_samples: -1 # set to -1 to use full dataset 24 | # Multi-turn settings 25 | messages_key: messages # Key for messages list in multi-turn mode 26 | tools_key: tools # Key for tools list in multi-turn mode 27 | enable_thinking_key: enable_thinking # Whether to enable thinking in multi-turn mode 28 | pad_mode: no_padding 29 | # for right padding 30 | max_length: 1024 31 | truncation: error 32 | balance_dp_token: False # to be implement 33 | custom_cls: 34 | path: null 35 | name: null 36 | use_shm: False 37 | apply_chat_template_kwargs: {} 38 | 39 | # Checkpoint configuration 40 | checkpoint: 41 | _target_: verl.trainer.config.CheckpointConfig 42 | # What to include in saved checkpoints 43 | # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space 44 | save_contents: ["model", "optimizer", "extra"] 45 | 46 | # For more flexibility, you can specify the contents to load from the checkpoint. 47 | load_contents: ${checkpoint.save_contents} 48 | 49 | trainer: 50 | default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} 51 | default_hdfs_dir: null 52 | project_name: gsm8k-sft 53 | experiment_name: test 54 | total_epochs: 4 55 | total_training_steps: null 56 | logger: [ 'console', 'wandb' ] 57 | seed: 1 58 | save_freq: -1 59 | test_freq: -1 60 | max_ckpt_to_keep: null # Maximum number of checkpoints to keep, set to null to keep all 61 | 62 | # Resume mode: "auto", "disable", or "resume_path" 63 | # "auto": resume from last checkpoint if available 64 | # "disable": start from scratch 65 | # "resume_path": resume from a user-defined path 66 | resume_mode: auto 67 | 68 | # Path to resume training from (used when resume_mode is "resume_path" or "auto") 69 | resume_from_path: null 70 | device: cuda 71 | -------------------------------------------------------------------------------- /opentinker/server/config/engine/megatron.yaml: -------------------------------------------------------------------------------- 1 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs 2 | _target_: verl.workers.config.McoreEngineConfig 3 | 4 | # Whether to offload model parameters to CPU 5 | param_offload: False 6 | 7 | # Whether to offload gradients to CPU 8 | grad_offload: False 9 | 10 | # Whether to offload optimizer state to CPU 11 | optimizer_offload: False 12 | 13 | # tensor model parallel size 14 | tensor_model_parallel_size: 1 15 | 16 | # expert model parallel size 17 | expert_model_parallel_size: 1 18 | 19 | # expert tensor parallel size 20 | expert_tensor_parallel_size: 1 21 | 22 | # pipeline model parallel size 23 | pipeline_model_parallel_size: 1 24 | 25 | # virtual pipeline model parallel size 26 | virtual_pipeline_model_parallel_size: null 27 | 28 | # context parallel size 29 | context_parallel_size: 1 30 | 31 | # sequence parallel 32 | sequence_parallel: True 33 | 34 | # Whether to use distributed optimizer 35 | use_distributed_optimizer: True 36 | 37 | # Whether to use distributed checkpointing 38 | use_dist_checkpointing: False 39 | 40 | # distributed checkpointing path 41 | dist_checkpointing_path: null 42 | 43 | # oc.select: default val for ref.megatron.seed 44 | seed: 42 45 | 46 | # Allow to override Distributed Data Parallel (DDP) config 47 | override_ddp_config: {} 48 | 49 | # additional transformer config like: num_layers_in_first(/last)_pipeline_stage 50 | # oc.select: default val for ref.megatron.override_transformer_config 51 | override_transformer_config: 52 | # Recompute configuration, same as in megatron.training.arguments 53 | # default use minimal performance-interference recompute methods 54 | # Recompute granualarity, choices: ["full", "selective"] 55 | recompute_granularity: null 56 | 57 | # Recompute modules, multiple choices: ["core_attn", "moe_act", "layernorm", "mla_up_proj", "mlp", "moe"] 58 | # Please use correct module in matched model 59 | recompute_modules: ["core_attn"] 60 | 61 | # 'uniform', 'block' 62 | # 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk 63 | # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity 64 | recompute_method: null 65 | 66 | # 'full' will checkpoint the entire transformer layer and 'selective' only checkpoints memory intensive part of attention 67 | recompute_num_layers: null 68 | 69 | # Attention backend to use (flash,fused,unfused,local,auto). Defaults to auto in mcore, flash in verl 70 | attention_backend: flash 71 | 72 | override_mcore_model_config: {} 73 | 74 | # oc.select: default val for ref.megatron.use_mbridge 75 | use_mbridge: False 76 | 77 | # whether to use forward only 78 | forward_only: False 79 | -------------------------------------------------------------------------------- /opentinker/environment/geo3k/geo3k_tool_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2025 OpenTinker 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Geo3K Multi-Turn VL Environment. 16 | 17 | This environment supports multi-turn geometry problem solving with vision-language models. 18 | It uses Geo3KToolGame for verification-based interactions. 19 | """ 20 | 21 | from opentinker.environment.vl_game_environment import VLGameEnvironment 22 | from opentinker.environment.geo3k.geo3k_tool_game import Geo3KToolGame 23 | 24 | 25 | class Geo3KToolEnvironment(VLGameEnvironment): 26 | """Multi-turn VL environment for Geo3K geometry problems. 27 | 28 | This environment uses: 29 | - Geo3KToolGame for multi-turn verification logic 30 | - StaticDatasetGeneratorVL for image handling 31 | - GymEnvironmentInteraction for HTTP communication 32 | 33 | The model can submit answers multiple times and receive feedback 34 | in verl-compatible format: "Current parsed answer={answer} reward={0.0|1.0}" 35 | 36 | Args: 37 | config: Configuration object 38 | data_paths: Training data paths (parquet files) 39 | val_data_paths: Validation data paths (optional) 40 | job_id: Job identifier 41 | max_retries: Max verification attempts per problem (default: 3) 42 | 43 | Example: 44 | env = Geo3KToolEnvironment( 45 | config=config, 46 | data_paths=["~/data/geo3k_multiturn/train.parquet"], 47 | val_data_paths=["~/data/geo3k_multiturn/test.parquet"], 48 | job_id="geo3k_tool_training_001", 49 | ) 50 | """ 51 | 52 | def __init__( 53 | self, 54 | config, 55 | data_paths, 56 | val_data_paths=None, 57 | job_id=None, 58 | max_retries: int = 3, 59 | ): 60 | # Initialize with multi-turn Geo3K game and VL environment 61 | super().__init__( 62 | game_class=Geo3KToolGame, 63 | config=config, 64 | data_paths=data_paths, 65 | val_data_paths=val_data_paths, 66 | game_kwargs={"max_retries": max_retries}, 67 | job_id=job_id, 68 | image_key="images", # Geo3K uses "images" field 69 | ) 70 | -------------------------------------------------------------------------------- /opentinker/client/client_config/generic_env_param.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Generic Environment Training (LLM-Environment Interaction) 2 | # 3 | # Key differences from opentinker_param.yaml: 4 | # - algorithm: "agent_loop" (uses GenericAgentLoop) 5 | # - No reward function (environment provides rewards) 6 | # - Interaction configuration for Gym environment 7 | 8 | server_url: "http://localhost:8000" # 如果是scheduler版本,不需要这个参数 9 | scheduler_url: "http://localhost:8766" 10 | scheduler_api_key: "otk_98b8db24ccd64c92e1fdd9a232e209fa" 11 | 12 | # GPU allocation 13 | num_gpus: 4 14 | 15 | # Data configuration 16 | data_path: null # Path to training data (parquet file) 17 | val_data_path: null # Optional validation data 18 | tokenizer_path: null # Path to tokenizer/model 19 | batch_size: 16 # Smaller batch for multi-turn 20 | val_batch_size: 50 # Validation batch size (also controls dataset size if val_max_samples not set) 21 | # Training duration - set ONE of these (num_steps takes precedence if both set) 22 | num_epochs: 3 # Number of epochs (null = use num_steps) 23 | num_steps: null # Total training steps (null = use num_epochs) 24 | num_workers: 0 25 | 26 | # Interaction configuration (for GenericAgentLoop) 27 | # This replaces the reward function - environment provides rewards 28 | interaction: 29 | name: "gym_env" # Name referenced in dataset's interaction_kwargs 30 | class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction 31 | config: 32 | env_endpoint: "http://localhost:8084" # Mock environment server 33 | max_steps: 50 34 | observation_template: "Environment: {observation}" 35 | 36 | # Multi-turn configuration 37 | multi_turn: 38 | max_user_turns: 12 39 | max_assistant_turns: 12 40 | max_tokens_per_turn: 512 # Per-turn response limit (optional, null for no limit) 41 | # Weave tracing (optional - runs on SERVER side) 42 | weave_project: "zsqzz/generic-env-test" # W&B project for tracing {'message': 'Entity opentinker not found', 'path': ['upsertModel']} 43 | experiment_name: "gym_interaction" # Experiment name in Weave 44 | 45 | # Project tracking 46 | project_name: "generic_env_training" 47 | experiment_name: "gym_interaction" 48 | save_freq: 50000 49 | test_freq: 5 50 | 51 | # Generation parameters 52 | temperature: 1 # Lower temperature for more focused responses 53 | top_p: 1 54 | max_new_tokens: 8192 # TOTAL response budget for entire multi-turn trajectory (NOT per-turn!) 55 | max_prompt_tokens: 2048 56 | 57 | # Algorithm selection 58 | # IMPORTANT: Must be "agent_loop" for GenericEnvironment 59 | algorithm: "agent_loop" 60 | 61 | # Logging 62 | logger_backends: ["console","wandb"] # Add "wandb" if needed 63 | wandb_key: null 64 | enable_tracing: true 65 | # No reward configuration needed! 66 | # GenericEnvironment gets rewards from the interaction 67 | -------------------------------------------------------------------------------- /opentinker/client/client_config/gomoku_param.yaml: -------------------------------------------------------------------------------- 1 | # Gomoku Training Configuration 2 | # Use with: python gomoku_client.py 3 | 4 | # Project settings 5 | project_name: opentinker 6 | experiment_name: gomoku_training 7 | 8 | # Logging 9 | logger_backends: ["console","wandb"] 10 | 11 | # Tracing (optional) 12 | enable_tracing: true 13 | weave_project: null 14 | 15 | # WandB (optional) 16 | wandb_key: 2ed6f8544ac3e30d5c08879166cc10d9c6232448 17 | 18 | 19 | # Model and tokenizer 20 | tokenizer_path: null 21 | 22 | 23 | # Training parameters 24 | batch_size: 4 25 | num_workers: 4 26 | # Training duration - set ONE of these (num_steps takes precedence if both set) 27 | num_epochs: null # Number of epochs (null = use num_steps) 28 | num_steps: 1000 # Total training steps (null = use num_epochs) 29 | save_freq: 20000 30 | test_freq: 10 # Validation frequency (every N steps) 31 | 32 | # Validation parameters 33 | val_batch_size: 50 # Total validation samples (null = 50) 34 | 35 | # Model parameters 36 | # Generation parameters 37 | temperature: 1 # Lower temperature for more focused responses 38 | top_p: 1 39 | max_new_tokens: 8192 # TOTAL response budget for entire multi-turn trajectory (NOT per-turn!) 40 | max_prompt_tokens: 4096 41 | 42 | # Algorithm (must be agent_loop for multi-turn) 43 | algorithm: "agent_loop" 44 | 45 | # RL Algorithm settings (passed to server via scheduler) 46 | # adv_estimator options: 47 | # - "grpo" : Standard GRPO (outcome-only advantage) 48 | # - "grpo_per_step" : Per-step GRPO with return-based advantages (for multi-turn tasks) 49 | # - "gae" : Generalized Advantage Estimation (for PPO, requires critic) 50 | adv_estimator: "grpo_per_step" 51 | # rollout_n: number of samples per prompt for GRPO/grpo_per_step (only used when adv_estimator=grpo or grpo_per_step) 52 | rollout_n: 16 53 | 54 | # Interaction configuration 55 | interaction: 56 | name: gomoku 57 | class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction 58 | config: 59 | env_host: 0.0.0.0 60 | env_port: 8088 61 | env_endpoint: http://${interaction.config.env_host}:${interaction.config.env_port} 62 | max_steps: 81 # 这是后端GymEnvironmentInteraction的参数 63 | max_total_steps: 39 # 调用环境的step方法的最大次数限制(防止invalid move hacking) 64 | max_initial_moves: 0 65 | board_size: 9 66 | observation_template: "{observation}" 67 | empty_board_prob: 1.0 68 | 69 | 70 | multi_turn: 71 | max_user_turns: ${interaction.config.max_total_steps} # 这是vllm的参数 72 | max_assistant_turns: ${interaction.config.max_total_steps} # 这是vllm的参数 73 | max_tokens_per_turn: 256 # Per-turn response limit (optional, null for no limit) 74 | # Weave tracing (optional - runs on SERVER side) 75 | weave_project: "zsqzz/gomoku-env-test" # W&B project for tracing {'message': 'Entity opentinker not found', 'path': ['upsertModel']} 76 | experiment_name: "gomoku_interaction" # Experiment name in Weave 77 | 78 | 79 | # Scheduler settings 80 | scheduler_url: "http://0.0.0.0:8780" 81 | scheduler_api_key: otk_98b8db24ccd64c92e1fdd9a232e209fa # this is user id 82 | 83 | # GPU settings 84 | num_gpus: 4 85 | 86 | -------------------------------------------------------------------------------- /opentinker/server/config/sft_trainer.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - optim: fsdp 3 | - _self_ 4 | 5 | data: 6 | train_batch_size: 256 7 | micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu 8 | micro_batch_size_per_gpu: 4 # this is also val batch size 9 | train_files: ~/data/gsm8k/train.parquet 10 | val_files: ~/data/gsm8k/test.parquet 11 | train_max_samples: -1 # set to -1 to use full dataset 12 | val_max_samples: -1 # set to -1 to use full dataset 13 | # Single-turn settings 14 | prompt_key: question 15 | response_key: answer 16 | prompt_dict_keys: null 17 | response_dict_keys: null 18 | # Multi-turn settings 19 | multiturn: 20 | enable: false # Set to true to use multi-turn dataset 21 | messages_key: messages # Key for messages list in multi-turn mode 22 | tools_key: tools # Key for tools list in multi-turn mode 23 | enable_thinking_key: enable_thinking # Whether to enable thinking in multi-turn mode 24 | max_length: 1024 25 | truncation: error 26 | balance_dp_token: False 27 | chat_template: null 28 | custom_cls: 29 | path: null 30 | name: null 31 | use_shm: False 32 | apply_chat_template_kwargs: {} 33 | model: 34 | partial_pretrain: ~/models/gemma-1.1-7b-it 35 | use_shm: False 36 | fsdp_config: 37 | model_dtype: fp32 38 | wrap_policy: 39 | min_num_params: 0 40 | cpu_offload: False 41 | offload_params: False 42 | external_lib: null 43 | enable_gradient_checkpointing: True 44 | trust_remote_code: False 45 | lora_rank: 0 # Set to positive value to enable LoRA (e.g., 32) 46 | lora_alpha: 16 # LoRA scaling factor 47 | target_modules: all-linear # Target modules for LoRA adaptation 48 | use_liger: False 49 | strategy: fsdp2 50 | optim: 51 | lr: 1e-5 52 | betas: [0.9, 0.95] 53 | weight_decay: 0.01 54 | lr_warmup_steps_ratio: 0.1 55 | clip_grad: 1.0 56 | lr_scheduler: cosine 57 | ulysses_sequence_parallel_size: 1 58 | use_remove_padding: False 59 | trainer: 60 | default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} 61 | default_hdfs_dir: null 62 | project_name: gsm8k-sft 63 | experiment_name: test 64 | total_epochs: 4 65 | total_training_steps: null 66 | logger: [ 'console', 'wandb' ] 67 | seed: 1 68 | save_freq: -1 69 | test_freq: -1 70 | nnodes: 1 71 | n_gpus_per_node: 8 72 | max_ckpt_to_keep: null # Maximum number of checkpoints to keep, set to null to keep all 73 | 74 | # Resume mode: "auto", "disable", or "resume_path" 75 | # "auto": resume from last checkpoint if available 76 | # "disable": start from scratch 77 | # "resume_path": resume from a user-defined path 78 | resume_mode: auto 79 | 80 | # Path to resume training from (used when resume_mode is "resume_path" or "auto") 81 | resume_from_path: null 82 | 83 | # Checkpoint configuration 84 | checkpoint: 85 | # What to include in saved checkpoints 86 | # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space 87 | save_contents: ["model", "optimizer", "extra"] 88 | 89 | # For more flexibility, you can specify the contents to load from the checkpoint. 90 | load_contents: ${trainer.checkpoint.save_contents} 91 | device: cuda 92 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Setup script for OpenTinker. 4 | 5 | This allows the package to be installed in development mode with: 6 | pip install -e . 7 | """ 8 | 9 | from setuptools import setup, find_packages 10 | from pathlib import Path 11 | 12 | # Read requirements 13 | def read_requirements(filename): 14 | """Read requirements from file, ignoring comments and empty lines.""" 15 | requirements = [] 16 | filepath = Path(__file__).parent / filename 17 | if filepath.exists(): 18 | with open(filepath, "r") as f: 19 | for line in f: 20 | line = line.strip() 21 | if line and not line.startswith("#"): 22 | requirements.append(line) 23 | return requirements 24 | 25 | 26 | # Read README for long description 27 | readme_path = Path(__file__).parent / "README.md" 28 | long_description = "" 29 | if readme_path.exists(): 30 | with open(readme_path, "r", encoding="utf-8") as f: 31 | long_description = f.read() 32 | 33 | 34 | setup( 35 | name="opentinker", 36 | version="0.1.0", 37 | description="OpenTinker: A distributedframework for training and inference with interactive environments", 38 | long_description=long_description, 39 | long_description_content_type="text/markdown", 40 | author="Siqi Zhu, Jiaxuan You", 41 | author_email="siqizhu4@illinois.edu, jiaxuan@illinois.edu", 42 | url="https://github.com/open-tinker/OpenTinker", 43 | python_requires=">=3.8", 44 | packages=find_packages(include=["opentinker", "opentinker.*"]), 45 | install_requires=[ 46 | # Core dependencies 47 | "ray>=2.9.0", 48 | "torch>=2.0.0", 49 | "transformers>=4.35.0", 50 | # Web framework 51 | "fastapi>=0.104.0", 52 | "uvicorn>=0.24.0", 53 | "pydantic>=2.0.0", 54 | # Configuration 55 | "omegaconf>=2.3.0", 56 | "hydra-core>=1.3.0", 57 | "pyyaml>=6.0", 58 | # Data processing 59 | "pandas>=2.0.0", 60 | "pyarrow>=14.0.0", 61 | "datasets>=2.14.0", 62 | # Utilities 63 | "requests>=2.31.0", 64 | "aiohttp>=3.9.0", 65 | ], 66 | extras_require={ 67 | "dev": [ 68 | "pytest>=7.4.0", 69 | "black>=23.0.0", 70 | "flake8>=6.1.0", 71 | ], 72 | "logging": [ 73 | "wandb>=0.16.0", 74 | ], 75 | }, 76 | entry_points={ 77 | # "console_scripts": [ 78 | # "opentinker-scheduler=opentinker.scheduler.launch_scheduler_kill:main", 79 | # ], 80 | }, 81 | classifiers=[ 82 | "Development Status :: 3 - Alpha", 83 | "Intended Audience :: Developers", 84 | "Intended Audience :: Science/Research", 85 | "License :: OSI Approved :: Apache Software License", 86 | "Programming Language :: Python :: 3", 87 | "Programming Language :: Python :: 3.8", 88 | "Programming Language :: Python :: 3.9", 89 | "Programming Language :: Python :: 3.10", 90 | "Programming Language :: Python :: 3.11", 91 | "Programming Language :: Python :: 3.12", 92 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 93 | ], 94 | ) 95 | -------------------------------------------------------------------------------- /opentinker/server/config/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass, field 16 | from typing import Any, Optional 17 | 18 | from verl.base_config import BaseConfig 19 | 20 | __all__ = ["CheckpointConfig", "ProfileConfig", "BaseModelConfig"] 21 | 22 | 23 | @dataclass 24 | class CheckpointConfig(BaseConfig): 25 | """Configuration for model checkpointing. 26 | 27 | The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config. 28 | 29 | Args: 30 | save_contents (list[str]): What to include in saved checkpoints. 31 | Options: 'model', 'optimizer', 'extra', 'hf_model'. 32 | load_contents (list[str]): Contents to load from checkpoint. Defaults to same as save_contents. 33 | async_save (bool): Whether to save checkpoints asynchronously. Only implemented for Megatron as of now. 34 | """ 35 | 36 | save_contents: list[str] = field(default_factory=lambda: ["model", "optimizer", "extra"]) 37 | load_contents: list[str] = field(default_factory=lambda: ["model", "optimizer", "extra"]) 38 | async_save: bool = False 39 | 40 | 41 | @dataclass 42 | class ProfileConfig(BaseConfig): 43 | """Configuration for profiling. 44 | 45 | The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config. 46 | 47 | Args: 48 | profile_ranks (Optional[list[int]]): List of ranks to profile. None means all ranks. 49 | step_start (int): Starting step for profiling. 50 | step_end (int): Ending step for profiling. 51 | save_path (Optional[str]): Path to save profiling results. 52 | """ 53 | 54 | profile_ranks: Optional[list[int]] = None 55 | step_start: int = -1 56 | step_end: int = -1 57 | save_path: Optional[str] = None 58 | 59 | 60 | @dataclass 61 | class BaseModelConfig(BaseConfig): 62 | """Base configuration for a model. 63 | Contains core settings for loading and initializing a pretrained model checkpoint. 64 | 65 | Args: 66 | path (str): Path to pretrained model weights. 67 | tokenizer_path (Optional[str]): Tokenizer path (defaults to actor's model path if not set). 68 | override_config (dict): Hugging Face config override. 69 | external_lib (Optional[str]): External model implementation (optional). 70 | trust_remote_code (bool): Whether to trust remote code from Hugging Face models. 71 | """ 72 | 73 | path: str = "~/models/deepseek-llm-7b-chat" 74 | tokenizer_path: Optional[str] = None 75 | override_config: dict[str, Any] = field(default_factory=dict) 76 | external_lib: Optional[str] = None 77 | trust_remote_code: bool = False 78 | -------------------------------------------------------------------------------- /opentinker/data_preprocess/math.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. 2 | # Licensed under the Apache License, Version 2.0 3 | 4 | import argparse 5 | import json 6 | import os 7 | 8 | import datasets 9 | 10 | from verl.utils.hdfs_io import copy, makedirs 11 | from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed 12 | 13 | 14 | def extract_solution(solution_str): 15 | return remove_boxed(last_boxed_only_string(solution_str)) 16 | 17 | 18 | if __name__ == "__main__": 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--local_dir", default="./data", help="The local directory for the preprocessed dataset.") 21 | parser.add_argument("--hdfs_dir", default=None) 22 | parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") 23 | parser.add_argument( 24 | "--local_save_dir", default="./data", help="The save directory for the preprocessed dataset." 25 | ) 26 | 27 | args = parser.parse_args() 28 | local_dataset_path = args.local_dataset_path 29 | 30 | data_source = "DigitalLearningGmbH/MATH-lighteval" 31 | print(f"Loading the {data_source} dataset from huggingface...", flush=True) 32 | 33 | if local_dataset_path is not None: 34 | dataset = datasets.load_dataset(local_dataset_path) 35 | else: 36 | dataset = datasets.load_dataset(data_source) 37 | 38 | train_dataset = dataset["train"] 39 | test_dataset = dataset["test"] 40 | 41 | instruction_following = "Let's think step by step and output the final answer within \\boxed{}." 42 | 43 | # build map function 44 | def make_map_fn(): 45 | def process_fn(example, idx): 46 | question = example.pop("problem") + " " + instruction_following 47 | 48 | answer = example.pop("solution") 49 | solution = extract_solution(answer) 50 | 51 | return { 52 | "prompt": [{"role": "user", "content": question}], 53 | "ground_truth": solution, 54 | "data_source": data_source, 55 | } 56 | return process_fn 57 | 58 | train_dataset = train_dataset.map(make_map_fn(), with_indices=True) 59 | test_dataset = test_dataset.map(make_map_fn(), with_indices=True) 60 | 61 | local_save_dir = args.local_dir or args.local_save_dir 62 | local_dir = os.path.expanduser(local_save_dir) 63 | os.makedirs(local_dir, exist_ok=True) 64 | 65 | # -------------------- 66 | # Write JSONL files 67 | # -------------------- 68 | def write_jsonl(ds, path): 69 | with open(path, "w") as f: 70 | for item in ds: 71 | json.dump(item, f) 72 | f.write("\n") 73 | 74 | train_jsonl = os.path.join(local_dir, "train.jsonl") 75 | test_jsonl = os.path.join(local_dir, "test.jsonl") 76 | 77 | print(f"Writing JSONL to {train_jsonl} and {test_jsonl}") 78 | write_jsonl(train_dataset, train_jsonl) 79 | write_jsonl(test_dataset, test_jsonl) 80 | 81 | # Save first example for reference 82 | with open(os.path.join(local_dir, "train_example.json"), "w") as f: 83 | json.dump(train_dataset[0], f, indent=2) 84 | with open(os.path.join(local_dir, "test_example.json"), "w") as f: 85 | json.dump(test_dataset[0], f, indent=2) 86 | 87 | if args.hdfs_dir is not None: 88 | makedirs(args.hdfs_dir) 89 | copy(src=local_dir, dst=args.hdfs_dir) 90 | -------------------------------------------------------------------------------- /docs/geo3k_quickstart.md: -------------------------------------------------------------------------------- 1 | # Geo3K Vision-Language Training Quick Start 2 | 3 | ## Overview 4 | 5 | This guide shows how to train vision-language models (like Qwen2.5-VL) on Geo3K geometry problems using OpenTinker. 6 | 7 | ## Prerequisites 8 | 9 | ```bash 10 | # Install required packages 11 | pip install transformers>=4.37.0 pillow 12 | 13 | # Prepare Geo3K data (if not already done) 14 | cd verl/examples/data_preprocess 15 | python geo3k.py --local_save_dir ~/data/geo3k 16 | ``` 17 | 18 | ## Quick Start 19 | 20 | ### 1. Test Data Loading (Optional) 21 | 22 | ```bash 23 | python opentinker/test_geo3k_data.py --data_path ~/data/geo3k/train.parquet 24 | ``` 25 | 26 | ### 2. Configure Training 27 | 28 | Edit `opentinker/client/client_config/geo3k_param.yaml`: 29 | 30 | ```yaml 31 | # Model paths 32 | tokenizer_path: Qwen/Qwen2.5-VL-7B-Instruct 33 | processor_path: Qwen/Qwen2.5-VL-7B-Instruct 34 | 35 | # Data 36 | data_path: ~/data/geo3k/train.parquet 37 | val_data_path: ~/data/geo3k/test.parquet 38 | 39 | # GRPO settings 40 | adv_estimator: "grpo" 41 | rollout_n: 5 42 | 43 | # Resources 44 | num_gpus: 8 45 | batch_size: 64 46 | ``` 47 | 48 | ### 3. Launch Training 49 | 50 | ```bash 51 | python opentinker/client/geo3k_rl.py 52 | ``` 53 | 54 | Or with custom parameters: 55 | 56 | ```bash 57 | python opentinker/client/geo3k_rl.py \ 58 | tokenizer_path=Qwen/Qwen2.5-VL-7B-Instruct \ 59 | batch_size=32 \ 60 | num_epochs=15 \ 61 | num_gpus=4 62 | ``` 63 | 64 | ## Architecture Components 65 | 66 | - **Data Generator**: `StaticDatasetGeneratorVL` - loads images from parquet 67 | - **Dataset**: `DynamicGameDatasetVL` - processes text + images with AutoProcessor 68 | - **Environment**: `VLGameEnvironment` - VL-aware training environment 69 | - **Game**: `Geo3KGame` - geometry problem logic with reward computation 70 | - **Client**: `geo3k_rl.py` - training launcher 71 | 72 | ## Key Differences from Text-Only Training 73 | 74 | | Aspect | Text-Only | Vision-Language | 75 | |--------|-----------|-----------------| 76 | | Processor | AutoTokenizer | AutoProcessor | 77 | | Data Generator | StaticDatasetGenerator | StaticDatasetGeneratorVL | 78 | | Dataset | DynamicGameDataset | DynamicGameDatasetVL | 79 | | Environment | GameEnvironment | VLGameEnvironment | 80 | | Data Fields | prompt | prompt + images | 81 | | Model Input | input_ids, attention_mask | + pixel_values, image_grid_thw | 82 | 83 | ## Next Steps 84 | 85 | ### Add Multi-Turn Support 86 | 87 | Create a multi-turn version that allows reasoning refinement: 88 | 89 | 1. Extend `Geo3KGame` for multi-turn interactions 90 | 2. Update config: `max_user_turns: 2`, `max_assistant_turns: 3` 91 | 3. Optionally add tools for intermediate verification 92 | 93 | ### Add Other VL Tasks 94 | 95 | Follow the Geo3K pattern for: 96 | - MathVista (math with diagrams) 97 | - ChartQA (chart understanding) 98 | - DocVQA (document QA) 99 | 100 | ## Troubleshooting 101 | 102 | ### "No module named transformers" 103 | ```bash 104 | pip install transformers>=4.37.0 105 | ``` 106 | 107 | ### "Data file not found" 108 | ```bash 109 | python verl/examples/data_preprocess/geo3k.py --local_save_dir ~/data/geo3k 110 | ``` 111 | 112 | ### "AutoProcessor not found" 113 | Ensure you're using a VL model path (e.g., Qwen2.5-VL, not Qwen2.5). 114 | 115 | ## References 116 | 117 | - Implementation Plan: `implementation_plan.md` 118 | - Walkthrough: `walkthrough.md` 119 | - verl Geo3K Example: `verl/examples/grpo_trainer/run_qwen2_5_vl-7b.sh` 120 | -------------------------------------------------------------------------------- /opentinker/client/geo3k_rl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Geo3K Vision-Language RL Training Client. 3 | 4 | This script launches Geo3K geometry problem training using vision-language models. 5 | It follows the same pattern as math_rl.py but uses VL-specific components. 6 | """ 7 | 8 | import hydra 9 | from omegaconf import OmegaConf 10 | 11 | from opentinker.client.utils.http_training_client import ServiceClient, SchedulerClient 12 | from opentinker.environment.geo3k import Geo3KGameEnvironment 13 | from opentinker.environment.game_stats_client import GameStatsClient 14 | from opentinker.client.utils.utils import resolve_paths_in_config 15 | from opentinker.client.utils.scheduler_client_lifecycle import get_lifecycle_manager 16 | 17 | 18 | @hydra.main(config_path="client_config", config_name="geo3k_param.yaml") 19 | def main(args): 20 | args = resolve_paths_in_config(args) 21 | lifecycle = get_lifecycle_manager() 22 | 23 | print("=" * 60) 24 | print("Geo3K Vision-Language Training with OpenTinker") 25 | print("=" * 60) 26 | 27 | # 1. Submit job to scheduler 28 | scheduler_client = SchedulerClient( 29 | scheduler_url=args.get("scheduler_url", "http://localhost:8780"), 30 | api_key=args.get("scheduler_api_key") 31 | ) 32 | 33 | job_result = scheduler_client.submit_job( 34 | config=OmegaConf.to_container(args, resolve=True), 35 | enable_agent_loop=True, 36 | wandb_key=args.get("wandb_key"), 37 | num_gpus=args.get("num_gpus"), 38 | ) 39 | 40 | job_id = job_result["job_id"] 41 | server_url = job_result["server_url"] 42 | lifecycle.register_job(scheduler_client, job_id) 43 | 44 | print(f"✓ Job {job_id} allocated at {server_url}") 45 | 46 | # 2. Setup Geo3K VL environment 47 | env_endpoint = args.interaction.config.env_endpoint 48 | env = Geo3KGameEnvironment( 49 | config=args, 50 | data_paths=[args.data_path], 51 | val_data_paths=[args.val_data_path] if args.val_data_path else None, 52 | job_id=job_id, 53 | ) 54 | print(f"✓ Geo3K VL environment created, interaction config: {env.get_interaction_config_path()}") 55 | 56 | # 3. Setup game stats client (optional) 57 | game_stats = GameStatsClient(env_endpoint, job_id=env.job_id) 58 | if game_stats.health_check(): 59 | game_stats.reset_all() 60 | print(f"✓ Connected to game server at {env_endpoint}") 61 | else: 62 | game_stats = None 63 | print(f"⚠ Game server not responding at {env_endpoint}") 64 | 65 | # 4. Connect to training server 66 | client = ServiceClient( 67 | server_url=server_url, 68 | project_name=args.project_name, 69 | experiment_name=args.experiment_name, 70 | logger_backends=args.logger_backends, 71 | ) 72 | client.set_config(args, env) 73 | 74 | # 5. Train 75 | print(f"Starting Geo3K training: steps={args.get('num_steps')}, epochs={args.get('num_epochs')}") 76 | 77 | try: 78 | final_metrics = client.fit( 79 | env=env, 80 | num_epochs=args.get("num_epochs"), 81 | num_steps=args.get("num_steps"), 82 | save_freq=args.save_freq, 83 | test_freq=args.test_freq, 84 | verbose=True, 85 | validate_before_training=True, 86 | game_stats_client=game_stats, 87 | ) 88 | print(f"Training completed! Metrics: {final_metrics}") 89 | finally: 90 | env.cleanup() 91 | 92 | 93 | if __name__ == "__main__": 94 | main() 95 | -------------------------------------------------------------------------------- /opentinker/environment/static_data_generator_vl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Vision-Language Static Data Generator for OpenTinker. 3 | 4 | This module extends StaticDatasetGenerator to support vision-language models 5 | by loading and processing images from parquet files. 6 | """ 7 | 8 | import logging 9 | from typing import Any, Dict, List, Optional 10 | 11 | from opentinker.environment.static_data_generator import StaticDatasetGenerator 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class StaticDatasetGeneratorVL(StaticDatasetGenerator): 17 | """Static dataset generator with vision-language support. 18 | 19 | This generator extends StaticDatasetGenerator to handle image data 20 | from parquet files. Images are typically stored as lists of PIL images 21 | or image paths in the dataset. 22 | 23 | Args: 24 | data_paths: List of parquet file paths 25 | interaction_name: Name of the interaction handler 26 | prompt_key: Key for prompt field in data (default: "prompt") 27 | ground_truth_key: Key for ground truth answer (default: "ground_truth") 28 | image_key: Key for image field in data (default: "images") 29 | shuffle: Whether to shuffle data 30 | seed: Random seed for shuffling 31 | system_prompt: Optional system prompt to prepend 32 | 33 | Example: 34 | generator = StaticDatasetGeneratorVL( 35 | data_paths=["~/data/geo3k/train.parquet"], 36 | interaction_name="game", 37 | image_key="images", 38 | ) 39 | """ 40 | 41 | def __init__( 42 | self, 43 | data_paths: List[str], 44 | interaction_name: str = "game", 45 | prompt_key: str = "prompt", 46 | ground_truth_key: str = "ground_truth", 47 | image_key: str = "images", 48 | shuffle: bool = False, 49 | seed: Optional[int] = None, 50 | system_prompt: Optional[str] = None, 51 | ): 52 | super().__init__( 53 | data_paths=data_paths, 54 | interaction_name=interaction_name, 55 | prompt_key=prompt_key, 56 | ground_truth_key=ground_truth_key, 57 | shuffle=shuffle, 58 | seed=seed, 59 | system_prompt=system_prompt, 60 | ) 61 | self.image_key = image_key 62 | logger.info(f"StaticDatasetGeneratorVL initialized with image_key='{image_key}'") 63 | 64 | def generate_sample(self, index: int) -> Dict[str, Any]: 65 | """Generate a sample with vision-language data. 66 | 67 | Args: 68 | index: Sample index 69 | 70 | Returns: 71 | Dict with keys: 72 | - prompt: List of message dicts 73 | - env_kwargs: Dict with ground_truth 74 | - images: List of images (if present) 75 | - data_source: Data source identifier 76 | """ 77 | # Get base sample from parent class 78 | sample = super().generate_sample(index) 79 | 80 | # Add images if present in the data 81 | actual_idx = self._indices[index % len(self._samples)] 82 | row = self._samples[actual_idx] 83 | if self.image_key in row: 84 | images = row[self.image_key] 85 | # Ensure images is a list 86 | if not isinstance(images, list): 87 | images = [images] if images is not None else [] 88 | sample["images"] = images 89 | else: 90 | sample["images"] = [] 91 | 92 | return sample 93 | -------------------------------------------------------------------------------- /opentinker/reward_functions/math_reward_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Example Remote Reward API Server 4 | 5 | This is a simple FastAPI server that implements the reward computation endpoint. 6 | Use this as a template for creating your own remote reward services. 7 | 8 | Start the server: 9 | python remote_reward_api_server.py 10 | 11 | The server will listen on http://localhost:30000 12 | """ 13 | 14 | from fastapi import FastAPI, HTTPException 15 | from pydantic import BaseModel 16 | from typing import Dict, Any 17 | import uvicorn 18 | from verl.utils.reward_score import default_compute_score 19 | from transformers import PreTrainedTokenizer 20 | from typing import Any 21 | 22 | 23 | app = FastAPI( 24 | title="Remote Reward API", 25 | description="Example reward computation service", 26 | version="1.0.0" 27 | ) 28 | 29 | 30 | class ComputeRewardRequest(BaseModel): 31 | """Request model for reward computation""" 32 | data_source: str 33 | solution_str: str 34 | ground_truth: str 35 | extra_info: Dict[str, Any] 36 | sandbox_fusion_url: str = None 37 | concurrent_semaphore: int = None 38 | memory_limit_mb: int = None 39 | reward_router_address: str = None 40 | # reward_model_tokenizer: PreTrainedTokenizer = None 41 | reward_model_tokenizer: Any = None 42 | 43 | class ComputeRewardResponse(BaseModel): 44 | """Response model for reward computation""" 45 | reward: float 46 | 47 | 48 | @app.get("/health") 49 | async def health_check(): 50 | """Health check endpoint""" 51 | return {"status": "healthy", "service": "remote_reward_api"} 52 | 53 | 54 | @app.post("/compute_reward", response_model=ComputeRewardResponse) 55 | async def compute_reward(request: ComputeRewardRequest): 56 | """ 57 | Compute reward for a single solution. 58 | 59 | This is a simple example implementation. Replace with your own logic. 60 | """ 61 | func_rm_score = default_compute_score( 62 | request.data_source, 63 | request.solution_str, 64 | request.ground_truth, 65 | request.extra_info, 66 | # request.sandbox_fusion_url, 67 | # request.concurrent_semaphore, 68 | # request.memory_limit_mb, 69 | ) 70 | 71 | # Handle both dict and scalar return values 72 | # default_compute_score may return dict with {"score": ..., other_keys: ...} 73 | if isinstance(func_rm_score, dict): 74 | reward = float(func_rm_score.get("score", 0.0)) 75 | else: 76 | reward = float(func_rm_score) 77 | 78 | return ComputeRewardResponse(reward=reward) 79 | 80 | 81 | def main(): 82 | """Start the remote reward API server""" 83 | import argparse 84 | 85 | parser = argparse.ArgumentParser(description="Remote Reward API Server") 86 | parser.add_argument("--port", type=int, default=30001, help="Port to listen on (default: 30001)") 87 | parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to (default: 0.0.0.0)") 88 | args = parser.parse_args() 89 | 90 | print("="*60) 91 | print("Starting Remote Reward API Server") 92 | print("="*60) 93 | print("Endpoints:") 94 | print(f" - Health: http://localhost:{args.port}/health") 95 | print(f" - Compute reward: http://localhost:{args.port}/compute_reward") 96 | print("="*60) 97 | 98 | uvicorn.run( 99 | app, 100 | host=args.host, 101 | port=args.port, 102 | log_level="info" 103 | ) 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /opentinker/server/config/reward_model/reward_model.yaml: -------------------------------------------------------------------------------- 1 | # configs for the reward model 2 | 3 | # Whether to enable reward model. If False, we compute the reward only with the user-defined reward functions. 4 | # In GSM8K and Math examples, we disable reward model. 5 | # For RLHF alignment example using full_hh_rlhf, we utilize reward model to assess the responses. 6 | # If False, the following parameters are not effective 7 | enable: False 8 | 9 | # Whether to deploy the model to a separate resource pool. 10 | # If true, n_gpus_per_node & nnodes will be used to determine the resource node. 11 | enable_resource_pool: False 12 | n_gpus_per_node: 0 13 | nnodes: 0 14 | 15 | # FSDP strategy: "fsdp" or "fsdp2" 16 | strategy: ??? 17 | 18 | # model config for reward scoring 19 | model: 20 | 21 | # Input tokenizer. If the reward model's chat template is inconsistent with the policy, 22 | # we need to first decode to plaintext, then apply the rm's chat_template. 23 | # Then score with RM. If chat_templates are consistent, it can be set to null. 24 | # set this to null if the chat template is identical 25 | input_tokenizer: ${actor_rollout_ref.model.path} 26 | 27 | # RM’s HDFS path or local path. Note that RM only supports AutoModelForSequenceClassification. 28 | # Other model types need to define their own RewardModelWorker and pass it from the code. 29 | path: ~/models/FsfairX-LLaMA3-RM-v0.1 30 | 31 | # External model implementation (optional) 32 | external_lib: ${actor_rollout_ref.model.external_lib} 33 | 34 | # Whether to enable loading a remote code model, default to False 35 | trust_remote_code: False 36 | 37 | # [Deprecated] Global micro batch size 38 | # will be deprecated, use micro_batch_size_per_gpu 39 | micro_batch_size: null 40 | 41 | # Local per-GPU micro batch size 42 | micro_batch_size_per_gpu: null 43 | 44 | # Maximum sequence length to process for scoring 45 | max_length: null 46 | 47 | # Whether to dynamically adjust batch size at runtime 48 | use_dynamic_bsz: ${critic.use_dynamic_bsz} 49 | 50 | # Maximum number of tokens per GPU in one forward pass 51 | forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} 52 | 53 | # Reward Manager. This defines the mechanism of computing rule-based reward and handling different reward sources. 54 | # Default is naive. If all verification functions are multiprocessing-safe, 55 | # the reward manager can be set to prime for parallel verification. 56 | reward_manager: naive 57 | 58 | # Whether to launch custom reward function asynchronously during log_prob 59 | # custom reward function executed async on CPU, during log_prob 60 | launch_reward_fn_async: False 61 | 62 | # Cloud/local sandbox fusion configuration for custom reward logic 63 | sandbox_fusion: 64 | 65 | # Cloud /local function URL for sandbox execution 66 | url: null 67 | 68 | # Max concurrent requests allowed to sandbox 69 | max_concurrent: 64 70 | 71 | # Max memory limit for each sandbox process in MB 72 | memory_limit_mb: 1024 73 | 74 | # profile the reward model in `compute_reward` 75 | profiler: 76 | 77 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs 78 | _target_: verl.utils.profiler.ProfilerConfig 79 | 80 | # profiler tool, default same as profiler.tool in global config 81 | # choices: nsys, npu, torch 82 | tool: ${oc.select:global_profiler.tool,null} 83 | 84 | # whether enable profile on ref 85 | enable: False 86 | 87 | # Whether to profile all ranks. 88 | all_ranks: False 89 | 90 | # The ranks that will be profiled. [] or [0,1,...] 91 | ranks: [] 92 | 93 | # profile results saving path 94 | save_path: ${oc.select:global_profiler.save_path,null} 95 | 96 | # specific tool config 97 | tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} -------------------------------------------------------------------------------- /opentinker/server/sandbox_tool.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import re 16 | import aiohttp 17 | from transformers.utils import get_json_schema 18 | from verl.tools.base_tool import BaseTool, OpenAIFunctionToolSchema, ToolResponse 19 | 20 | 21 | class SandboxTool(BaseTool): 22 | """Client tool to interact with the Sandbox server.""" 23 | 24 | def __init__(self, config: dict, tool_schema: OpenAIFunctionToolSchema): 25 | super().__init__(config, tool_schema) 26 | # Different model may use different code pattern, e.g. python, py, etc. 27 | self.code_pattern = re.compile(r"```py(.*?)```", re.DOTALL) 28 | 29 | async def code_interpreter(self, code: str) -> str: 30 | """Execute the code in the sandbox. 31 | 32 | Args: 33 | code: The code to be executed. 34 | 35 | Returns: 36 | str: The output of the code execution. 37 | """ 38 | async with aiohttp.ClientSession() as session: 39 | async with session.post( 40 | self.config.get("sandbox_fusion_url"), 41 | json={"code": code}, 42 | ) as resp: 43 | resp.raise_for_status() 44 | result = await resp.json() 45 | stdout, stderr = result["run_result"]["stdout"], result["run_result"]["stderr"] 46 | return stdout + stderr 47 | 48 | def get_openai_tool_schema(self) -> OpenAIFunctionToolSchema: 49 | schema = get_json_schema(self.code_interpreter) 50 | return OpenAIFunctionToolSchema(**schema) 51 | 52 | async def execute(self, instance_id: str, parameters: dict, **kwargs) -> tuple[str, float, dict]: 53 | code = parameters["code"] 54 | matches = self.code_pattern.findall(code) 55 | if matches: 56 | code = matches[0].strip() 57 | 58 | # NOTE: Some script may not explicitly print result, we need to add a print statement to the end of the script. 59 | # More better way is to SFT the model to make it print result by default, we skip SFT stage in this tutorial. 60 | lines = code.split("\n") 61 | for i, line in reversed(list(enumerate(lines))): 62 | if line == "": 63 | continue 64 | if not lines[i].startswith("print"): 65 | lines[i] = f"print({line})" 66 | break 67 | code = "\n".join(lines) 68 | 69 | result = await self.code_interpreter(code) 70 | return ToolResponse(text=result), 0.0, {} 71 | 72 | 73 | if __name__ == "__main__": 74 | # Example usage - assumes sandbox server is running 75 | import asyncio 76 | 77 | async def test_sandbox_tool(): 78 | sandbox_tool = SandboxTool( 79 | config={"sandbox_fusion_url": "http://localhost:8000/run_code"}, 80 | tool_schema=None 81 | ) 82 | 83 | # Test code execution 84 | test_code = "print('Hello from sandbox!')\nresult = 2 + 2" 85 | response, reward, info = await sandbox_tool.execute( 86 | instance_id="test", 87 | parameters={"code": test_code} 88 | ) 89 | print(f"Response: {response.text}") 90 | 91 | asyncio.run(test_sandbox_tool()) 92 | -------------------------------------------------------------------------------- /opentinker/client/math_rl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import hydra 3 | from omegaconf import OmegaConf 4 | from torch.utils.data import DataLoader 5 | from transformers import AutoTokenizer 6 | from torchdata.stateful_dataloader import StatefulDataLoader 7 | 8 | from utils.http_training_client import ServiceClient, SchedulerClient 9 | from opentinker.environment.base_game_environment import GameEnvironment 10 | from opentinker.environment.base_data_generator import DynamicGameDataset, collate_fn 11 | from opentinker.environment.math import MathGame 12 | from opentinker.environment.static_data_generator import StaticDatasetGenerator 13 | from opentinker.environment.game_stats_client import GameStatsClient 14 | from utils.utils import resolve_paths_in_config 15 | from utils.scheduler_client_lifecycle import get_lifecycle_manager 16 | from verl.trainer.main_ppo import create_rl_sampler 17 | from opentinker.environment.math.math_env import MathGameEnvironment 18 | 19 | 20 | @hydra.main(config_path="client_config", config_name="math_param.yaml") 21 | def main(args): 22 | args = resolve_paths_in_config(args) 23 | lifecycle = get_lifecycle_manager() 24 | 25 | print("=" * 60) 26 | print("Math Training with GameEnvironment Pattern") 27 | print("=" * 60) 28 | 29 | # 1. Submit job to scheduler 30 | scheduler_client = SchedulerClient( 31 | scheduler_url=args.get("scheduler_url", "http://localhost:8780"), 32 | api_key=args.get("scheduler_api_key") 33 | ) 34 | 35 | job_result = scheduler_client.submit_job( 36 | config=OmegaConf.to_container(args, resolve=True), 37 | enable_agent_loop=True, 38 | wandb_key=args.get("wandb_key"), 39 | num_gpus=args.get("num_gpus"), 40 | ) 41 | 42 | job_id = job_result["job_id"] 43 | server_url = job_result["server_url"] 44 | lifecycle.register_job(scheduler_client, job_id) 45 | 46 | print(f"✓ Job {job_id} allocated at {server_url}") 47 | 48 | # 2. Setup environment (job_id is automatically handled) 49 | env_endpoint = args.interaction.config.env_endpoint 50 | env = MathGameEnvironment( 51 | game_class=MathGame, 52 | config=args, 53 | data_paths=[args.data_path], 54 | val_data_paths=[args.val_data_path] if args.val_data_path else None, 55 | job_id=job_id, # Pass job_id directly 56 | ) 57 | print(f"✓ Environment created, interaction config: {env.get_interaction_config_path()}") 58 | 59 | # 3. Setup game stats client (use env.job_id for consistency) 60 | game_stats = GameStatsClient(env_endpoint, job_id=env.job_id) 61 | if game_stats.health_check(): 62 | game_stats.reset_all() 63 | print(f"✓ Connected to math server at {env_endpoint}") 64 | else: 65 | game_stats = None 66 | print(f"⚠ Math server not responding at {env_endpoint}") 67 | 68 | # 4. Connect to training server 69 | client = ServiceClient( 70 | server_url=server_url, 71 | project_name=args.project_name, 72 | experiment_name=args.experiment_name, 73 | logger_backends=args.logger_backends, 74 | ) 75 | client.set_config(args, env) 76 | 77 | # 5. Train 78 | print(f"Starting training: steps={args.get('num_steps')}, epochs={args.get('num_epochs')}") 79 | 80 | try: 81 | final_metrics = client.fit( 82 | env=env, 83 | num_epochs=args.get("num_epochs"), 84 | num_steps=args.get("num_steps"), 85 | save_freq=args.save_freq, 86 | test_freq=args.test_freq, 87 | verbose=True, 88 | validate_before_training=True, 89 | game_stats_client=game_stats, 90 | ) 91 | print(f"Training completed! Metrics: {final_metrics}") 92 | finally: 93 | env.cleanup() 94 | 95 | 96 | if __name__ == "__main__": 97 | main() 98 | -------------------------------------------------------------------------------- /opentinker/data_preprocess/geo3k.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Preprocess the Geometry3k dataset to parquet format 16 | """ 17 | 18 | import argparse 19 | import os 20 | 21 | import datasets 22 | 23 | from verl.utils.hdfs_io import copy, makedirs 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument("--local_dir", default=None) 28 | parser.add_argument("--hdfs_dir", default=None) 29 | parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") 30 | parser.add_argument( 31 | "--local_save_dir", default="./data/geo3k", help="The save directory for the preprocessed dataset." 32 | ) 33 | 34 | args = parser.parse_args() 35 | local_dataset_path = args.local_dataset_path 36 | 37 | data_source = "hiyouga/geometry3k" 38 | 39 | if local_dataset_path is not None: 40 | dataset = datasets.load_dataset( 41 | local_dataset_path, 42 | ) 43 | else: 44 | dataset = datasets.load_dataset( 45 | data_source, 46 | ) 47 | 48 | train_dataset = dataset["train"] 49 | test_dataset = dataset["test"] 50 | 51 | instruction_following = ( 52 | r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. " 53 | r"The reasoning process MUST BE enclosed within tags. " 54 | r"The final answer MUST BE put in \boxed{}." 55 | ) 56 | 57 | # add a row to each data item that represents a unique id 58 | def make_map_fn(split): 59 | def process_fn(example, idx): 60 | problem = example.pop("problem") 61 | prompt = problem + " " + instruction_following 62 | answer = example.pop("answer") 63 | images = example.pop("images") 64 | 65 | data = { 66 | "data_source": data_source, 67 | "prompt": [ 68 | { 69 | "role": "user", 70 | "content": prompt, 71 | } 72 | ], 73 | "images": images, 74 | "ability": "math", 75 | "reward_model": {"style": "rule", "ground_truth": answer}, 76 | "extra_info": { 77 | "split": split, 78 | "index": idx, 79 | "answer": answer, 80 | "question": problem, 81 | }, 82 | } 83 | return data 84 | 85 | return process_fn 86 | 87 | train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True, num_proc=8) 88 | test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True, num_proc=8) 89 | 90 | hdfs_dir = args.hdfs_dir 91 | local_save_dir = args.local_dir 92 | if local_save_dir is not None: 93 | print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") 94 | else: 95 | local_save_dir = args.local_save_dir 96 | 97 | train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) 98 | test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet")) 99 | 100 | if hdfs_dir is not None: 101 | makedirs(hdfs_dir) 102 | copy(src=local_save_dir, dst=hdfs_dir) 103 | -------------------------------------------------------------------------------- /opentinker/scheduler/SCHEDULER_GUIDE.md: -------------------------------------------------------------------------------- 1 | # Scheduler & Web Dashboard Guide 2 | 3 | This guide covers configuration and usage for the OpenTinker Job Scheduler and Web Dashboard. 4 | 5 | ## Configuration 6 | 7 | The scheduler is configured via `opentinker/scheduler/config/scheduler.yaml`. 8 | 9 | ### Key Settings 10 | 11 | ```yaml 12 | # Authentication 13 | enable_auth: true # Set to true to require API keys 14 | user_db_path: "scheduler_users.db" 15 | 16 | # Resources 17 | available_gpus: [0, 1, 2, 3] # GPUs to manage 18 | port_range: null # null for auto-detect, or [min, max] 19 | num_ports: 50 # Number of ports to auto-detect 20 | scheduler_port: 8765 # Main API port 21 | ``` 22 | 23 | ## Authentication 24 | 25 | ### 1. Registering Users 26 | 27 | **Method 1: Interactive Script (Recommended)** 28 | ```bash 29 | python opentinker/scheduler/register_user_example.py 30 | ``` 31 | This script prompts for a username, registers the user, and saves the API key to a local file. 32 | 33 | **Method 2: REST API** 34 | ```bash 35 | # Register a new user 36 | curl -X POST "http:///register?username=" 37 | ``` 38 | **Response:** 39 | ```json 40 | { 41 | "user_id": "user_abc123", 42 | "username": "your_username", 43 | "api_key": "otk_98b8db24ccd64c92e1fdd9a232e209fa", 44 | "message": "User registered successfully..." 45 | } 46 | ``` 47 | > ⚠️ **Important**: Save your API key immediately! It cannot be retrieved after registration. 48 | 49 | ### 2. Using the API Key 50 | 51 | Include the API key in the `Authorization` header for all requests: 52 | 53 | **cURL**: 54 | ```bash 55 | curl -H "Authorization: Bearer " http:///list_jobs 56 | ``` 57 | 58 | **Python**: 59 | ```python 60 | import requests 61 | headers = {"Authorization": f"Bearer {api_key}"} 62 | response = requests.get(f"{scheduler_url}/list_jobs", headers=headers) 63 | ``` 64 | 65 | The Web Dashboard provides a real-time view of job status and resource usage. 66 | 67 | ### 1. Start the Dashboard 68 | 69 | ```bash 70 | python opentinker/scheduler/web_dashboard.py --port 8081 71 | ``` 72 | 73 | ### 2. Access 74 | 75 | Open [http://localhost:8081/web_dashboard.html](http://localhost:8081/web_dashboard.html) in your browser. 76 | 77 | ### 3. Authentication 78 | 79 | If `enable_auth` is true in the scheduler config, you must provide an API Key. 80 | 81 | 1. **Get your API Key**: 82 | - Run: `python opentinker/scheduler/register_user_example.py` 83 | - Or check your client config: `cat client/client_config/opentinker_param.yaml | grep scheduler_api_key` 84 | 2. **Enter in Dashboard**: 85 | - Go to the "Settings" section at the top of the dashboard. 86 | - Paste your key into the "API Key" field. 87 | - The key is automatically saved to your browser's local storage. 88 | 89 | ## Scheduler API Reference 90 | 91 | Base URL: `http://localhost:` 92 | 93 | | Method | Endpoint | Description | 94 | |--------|----------|-------------| 95 | | POST | `/submit_job` | Submit a new training job | 96 | | GET | `/list_jobs` | List all jobs and their status | 97 | | GET | `/job_status/{job_id}` | Get details for a specific job | 98 | | DELETE | `/cancel_job/{job_id}` | Cancel a running or queued job | 99 | | POST | `/complete_job/{job_id}` | Mark a job as completed (called by client) | 100 | | POST | `/register` | Register a new user (if auth enabled) | 101 | 102 | ## Troubleshooting 103 | 104 | ### Job stuck in QUEUED 105 | - Check GPU availability with `nvidia-smi`. 106 | - Verify the scheduler has free ports in its range. 107 | 108 | ### 401 Unauthorized Errors 109 | - Ensure you are providing a valid `Authorization: Bearer ` header (API) or have entered the key in the dashboard. 110 | - If running locally without need for auth, set `enable_auth: false` in `scheduler.yaml`. 111 | 112 | ### Server Launch Failures 113 | - Check the scheduler console logs for Python tracebacks. 114 | - Ensure all dependencies are installed in the environment where the scheduler runs. 115 | -------------------------------------------------------------------------------- /opentinker/client/math_inference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import hydra 3 | from omegaconf import OmegaConf 4 | 5 | from utils.http_training_client import InferenceSchedulerClient 6 | from utils.scheduler_client_lifecycle import get_lifecycle_manager 7 | from opentinker.environment.inference_pipeline import run_inference 8 | from opentinker.environment.math import MathGame 9 | from opentinker.environment.game_stats_client import GameStatsClient 10 | 11 | 12 | @hydra.main(config_path="client_config", config_name="math_inference_scheduler_config.yaml", version_base=None) 13 | def main(args): 14 | """Run math inference with scheduler-managed vLLM server.""" 15 | lifecycle = get_lifecycle_manager() 16 | 17 | print("=" * 60) 18 | print("Math Inference with Scheduler") 19 | print("=" * 60) 20 | 21 | if not args.model_path: 22 | raise ValueError("model_path is required") 23 | if not args.data_path: 24 | raise ValueError("data_path is required") 25 | 26 | # 1. Submit inference job to scheduler 27 | scheduler_client = InferenceSchedulerClient( 28 | scheduler_url=args.get("scheduler_url", "http://localhost:8780"), 29 | api_key=args.get("scheduler_api_key"), 30 | ) 31 | 32 | print(f"Submitting inference job to scheduler...") 33 | job_result = scheduler_client.submit_inference_job( 34 | model_path=args.model_path, 35 | tokenizer_path=args.get("tokenizer_path"), 36 | tensor_parallel_size=args.get("tensor_parallel_size", 1), 37 | num_gpus=args.get("num_gpus"), 38 | gpu_memory_utilization=args.get("gpu_memory_utilization", 0.9), 39 | max_model_len=args.get("max_model_len"), 40 | trust_remote_code=args.get("trust_remote_code", True), 41 | ) 42 | 43 | job_id = job_result["job_id"] 44 | vllm_server_url = job_result["vllm_server_url"] 45 | 46 | # Register job for lifecycle cleanup 47 | lifecycle.register_job(scheduler_client, job_id) 48 | 49 | print(f"✓ Inference job {job_id} started at {vllm_server_url}") 50 | 51 | # 2. Setup GameStatsClient for per-step metrics (with job_id isolation) 52 | game_stats = GameStatsClient(args.env_endpoint, job_id=job_id) 53 | if game_stats.health_check(): 54 | print(f"✓ Connected to game server at {args.env_endpoint}") 55 | game_stats.reset_all() # Reset stats for this job before inference 56 | else: 57 | print(f"⚠ Game server not available at {args.env_endpoint}, continuing without stats") 58 | game_stats = None 59 | 60 | # 3. Run inference using the remote vLLM server 61 | print(f"Running inference on {args.data_path}...") 62 | 63 | results = run_inference( 64 | model_path=None, # Not needed when using vllm_server_url 65 | vllm_server_url=vllm_server_url, 66 | tokenizer_path=args.get("tokenizer_path") or args.model_path, 67 | data_path=args.data_path, 68 | game_class=MathGame, 69 | env_endpoint=args.env_endpoint, 70 | job_id=job_id, # Pass job_id for stats isolation 71 | output_path=args.get("output_path"), 72 | temperature=args.temperature, 73 | top_p=args.top_p, 74 | max_tokens=args.max_new_tokens, 75 | max_samples=args.get("max_samples"), 76 | max_user_turns=args.multi_turn.max_user_turns, 77 | max_assistant_turns=args.multi_turn.max_assistant_turns, 78 | ) 79 | 80 | # 4. Log game stats after inference 81 | if game_stats: 82 | stats = game_stats.get_all_stats() 83 | print(f"\nGame Server Stats (job_id={job_id}):") 84 | print(f" Total samples: {stats.get('total_samples', 0)}") 85 | print(f" Games completed: {stats.get('games_in_step', 0)}") 86 | print(f" Mean reward: {stats.get('mean_final_reward', 0):.4f}") 87 | 88 | if args.get("output_path"): 89 | print(f"\nResults saved to: {args.output_path}") 90 | 91 | print(f"\n{'='*60}") 92 | print("Inference completed! vLLM server will be automatically cleaned up.") 93 | print(f"{'='*60}") 94 | 95 | 96 | if __name__ == "__main__": 97 | main() 98 | 99 | -------------------------------------------------------------------------------- /opentinker/server/config/ref/ref.yaml: -------------------------------------------------------------------------------- 1 | # actor_rollout_ref.ref: FSDP config same as actor. For models larger than 7B, it’s recommended to turn on offload for ref by default 2 | strategy: ${actor_rollout_ref.actor.strategy} 3 | 4 | # whether to enable torch.compile 5 | # same as actor_rollout_ref.actor.use_torch_compile if it exists, otherwise 1 6 | use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} 7 | 8 | # [Will be deprecated, use log_prob_micro_batch_size_per_gpu] 9 | # The batch size for one forward pass in the computation of log_prob. Global batch size. 10 | log_prob_micro_batch_size: null 11 | 12 | # The batch size for one forward pass in the computation of log_prob. Local batch size per GPU. 13 | log_prob_micro_batch_size_per_gpu: null 14 | 15 | # enable dynamic batch size (sequence packing) for log_prob computation 16 | # same as actor_rollout_ref.actor.use_dynamic_bsz if it exists, otherwise false 17 | log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} 18 | 19 | # the max token length per GPU 20 | # same as actor_rollout_ref.actor.ppo_max_token_len_per_gpu if it exists, otherwise 16384 21 | log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} 22 | 23 | # profile the ref model in `compute_log_prob` 24 | profiler: 25 | 26 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs 27 | _target_: verl.utils.profiler.ProfilerConfig 28 | 29 | # choices: nsys, npu, torch, torch_memory 30 | tool: ${oc.select:global_profiler.tool,null} 31 | 32 | # whether enable profile on Ref 33 | enable: False 34 | 35 | # Whether to profile all ranks. 36 | all_ranks: False 37 | 38 | # The ranks that will be profiled. [] or [0,1,...] 39 | ranks: [] 40 | 41 | # profile results saving path 42 | save_path: ${oc.select:global_profiler.save_path,null} 43 | 44 | # specific tool config which only related to the role 45 | tool_config: 46 | 47 | # nsys tool config 48 | nsys: 49 | 50 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs 51 | _target_: verl.utils.profiler.config.NsightToolConfig 52 | 53 | # True for each task has its own database, False for all tasks in one training step share one database. 54 | discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} 55 | 56 | # npu config 57 | npu: 58 | 59 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs 60 | _target_: verl.utils.profiler.config.NPUToolConfig 61 | 62 | # Contents to profile, can be empty 63 | # options: npu, cpu, memory, shapes, module, stack 64 | contents: [] 65 | 66 | # Collection level, optional values: level_none, level0, level1, level2. 67 | level: "level1" 68 | 69 | # Whether to automatically parse the data. 70 | analysis: True 71 | 72 | # True for each task has its own database, False for all tasks in one training step share one database. 73 | discrete: False 74 | 75 | # torch profiler config 76 | torch: 77 | 78 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs 79 | _target_: verl.utils.profiler.config.TorchProfilerToolConfig 80 | 81 | # start profile mini-batch in training 82 | # NOTICE: different with global steps config which refers to iteration 83 | # This field only related with mini-batch 84 | step_start: 0 85 | 86 | # stop profile mini-batch in training 87 | step_end: null 88 | 89 | # torch memory profiler config 90 | torch_memory: 91 | 92 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs 93 | _target_: verl.utils.profiler.config.TorchMemoryToolConfig 94 | 95 | # Maximum number of memory allocation entries to track 96 | trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} 97 | 98 | # Stack trace depth for memory allocations 99 | stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} -------------------------------------------------------------------------------- /opentinker/data_preprocess/math_multiturn_w_interaction.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # Copyright 2023-2024 SGLang Team 3 | # Copyright 2025 ModelBest Inc. and/or its affiliates 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Preprocess the GSM8k dataset to parquet format 18 | """ 19 | 20 | import argparse 21 | import os 22 | import re 23 | 24 | import datasets 25 | 26 | from verl.utils.hdfs_io import copy, makedirs 27 | from verl.utils.hdfs_io import copy, makedirs 28 | from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed 29 | 30 | 31 | def extract_solution(solution_str): 32 | return remove_boxed(last_boxed_only_string(solution_str)) 33 | 34 | if __name__ == "__main__": 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.") 37 | parser.add_argument("--hdfs_dir", default=None) 38 | parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") 39 | parser.add_argument( 40 | "--local_save_dir", default="data/math_agentloop", help="The save directory for the preprocessed dataset." 41 | ) 42 | 43 | args = parser.parse_args() 44 | local_dataset_path = args.local_dataset_path 45 | 46 | data_source = "DigitalLearningGmbH/MATH-lighteval" 47 | 48 | dataset = datasets.load_dataset( 49 | data_source, 50 | ) 51 | train_dataset = dataset["train"] 52 | test_dataset = dataset["test"] 53 | 54 | instruction_following = "Let's think step by step and output the final answer within \\boxed{}." 55 | 56 | # add a row to each data item that represents a unique id 57 | def make_map_fn(split): 58 | def process_fn(example, idx): 59 | question_raw = example.pop("problem") 60 | 61 | question = question_raw + " " + instruction_following 62 | 63 | answer_raw = example.pop("solution") 64 | solution = extract_solution(answer_raw) 65 | data = { 66 | "data_source": data_source, 67 | "prompt": [ 68 | { 69 | "role": "user", 70 | "content": question, 71 | }, 72 | ], 73 | "ability": "math", 74 | "reward_model": {"style": "rule", "ground_truth": solution}, 75 | "extra_info": { 76 | "split": split, 77 | "index": idx, 78 | "answer": answer_raw, 79 | "question": question_raw, 80 | "interaction_kwargs": { 81 | "name": "math", 82 | "query": question, 83 | "ground_truth": solution, 84 | }, 85 | }, 86 | } 87 | return data 88 | 89 | return process_fn 90 | 91 | train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) 92 | test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True) 93 | 94 | hdfs_dir = args.hdfs_dir 95 | local_save_dir = args.local_dir 96 | if local_save_dir is not None: 97 | print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") 98 | else: 99 | local_save_dir = args.local_save_dir 100 | 101 | train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) 102 | test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet")) 103 | 104 | if hdfs_dir is not None: 105 | makedirs(hdfs_dir) 106 | copy(src=local_save_dir, dst=hdfs_dir) 107 | -------------------------------------------------------------------------------- /opentinker/data_preprocess/math_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Preprocess the MATH-lighteval dataset to parquet format 16 | """ 17 | 18 | import argparse 19 | import json 20 | import os 21 | 22 | import datasets 23 | 24 | from verl.utils.hdfs_io import copy, makedirs 25 | from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed 26 | 27 | 28 | def extract_solution(solution_str): 29 | return remove_boxed(last_boxed_only_string(solution_str)) 30 | 31 | 32 | if __name__ == "__main__": 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument("--local_dir", default=None) 35 | parser.add_argument("--hdfs_dir", default=None) 36 | parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") 37 | parser.add_argument( 38 | "--local_save_dir", default="~/data/math", help="The save directory for the preprocessed dataset." 39 | ) 40 | 41 | args = parser.parse_args() 42 | local_dataset_path = args.local_dataset_path 43 | 44 | # 'lighteval/MATH' is no longer available on huggingface. 45 | # Use mirror repo: DigitalLearningGmbH/MATH-lighteval 46 | data_source = "DigitalLearningGmbH/MATH-lighteval" 47 | print(f"Loading the {data_source} dataset from huggingface...", flush=True) 48 | if local_dataset_path is not None: 49 | dataset = datasets.load_dataset( 50 | local_dataset_path, 51 | ) 52 | else: 53 | dataset = datasets.load_dataset( 54 | data_source, 55 | ) 56 | 57 | train_dataset = dataset["train"] 58 | test_dataset = dataset["test"] 59 | 60 | instruction_following = "Let's think step by step and output the final answer within \\boxed{}." 61 | 62 | 63 | # add a row to each data item that represents a unique id 64 | def make_map_fn(split): 65 | def process_fn(example, idx): 66 | question = example.pop("problem") 67 | 68 | question = question + " " + instruction_following 69 | 70 | answer = example.pop("solution") 71 | solution = extract_solution(answer) 72 | data = { 73 | "data_source": data_source, 74 | "prompt": [{"role": "user", "content": question}], 75 | "ability": "math", 76 | "reward_model": {"style": "rule", "ground_truth": solution}, 77 | "extra_info": {"split": split, "index": idx}, 78 | } 79 | return data 80 | 81 | return process_fn 82 | 83 | train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) 84 | test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True) 85 | 86 | local_save_dir = args.local_dir 87 | if local_save_dir is not None: 88 | print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") 89 | else: 90 | local_save_dir = args.local_save_dir 91 | 92 | local_dir = os.path.expanduser(local_save_dir) 93 | hdfs_dir = args.hdfs_dir 94 | 95 | train_dataset.to_parquet(os.path.join(local_dir, "train.parquet")) 96 | test_dataset.to_parquet(os.path.join(local_dir, "test.parquet")) 97 | # Save one example as JSON for reference 98 | example = train_dataset[0] 99 | with open(os.path.join(local_dir, "train_example.json"), "w") as f: 100 | json.dump(example, f, indent=2) 101 | example = test_dataset[0] 102 | with open(os.path.join(local_dir, "test_example.json"), "w") as f: 103 | json.dump(example, f, indent=2) 104 | if hdfs_dir is not None: 105 | makedirs(hdfs_dir) 106 | 107 | copy(src=local_dir, dst=hdfs_dir) 108 | -------------------------------------------------------------------------------- /opentinker/environment/legacy/generic/README.md: -------------------------------------------------------------------------------- 1 | # Generic Environment for LLM-Environment Interaction 2 | 3 | This directory contains the generic environment implementation for training LLMs 4 | to interact with external environments (like OpenAI Gym). 5 | 6 | ## Architecture 7 | 8 | ``` 9 | ┌─────────────────────────────────────────────────────────────────────┐ 10 | │ Training Pipeline │ 11 | ├─────────────────────────────────────────────────────────────────────┤ 12 | │ │ 13 | │ ┌─────────────────┐ ┌─────────────────────────────────────┐ │ 14 | │ │ GenericEnvironment│───────▶│ GenericAgentLoop │ │ 15 | │ │ (BaseEnvironment) │ │ (verl/experimental/agent_loop/) │ │ 16 | │ │ │ │ │ │ 17 | │ │ - Dataloader │ │ PENDING → GENERATING → INTERACTING │ │ 18 | │ │ - InteractionSpec │ │ │ │ │ │ 19 | │ └─────────────────┘ │ ▼ ▼ │ │ 20 | │ │ LLM Server Environment │ │ 21 | │ │ (mask=1) (mask=0) │ │ 22 | │ └─────────────────────────────────────┘ │ 23 | │ │ │ 24 | │ ┌───────────────┴───────────────┐ │ 25 | │ │ BaseInteraction │ │ 26 | │ │ (verl/interactions/) │ │ 27 | │ │ │ │ 28 | │ │ - GymEnvironmentInteraction │ │ 29 | │ │ - SimpleTextEnvironment │ │ 30 | │ │ - Gsm8kInteraction │ │ 31 | │ └───────────────────────────────┘ │ 32 | └─────────────────────────────────────────────────────────────────────┘ 33 | ``` 34 | 35 | ## Key Concept: Environment Provides Rewards 36 | 37 | Unlike standard PPO training where a separate reward function evaluates completions, 38 | in environment interaction: 39 | 40 | - **Reward comes from the environment** via `interaction.generate_response()` 41 | - **No external `reward_function` is needed** 42 | - `response_mask` ensures only LLM tokens contribute to the loss 43 | 44 | ## Quick Start 45 | 46 | ```python 47 | from omegaconf import OmegaConf 48 | from opentinker.environment.generic.generic_env import ( 49 | GenericEnvironment, 50 | InteractionSpec, 51 | ) 52 | 53 | # 1. Configure environment 54 | config = OmegaConf.create({ 55 | "tokenizer_path": "meta-llama/Llama-2-7b-chat-hf", 56 | "data_path": "data/train.parquet", 57 | "max_prompt_tokens": 1024, 58 | "max_new_tokens": 512, 59 | "batch_size": 4, 60 | "num_workers": 4, 61 | "algorithm": "agent_loop", 62 | }) 63 | 64 | # 2. Define interaction with Gym environment 65 | interaction_specs = [ 66 | InteractionSpec( 67 | name="my_env", 68 | class_path="verl.interactions.gym_environment_interaction.GymEnvironmentInteraction", 69 | config={"env_endpoint": "http://localhost:8080", "max_steps": 100} 70 | ) 71 | ] 72 | 73 | # 3. Create environment 74 | env = GenericEnvironment(config, interaction_specs) 75 | 76 | # 4. Use with training client 77 | train_dl, val_dl = env.get_dataloader() 78 | env_config = env.setup(client) 79 | ``` 80 | 81 | ## Files 82 | 83 | | File | Description | 84 | |------|-------------| 85 | | `generic_env.py` | Main GenericEnvironment class | 86 | | `example_usage.py` | Usage examples | 87 | 88 | ## Dataset Format 89 | 90 | Your training data should include `interaction_kwargs` to specify which interaction to use: 91 | 92 | ```json 93 | { 94 | "prompt": [ 95 | {"role": "system", "content": "You are playing a text adventure..."}, 96 | {"role": "user", "content": "You are in a cave. What do you do?"} 97 | ], 98 | "extra_info": { 99 | "interaction_kwargs": {"name": "my_env"} 100 | } 101 | } 102 | ``` 103 | 104 | ## Related Files 105 | 106 | - Agent Loop: `verl/experimental/agent_loop/generic_agent_loop.py` 107 | - Interactions: `verl/interactions/gym_environment_interaction.py` 108 | - Mock Server: `opentinker/environment/example/mock_env_server.py` 109 | -------------------------------------------------------------------------------- /opentinker/client/math_tool_rl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import hydra 3 | from omegaconf import OmegaConf 4 | from transformers import AutoTokenizer 5 | from torchdata.stateful_dataloader import StatefulDataLoader 6 | 7 | from utils.http_training_client import ServiceClient, SchedulerClient 8 | from opentinker.environment.base_game_environment import GameEnvironment 9 | from opentinker.environment.base_data_generator import DynamicGameDataset, collate_fn 10 | from opentinker.environment.math.math_tool_game import CodeInterpreterMathGame 11 | from opentinker.environment.static_data_generator import StaticDatasetGenerator 12 | from opentinker.environment.game_stats_client import GameStatsClient 13 | from utils.utils import resolve_paths_in_config 14 | from utils.scheduler_client_lifecycle import get_lifecycle_manager 15 | from verl.trainer.main_ppo import create_rl_sampler 16 | from opentinker.environment.math.math_tool_env import MathCodeInterpreterEnvironment 17 | 18 | @hydra.main(config_path="client_config", config_name="math_code_interpreter_param.yaml") 19 | def main(args): 20 | args = resolve_paths_in_config(args) 21 | lifecycle = get_lifecycle_manager() 22 | 23 | print("=" * 60) 24 | print("Math Training with Code Interpreter (Agent Loop)") 25 | print("=" * 60) 26 | 27 | # 1. Submit job to scheduler 28 | print("\n[1/4] Submitting job to scheduler...") 29 | scheduler_client = SchedulerClient( 30 | scheduler_url=args.get("scheduler_url", "http://localhost:8780"), 31 | api_key=args.get("scheduler_api_key") 32 | ) 33 | 34 | job_result = scheduler_client.submit_job( 35 | config=OmegaConf.to_container(args, resolve=True), 36 | enable_agent_loop=True, 37 | wandb_key=args.get("wandb_key"), 38 | num_gpus=args.get("num_gpus"), 39 | ) 40 | 41 | job_id = job_result["job_id"] 42 | server_url = job_result["server_url"] 43 | lifecycle.register_job(scheduler_client, job_id) 44 | 45 | print(f"✓ Job {job_id} allocated at {server_url}") 46 | 47 | # 2. Setup environment 48 | print("\n[2/4] Setting up environment...") 49 | env_endpoint = args.interaction.config.env_endpoint 50 | env = MathCodeInterpreterEnvironment( 51 | game_class=CodeInterpreterMathGame, 52 | config=args, 53 | data_paths=[args.data_path], 54 | val_data_paths=[args.val_data_path] if args.val_data_path else None, 55 | job_id=job_id, 56 | ) 57 | print(f"✓ Environment created") 58 | print(f" - Interaction config: {env.get_interaction_config_path()}") 59 | print(f" - Game server endpoint: {env_endpoint}") 60 | 61 | # 3. Setup game stats client 62 | print("\n[3/4] Connecting to game server...") 63 | game_stats = GameStatsClient(env_endpoint, job_id=env.job_id) 64 | if game_stats.health_check(): 65 | game_stats.reset_all() 66 | print(f"✓ Connected to game server at {env_endpoint}") 67 | else: 68 | game_stats = None 69 | print(f"⚠ Game server not responding at {env_endpoint}") 70 | print(f" Make sure to start: python opentinker/environment/math/code_interpreter_math_server.py --port {args.interaction.config.env_port}") 71 | 72 | # 4. Connect to training server and train 73 | print("\n[4/4] Starting training...") 74 | client = ServiceClient( 75 | server_url=server_url, 76 | project_name=args.project_name, 77 | experiment_name=args.experiment_name, 78 | logger_backends=args.logger_backends, 79 | ) 80 | client.set_config(args, env) 81 | 82 | print(f"\nTraining configuration:") 83 | print(f" - Algorithm: {args.algorithm}") 84 | print(f" - Steps: {args.get('num_steps')}") 85 | print(f" - Epochs: {args.get('num_epochs')}") 86 | print(f" - Batch size: {args.batch_size}") 87 | print(f" - Max turns: {args.multi_turn.max_assistant_turns}") 88 | 89 | try: 90 | final_metrics = client.fit( 91 | env=env, 92 | num_epochs=args.get("num_epochs"), 93 | num_steps=args.get("num_steps"), 94 | save_freq=args.save_freq, 95 | test_freq=args.test_freq, 96 | verbose=True, 97 | validate_before_training=True, 98 | game_stats_client=game_stats, 99 | ) 100 | print(f"\n✓ Training completed!") 101 | print(f"Final metrics: {final_metrics}") 102 | finally: 103 | env.cleanup() 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /opentinker/client/gomoku_inference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import hydra 3 | from omegaconf import OmegaConf 4 | 5 | from utils.http_training_client import InferenceSchedulerClient 6 | from utils.scheduler_client_lifecycle import get_lifecycle_manager 7 | from opentinker.environment.inference_pipeline import run_inference 8 | from opentinker.environment.gomoku import GomokuGame 9 | from opentinker.environment.game_stats_client import GameStatsClient 10 | 11 | 12 | @hydra.main(config_path="client_config", config_name="gomoku_inference_scheduler_config.yaml", version_base=None) 13 | def main(args): 14 | """Run gomoku inference with scheduler-managed vLLM server.""" 15 | lifecycle = get_lifecycle_manager() 16 | 17 | print("=" * 60) 18 | print("Gomoku Inference with Scheduler") 19 | print("=" * 60) 20 | 21 | if not args.model_path: 22 | raise ValueError("model_path is required") 23 | 24 | # 1. Submit inference job to scheduler 25 | scheduler_client = InferenceSchedulerClient( 26 | scheduler_url=args.get("scheduler_url", "http://localhost:8780"), 27 | api_key=args.get("scheduler_api_key"), 28 | ) 29 | 30 | print(f"Submitting inference job to scheduler...") 31 | job_result = scheduler_client.submit_inference_job( 32 | model_path=args.model_path, 33 | tokenizer_path=args.get("tokenizer_path"), 34 | tensor_parallel_size=args.get("tensor_parallel_size", 1), 35 | num_gpus=args.get("num_gpus"), 36 | gpu_memory_utilization=args.get("gpu_memory_utilization", 0.9), 37 | max_model_len=args.get("max_model_len"), 38 | trust_remote_code=args.get("trust_remote_code", True), 39 | ) 40 | 41 | job_id = job_result["job_id"] 42 | vllm_server_url = job_result["vllm_server_url"] 43 | 44 | # Register job for lifecycle cleanup 45 | lifecycle.register_job(scheduler_client, job_id) 46 | 47 | print(f"✓ Inference job {job_id} started at {vllm_server_url}") 48 | 49 | # 2. Setup GameStatsClient for per-step metrics (with job_id isolation) 50 | game_stats = GameStatsClient(args.env_endpoint, job_id=job_id) 51 | if game_stats.health_check(): 52 | print(f"✓ Connected to game server at {args.env_endpoint}") 53 | game_stats.reset_all() # Reset stats for this job before inference 54 | else: 55 | print(f"⚠ Game server not available at {args.env_endpoint}, continuing without stats") 56 | game_stats = None 57 | 58 | # 3. Run inference using the remote vLLM server 59 | max_user_turns = args.multi_turn.get("max_user_turns", 50) 60 | max_assistant_turns = args.multi_turn.get("max_assistant_turns", 50) 61 | 62 | print(f"Running inference with max_samples={args.get('max_samples', 10)}...") 63 | 64 | results = run_inference( 65 | model_path=None, # Not needed when using vllm_server_url 66 | vllm_server_url=vllm_server_url, 67 | tokenizer_path=args.get("tokenizer_path") or args.model_path, 68 | data_path=args.get("data_path"), # None for dynamic generation 69 | game_class=GomokuGame, 70 | env_endpoint=args.env_endpoint, 71 | job_id=job_id, # Pass job_id for stats isolation 72 | output_path=args.get("output_path"), 73 | temperature=args.temperature, 74 | top_p=args.top_p, 75 | max_tokens=args.max_new_tokens, 76 | max_tokens_per_turn=args.multi_turn.get("max_tokens_per_turn"), 77 | max_samples=args.get("max_samples", 10), 78 | max_user_turns=max_user_turns, 79 | max_assistant_turns=max_assistant_turns, 80 | max_context_length=args.get("max_context_length", 30000), 81 | # GomokuGame kwargs 82 | board_size=args.get("board_size", 9), 83 | ) 84 | 85 | # 4. Log game stats after inference 86 | if game_stats: 87 | stats = game_stats.get_all_stats() 88 | print(f"\nGame Server Stats (job_id={job_id}):") 89 | print(f" Total samples: {stats.get('total_samples', 0)}") 90 | print(f" Games completed: {stats.get('games_in_step', 0)}") 91 | print(f" Win rate: {stats.get('win_rate', 0):.2%}") 92 | print(f" Mean reward: {stats.get('mean_final_reward', 0):.4f}") 93 | 94 | if args.get("output_path"): 95 | print(f"\nResults saved to: {args.output_path}") 96 | 97 | print(f"\n{'='*60}") 98 | print("Inference completed! vLLM server will be automatically cleaned up.") 99 | print(f"{'='*60}") 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | 105 | -------------------------------------------------------------------------------- /opentinker/docs/SERVER_CONNECTION_FIX.md: -------------------------------------------------------------------------------- 1 | # 服务器连接超时问题 - 解决方案 2 | 3 | ## 问题描述 4 | 5 | 客户端在尝试连接到 HTTP 训练服务器时失败,报错: 6 | ``` 7 | RuntimeError: Failed to complete request to set_generation_config after 3 attempts 8 | ``` 9 | 10 | ## 根本原因 11 | 12 | **服务器初始化时间过长**:HTTP 训练服务器需要时间来: 13 | 1. 启动 Ray actors 14 | 2. 加载大型语言模型到 GPU 15 | 3. 初始化各个组件(actor, critic, reference model 等) 16 | 17 | 而客户端的重试机制太"急躁": 18 | - 只重试 3 次 19 | - 重试间隔较短(2s、4s、8s) 20 | - 总共只等待约 14 秒 21 | 22 | 对于需要加载大模型的服务器,这个时间远远不够! 23 | 24 | --- 25 | 26 | ## 已实施的解决方案 27 | 28 | ### 1. ✅ 增加 HTTP 客户端重试次数和延迟 29 | 30 | **修改文件**: `client/http_training_client.py` 31 | 32 | **更改内容**: 33 | - `max_retries`: 3 → **10** (增加重试次数) 34 | - `retry_delay`: 2.0s → **5.0s** (增加基础延迟) 35 | - 添加指数退避上限:最长等待 60 秒 36 | 37 | **新的重试时间表**: 38 | ``` 39 | 尝试 1: 0秒 40 | 尝试 2: 5秒 → 等待 5s 41 | 尝试 3: 10秒 → 等待 5s 42 | 尝试 4: 20秒 → 等待 10s 43 | 尝试 5: 40秒 → 等待 20s 44 | 尝试 6: 80秒 → 等待 40s 45 | 尝试 7: 140秒 → 等待 60s (上限) 46 | 尝试 8: 200秒 → 等待 60s 47 | 尝试 9: 260秒 → 等待 60s 48 | 尝试 10: 320秒 → 等待 60s 49 | ``` 50 | 51 | **总等待时间**: ~5 分钟(足够服务器完全启动) 52 | 53 | ### 2. ✅ 创建服务器就绪等待工具 54 | 55 | **新文件**: `scripts/wait_for_server.py` 56 | 57 | 这是一个独立工具,可以在客户端连接之前检查服务器是否已就绪。 58 | 59 | **用法**: 60 | ```bash 61 | # 等待服务器就绪(默认超时 5 分钟) 62 | python scripts/wait_for_server.py http://localhost:38001 63 | 64 | # 自定义超时和轮询间隔 65 | python scripts/wait_for_server.py http://localhost:38001 600 10 66 | ``` 67 | 68 | **输出示例**: 69 | ``` 70 | ⏳ 等待服务器就绪: http://localhost:38001 71 | 超时时间: 300秒 72 | 检查端点: /api/v1/health 73 | 74 | ⏳ 尝试 1: 连接失败 (已等待: 0秒) 75 | ⏳ 尝试 2: HTTP 404 (已等待: 5秒) 76 | ⏳ 尝试 3: HTTP 200 ✓ 77 | 78 | ✅ 服务器已就绪!(耗时: 47.2秒) 79 | ``` 80 | 81 | --- 82 | 83 | ## 如何使用 84 | 85 | ### 方法 1: 直接运行(推荐) 86 | 87 | 由于已经修改了 `http_training_client.py`,现在客户端会自动等待更长时间: 88 | 89 | ```bash 90 | python client/custom_client_with_scheduler.py \ 91 | data_path=data/math/train.parquet \ 92 | val_data_path=data/math/test.parquet \ 93 | tokenizer_path=/path/to/tokenizer \ 94 | num_gpus=4 95 | ``` 96 | 97 | 客户端会自动: 98 | - 重试 10 次(而不是 3 次) 99 | - 使用指数退避策略 100 | - 最多等待约 5 分钟 101 | 102 | ### 方法 2: 手动等待(最保险) 103 | 104 | 如果服务器特别慢,可以先手动等待: 105 | 106 | ```bash 107 | # 步骤 1: 提交任务到调度器 108 | # (从客户端输出中获取 server_url,比如 http://localhost:38001) 109 | 110 | # 步骤 2: 等待服务器就绪 111 | python scripts/wait_for_server.py http://localhost:38001 600 112 | 113 | # 步骤 3: 服务器就绪后,客户端会自动连接 114 | ``` 115 | 116 | ### 方法 3: 增加调度器的服务器启动等待时间 117 | 118 | **修改文件**: `scheduler/job_scheduler.py` 119 | 120 | 找到第 394 行: 121 | ```python 122 | time.sleep(0.5) # Current: 0.5 seconds 123 | ``` 124 | 125 | 改为: 126 | ```python 127 | time.sleep(10.0) # Wait 10 seconds for server to initialize 128 | ``` 129 | 130 | 这会让调度器在启动服务器后等待更长时间再返回给客户端。 131 | 132 | --- 133 | 134 | ## 故障排查 135 | 136 | ### 检查服务器是否真的在运行 137 | 138 | ```bash 139 | ps aux | grep launch_http_server 140 | ``` 141 | 142 | 应该看到类似: 143 | ``` 144 | root 3283256 8.6 0.0 python .../launch_http_server.py server.port=38001 ... 145 | ``` 146 | 147 | ### 检查服务器端口是否可访问 148 | 149 | ```bash 150 | curl http://localhost:38001/api/v1/health 151 | ``` 152 | 153 | **预期响应**: 154 | - 如果服务器还在初始化:`{"detail":"Not Found"}` 或连接错误 155 | - 如果服务器已就绪:`{"status": "healthy", ...}` 156 | 157 | ### 查看服务器日志 158 | 159 | 服务器日志保存在 `/workspace/logs/`: 160 | ```bash 161 | # 查找最新的日志文件 162 | ls -lth /workspace/logs/ | head -5 163 | 164 | # 查看stderr(错误日志) 165 | tail -100 /workspace/logs/job_*_stderr.log 166 | ``` 167 | 168 | ### 延长客户端超时时间 169 | 170 | 如果服务器特别慢(例如首次加载超大模型),可以在客户端创建时增加超时: 171 | 172 | ```python 173 | # 在 custom_client_with_scheduler.py 中 174 | client = ServiceClient( 175 | server_url=server_url, 176 | timeout=10000.0, # 增加到 10000 秒 177 | max_retries=15, # 增加到 15 次 178 | retry_delay=10.0 # 增加基础延迟到 10 秒 179 | ) 180 | ``` 181 | 182 | --- 183 | 184 | ## 预防措施 185 | 186 | ### 1. 预热服务器 187 | 188 | 在第一次提交任务前,先启动一个服务器让它加载模型: 189 | 190 | ```bash 191 | # 启动一个"预热"服务器 192 | python server/launch_http_server.py server.port=38000 193 | 194 | # 等待模型加载完成(观察GPU内存增长) 195 | watch -n 1 nvidia-smi 196 | 197 | # 之后的任务会启动得更快(模型已在缓存中) 198 | ``` 199 | 200 | ### 2. 使用更快的启动配置 201 | 202 | 如果不需要所有组件,可以简化配置来加快启动: 203 | - 减少 GPU 数量 204 | - 使用更小的模型 205 | - 禁用不需要的功能 206 | 207 | ### 3. 监控服务器启动进度 208 | 209 | 添加日志查看脚本: 210 | ```bash 211 | # 实时查看最新的服务器日志 212 | tail -f /workspace/logs/job_*_stderr.log 213 | ``` 214 | 215 | --- 216 | 217 | ## 总结 218 | 219 | **问题**: 客户端连接超时(3 次重试,~14 秒) 220 | **原因**: 服务器初始化需要更长时间(加载模型到 GPU) 221 | **解决**: 222 | 1. ✅ 增加重试次数到 10 次 223 | 2. ✅ 增加重试延迟到 5 秒 224 | 3. ✅ 添加 wait_for_server.py 工具 225 | 4. ✅ 使用指数退避,最长等待 60 秒 226 | 227 | **现在客户端最多会等待约 5 分钟**,应该足够大多数服务器完成初始化。 228 | -------------------------------------------------------------------------------- /opentinker/setup_cross_node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 跨节点配置快速设置脚本 3 | # 用法: ./setup_cross_node.sh 4 | 5 | set -e 6 | 7 | # 颜色输出 8 | RED='\033[0;31m' 9 | GREEN='\033[0;32m' 10 | YELLOW='\033[1;33m' 11 | NC='\033[0m' # No Color 12 | 13 | # 检查参数 14 | if [ $# -lt 2 ]; then 15 | echo -e "${RED}错误: 参数不足${NC}" 16 | echo "用法: $0 [scheduler_port] [env_port]" 17 | echo "" 18 | echo "示例:" 19 | echo " $0 192.168.1.100 192.168.1.101" 20 | echo " $0 192.168.1.100 192.168.1.101 8766 8084" 21 | exit 1 22 | fi 23 | 24 | SCHEDULER_IP=$1 25 | ENV_IP=$2 26 | SCHEDULER_PORT=${3:-8766} # 默认 8766 27 | ENV_PORT=${4:-8084} # 默认 8084 28 | 29 | echo -e "${GREEN}========================================${NC}" 30 | echo -e "${GREEN}OpenTinker 跨节点配置工具${NC}" 31 | echo -e "${GREEN}========================================${NC}" 32 | echo "" 33 | echo -e "${YELLOW}配置信息:${NC}" 34 | echo " Scheduler: http://${SCHEDULER_IP}:${SCHEDULER_PORT}" 35 | echo " Environment: http://${ENV_IP}:${ENV_PORT}" 36 | echo "" 37 | 38 | # 获取脚本所在目录 39 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 40 | CLIENT_CONFIG_DIR="${SCRIPT_DIR}/client/client_config" 41 | 42 | # 检查配置目录是否存在 43 | if [ ! -d "$CLIENT_CONFIG_DIR" ]; then 44 | echo -e "${RED}错误: 配置目录不存在: ${CLIENT_CONFIG_DIR}${NC}" 45 | echo "确保脚本位于 OpenTinker 项目根目录" 46 | exit 1 47 | fi 48 | 49 | # 配置文件列表 50 | CONFIG_FILES=( 51 | "generic_env_param.yaml" 52 | "gomoku_param.yaml" 53 | ) 54 | 55 | # 备份并修改配置文件 56 | for CONFIG_FILE in "${CONFIG_FILES[@]}"; do 57 | CONFIG_PATH="${CLIENT_CONFIG_DIR}/${CONFIG_FILE}" 58 | 59 | if [ ! -f "$CONFIG_PATH" ]; then 60 | echo -e "${YELLOW}跳过: ${CONFIG_FILE} (文件不存在)${NC}" 61 | continue 62 | fi 63 | 64 | echo -e "${GREEN}处理: ${CONFIG_FILE}${NC}" 65 | 66 | # 创建备份 67 | BACKUP_PATH="${CONFIG_PATH}.backup.$(date +%Y%m%d_%H%M%S)" 68 | cp "$CONFIG_PATH" "$BACKUP_PATH" 69 | echo " ✓ 备份已创建: $(basename $BACKUP_PATH)" 70 | 71 | # 修改 scheduler_url 72 | sed -i.tmp "s|scheduler_url:.*|scheduler_url: \"http://${SCHEDULER_IP}:${SCHEDULER_PORT}\"|" "$CONFIG_PATH" 73 | 74 | # 修改 env_endpoint 75 | sed -i.tmp "s|env_endpoint:.*|env_endpoint: \"http://${ENV_IP}:${ENV_PORT}\" # Modified by setup script|" "$CONFIG_PATH" 76 | 77 | # 删除临时文件 78 | rm -f "${CONFIG_PATH}.tmp" 79 | 80 | echo " ✓ 配置已更新" 81 | echo "" 82 | done 83 | 84 | # 验证修改 85 | echo -e "${GREEN}========================================${NC}" 86 | echo -e "${GREEN}配置验证${NC}" 87 | echo -e "${GREEN}========================================${NC}" 88 | echo "" 89 | 90 | for CONFIG_FILE in "${CONFIG_FILES[@]}"; do 91 | CONFIG_PATH="${CLIENT_CONFIG_DIR}/${CONFIG_FILE}" 92 | 93 | if [ ! -f "$CONFIG_PATH" ]; then 94 | continue 95 | fi 96 | 97 | echo -e "${YELLOW}${CONFIG_FILE}:${NC}" 98 | 99 | # 显示 scheduler_url 100 | SCHEDULER_LINE=$(grep "scheduler_url:" "$CONFIG_PATH" | head -1) 101 | echo " ${SCHEDULER_LINE}" 102 | 103 | # 显示 env_endpoint 104 | ENV_LINE=$(grep "env_endpoint:" "$CONFIG_PATH" | head -1) 105 | echo " ${ENV_LINE}" 106 | echo "" 107 | done 108 | 109 | # 网络连通性测试 110 | echo -e "${GREEN}========================================${NC}" 111 | echo -e "${GREEN}网络连通性测试${NC}" 112 | echo -e "${GREEN}========================================${NC}" 113 | echo "" 114 | 115 | echo -e "${YELLOW}测试 Scheduler 连接...${NC}" 116 | if ping -c 1 -W 2 "$SCHEDULER_IP" &> /dev/null; then 117 | echo -e " ${GREEN}✓ Ping ${SCHEDULER_IP} 成功${NC}" 118 | else 119 | echo -e " ${RED}✗ Ping ${SCHEDULER_IP} 失败${NC}" 120 | fi 121 | 122 | echo "" 123 | echo -e "${YELLOW}测试 Environment 连接...${NC}" 124 | if ping -c 1 -W 2 "$ENV_IP" &> /dev/null; then 125 | echo -e " ${GREEN}✓ Ping ${ENV_IP} 成功${NC}" 126 | else 127 | echo -e " ${RED}✗ Ping ${ENV_IP} 失败${NC}" 128 | fi 129 | 130 | echo "" 131 | echo -e "${GREEN}========================================${NC}" 132 | echo -e "${GREEN}配置完成!${NC}" 133 | echo -e "${GREEN}========================================${NC}" 134 | echo "" 135 | echo "下一步:" 136 | echo " 1. 启动 Scheduler (在 ${SCHEDULER_IP} 节点):" 137 | echo " cd scheduler && python launch_scheduler.py" 138 | echo "" 139 | echo " 2. 启动 Environment Server (在 ${ENV_IP} 节点):" 140 | echo " cd environment/example && python mock_env_server.py --port ${ENV_PORT}" 141 | echo "" 142 | echo " 3. 运行 Client (在当前节点):" 143 | echo " cd client && python generic_env_client.py" 144 | echo "" 145 | echo -e "${YELLOW}注意: 记得在 generic_env_param.yaml 中填写正确的 scheduler_api_key!${NC}" 146 | -------------------------------------------------------------------------------- /opentinker/test_geo3k_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Quick test script to verify Geo3K data loading with vision-language support. 3 | 4 | This script tests: 5 | 1. Loading parquet data with images 6 | 2. VL data generator functionality 7 | 3. Image tensor processing 8 | 9 | Usage: 10 | python test_geo3k_data.py --data_path ~/data/geo3k/train.parquet 11 | """ 12 | 13 | import argparse 14 | from transformers import AutoProcessor 15 | from omegaconf import OmegaConf 16 | 17 | from opentinker.environment.static_data_generator_vl import StaticDatasetGeneratorVL 18 | from opentinker.environment.base_data_generator_vl import DynamicGameDatasetVL 19 | 20 | 21 | def test_geo3k_data(data_path: str, num_samples: int = 5): 22 | """Test Geo3K data loading and processing. 23 | 24 | Args: 25 | data_path: Path to Geo3K parquet file 26 | num_samples: Number of samples to test 27 | """ 28 | print("=" * 60) 29 | print("Testing Geo3K Vision-Language Data Loading") 30 | print("=" * 60) 31 | 32 | # 1. Test static data generator 33 | print("\n1. Testing StaticDatasetGeneratorVL...") 34 | generator = StaticDatasetGeneratorVL( 35 | data_paths=[data_path], 36 | interaction_name="game", 37 | image_key="images", 38 | shuffle=False, 39 | ) 40 | print(f" ✓ Loaded dataset with {len(generator)} samples") 41 | 42 | # Check first sample 43 | sample = generator.generate_sample(0) 44 | print(f" ✓ Sample keys: {sample.keys()}") 45 | print(f" ✓ Prompt type: {type(sample['prompt'])}") 46 | print(f" ✓ Images: {len(sample.get('images', []))} image(s)") 47 | if sample.get('images'): 48 | print(f" ✓ First image type: {type(sample['images'][0])}") 49 | 50 | # 2. Test processor loading 51 | print("\n2. Testing AutoProcessor...") 52 | processor = AutoProcessor.from_pretrained( 53 | "Qwen/Qwen2.5-VL-7B-Instruct", 54 | trust_remote_code=True 55 | ) 56 | print(f" ✓ Loaded processor: {type(processor).__name__}") 57 | 58 | # 3. Test dynamic dataset 59 | print("\n3. Testing DynamicGameDatasetVL...") 60 | config = OmegaConf.create({ 61 | "max_prompt_length": 1024, 62 | "truncation": "right", 63 | "return_raw_chat": True, 64 | }) 65 | 66 | dataset = DynamicGameDatasetVL( 67 | data_generator=generator, 68 | tokenizer=None, 69 | processor=processor, 70 | config=config, 71 | virtual_size=num_samples, 72 | ) 73 | print(f" ✓ Created dataset with {len(dataset)} samples") 74 | 75 | # Test sample fetching 76 | print(f"\n4. Testing sample processing (first {num_samples} samples)...") 77 | for i in range(min(num_samples, len(dataset))): 78 | sample = dataset[i] 79 | print(f"\n Sample {i}:") 80 | print(f" - input_ids shape: {sample['input_ids'].shape}") 81 | print(f" - attention_mask shape: {sample['attention_mask'].shape}") 82 | 83 | # Check for image tensors 84 | image_keys = [k for k in sample.keys() if 'pixel' in k or 'image' in k] 85 | if image_keys: 86 | print(f" - Image tensor keys: {image_keys}") 87 | for key in image_keys: 88 | print(f" - {key} shape: {sample[key].shape}") 89 | else: 90 | print(f" - No image tensors found") 91 | 92 | print(f" - data_source: {sample.get('data_source')}") 93 | print(f" - interaction_kwargs: {sample.get('interaction_kwargs', {}).get('name')}") 94 | 95 | print("\n" + "=" * 60) 96 | print("✓ All tests passed!") 97 | print("=" * 60) 98 | 99 | return True 100 | 101 | 102 | def main(): 103 | parser = argparse.ArgumentParser(description="Test Geo3K VL data loading") 104 | parser.add_argument( 105 | "--data_path", 106 | type=str, 107 | default="~/data/geo3k/train.parquet", 108 | help="Path to Geo3K parquet file" 109 | ) 110 | parser.add_argument( 111 | "--num_samples", 112 | type=int, 113 | default=3, 114 | help="Number of samples to test" 115 | ) 116 | 117 | args = parser.parse_args() 118 | 119 | # Expand path 120 | import os 121 | data_path = os.path.expanduser(args.data_path) 122 | 123 | if not os.path.exists(data_path): 124 | print(f"Error: Data file not found: {data_path}") 125 | print(f"\nPlease prepare Geo3K data first:") 126 | print(f" python verl/examples/data_preprocess/geo3k.py --local_save_dir ~/data/geo3k") 127 | return False 128 | 129 | test_geo3k_data(data_path, args.num_samples) 130 | 131 | 132 | if __name__ == "__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /opentinker/client/math_tool_inference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import hydra 3 | from omegaconf import OmegaConf 4 | 5 | from utils.http_training_client import InferenceSchedulerClient 6 | from utils.scheduler_client_lifecycle import get_lifecycle_manager 7 | from opentinker.environment.inference_pipeline import run_inference 8 | from opentinker.environment.math.math_tool_game import CodeInterpreterMathGame 9 | from opentinker.environment.game_stats_client import GameStatsClient 10 | 11 | 12 | @hydra.main(config_path="client_config", config_name="math_code_interpreter_inference_config.yaml", version_base=None) 13 | def main(args): 14 | """Run math code interpreter inference with scheduler-managed vLLM server.""" 15 | lifecycle = get_lifecycle_manager() 16 | 17 | print("=" * 60) 18 | print("Math Code Interpreter Inference with Scheduler") 19 | print("=" * 60) 20 | 21 | if not args.model_path: 22 | raise ValueError("model_path is required") 23 | if not args.data_path: 24 | raise ValueError("data_path is required") 25 | 26 | # 1. Submit inference job to scheduler 27 | scheduler_client = InferenceSchedulerClient( 28 | scheduler_url=args.get("scheduler_url", "http://localhost:8780"), 29 | api_key=args.get("scheduler_api_key"), 30 | ) 31 | 32 | print(f"Submitting inference job to scheduler...") 33 | job_result = scheduler_client.submit_inference_job( 34 | model_path=args.model_path, 35 | tokenizer_path=args.get("tokenizer_path"), 36 | tensor_parallel_size=args.get("tensor_parallel_size", 1), 37 | num_gpus=args.get("num_gpus"), 38 | gpu_memory_utilization=args.get("gpu_memory_utilization", 0.9), 39 | max_model_len=args.get("max_model_len"), 40 | trust_remote_code=args.get("trust_remote_code", True), 41 | ) 42 | 43 | job_id = job_result["job_id"] 44 | vllm_server_url = job_result["vllm_server_url"] 45 | 46 | # Register job for lifecycle cleanup 47 | lifecycle.register_job(scheduler_client, job_id) 48 | 49 | print(f"✓ Inference job {job_id} started at {vllm_server_url}") 50 | 51 | # 2. Setup GameStatsClient for per-step metrics (with job_id isolation) 52 | game_stats = GameStatsClient(args.env_endpoint, job_id=job_id) 53 | if game_stats.health_check(): 54 | print(f"✓ Connected to code interpreter game server at {args.env_endpoint}") 55 | game_stats.reset_all() # Reset stats for this job before inference 56 | else: 57 | print(f"⚠ Game server not available at {args.env_endpoint}, continuing without stats") 58 | print(f" Make sure to start: python opentinker/environment/math/code_interpreter_math_server.py --port 8088") 59 | game_stats = None 60 | 61 | # 3. Run inference using the remote vLLM server 62 | print(f"\nRunning code interpreter inference on {args.data_path}...") 63 | print(f" - Multi-turn: max_user_turns={args.multi_turn.max_user_turns}, max_assistant_turns={args.multi_turn.max_assistant_turns}") 64 | print(f" - Max tokens: {args.max_new_tokens} total, {args.get('max_tokens_per_turn', 'unlimited')} per turn") 65 | 66 | results = run_inference( 67 | model_path=None, # Not needed when using vllm_server_url 68 | vllm_server_url=vllm_server_url, 69 | tokenizer_path=args.get("tokenizer_path") or args.model_path, 70 | data_path=args.data_path, 71 | game_class=CodeInterpreterMathGame, 72 | env_endpoint=args.env_endpoint, 73 | job_id=job_id, # Pass job_id for stats isolation 74 | output_path=args.get("output_path"), 75 | temperature=args.temperature, 76 | top_p=args.top_p, 77 | max_tokens=args.max_new_tokens, 78 | max_tokens_per_turn=args.get("max_tokens_per_turn"), 79 | max_samples=args.get("max_samples"), 80 | max_user_turns=args.multi_turn.max_user_turns, 81 | max_assistant_turns=args.multi_turn.max_assistant_turns, 82 | ) 83 | 84 | # 4. Log game stats after inference 85 | if game_stats: 86 | stats = game_stats.get_all_stats() 87 | print(f"\nCode Interpreter Stats (job_id={job_id}):") 88 | print(f" Total samples: {stats.get('total_samples', 0)}") 89 | print(f" Games completed: {stats.get('games_in_step', 0)}") 90 | print(f" Mean reward: {stats.get('mean_final_reward', 0):.4f}") 91 | print(f" Code executions: {stats.get('code_executions', 'N/A')}") 92 | 93 | if args.get("output_path"): 94 | print(f"\nResults saved to: {args.output_path}") 95 | 96 | print(f"\n{'='*60}") 97 | print("Inference completed! vLLM server will be automatically cleaned up.") 98 | print(f"{'='*60}") 99 | 100 | 101 | if __name__ == "__main__": 102 | main() 103 | -------------------------------------------------------------------------------- /opentinker/client/geo3k_tool_rl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Geo3K Multi-Turn Vision-Language RL Training Client. 3 | 4 | This script launches Geo3K geometry problem training using vision-language models 5 | with multi-turn verification. The model can submit answers and receive feedback 6 | before giving the final answer. 7 | 8 | Usage: 9 | # First, start the scheduler: 10 | bash opentinker/scripts/launch_scheduler.sh 11 | 12 | # Then start the game server: 13 | python opentinker/environment/geo3k/geo3k_tool_server.py --port 8088 14 | 15 | # Finally, run this training script: 16 | python opentinker/client/geo3k_tool_rl.py 17 | """ 18 | 19 | import hydra 20 | from omegaconf import OmegaConf 21 | 22 | from opentinker.client.utils.http_training_client import ServiceClient, SchedulerClient 23 | from opentinker.environment.geo3k import Geo3KToolEnvironment 24 | from opentinker.environment.game_stats_client import GameStatsClient 25 | from opentinker.client.utils.utils import resolve_paths_in_config 26 | from opentinker.client.utils.scheduler_client_lifecycle import get_lifecycle_manager 27 | 28 | 29 | @hydra.main(config_path="client_config", config_name="geo3k_tool_param.yaml") 30 | def main(args): 31 | args = resolve_paths_in_config(args) 32 | lifecycle = get_lifecycle_manager() 33 | 34 | print("=" * 60) 35 | print("Geo3K Multi-Turn Vision-Language Training") 36 | print("=" * 60) 37 | 38 | # 1. Submit job to scheduler 39 | print("\n[1/4] Submitting job to scheduler...") 40 | scheduler_client = SchedulerClient( 41 | scheduler_url=args.get("scheduler_url", "http://localhost:8780"), 42 | api_key=args.get("scheduler_api_key") 43 | ) 44 | 45 | job_result = scheduler_client.submit_job( 46 | config=OmegaConf.to_container(args, resolve=True), 47 | enable_agent_loop=True, 48 | wandb_key=args.get("wandb_key"), 49 | num_gpus=args.get("num_gpus"), 50 | ) 51 | 52 | job_id = job_result["job_id"] 53 | server_url = job_result["server_url"] 54 | lifecycle.register_job(scheduler_client, job_id) 55 | 56 | print(f"✓ Job {job_id} allocated at {server_url}") 57 | 58 | # 2. Setup Geo3K multi-turn VL environment 59 | print("\n[2/4] Setting up environment...") 60 | env_endpoint = args.interaction.config.env_endpoint 61 | 62 | max_retries = args.multi_turn.get("max_assistant_turns", 3) - 1 # -1 for initial attempt 63 | env = Geo3KToolEnvironment( 64 | config=args, 65 | data_paths=[args.data_path], 66 | val_data_paths=[args.val_data_path] if args.val_data_path else None, 67 | job_id=job_id, 68 | max_retries=max_retries, 69 | ) 70 | print(f"✓ Geo3K multi-turn VL environment created") 71 | print(f" - Interaction config: {env.get_interaction_config_path()}") 72 | print(f" - Max retries: {max_retries}") 73 | 74 | # 3. Setup game stats client 75 | print("\n[3/4] Connecting to game server...") 76 | game_stats = GameStatsClient(env_endpoint, job_id=env.job_id) 77 | if game_stats.health_check(): 78 | game_stats.reset_all() 79 | print(f"✓ Connected to game server at {env_endpoint}") 80 | else: 81 | game_stats = None 82 | print(f"⚠ Game server not responding at {env_endpoint}") 83 | print(f" Make sure to start: python opentinker/environment/geo3k/geo3k_tool_server.py --port {args.interaction.config.env_port}") 84 | 85 | # 4. Connect to training server and train 86 | print("\n[4/4] Starting training...") 87 | client = ServiceClient( 88 | server_url=server_url, 89 | project_name=args.project_name, 90 | experiment_name=args.experiment_name, 91 | logger_backends=args.logger_backends, 92 | ) 93 | client.set_config(args, env) 94 | 95 | print(f"\nTraining configuration:") 96 | print(f" - Algorithm: {args.algorithm}") 97 | print(f" - Epochs: {args.get('num_epochs')}") 98 | print(f" - Batch size: {args.batch_size}") 99 | print(f" - Max assistant turns: {args.multi_turn.max_assistant_turns}") 100 | print(f" - ADV estimator: {args.adv_estimator}") 101 | print(f" - Rollout N: {args.rollout_n}") 102 | 103 | try: 104 | final_metrics = client.fit( 105 | env=env, 106 | num_epochs=args.get("num_epochs"), 107 | num_steps=args.get("num_steps"), 108 | save_freq=args.save_freq, 109 | test_freq=args.test_freq, 110 | verbose=True, 111 | validate_before_training=True, 112 | game_stats_client=game_stats, 113 | ) 114 | print(f"\n✓ Training completed!") 115 | print(f"Final metrics: {final_metrics}") 116 | finally: 117 | env.cleanup() 118 | 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /opentinker/environment/math/math_env.py: -------------------------------------------------------------------------------- 1 | from opentinker.environment.base_game_environment import GameEnvironment 2 | import hydra 3 | from omegaconf import OmegaConf 4 | from torch.utils.data import DataLoader 5 | from transformers import AutoTokenizer 6 | from torchdata.stateful_dataloader import StatefulDataLoader 7 | 8 | from opentinker.client.utils.http_training_client import ServiceClient, SchedulerClient 9 | from opentinker.environment.base_game_environment import GameEnvironment 10 | from opentinker.environment.base_data_generator import DynamicGameDataset, collate_fn 11 | from opentinker.environment.math import MathGame 12 | from opentinker.environment.static_data_generator import StaticDatasetGenerator 13 | from opentinker.environment.game_stats_client import GameStatsClient 14 | from opentinker.client.utils.utils import resolve_paths_in_config 15 | from opentinker.client.utils.scheduler_client_lifecycle import get_lifecycle_manager 16 | from verl.trainer.main_ppo import create_rl_sampler 17 | 18 | class MathGameEnvironment(GameEnvironment): 19 | """GameEnvironment for static dataset math problems.""" 20 | 21 | def __init__(self, game_class, config, data_paths, val_data_paths=None, game_kwargs=None, job_id=None): 22 | self.data_paths = [data_paths] if isinstance(data_paths, str) else list(data_paths) 23 | self.val_data_paths = [val_data_paths] if isinstance(val_data_paths, str) else (list(val_data_paths) if val_data_paths else None) 24 | super().__init__(game_class=game_class, config=config, game_kwargs=game_kwargs or {}, job_id=job_id) 25 | 26 | def _setup_dataloader(self): 27 | """Use StaticDatasetGenerator for static dataset.""" 28 | tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path) 29 | tokenizer.padding_side = "left" 30 | if tokenizer.pad_token is None: 31 | tokenizer.pad_token = tokenizer.eos_token 32 | 33 | dataset_config = OmegaConf.create({ 34 | "max_prompt_length": self.config.max_prompt_tokens, 35 | "truncation": "right", 36 | "return_raw_chat": True, 37 | }) 38 | 39 | math_game_for_prompt = MathGame() 40 | 41 | 42 | # Training data generator 43 | train_generator = StaticDatasetGenerator( 44 | data_paths=self.data_paths, 45 | interaction_name=self.interaction_name, 46 | prompt_key="prompt", 47 | ground_truth_key="ground_truth", 48 | shuffle=True, 49 | system_prompt=math_game_for_prompt.get_system_prompt(), 50 | ) 51 | 52 | batch_size = self.config.batch_size 53 | num_steps = getattr(self.config, 'num_steps', None) 54 | virtual_size = num_steps * batch_size if num_steps else len(train_generator) * getattr(self.config, 'num_epochs', 1) 55 | 56 | train_dataset = DynamicGameDataset(train_generator, tokenizer, dataset_config, virtual_size=virtual_size) 57 | 58 | sampler_config = OmegaConf.create({ 59 | "shuffle": True, 60 | "seed": 42, 61 | "sampler": None, 62 | }) 63 | train_sampler = create_rl_sampler(sampler_config, train_dataset) 64 | 65 | 66 | self.train_dataloader = StatefulDataLoader(train_dataset, batch_size=batch_size, shuffle=False, 67 | sampler=train_sampler, 68 | num_workers=getattr(self.config, 'num_workers', 0), 69 | collate_fn=collate_fn, drop_last=True) 70 | print(f"Training dataloader: {len(self.train_dataloader)} batches") 71 | 72 | # Validation data generator - sample exactly val_batch_size samples, keep fixed 73 | if self.val_data_paths: 74 | val_generator = StaticDatasetGenerator( 75 | data_paths=self.val_data_paths, 76 | interaction_name=self.interaction_name, 77 | prompt_key="prompt", 78 | ground_truth_key="ground_truth", 79 | shuffle=False, # No shuffle - keep samples fixed 80 | seed=42, 81 | system_prompt=math_game_for_prompt.get_system_prompt(), 82 | ) 83 | val_batch_size = getattr(self.config, 'val_batch_size', min(64, len(val_generator))) 84 | # Use val_batch_size as virtual_size to sample exactly that many samples 85 | val_dataset = DynamicGameDataset(val_generator, tokenizer, dataset_config, 86 | virtual_size=val_batch_size, seed=42) 87 | self.val_dataloader = StatefulDataLoader(val_dataset, batch_size=val_batch_size, shuffle=False, 88 | num_workers=getattr(self.config, 'num_workers', 0), 89 | collate_fn=collate_fn, drop_last=False) 90 | print(f"Validation dataloader: {val_batch_size} fixed samples in {len(self.val_dataloader)} batch(es)") 91 | 92 | -------------------------------------------------------------------------------- /opentinker/server/config/data/legacy_data.yaml: -------------------------------------------------------------------------------- 1 | # Tokenizer class or path. If null, it will be inferred from the model. 2 | tokenizer: null 3 | 4 | # Whether to use shared memory for data loading. 5 | use_shm: False 6 | 7 | # Training set parquet. Can be a list or a single file. 8 | # The program will read all files into memory, so it can't be too large (< 100GB). 9 | # The path can be either a local path or an HDFS path. 10 | # For HDFS path, we provide utils to download it to DRAM and convert it to a local path. 11 | train_files: ~/data/rlhf/gsm8k/train.parquet 12 | 13 | # Validation parquet. Can be a list or a single file. 14 | val_files: ~/data/rlhf/gsm8k/test.parquet 15 | 16 | # Maximum sample length to be used. 17 | # Set to -1 to use full dataset, otherwise, randomly 18 | # select the specified number of samples from train dataset 19 | train_max_samples: -1 20 | 21 | # Maximum sample length to be used. 22 | # Set to -1 to use full dataset, otherwise, randomly 23 | # select the specified number of samples from val dataset 24 | val_max_samples: 100 25 | 26 | # The field in the dataset where the prompt is located. Default is 'prompt'. 27 | prompt_key: prompt 28 | 29 | # The field used to select the reward function (if using different ones per example). 30 | reward_fn_key: data_source 31 | 32 | # Maximum prompt length. All prompts will be left-padded to this length. 33 | # An error will be reported if the length is too long. 34 | # oc.select: default val for rollout.prompt_length 35 | max_prompt_length: 512 36 | 37 | # Maximum response length. Rollout in RL algorithms (e.g. PPO) generates up to this length. 38 | # oc.select: default val for rollout.response_length 39 | max_response_length: 512 40 | 41 | # Batch size sampled for one training iteration of different RL algorithms. 42 | train_batch_size: 1024 43 | 44 | # Batch size used during validation. Can be null. 45 | val_batch_size: null 46 | 47 | # use tool config to calculate true prompt length 48 | tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, null} 49 | 50 | # Whether to return the original input_ids without adding chat template. 51 | # This is used when the reward model's chat template differs from the policy. 52 | # If using a model-based RM with different templates, this should be True. 53 | return_raw_input_ids: False 54 | 55 | # Whether to return the original chat (prompt) without applying chat template. 56 | return_raw_chat: False 57 | 58 | # Whether to return the full prompt with chat template. 59 | return_full_prompt: False 60 | 61 | # Whether to shuffle the data in the dataloader. 62 | shuffle: True 63 | 64 | # Seed to use when shuffling the data 65 | seed: null 66 | 67 | # num dataloader workers 68 | dataloader_num_workers: 8 69 | 70 | # image patch size 71 | image_patch_size: 14 72 | 73 | # Whether to shuffle the validation set. 74 | validation_shuffle: False 75 | 76 | # Whether to filter overlong prompts. 77 | filter_overlong_prompts: False 78 | 79 | # Number of workers for filtering overlong prompts. 80 | # For large-scale datasets, filtering can be time-consuming. 81 | # Use multiprocessing to speed up. Default is 1. 82 | filter_overlong_prompts_workers: 1 83 | 84 | # Truncate the input_ids or prompt if they exceed max_prompt_length. 85 | # Options: 'error', 'left', 'right', 'middle'. Default is 'error'. 86 | truncation: error 87 | 88 | # The field in the multi-modal dataset where the image is located. Default is 'images'. 89 | image_key: images 90 | 91 | # The field in the multi-modal dataset where the video is located. 92 | video_key: videos 93 | 94 | # If the remote tokenizer has a Python file, this flag determines whether to allow using it. 95 | trust_remote_code: False 96 | 97 | # Optional: specify a custom dataset class path and name if overriding default loading behavior. 98 | custom_cls: 99 | 100 | # The path to the file containing your customized dataset class. If not specified, pre-implemented dataset will be used. 101 | path: null 102 | 103 | # The name of the dataset class within the specified file. 104 | name: null 105 | 106 | # Whether to return multi-modal inputs in the dataset. Set to False if rollout generates new multi-modal inputs. 107 | return_multi_modal_inputs: True 108 | 109 | # settings related to data sampler 110 | sampler: 111 | 112 | # the path to the module containing a curriculum class which implements the 113 | # AbstractSampler interface 114 | class_path: null 115 | 116 | # the name of the curriculum class like `MySampler` 117 | class_name: null 118 | 119 | # Data generation configuration for augmenting the dataset. 120 | datagen: 121 | 122 | # The path to the file containing your customized data generation class. 123 | # E.g. 'pkg://verl.experimental.dynamic_dataset.dynamicgen_dataset' 124 | path: null 125 | 126 | # The class name of the data generation class within the specified file. 127 | # E.g. 'MockDataGenerator' 128 | name: null 129 | 130 | # Additional kwargs when calling tokenizer.apply_chat_template 131 | apply_chat_template_kwargs: {} 132 | -------------------------------------------------------------------------------- /opentinker/environment/environment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Environment API for PPO Training 4 | 5 | Provides abstract base class and concrete implementation for configuring 6 | dataloader and reward functions in PPO training. 7 | """ 8 | 9 | from abc import ABC, abstractmethod 10 | from dataclasses import dataclass 11 | from typing import Any, Dict, Optional, Callable 12 | import inspect 13 | from transformers import AutoTokenizer 14 | from omegaconf import OmegaConf 15 | 16 | # Note(Siqi): 17 | # ImportError: cannot import name 'ServiceClient' 18 | # from partially initialized module 'http_training_client' 19 | # (most likely due to a circular import) 20 | 21 | from verl.utils.dataset.rl_dataset import collate_fn 22 | from torchdata.stateful_dataloader import StatefulDataLoader 23 | from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler 24 | from opentinker.client.utils.utils import prepare_dataset, verify_raw_prompt_format 25 | 26 | @dataclass 27 | class RewardFunctionSpec: 28 | """Specification for reward function configuration. 29 | 30 | Supports three types: 31 | - "config": Load from Python file (path + function name) 32 | - "remote": Call remote API endpoint (future) 33 | - "code": Upload custom Python function to server 34 | """ 35 | type: str # "config", "remote", or "code" 36 | 37 | # For type="config" 38 | config_path: Optional[str] = None 39 | config_name: Optional[str] = None 40 | config_kwargs: Optional[Dict[str, Any]] = None 41 | 42 | # For type="remote" (future) 43 | remote_endpoint: Optional[str] = None 44 | remote_api_key: Optional[str] = None 45 | 46 | # For type="code" 47 | code_function: Optional[Callable] = None 48 | code_source: Optional[str] = None 49 | 50 | def __post_init__(self): 51 | """Validate configuration and extract source code if needed.""" 52 | if self.type not in ["config", "remote", "code"]: 53 | raise ValueError(f"Invalid reward function type: {self.type}. Must be 'config', 'remote', or 'code'") 54 | 55 | if self.type == "config": 56 | if not self.config_path or not self.config_name: 57 | raise ValueError("config_path and config_name are required for type='config'") 58 | 59 | elif self.type == "remote": 60 | if not self.remote_endpoint: 61 | raise ValueError("remote_endpoint is required for type='remote'") 62 | 63 | elif self.type == "code": 64 | if not self.code_function: 65 | raise ValueError("code_function is required for type='code'") 66 | 67 | # Auto-extract source code if not provided 68 | if self.code_source is None: 69 | try: 70 | self.code_source = inspect.getsource(self.code_function) 71 | except (OSError, TypeError) as e: 72 | raise ValueError(f"Could not extract source code from function: {e}") 73 | 74 | def to_config_dict(self) -> Dict[str, Any]: 75 | """Convert to configuration dictionary for server.""" 76 | if self.type == "config": 77 | config = { 78 | "type": "config", 79 | "config_path": self.config_path, 80 | "config_name": self.config_name, 81 | } 82 | if self.config_kwargs: 83 | config["config_kwargs"] = self.config_kwargs 84 | return config 85 | 86 | elif self.type == "remote": 87 | config = { 88 | "type": "remote", 89 | "remote_endpoint": self.remote_endpoint, 90 | } 91 | if self.remote_api_key: 92 | config["remote_api_key"] = self.remote_api_key 93 | return config 94 | 95 | elif self.type == "code": 96 | # Use 'name' field to match server config schema (not 'function_name') 97 | return { 98 | "type": "code", 99 | "name": self.code_function.__name__, 100 | } 101 | 102 | return {} 103 | 104 | 105 | class BaseEnvironment(ABC): 106 | """Abstract base class for PPO training environments. 107 | 108 | Subclasses must implement: 109 | - setup(client): Configure the environment on the server 110 | - dataloader property: Return the training dataloader 111 | - get_config(): Return configuration dict for server 112 | """ 113 | 114 | @abstractmethod 115 | def setup(self, client): 116 | """Setup environment on the server. 117 | 118 | Args: 119 | client: HTTPTrainingClient or ServiceClient instance 120 | """ 121 | pass 122 | 123 | @abstractmethod 124 | def get_dataloader(self): 125 | """Return the training dataloader.""" 126 | pass 127 | 128 | @abstractmethod 129 | def get_config(self) -> Dict[str, Any]: 130 | """Return configuration dictionary for server.""" 131 | pass -------------------------------------------------------------------------------- /opentinker/server/config/algorithm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass, field 16 | from typing import Any, Optional 17 | 18 | from verl.base_config import BaseConfig 19 | 20 | __all__ = ["AlgoConfig", "FilterGroupsConfig", "KLControlConfig"] 21 | 22 | 23 | @dataclass 24 | class KLControlConfig(BaseConfig): 25 | """Configuration for KL control. 26 | 27 | The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config. 28 | 29 | Args: 30 | type (str): Type of KL control. Can be "fixed" or "adaptive". 31 | kl_coef (float): Initial coefficient for KL penalty. 32 | horizon (int): Horizon value for adaptive controller. 33 | target_kl (float): Target KL divergence for adaptive controller. 34 | """ 35 | 36 | type: str = "fixed" 37 | kl_coef: float = 0.001 38 | horizon: int = 10000 39 | target_kl: float = 0.1 40 | 41 | 42 | @dataclass 43 | class FilterGroupsConfig(BaseConfig): 44 | """Configuration for filter groups (used in DAPO and Entropy). 45 | 46 | The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config. 47 | 48 | Args: 49 | enable (bool): Whether to enable filter groups. 50 | metric (Optional[str]): Metric to use for filtering: "acc", "score", "seq_reward", "seq_final_reward", etc. 51 | max_num_gen_batches (int): Non-positive values mean no upper limit. 52 | """ 53 | 54 | enable: bool = False 55 | metric: Optional[str] = None 56 | max_num_gen_batches: int = 0 57 | 58 | 59 | @dataclass 60 | class AlgoConfig(BaseConfig): 61 | """Configuration for the algorithm. 62 | 63 | The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config. 64 | 65 | Args: 66 | gamma (float): Discount factor for future rewards. 67 | lam (float): Trade-off between bias and variance in the GAE estimator. 68 | adv_estimator (str): Advantage estimator type: "gae", "grpo", "reinforce_plus_plus", etc. 69 | norm_adv_by_std_in_grpo (bool): Whether to normalize advantages by std (specific to GRPO). 70 | use_kl_in_reward (bool): Whether to enable in-reward KL penalty. 71 | kl_penalty (str): How to estimate KL divergence: "kl", "abs", "mse", "low_var_kl", or "full". 72 | kl_ctrl (KLControlConfig): KL control configuration. 73 | use_pf_ppo (bool): Whether to enable preference feedback PPO. 74 | pf_ppo (dict[str, Any]): Preference feedback PPO settings. 75 | filter_groups (Optional[FilterGroupsConfig]): Filter groups configuration, used in DAPO and Entropy 76 | rollout_is_threshold (Optional[float]): Upper threshold for IS weights. null = disabled, 77 | float value = enabled (compute weights and metrics). This is the main on/off switch. 78 | rollout_is_threshold_lower (Optional[float]): Lower threshold for IS weights. If None, defaults to 1/upper. 79 | rollout_is_level (str): Aggregation level: "token", "sequence", or "geometric". 80 | rollout_is_mode (str): Bounding mode: "truncate" (cap upper only) or "mask" (zero outside bounds). 81 | rollout_is_veto_threshold (float or None): Per-token veto threshold for catastrophic outliers. None to disable. 82 | rollout_is (bool): Whether to apply IS weights to policy loss. True = apply weights, 83 | False = compute metrics only (useful for monitoring before enabling correction). Default: False. 84 | """ 85 | 86 | gamma: float = 1.0 87 | lam: float = 1.0 88 | adv_estimator: str = "gae" 89 | norm_adv_by_std_in_grpo: bool = True 90 | use_kl_in_reward: bool = False 91 | kl_penalty: str = "kl" 92 | kl_ctrl: KLControlConfig = field(default_factory=KLControlConfig) 93 | use_pf_ppo: bool = False 94 | pf_ppo: dict[str, Any] = field(default_factory=dict) 95 | filter_groups: Optional[FilterGroupsConfig] = None 96 | # Rollout Importance Sampling 97 | # Controls computation of IS weights and mismatch metrics 98 | rollout_is_threshold: Optional[float] = None # null = disabled, float = enabled 99 | rollout_is_threshold_lower: Optional[float] = None 100 | rollout_is_level: str = "token" 101 | rollout_is_mode: str = "truncate" 102 | rollout_is_veto_threshold: Optional[float] = None 103 | # Controls whether to apply IS weights to policy loss (only if rollout_is_threshold is set) 104 | # True = apply weights to loss, False = compute metrics only (no weight application) 105 | rollout_is: bool = False 106 | -------------------------------------------------------------------------------- /opentinker/environment/math/math_tool_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Code Interpreter Math Environment Server. 3 | 4 | This script starts a game server for math problem solving with code interpreter. 5 | It also manages a sandbox server for Python code execution. 6 | 7 | The server handles: 8 | - /reset: Initialize a new math problem session 9 | - /step: Process LLM response, extract and execute code, return results 10 | 11 | Usage: 12 | # Start with auto-managed sandbox (recommended): 13 | python code_interpreter_math_server.py --port 8088 14 | 15 | # Start with external sandbox: 16 | python code_interpreter_math_server.py --port 8088 --sandbox-url http://localhost:8000/run_code 17 | 18 | # For multi-worker mode (production): 19 | # First start sandbox separately, then: 20 | uvicorn opentinker.environment.math.code_interpreter_math_server:app \\ 21 | --host 0.0.0.0 --port 8088 --workers 4 22 | """ 23 | 24 | import argparse 25 | import atexit 26 | import threading 27 | import time 28 | from typing import Optional 29 | 30 | from opentinker.environment.base_game_server import run_game_server, create_game_app 31 | from opentinker.environment.math.math_tool_game import CodeInterpreterMathGame 32 | 33 | 34 | # Global sandbox reference for cleanup 35 | _sandbox_actor = None 36 | _sandbox_url = None 37 | 38 | 39 | def start_sandbox_background() -> str: 40 | """Start sandbox server in background and return URL. 41 | 42 | Returns: 43 | URL of the sandbox server 44 | """ 45 | global _sandbox_actor, _sandbox_url 46 | 47 | import ray 48 | from opentinker.server.sandbox import Sandbox 49 | 50 | # Initialize Ray if needed 51 | if not ray.is_initialized(): 52 | ray.init(ignore_reinit_error=True) 53 | 54 | # Create and start sandbox 55 | _sandbox_actor = Sandbox.remote() 56 | ray.get(_sandbox_actor.start_server.remote()) 57 | 58 | # Wait for server to be ready 59 | time.sleep(0.5) 60 | 61 | # Get address 62 | address = ray.get(_sandbox_actor.get_server_address.remote()) 63 | _sandbox_url = f"http://{address}/run_code" 64 | 65 | print(f"✓ Sandbox server started at {_sandbox_url}") 66 | return _sandbox_url 67 | 68 | 69 | def cleanup_sandbox(): 70 | """Clean up sandbox server on exit.""" 71 | global _sandbox_actor 72 | if _sandbox_actor is not None: 73 | try: 74 | import ray 75 | ray.kill(_sandbox_actor) 76 | print("✓ Sandbox server stopped") 77 | except: 78 | pass 79 | 80 | 81 | # Register cleanup 82 | atexit.register(cleanup_sandbox) 83 | 84 | 85 | def create_app_with_sandbox(sandbox_url: str): 86 | """Create FastAPI app with sandbox URL configured. 87 | 88 | Args: 89 | sandbox_url: URL of the sandbox server 90 | 91 | Returns: 92 | FastAPI app 93 | """ 94 | # Create game class factory with sandbox_url 95 | def game_factory(**kwargs): 96 | return CodeInterpreterMathGame(sandbox_url=sandbox_url, **kwargs) 97 | 98 | return create_game_app(game_class=game_factory) 99 | 100 | 101 | def main(): 102 | parser = argparse.ArgumentParser(description="Code Interpreter Math Game Server") 103 | parser.add_argument("--host", default="0.0.0.0", help="Server host") 104 | parser.add_argument("--port", type=int, default=8088, help="Server port") 105 | parser.add_argument("--sandbox-url", type=str, default=None, 106 | help="External sandbox URL. If not provided, starts internal sandbox.") 107 | parser.add_argument("--max-turns", type=int, default=10, 108 | help="Maximum turns per problem") 109 | parser.add_argument("--timeout", type=int, default=30, 110 | help="Sandbox execution timeout in seconds") 111 | args = parser.parse_args() 112 | 113 | print("\n" + "=" * 60) 114 | print("Code Interpreter Math Game Server") 115 | print("=" * 60) 116 | 117 | # Determine sandbox URL 118 | if args.sandbox_url: 119 | sandbox_url = args.sandbox_url 120 | print(f"Using external sandbox at: {sandbox_url}") 121 | else: 122 | print("Starting internal sandbox server...") 123 | sandbox_url = start_sandbox_background() 124 | 125 | print(f"\nServer configuration:") 126 | print(f" Host: {args.host}") 127 | print(f" Port: {args.port}") 128 | print(f" Sandbox URL: {sandbox_url}") 129 | print(f" Max turns: {args.max_turns}") 130 | print(f" Timeout: {args.timeout}s") 131 | print("=" * 60 + "\n") 132 | 133 | # Run game server with configured sandbox 134 | run_game_server( 135 | game_class=CodeInterpreterMathGame, 136 | host=args.host, 137 | port=args.port, 138 | sandbox_url=sandbox_url, 139 | max_turns=args.max_turns, 140 | timeout=args.timeout, 141 | ) 142 | 143 | 144 | # For uvicorn multi-worker mode, create app with default sandbox URL 145 | # Usage: Set SANDBOX_URL env var before running uvicorn 146 | import os 147 | _default_sandbox_url = os.environ.get("SANDBOX_URL", "http://localhost:8000/run_code") 148 | app = create_game_app( 149 | game_class=CodeInterpreterMathGame, 150 | sandbox_url=_default_sandbox_url, 151 | ) 152 | 153 | 154 | if __name__ == "__main__": 155 | main() 156 | -------------------------------------------------------------------------------- /opentinker/environment/math/math_tool_env.py: -------------------------------------------------------------------------------- 1 | from opentinker.environment.base_game_environment import GameEnvironment 2 | import hydra 3 | from omegaconf import OmegaConf 4 | from torch.utils.data import DataLoader 5 | from transformers import AutoTokenizer 6 | from torchdata.stateful_dataloader import StatefulDataLoader 7 | 8 | from opentinker.client.utils.http_training_client import ServiceClient, SchedulerClient 9 | from opentinker.environment.base_game_environment import GameEnvironment 10 | from opentinker.environment.base_data_generator import DynamicGameDataset, collate_fn 11 | from opentinker.environment.math import MathGame 12 | from opentinker.environment.static_data_generator import StaticDatasetGenerator 13 | from opentinker.environment.game_stats_client import GameStatsClient 14 | from opentinker.client.utils.utils import resolve_paths_in_config 15 | from opentinker.client.utils.scheduler_client_lifecycle import get_lifecycle_manager 16 | from verl.trainer.main_ppo import create_rl_sampler 17 | from opentinker.environment.math.math_tool_game import CodeInterpreterMathGame 18 | 19 | class MathCodeInterpreterEnvironment(GameEnvironment): 20 | """GameEnvironment for math with code interpreter. 21 | 22 | Uses agent_loop (GenericAgentLoop) with GymEnvironmentInteraction. 23 | The game server handles code execution internally. 24 | """ 25 | 26 | def __init__( 27 | self, 28 | game_class, 29 | config, 30 | data_paths, 31 | val_data_paths=None, 32 | game_kwargs=None, 33 | job_id=None, 34 | ): 35 | self.data_paths = [data_paths] if isinstance(data_paths, str) else list(data_paths) 36 | self.val_data_paths = [val_data_paths] if isinstance(val_data_paths, str) else (list(val_data_paths) if val_data_paths else None) 37 | super().__init__( 38 | game_class=game_class, 39 | config=config, 40 | game_kwargs=game_kwargs or {}, 41 | job_id=job_id 42 | ) 43 | 44 | def _setup_dataloader(self): 45 | """Use StaticDatasetGenerator for static dataset.""" 46 | tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path) 47 | tokenizer.padding_side = "left" 48 | if tokenizer.pad_token is None: 49 | tokenizer.pad_token = tokenizer.eos_token 50 | 51 | dataset_config = OmegaConf.create({ 52 | "max_prompt_length": self.config.max_prompt_tokens, 53 | "truncation": "right", 54 | "return_raw_chat": True, 55 | }) 56 | 57 | # Use CodeInterpreterMathGame for system prompt 58 | math_game_for_prompt = CodeInterpreterMathGame() 59 | 60 | # Training data generator 61 | train_generator = StaticDatasetGenerator( 62 | data_paths=self.data_paths, 63 | interaction_name=self.interaction_name, 64 | prompt_key="prompt", 65 | ground_truth_key="ground_truth", 66 | shuffle=True, 67 | system_prompt=math_game_for_prompt.get_system_prompt(), 68 | ) 69 | 70 | batch_size = self.config.batch_size 71 | num_steps = getattr(self.config, 'num_steps', None) 72 | virtual_size = num_steps * batch_size if num_steps else len(train_generator) * getattr(self.config, 'num_epochs', 1) 73 | 74 | train_dataset = DynamicGameDataset(train_generator, tokenizer, dataset_config, virtual_size=virtual_size) 75 | 76 | sampler_config = OmegaConf.create({ 77 | "shuffle": True, 78 | "seed": 42, 79 | "sampler": None, 80 | }) 81 | train_sampler = create_rl_sampler(sampler_config, train_dataset) 82 | 83 | self.train_dataloader = StatefulDataLoader( 84 | train_dataset, 85 | batch_size=batch_size, 86 | shuffle=False, 87 | sampler=train_sampler, 88 | num_workers=getattr(self.config, 'num_workers', 0), 89 | collate_fn=collate_fn, 90 | drop_last=True 91 | ) 92 | print(f"Training dataloader: {len(self.train_dataloader)} batches") 93 | 94 | # Validation data generator 95 | if self.val_data_paths: 96 | val_generator = StaticDatasetGenerator( 97 | data_paths=self.val_data_paths, 98 | interaction_name=self.interaction_name, 99 | prompt_key="prompt", 100 | ground_truth_key="ground_truth", 101 | shuffle=False, 102 | seed=42, 103 | system_prompt=math_game_for_prompt.get_system_prompt(), 104 | ) 105 | val_batch_size = getattr(self.config, 'val_batch_size', min(64, len(val_generator))) 106 | val_dataset = DynamicGameDataset( 107 | val_generator, tokenizer, dataset_config, 108 | virtual_size=val_batch_size, seed=42 109 | ) 110 | self.val_dataloader = StatefulDataLoader( 111 | val_dataset, 112 | batch_size=val_batch_size, 113 | shuffle=False, 114 | num_workers=getattr(self.config, 'num_workers', 0), 115 | collate_fn=collate_fn, 116 | drop_last=False 117 | ) 118 | print(f"Validation dataloader: {val_batch_size} fixed samples in {len(self.val_dataloader)} batch(es)") 119 | 120 | --------------------------------------------------------------------------------