├── opentinker
    ├── client
    │   ├── utils
    │   │   └── __init__.py
    │   ├── __init__.py
    │   ├── client_config
    │   │   ├── math_inference_config.yaml
    │   │   ├── opentinker_param.yaml
    │   │   ├── math_inference_scheduler_config.yaml
    │   │   ├── gomoku_inference_config.yaml
    │   │   ├── math_code_interpreter_inference_config.yaml
    │   │   ├── gomoku_inference_scheduler_config.yaml
    │   │   ├── geo3k_param.yaml
    │   │   ├── math_code_interpreter_param.yaml
    │   │   ├── geo3k_tool_param.yaml
    │   │   ├── math_param.yaml
    │   │   ├── generic_env_param.yaml
    │   │   └── gomoku_param.yaml
    │   ├── legacy
    │   │   ├── math_inference.py
    │   │   └── gomoku_inference.py
    │   ├── geo3k_rl.py
    │   ├── math_rl.py
    │   ├── math_inference.py
    │   ├── math_tool_rl.py
    │   ├── gomoku_inference.py
    │   ├── math_tool_inference.py
    │   └── geo3k_tool_rl.py
    ├── server
    │   ├── __init__.py
    │   ├── agent.yaml
    │   ├── config
    │   │   ├── tool_config_math.json
    │   │   ├── evaluation.yaml
    │   │   ├── actor
    │   │   │   ├── megatron_actor.yaml
    │   │   │   └── dp_actor.yaml
    │   │   ├── ref
    │   │   │   ├── megatron_ref.yaml
    │   │   │   ├── dp_ref.yaml
    │   │   │   └── ref.yaml
    │   │   ├── __init__.py
    │   │   ├── npu_profile
    │   │   │   └── npu_profile.yaml
    │   │   ├── optim
    │   │   │   ├── megatron.yaml
    │   │   │   └── fsdp.yaml
    │   │   ├── critic
    │   │   │   ├── megatron_critic.yaml
    │   │   │   └── dp_critic.yaml
    │   │   ├── engine
    │   │   │   ├── fsdp.yaml
    │   │   │   └── megatron.yaml
    │   │   ├── reward_model
    │   │   │   ├── dp_reward_model.yaml
    │   │   │   ├── megatron_reward_model.yaml
    │   │   │   └── reward_model.yaml
    │   │   ├── generation.yaml
    │   │   ├── model
    │   │   │   └── hf_model.yaml
    │   │   ├── sft_trainer_engine.yaml
    │   │   ├── sft_trainer.yaml
    │   │   ├── config.py
    │   │   ├── data
    │   │   │   └── legacy_data.yaml
    │   │   └── algorithm.py
    │   └── sandbox_tool.py
    ├── scheduler
    │   ├── __init__.py
    │   ├── config
    │   │   └── scheduler.yaml
    │   ├── register_user_example.py
    │   ├── web_dashboard.py
    │   └── SCHEDULER_GUIDE.md
    ├── data_preprocess
    │   ├── __init__.py
    │   ├── math.py
    │   ├── geo3k.py
    │   ├── math_multiturn_w_interaction.py
    │   └── math_dataset.py
    ├── reward_functions
    │   ├── __init__.py
    │   └── math_reward_server.py
    ├── docs
    │   ├── images
    │   │   └── opentinker_arch.jpeg
    │   ├── CORS_FIX.md
    │   └── SERVER_CONNECTION_FIX.md
    ├── __init__.py
    ├── backend_patch
    │   └── note.md
    ├── environment
    │   ├── geo3k
    │   │   ├── __init__.py
    │   │   ├── geo3k_env.py
    │   │   ├── geo3k_tool_server.py
    │   │   ├── geo3k_server.py
    │   │   └── geo3k_tool_env.py
    │   ├── math
    │   │   ├── __init__.py
    │   │   ├── math_server.py
    │   │   ├── math_env.py
    │   │   ├── math_tool_server.py
    │   │   └── math_tool_env.py
    │   ├── legacy
    │   │   ├── example
    │   │   │   └── interaction_config.yaml
    │   │   └── generic
    │   │   │   └── README.md
    │   ├── gomoku
    │   │   ├── __init__.py
    │   │   └── gomoku_server.py
    │   ├── __init__.py
    │   ├── static_data_generator_vl.py
    │   └── environment.py
    ├── utils
    │   └── __init__.py
    ├── requirements.txt
    ├── scripts
    │   └── launch_scheduler.sh
    ├── setup_cross_node.sh
    └── test_geo3k_data.py
├── assets
    ├── reallogo.png
    └── README.md
├── scheduler_users.db
├── .gitmodules
├── data
    ├── read.py
    └── math
    │   ├── test_example.json
    │   └── train_example.json
├── .gitignore
├── setup.py
└── docs
    └── geo3k_quickstart.md


/opentinker/client/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/opentinker/client/__init__.py:
--------------------------------------------------------------------------------
1 | """OpenTinker client module."""
2 | 


--------------------------------------------------------------------------------
/opentinker/server/__init__.py:
--------------------------------------------------------------------------------
1 | """OpenTinker server module."""
2 | 


--------------------------------------------------------------------------------
/opentinker/scheduler/__init__.py:
--------------------------------------------------------------------------------
1 | """OpenTinker scheduler module."""
2 | 


--------------------------------------------------------------------------------
/opentinker/data_preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | """OpenTinker data preprocessing module."""
2 | 


--------------------------------------------------------------------------------
/opentinker/reward_functions/__init__.py:
--------------------------------------------------------------------------------
1 | """OpenTinker reward functions module."""
2 | 


--------------------------------------------------------------------------------
/assets/reallogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-tinker/OpenTinker/HEAD/assets/reallogo.png


--------------------------------------------------------------------------------
/scheduler_users.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-tinker/OpenTinker/HEAD/scheduler_users.db


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "verl"]
2 | 	path = verl
3 | 	url = https://github.com/volcengine/verl.git
4 | 


--------------------------------------------------------------------------------
/opentinker/server/agent.yaml:
--------------------------------------------------------------------------------
1 | - name: generic_agent
2 |   _target_: opentinker.server.generic_agent_loop.GenericAgentLoop
3 | 


--------------------------------------------------------------------------------
/opentinker/docs/images/opentinker_arch.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-tinker/OpenTinker/HEAD/opentinker/docs/images/opentinker_arch.jpeg


--------------------------------------------------------------------------------
/opentinker/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | OpenTinker: A framework for training and inference with interactive environments.
3 | """
4 | 
5 | __version__ = "0.1.0"
6 | 


--------------------------------------------------------------------------------
/data/read.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | import pandas as pd
3 | 
4 | # 加载你的数据
5 | ds = load_dataset("parquet", data_files="./data/geo3k/test.parquet")
6 | print("Dataset columns:", ds['train'].column_names)
7 | print("\nFirst row:")
8 | print(ds['train'][0])


--------------------------------------------------------------------------------
/opentinker/backend_patch/note.md:
--------------------------------------------------------------------------------
1 | this is a patch for verl, to make it work with opentinker
2 | 
3 | the patch is based on verl 0.7.0.dev0
4 | 
5 | git clone https://github.com/volcengine/verl.git
6 | cd verl
7 | git checkout 418f964ab84d2b7c49aa4404f65774917501b092


--------------------------------------------------------------------------------
/opentinker/server/config/tool_config_math.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "tools": [
 3 |         {
 4 |             "class_name": "opentinker.server.sandbox_tool.SandboxTool",
 5 |             "config": {
 6 |                 "type": "native",
 7 |                 "sandbox_fusion_url": "http://localhost:8000/run_code"
 8 |             }
 9 |         }
10 |     ]
11 | }
12 | 


--------------------------------------------------------------------------------
/opentinker/environment/geo3k/__init__.py:
--------------------------------------------------------------------------------
 1 | """Geo3K geometry problem-solving game for OpenTinker."""
 2 | 
 3 | from .geo3k_game import Geo3KGame
 4 | from .geo3k_env import Geo3KGameEnvironment
 5 | from .geo3k_tool_game import Geo3KToolGame
 6 | from .geo3k_tool_env import Geo3KToolEnvironment
 7 | 
 8 | __all__ = [
 9 |     "Geo3KGame", 
10 |     "Geo3KGameEnvironment",
11 |     "Geo3KToolGame",
12 |     "Geo3KToolEnvironment",
13 | ]
14 | 


--------------------------------------------------------------------------------
/assets/README.md:
--------------------------------------------------------------------------------
 1 | # Assets Directory
 2 | 
 3 | This directory contains visual assets for the README and documentation.
 4 | 
 5 | ## Required Files
 6 | 
 7 | | File | Description |
 8 | |------|-------------|
 9 | | `logo.png` | OpenTinker logo (recommended: 200x200 px) |
10 | | `demo.gif` | Demo animation showing OpenTinker in action |
11 | 
12 | ## Optional Files
13 | 
14 | Add any additional screenshots, diagrams, or visual materials here.
15 | 


--------------------------------------------------------------------------------
/opentinker/server/config/evaluation.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   path: /tmp/math_Qwen2-7B-Instruct.parquet
 3 |   prompt_key: prompt
 4 |   response_key: responses
 5 |   data_source_key: data_source
 6 |   reward_model_key: reward_model
 7 | 
 8 | custom_reward_function:
 9 |   path: null
10 |   name: compute_score
11 | 
12 | ray_kwargs:
13 |   ray_init:
14 |     num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
15 |   timeline_json_file: null
16 | 


--------------------------------------------------------------------------------
/opentinker/environment/math/__init__.py:
--------------------------------------------------------------------------------
 1 | """Math Environment Package.
 2 | 
 3 | Provides MathGame and related components for math problem solving:
 4 | - MathGame: Single-turn math problem solving with rewards computed in step()
 5 | - CodeInterpreterMathGame: Multi-turn math with code interpreter tool support
 6 | """
 7 | 
 8 | from opentinker.environment.math.math_game import MathGame
 9 | from opentinker.environment.math.math_tool_game import CodeInterpreterMathGame
10 | 
11 | __all__ = ["MathGame", "CodeInterpreterMathGame"]
12 | 
13 | 


--------------------------------------------------------------------------------
/opentinker/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Utility modules for OpenTinker
 2 | from opentinker.utils.rollout_trace_saver import (
 3 |     RolloutTraceSaver,
 4 |     RolloutTrace,
 5 |     init_weave_tracing,
 6 |     init_mlflow_tracing,
 7 |     get_global_saver,
 8 |     set_global_saver,
 9 |     init_global_saver,
10 | )
11 | 
12 | __all__ = [
13 |     "RolloutTraceSaver",
14 |     "RolloutTrace",
15 |     "init_weave_tracing",
16 |     "init_mlflow_tracing",
17 |     "get_global_saver",
18 |     "set_global_saver",
19 |     "init_global_saver",
20 | ]
21 | 


--------------------------------------------------------------------------------
/opentinker/server/config/actor/megatron_actor.yaml:
--------------------------------------------------------------------------------
 1 | # megatron actor config, inheriting from trainer/config/actor/actor.yaml
 2 | defaults:
 3 |   # megatron optimizer config
 4 |   - ../optim@optim: megatron
 5 | 
 6 |   # megatron engine config
 7 |   - ../engine@megatron: megatron
 8 | 
 9 |   - actor
10 | 
11 |   # load the reference default config, then apply the fields in the current yaml
12 |   - _self_
13 | 
14 | _target_: verl.workers.config.McoreActorConfig
15 | 
16 | strategy: megatron
17 | 
18 | data_loader_seed: null
19 | 
20 | load_weight: True
21 | 


--------------------------------------------------------------------------------
/data/math/test_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "level": "Level 3",
 3 |   "type": "Algebra",
 4 |   "data_source": "DigitalLearningGmbH/MATH-lighteval",
 5 |   "prompt": [
 6 |     {
 7 |       "content": "How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have? Let's think step by step and output the final answer within \\boxed{}.",
 8 |       "role": "user"
 9 |     }
10 |   ],
11 |   "ability": "math",
12 |   "reward_model": {
13 |     "ground_truth": "2",
14 |     "style": "rule"
15 |   },
16 |   "extra_info": {
17 |     "index": 0,
18 |     "split": "test"
19 |   }
20 | }


--------------------------------------------------------------------------------
/opentinker/requirements.txt:
--------------------------------------------------------------------------------
 1 | # OpenTinker Requirements
 2 | # Python 3.8+
 3 | 
 4 | # Core dependencies
 5 | ray>=2.9.0
 6 | torch>=2.0.0
 7 | transformers>=4.35.0
 8 | 
 9 | # Web framework
10 | fastapi>=0.104.0
11 | uvicorn>=0.24.0
12 | pydantic>=2.0.0
13 | 
14 | # Configuration
15 | omegaconf>=2.3.0
16 | hydra-core>=1.3.0
17 | pyyaml>=6.0
18 | 
19 | # Data processing
20 | pandas>=2.0.0
21 | pyarrow>=14.0.0
22 | datasets>=2.14.0
23 | 
24 | # Utilities
25 | requests>=2.31.0
26 | aiohttp>=3.9.0
27 | 
28 | # Optional: Logging and monitoring
29 | wandb>=0.16.0
30 | 
31 | # Optional: Development tools
32 | pytest>=7.4.0
33 | black>=23.0.0
34 | flake8>=6.1.0
35 | 


--------------------------------------------------------------------------------
/opentinker/server/config/ref/megatron_ref.yaml:
--------------------------------------------------------------------------------
 1 | # megatron ref config, inheriting from trainer/config/ref/ref.yaml
 2 | defaults:
 3 |   - ref
 4 | 
 5 |   # megatron engine config
 6 |   - ../engine@megatron: megatron
 7 |   
 8 |   # load the reference default config, then apply the fields in the current yaml
 9 |   - _self_
10 | 
11 | strategy: megatron
12 | 
13 | megatron:
14 |   _target_: verl.workers.config.MegatronEngineConfig
15 |   seed: ${oc.select:actor_rollout_ref.actor.megatron.seed,42}
16 |   override_transformer_config: ${oc.select:actor_rollout_ref.actor.megatron.override_transformer_config,{}}
17 |   use_mbridge: ${oc.select:actor_rollout_ref.actor.megatron.use_mbridge,False}
18 | 
19 | load_weight: True


--------------------------------------------------------------------------------
/opentinker/scheduler/config/scheduler.yaml:
--------------------------------------------------------------------------------
 1 | # Job Scheduler Configuration
 2 | 
 3 | # GPU IDs available for job allocation
 4 | available_gpus: [0, 1, 2, 3]
 5 | 
 6 | # Number of GPUs to allocate per job
 7 | gpus_per_job: 4
 8 | 
 9 | # Port range for spawned training servers [min, max]
10 | # Set to null to auto-detect available ports
11 | port_range: null  # or [38564, 38600] for manual range
12 | 
13 | # Number of ports to auto-detect if port_range is null
14 | num_ports: 50
15 | 
16 | # Port for the scheduler server itself
17 | scheduler_port: 8765
18 | 
19 | # Authentication settings
20 | enable_auth: false  # Set to false to disable authentication
21 | 
22 | # Path to SQLite user database
23 | user_db_path: "scheduler_users.db"
24 | 


--------------------------------------------------------------------------------
/data/math/train_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "level": "Level 5",
 3 |   "type": "Algebra",
 4 |   "data_source": "DigitalLearningGmbH/MATH-lighteval",
 5 |   "prompt": [
 6 |     {
 7 |       "content": "Let \\[f(x) = \\left\\{\n\\begin{array}{cl} ax+3, &\\text{ if }x>2, \\\\\nx-5 &\\text{ if } -2 \\le x \\le 2, \\\\\n2x-b &\\text{ if } x <-2.\n\\end{array}\n\\right.\\]Find $a+b$ if the piecewise function is continuous (which means that its graph can be drawn without lifting your pencil from the paper). Let's think step by step and output the final answer within \\boxed{}.",
 8 |       "role": "user"
 9 |     }
10 |   ],
11 |   "ability": "math",
12 |   "reward_model": {
13 |     "ground_truth": "0",
14 |     "style": "rule"
15 |   },
16 |   "extra_info": {
17 |     "index": 0,
18 |     "split": "train"
19 |   }
20 | }


--------------------------------------------------------------------------------
/opentinker/environment/legacy/example/interaction_config.yaml:
--------------------------------------------------------------------------------
 1 | # Generic Environment Interaction Configuration
 2 | # This file configures interactions for use with GenericAgentLoop.
 3 | # Each entry defines an interaction that can be used during training.
 4 | 
 5 | # Gym Environment Interaction
 6 | # Connects to an external Gym-like environment via HTTP API
 7 | - name: gym_env
 8 |   class: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction
 9 |   config:
10 |     # HTTP endpoint for the environment server
11 |     env_endpoint: "http://localhost:8084"
12 |     # Maximum steps per episode
13 |     max_steps: 100
14 |     # Template for formatting observations as messages
15 |     # Available variables: {observation}, {reward}, {step}, {cumulative_reward}
16 |     observation_template: "Environment observation: {observation}"


--------------------------------------------------------------------------------
/opentinker/server/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from . import algorithm, config
16 | from .algorithm import *  # noqa: F401
17 | from .config import *  # noqa: F401
18 | 
19 | __all__ = config.__all__ + algorithm.__all__
20 | 


--------------------------------------------------------------------------------
/opentinker/environment/gomoku/__init__.py:
--------------------------------------------------------------------------------
 1 | """Gomoku Environment Module for LLM Training.
 2 | 
 3 | Usage:
 4 |     from opentinker.environment.base_game_environment import GameEnvironment
 5 |     from opentinker.environment.gomoku import GomokuGame
 6 |     from opentinker.environment.game_stats_client import GameStatsClient
 7 |     
 8 |     env = GameEnvironment(game_class=GomokuGame, config=config)
 9 |     stats_client = GameStatsClient(env_endpoint)
10 |     
11 |     # Optional: GomokuGameStats for server-side metrics
12 |     from opentinker.environment.gomoku import GomokuGameStats  # may be None
13 | """
14 | 
15 | from .gomoku_game import GomokuGame
16 | 
17 | # GomokuGameStats is optional - only available if gomoku_stats.py exists
18 | try:
19 |     from .gomoku_stats import GomokuGameStats
20 | except ImportError:
21 |     GomokuGameStats = None
22 | 
23 | __all__ = [
24 |     "GomokuGame",
25 |     "GomokuGameStats",
26 | ]
27 | 


--------------------------------------------------------------------------------
/opentinker/client/client_config/math_inference_config.yaml:
--------------------------------------------------------------------------------
 1 | # OpenTinker Inference Configuration
 2 | # Use with: python math_inference.py
 3 | 
 4 | # Model settings
 5 | model_path: null           # Path to trained checkpoint (HuggingFace format)
 6 | tokenizer_path: null       # Tokenizer path (defaults to model_path if null)
 7 | 
 8 | # GPU settings
 9 | tensor_parallel_size: 1    # Number of GPUs for tensor parallelism
10 | gpu_memory_utilization: 0.9
11 | 
12 | # Generation parameters (greedy by default for inference)
13 | temperature: 0.0           # 0.0 = greedy decoding
14 | top_p: 1.0
15 | max_new_tokens: 4096
16 | 
17 | # Data settings
18 | data_path: null            # Input data file (parquet/jsonl)
19 | output_path: null          # Output results file (jsonl)
20 | max_samples: null          # Limit samples (null = all)
21 | 
22 | # Environment settings  
23 | env_endpoint: http://localhost:8088
24 | 
25 | # Multi-turn settings (same as training config)
26 | multi_turn:
27 |     max_user_turns: 0
28 |     max_assistant_turns: 1


--------------------------------------------------------------------------------
/opentinker/server/config/ref/dp_ref.yaml:
--------------------------------------------------------------------------------
 1 | # defaults specify the default config from each component
 2 | defaults:
 3 | 
 4 |   # dp ref config, inheriting from trainer/config/ref/ref.yaml
 5 |   - ref
 6 |   
 7 |   # fsdp engine config
 8 |   - ../engine@fsdp_config: fsdp
 9 | 
10 |   # load the reference default config, then apply the fields in the current yaml
11 |   - _self_
12 | 
13 | # ref model is assumed to be identical to actor model. Specify model.path for using a different ref model.
14 | # Potential use case involves on policy distillation where we calculate KL divergence between student actor
15 | # and teacher ref
16 | model: null
17 | 
18 | # sequence parallel size
19 | # same as actor_rollout_ref.actor.ulysses_sequence_parallel_size if it exists, otherwise 1
20 | ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
21 | 
22 | # calculate entropy with chunking to reduce memory peak
23 | entropy_from_logits_with_chunking: False
24 | 
25 | # recompute entropy
26 | entropy_checkpointing: False
27 | 


--------------------------------------------------------------------------------
/opentinker/server/config/npu_profile/npu_profile.yaml:
--------------------------------------------------------------------------------
 1 | # Options for the npu profiler
 2 | options:
 3 | 
 4 |   # Storage path of collected data.
 5 |   save_path: ./profiler_data
 6 | 
 7 |   # The roles that will be profiled. Only takes effect in discrete mode.
 8 |   # optional values: all, rollout_generate, actor_compute_log_prob, actor_update and ref_compute_log_prob.
 9 |   # "all" means all roles will be profiled.
10 |   roles: ["all"]
11 | 
12 |   # Collection level, optional values: level_none, level0, level1, level2.
13 |   level: level1
14 | 
15 |   # Whether to enable memory analysis.
16 |   with_memory: False
17 | 
18 |   # Whether to record tensor shape.
19 |   record_shapes: False
20 | 
21 |   # Whether to record Device-side performance data.
22 |   with_npu: True
23 | 
24 |   # Whether to record Host-side performance data.
25 |   with_cpu: True
26 | 
27 |   # Whether to record Python call stack information.
28 |   with_module: False
29 | 
30 |   # Whether to record operator call stack information.
31 |   with_stack: False
32 | 
33 |   # Whether to automatically parse the data.
34 |   analysis: True


--------------------------------------------------------------------------------
/opentinker/server/config/optim/megatron.yaml:
--------------------------------------------------------------------------------
 1 | _target_: verl.workers.config.McoreOptimizerConfig
 2 | 
 3 | # Learning rate
 4 | lr: 1e-3
 5 | 
 6 | # LR warmup steps ratio
 7 | lr_warmup_steps_ratio: 0.0
 8 | 
 9 | # Total training steps
10 | total_training_steps: -1
11 | 
12 | # Weight decay
13 | weight_decay: 0.01
14 | 
15 | # LR warmup steps
16 | lr_warmup_steps: -1
17 | 
18 | # Betas for Adam optimizer
19 | betas: [0.9, 0.999]
20 | 
21 | # Clip gradient
22 | clip_grad: 1.0
23 | 
24 | # optimizer type
25 | optimizer: adam
26 | 
27 | # initial learning rate for warmup, default to 0.0
28 | lr_warmup_init: 0.0
29 | 
30 | lr_decay_steps: null
31 | 
32 | # select from constant/linear/cosine/inverse_square_root
33 | lr_decay_style: constant
34 | 
35 | # minimum learning rate, default to 0.0
36 | min_lr: 0.0
37 | 
38 | # select from constant/linear/cosine
39 | weight_decay_incr_style: constant
40 | 
41 | # select from constant/exponential/cosine
42 | lr_wsd_decay_style: exponential
43 | 
44 | lr_wsd_decay_steps: null
45 | 
46 | # use checkpoint optimizer parameter scheduler
47 | use_checkpoint_opt_param_scheduler: False
48 | 
49 | override_optimizer_config: {}
50 | 


--------------------------------------------------------------------------------
/opentinker/server/config/optim/fsdp.yaml:
--------------------------------------------------------------------------------
 1 | # Target class for this configuration
 2 | _target_: verl.workers.config.FSDPOptimizerConfig
 3 | 
 4 | # Optimizer class name (e.g., "AdamW", "AdamW8bit", "_AdamW", "Adam")
 5 | optimizer: AdamW
 6 | 
 7 | # Module path to import optimizer
 8 | # Examples: "torch.optim", "torchao.optim", "bitsandbytes.optim"
 9 | optimizer_impl: torch.optim
10 | 
11 | # Learning rate
12 | lr: 1e-3
13 | 
14 | # LR warmup steps ratio
15 | lr_warmup_steps_ratio: 0.0
16 | 
17 | # Total training steps
18 | total_training_steps: -1
19 | 
20 | # Weight decay
21 | weight_decay: 0.01
22 | 
23 | # LR warmup steps
24 | lr_warmup_steps: -1
25 | 
26 | # Betas for Adam optimizer
27 | betas: [0.9, 0.999]
28 | 
29 | # Clip gradient
30 | clip_grad: 1.0
31 | 
32 | # Minimum LR ratio for cosine schedule
33 | min_lr_ratio: 0.0
34 | 
35 | # Number of cosine cycles in LR schedule
36 | num_cycles: 0.5
37 | 
38 | # LR scheduler type: "constant" or "cosine"
39 | lr_scheduler_type: constant
40 | 
41 | # deprecated
42 | warmup_style: null
43 | 
44 | # Additional optimizer-specific keyword arguments
45 | # Example for torchao with bf16 stochastic rounding:
46 | # optimizer_impl: torchao.optim
47 | # optimizer: _AdamW
48 | # override_optimizer_config:
49 | #   bf16_stochastic_round: true
50 | override_optimizer_config: null
51 | 


--------------------------------------------------------------------------------
/opentinker/client/client_config/opentinker_param.yaml:
--------------------------------------------------------------------------------
 1 | server_url: "http://localhost:8000"
 2 | scheduler_api_key: "otk_98b8db24ccd64c92e1fdd9a232e209fa"  # Your API key for scheduler authentication
 3 | 
 4 | # GPU allocation
 5 | num_gpus: 4  # Number of GPUs to request from scheduler (default: 4)
 6 | 
 7 | data_path: null
 8 | val_data_path: null
 9 | tokenizer_path: null
10 | batch_size: 64
11 | val_batch_size: 100
12 | # Training duration - set ONE of these (num_steps takes precedence if both set)
13 | num_epochs: 10    # Number of epochs (null = use num_steps)
14 | num_steps: null   # Total training steps (null = use num_epochs)
15 | num_workers: 0
16 | 
17 | # reward function
18 | ## api reward function
19 | reward:
20 |     type: "remote"
21 |     remote:
22 |         reward_ip: "localhost"
23 |         reward_port: null
24 |         remote_api_key: null
25 |         auto_start: true  # Enable auto-start of reward server
26 |     code:
27 |         code_function: null
28 | ## code reward function
29 | 
30 | project_name: "agent_loop_training"
31 | experiment_name: "math_with_tools"
32 | save_freq: 100
33 | test_freq: 50
34 | 
35 | temperature: 1
36 | top_p: 1
37 | max_new_tokens: 4096
38 | max_prompt_tokens: 4096
39 | 
40 | algorithm: "toolcall"
41 | 
42 | logger_backends: ["console"] # options: ["console", "wandb"]
43 | wandb_key: null


--------------------------------------------------------------------------------
/opentinker/server/config/critic/megatron_critic.yaml:
--------------------------------------------------------------------------------
 1 | # defaults specify the default config from each component
 2 | defaults:
 3 | 
 4 |   # megatron optimizer config
 5 |   - ../optim@optim: megatron
 6 | 
 7 |   # megatron engine config
 8 |   - ../engine@megatron: megatron
 9 | 
10 |   # dp actor config, inheriting from trainer/config/critic/critic.yaml
11 |   - critic
12 | 
13 |   # load the reference default config, then apply the fields in the current yaml
14 |   - _self_
15 | 
16 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
17 | _target_: verl.workers.config.McoreCriticConfig
18 | 
19 | strategy: megatron
20 | 
21 | # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
22 | nccl_timeout: 600
23 | 
24 | # model config for the critic
25 | model:
26 | 
27 |   # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
28 |   _target_: verl.trainer.config.BaseModelConfig
29 | 
30 |   # override default empty mapping
31 |   override_config:
32 | 
33 |     model_config: {}
34 | 
35 |     moe_config:
36 | 
37 |       freeze_moe_router: False
38 | 
39 | # Whether to load initial weights
40 | load_weight: True
41 | 
42 | # seed for data loader
43 | data_loader_seed: ${oc.select:actor_rollout_ref.actor.data_loader_seed,null}
44 | 


--------------------------------------------------------------------------------
/opentinker/client/client_config/math_inference_scheduler_config.yaml:
--------------------------------------------------------------------------------
 1 | # Math Inference with Scheduler Configuration
 2 | # Use with: python math_inference_with_scheduler.py
 3 | 
 4 | # Scheduler settings
 5 | scheduler_url: http://0.0.0.0:8789
 6 | scheduler_api_key: otk_98b8db24ccd64c92e1fdd9a232e209fa  # Optional API key for authentication
 7 | 
 8 | # Model settings
 9 | model_path: null           # Path to trained checkpoint (HuggingFace format)
10 | tokenizer_path: null       # Tokenizer path (defaults to model_path if null)
11 | 
12 | # GPU settings for vLLM server
13 | tensor_parallel_size: 1    # Number of GPUs for tensor parallelism
14 | num_gpus: null             # Override number of GPUs (defaults to tensor_parallel_size)
15 | gpu_memory_utilization: 0.9
16 | max_model_len: null        # Max model context length (optional)
17 | trust_remote_code: true
18 | 
19 | # Generation parameters (greedy by default for inference)
20 | temperature: 0.0           # 0.0 = greedy decoding
21 | top_p: 1.0
22 | max_new_tokens: 4096
23 | 
24 | # Data settings
25 | data_path: null            # Input data file (parquet/jsonl)
26 | output_path: null          # Output results file (jsonl)
27 | max_samples: null          # Limit samples (null = all)
28 | 
29 | # Environment settings  
30 | env_endpoint: http://0.0.0.0:8088
31 | 
32 | # Multi-turn settings (same as training config)
33 | multi_turn:
34 |     max_user_turns: 0
35 |     max_assistant_turns: 1
36 | 


--------------------------------------------------------------------------------
/opentinker/server/config/actor/dp_actor.yaml:
--------------------------------------------------------------------------------
 1 | # Format checks enforced on CI:
 2 | # 1. Comments must appear above each field.
 3 | # 2. There must be a blank line between each field.
 4 | # 3. Inline comments (after a field on the same line) are not allowed.
 5 | # 4. Indentation level is respected for nested fields.
 6 | 
 7 | # defaults specify the default config from each component
 8 | defaults:
 9 | 
10 |   # fsdp optimizer config
11 |   - ../optim@optim: fsdp
12 | 
13 |   # fsdp engine config
14 |   - ../engine@fsdp_config: fsdp
15 | 
16 |   # dp actor config, inheriting from trainer/config/actor/actor.yaml
17 |   - actor
18 | 
19 |   # load the reference default config, then apply the fields in the current yaml
20 |   - _self_
21 | 
22 | # Target class for this configuration
23 | _target_: verl.workers.config.FSDPActorConfig
24 | 
25 | # TODO(haibin.lin): switch to fsdp2
26 | strategy: fsdp
27 | 
28 | # Gradient clipping for actor updates, specific to the strategy.
29 | grad_clip: 1.0
30 | 
31 | # Sequence parallelism size for Ulysses-style model parallelism
32 | # oc.select: the default val for ref.ulysses_sequence_parallel_size
33 | ulysses_sequence_parallel_size: 1
34 | 
35 | # calculate entropy with chunking to reduce memory peak
36 | entropy_from_logits_with_chunking: False
37 | 
38 | # recompute entropy
39 | entropy_checkpointing: False
40 | 
41 | # Whether to remove padding tokens in inputs during training
42 | use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}


--------------------------------------------------------------------------------
/opentinker/client/client_config/gomoku_inference_config.yaml:
--------------------------------------------------------------------------------
 1 | # Gomoku Inference Configuration
 2 | # Use with: python gomoku_inference.py
 3 | 
 4 | # Model settings
 5 | model_path: null           # Path to trained checkpoint (HuggingFace format)
 6 | tokenizer_path: null       # Tokenizer path (defaults to model_path if null)
 7 | vllm_server_url: null      # vLLM server URL for server mode (e.g., "http://localhost:8000")
 8 | 
 9 | # GPU settings (offline mode only)
10 | tensor_parallel_size: 1    # Number of GPUs for tensor parallelism
11 | gpu_memory_utilization: 0.9
12 | 
13 | # Generation parameters
14 | temperature: 0.0           # 0.0 = greedy decoding
15 | top_p: 1.0
16 | max_new_tokens: 8192  # TOTAL response budget for entire multi-turn trajectory (NOT per-turn!)
17 | max_prompt_tokens: 4096
18 | max_context_length: 30000  # Max context before ending game (< model max 32768)
19 | 
20 | # Data settings (Gomoku uses dynamic generation, no data_path needed)
21 | data_path: null            # Not needed for Gomoku (uses dynamic generation)
22 | output_path: null          # Output results file (jsonl)
23 | max_samples: 10            # Number of games to play
24 | 
25 | # Environment settings  
26 | env_endpoint: http://localhost:8091
27 | 
28 | # Multi-turn settings (Gomoku is multi-turn game)
29 | multi_turn:
30 |     max_user_turns: 39       # Max environment turns (moves)
31 |     max_assistant_turns: 39  # Max model response turns
32 |     max_tokens_per_turn: 256  # Per-turn response limit (optional, null for no limit)
33 | 
34 | 
35 | # Game-specific settings
36 | board_size: 9              # Gomoku board size (9x9)


--------------------------------------------------------------------------------
/opentinker/environment/geo3k/geo3k_env.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Geo3K Game Environment using VL components."""
 3 | 
 4 | from opentinker.environment.vl_game_environment import VLGameEnvironment
 5 | from opentinker.environment.geo3k.geo3k_game import Geo3KGame
 6 | 
 7 | 
 8 | class Geo3KGameEnvironment(VLGameEnvironment):
 9 |     """GameEnvironment for Geo3K geometry problems with vision-language models.
10 |     
11 |     This environment uses:
12 |     - VLGameEnvironment for multimodal data processing
13 |     - StaticDatasetGeneratorVL for loading Geo3K parquet data with images
14 |     - Geo3KGame for geometry problem logic
15 |     
16 |     Args:
17 |         config: Configuration object
18 |         data_paths: Training data paths (parquet files)
19 |         val_data_paths: Validation data paths (optional)
20 |         job_id: Job identifier
21 |     
22 |     Example:
23 |         env = Geo3KGameEnvironment(
24 |             config=config,
25 |             data_paths=["~/data/geo3k/train.parquet"],
26 |             val_data_paths=["~/data/geo3k/test.parquet"],
27 |             job_id="geo3k_training_001",
28 |         )
29 |     """
30 |     
31 |     def __init__(self, config, data_paths, val_data_paths=None, job_id=None):
32 |         # Initialize with Geo3K game and VL environment
33 |         super().__init__(
34 |             game_class=Geo3KGame,
35 |             config=config,
36 |             data_paths=data_paths,
37 |             val_data_paths=val_data_paths,
38 |             game_kwargs={},
39 |             job_id=job_id,
40 |             image_key="images",  # Geo3K uses "images" field
41 |         )
42 | 


--------------------------------------------------------------------------------
/opentinker/server/config/engine/fsdp.yaml:
--------------------------------------------------------------------------------
 1 | # Target class for this configuration
 2 | _target_: verl.workers.config.FSDPEngineConfig
 3 | 
 4 | # policy for wrapping the model
 5 | wrap_policy:
 6 | 
 7 |   # Minimum number of parameters to trigger wrapping a layer with FSDP
 8 |   min_num_params: 0
 9 | 
10 | # Whether to offload model parameters to CPU (trades speed for memory)
11 | # Note that this differs from the offload_policy in FSDP
12 | param_offload: false
13 | 
14 | # Whether to offload optimizer state to CPU
15 | # Note that this differs from the offload_policy in FSDP
16 | optimizer_offload: false
17 | 
18 | # Only for FSDP2: offload param/grad/optimizer during train
19 | offload_policy: false
20 | 
21 | # Only for FSDP2: Reshard after forward pass to reduce memory footprint
22 | reshard_after_forward: true
23 | 
24 | # Number of GPUs in each FSDP shard group; -1 means auto
25 | fsdp_size: -1
26 | 
27 | # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
28 | # before the current forward computation.
29 | forward_prefetch: False
30 | 
31 | # model dtype of fsdp
32 | model_dtype: fp32
33 | 
34 | # Whether to use original parameters in fsdp. Only avaiable in fsdp1
35 | use_orig_params: false
36 | 
37 | # ulysses sequence parallel size
38 | ulysses_sequence_parallel_size: 1
39 | 
40 | # Whether to use entropy_from_logits_with_chunking in fsdp.
41 | entropy_from_logits_with_chunking: false
42 | 
43 | # Whether to use torch compile in fsdp.
44 | use_torch_compile: true
45 | 
46 | # Whether to use entropy checkpointing in fsdp.
47 | entropy_checkpointing: false
48 | 
49 | # Whether to use forward only in fsdp.
50 | forward_only: false
51 | 
52 | # fsdp or fsdp2
53 | strategy: fsdp
54 | 


--------------------------------------------------------------------------------
/opentinker/client/client_config/math_code_interpreter_inference_config.yaml:
--------------------------------------------------------------------------------
 1 | # Math Code Interpreter Inference Configuration
 2 | # Use with: python math_code_interpreter_inference.py
 3 | # This config supports multi-turn code execution for math problem solving
 4 | 
 5 | # Scheduler settings
 6 | scheduler_url: http://0.0.0.0:8780
 7 | scheduler_api_key: null  # Optional API key for authentication
 8 | 
 9 | # Model settings
10 | model_path: null           # Path to trained checkpoint (HuggingFace format)
11 | tokenizer_path: null       # Tokenizer path (defaults to model_path if null)
12 | 
13 | # GPU settings for vLLM server
14 | tensor_parallel_size: 1    # Number of GPUs for tensor parallelism
15 | num_gpus: null             # Override number of GPUs (defaults to tensor_parallel_size)
16 | gpu_memory_utilization: 0.9
17 | max_model_len: null        # Max model context length (optional)
18 | trust_remote_code: true
19 | 
20 | # Generation parameters (greedy by default for inference)
21 | temperature: 0.0           # 0.0 = greedy decoding
22 | top_p: 1.0
23 | max_new_tokens: 8192       # Total response budget for entire trajectory
24 | max_tokens_per_turn: 1024  # Per-turn response limit
25 | 
26 | # Data settings
27 | data_path: null            # Input data file (parquet/jsonl)
28 | output_path: null          # Output results file (jsonl)
29 | max_samples: null          # Limit samples (null = all)
30 | 
31 | # Environment settings (code interpreter math server)
32 | env_endpoint: http://0.0.0.0:8088
33 | 
34 | # Multi-turn settings (allow code execution iterations)
35 | multi_turn:
36 |     max_user_turns: 5        # Max environment responses (code execution results)
37 |     max_assistant_turns: 5   # Max LLM responses for iterative solving
38 | 


--------------------------------------------------------------------------------
/opentinker/environment/geo3k/geo3k_tool_server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Geo3K Multi-Turn Environment Server.
 3 | 
 4 | This script starts a Geo3K geometry problem server using Geo3KToolGame
 5 | for multi-turn verification-based interactions.
 6 | 
 7 | Usage:
 8 |     python geo3k_tool_server.py
 9 |     # Or with custom config:
10 |     python geo3k_tool_server.py --port 8088 --max_retries 3
11 | """
12 | 
13 | import argparse
14 | from opentinker.environment.base_game_server import run_game_server
15 | from opentinker.environment.geo3k.geo3k_tool_game import Geo3KToolGame
16 | 
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser(description="Geo3K Multi-Turn Server")
20 |     parser.add_argument("--host", default="0.0.0.0", help="Server host")
21 |     parser.add_argument("--port", type=int, default=8088, help="Server port")
22 |     parser.add_argument("--max_retries", type=int, default=3, help="Max verification attempts")
23 |     args = parser.parse_args()
24 |     
25 |     print(f"\nGeo3K Multi-Turn Server Configuration:")
26 |     print(f"  Max retries: {args.max_retries}")
27 |     print(f"\nFeedback format (verl-compatible):")
28 |     print(f"  'Current parsed answer={{answer}} reward={{0.0|1.0}}'")
29 |     print(f"\nReward structure:")
30 |     print(f"  Correct: +{Geo3KToolGame.REWARD_CORRECT}")
31 |     print(f"  Incorrect: {Geo3KToolGame.REWARD_INCORRECT}")
32 |     print(f"  No improvement penalty: {Geo3KToolGame.PENALTY_NO_IMPROVEMENT}")
33 |     
34 |     run_game_server(
35 |         game_class=Geo3KToolGame,
36 |         host=args.host,
37 |         port=args.port,
38 |         stats_class=None,  # Use BaseGameStats
39 |         max_retries=args.max_retries,
40 |     )
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     main()
45 | 


--------------------------------------------------------------------------------
/opentinker/client/client_config/gomoku_inference_scheduler_config.yaml:
--------------------------------------------------------------------------------
 1 | # Gomoku Inference with Scheduler Configuration
 2 | # Use with: python gomoku_inference_with_scheduler.py
 3 | 
 4 | # Scheduler settings
 5 | scheduler_url: http://0.0.0.0:8780
 6 | scheduler_api_key: otk_98b8db24ccd64c92e1fdd9a232e209fa  # Optional API key for authentication
 7 | 
 8 | # Model settings
 9 | model_path: null           # Path to trained checkpoint (HuggingFace format)
10 | tokenizer_path: null       # Tokenizer path (defaults to model_path if null)
11 | 
12 | # GPU settings for vLLM server
13 | tensor_parallel_size: 1    # Number of GPUs for tensor parallelism
14 | num_gpus: null             # Override number of GPUs (defaults to tensor_parallel_size)
15 | gpu_memory_utilization: 0.9
16 | max_model_len: null        # Max model context length (optional)
17 | trust_remote_code: true
18 | 
19 | # Generation parameters
20 | temperature: 0.0           # 0.0 = greedy decoding
21 | top_p: 1.0
22 | max_new_tokens: 8192       # TOTAL response budget for entire multi-turn trajectory
23 | max_prompt_tokens: 4096
24 | max_context_length: 30000  # Max context before ending game
25 | 
26 | # Data settings (Gomoku uses dynamic generation, no data_path needed)
27 | data_path: null            # Not needed for Gomoku (uses dynamic generation)
28 | output_path: null          # Output results file (jsonl)
29 | max_samples: 10            # Number of games to play
30 | 
31 | # Environment settings  
32 | env_endpoint: http://0.0.0.0:8091
33 | 
34 | # Multi-turn settings (Gomoku is multi-turn game)
35 | multi_turn:
36 |     max_user_turns: 39       # Max environment turns (moves)
37 |     max_assistant_turns: 39  # Max model response turns
38 |     max_tokens_per_turn: 256 # Per-turn response limit
39 | 
40 | # Game-specific settings
41 | board_size: 9              # Gomoku board size (9x9)
42 | 


--------------------------------------------------------------------------------
/opentinker/server/config/reward_model/dp_reward_model.yaml:
--------------------------------------------------------------------------------
 1 | # Format checks enforced on CI:
 2 | # 1. Comments must appear above each field.
 3 | # 2. There must be a blank line between each field.
 4 | # 3. Inline comments (after a field on the same line) are not allowed.
 5 | # 4. Indentation level is respected for nested fields.
 6 | 
 7 | # defaults specify the default config from each component
 8 | defaults:
 9 | 
10 |   # dp actor config, inheriting from trainer/config/reward_model/reward_model.yaml
11 |   - reward_model
12 | 
13 |   # load the reference default config, then apply the fields in the current yaml
14 |   - _self_
15 | 
16 | strategy: fsdp
17 | 
18 | model:
19 | 
20 |   # Whether to use shared memory for loading the model
21 |   use_shm: False
22 | 
23 |   # Use remove padding optimization (saves compute)
24 |   use_remove_padding: False
25 | 
26 |   # Whether to use fused reward kernels for speedup
27 |   use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
28 | 
29 |   # FSDP-specific config
30 |   fsdp_config:
31 | 
32 |     # Target configuration dataclass
33 |     _target_: verl.workers.config.FSDPEngineConfig
34 | 
35 |     # Policy for wrapping layers with FSDP
36 |     wrap_policy:
37 | 
38 |       # Minimum number of parameters to trigger wrapping
39 |       min_num_params: 0
40 | 
41 |     # Whether to offload model parameters to CPU
42 |     param_offload: False
43 | 
44 |     # Only for FSDP2: Reshard after forward pass to reduce memory footprint
45 |     reshard_after_forward: True
46 | 
47 |     # Number of GPUs in each FSDP shard group; -1 means auto
48 |     fsdp_size: -1
49 | 
50 |     # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
51 |     # before the current forward computation.
52 |     forward_prefetch: False
53 | 
54 | # Sequence parallelism size for Ulysses-style model parallelism
55 | ulysses_sequence_parallel_size: 1


--------------------------------------------------------------------------------
/opentinker/client/legacy/math_inference.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Math Environment Inference Script
 4 | 
 5 | Uses the shared InferencePipeline to run inference on math problems.
 6 | 
 7 | Usage:
 8 |     1. Start the game server:
 9 |        python opentinker/environment/math/math_server.py
10 |        
11 |     2. Run inference:
12 |        python math_inference.py \
13 |            model_path=/path/to/checkpoint \
14 |            data_path=/ \
15 |            output_path=/tmp/results.jsonl
16 | 
17 | 
18 | """
19 | 
20 | import hydra
21 | from opentinker.environment.inference_pipeline import run_inference
22 | from opentinker.environment.math import MathGame
23 | 
24 | 
25 | @hydra.main(config_path="client_config", config_name="math_inference_config.yaml", version_base=None)
26 | def main(args):
27 |     """Run inference on math problems."""
28 |     print("=" * 60)
29 |     print("Math Environment Inference")
30 |     print("=" * 60)
31 |     
32 |     if not args.model_path:
33 |         raise ValueError("model_path is required")
34 |     if not args.data_path:
35 |         raise ValueError("data_path is required")
36 |     
37 |     results = run_inference(
38 |         model_path=args.model_path,
39 |         data_path=args.data_path,
40 |         game_class=MathGame,
41 |         env_endpoint=args.env_endpoint,
42 |         output_path=args.get("output_path"),
43 |         temperature=args.temperature,
44 |         top_p=args.top_p,
45 |         max_tokens=args.max_new_tokens,
46 |         max_samples=args.get("max_samples"),
47 |         max_user_turns=args.multi_turn.max_user_turns,
48 |         max_assistant_turns=args.multi_turn.max_assistant_turns,
49 |         tensor_parallel_size=args.get("tensor_parallel_size", 1),
50 |     )
51 |     
52 |     if args.get("output_path"):
53 |         print(f"\nResults saved to: {args.output_path}")
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     main()


--------------------------------------------------------------------------------
/opentinker/environment/geo3k/geo3k_server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Geo3K Environment Server - Simplified launcher.
 3 | 
 4 | This script starts a Geo3K geometry problem server using the generic base_game_server.
 5 | 
 6 | Usage:
 7 |     python geo3k_server.py
 8 |     # Or with custom config:
 9 |     python geo3k_server.py --port 8082 --max_retries 0
10 | """
11 | 
12 | import argparse
13 | from opentinker.environment.base_game_server import run_game_server
14 | from opentinker.environment.geo3k.geo3k_game import Geo3KGame
15 | 
16 | # Geo3KGameStats is optional - falls back to BaseGameStats if not available
17 | try:
18 |     from opentinker.environment.geo3k.geo3k_stats import Geo3KGameStats
19 | except ImportError:
20 |     Geo3KGameStats = None
21 | 
22 | 
23 | def main():
24 |     parser = argparse.ArgumentParser(description="Geo3K Geometry Problem Server")
25 |     parser.add_argument("--host", default="0.0.0.0", help="Server host")
26 |     parser.add_argument("--port", type=int, default=8082, help="Server port")
27 |     parser.add_argument("--max_retries", type=int, default=0, help="Max retry attempts (0 = single turn)")
28 |     args = parser.parse_args()
29 |     
30 |     print(f"\nGeo3K Game Configuration:")
31 |     print(f"  Max retries: {args.max_retries}")
32 |     print(f"\nReward structure:")
33 |     print(f"  Correct: +{Geo3KGame.REWARD_CORRECT}")
34 |     print(f"  Incorrect: {Geo3KGame.REWARD_INCORRECT}")
35 |     
36 |     if Geo3KGameStats:
37 |         print(f"\nUsing Geo3KGameStats for tracking")
38 |     else:
39 |         print(f"\nUsing BaseGameStats (Geo3KGameStats not available)")
40 |     
41 |     run_game_server(
42 |         game_class=Geo3KGame,
43 |         host=args.host,
44 |         port=args.port,
45 |         stats_class=Geo3KGameStats,  # None falls back to BaseGameStats
46 |         max_retries=args.max_retries,
47 |     )
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/opentinker/server/config/generation.yaml:
--------------------------------------------------------------------------------
 1 | trainer:
 2 |   nnodes: 1
 3 |   n_gpus_per_node: 8
 4 |   device: cuda
 5 | 
 6 | data:
 7 |   path: ~/data/rlhf/math/test.parquet
 8 |   prompt_key: prompt
 9 |   n_samples: 5
10 |   output_path: /opt/tiger/math_Qwen2-7B-Instruct.parquet
11 |   batch_size: 128
12 | 
13 | model:
14 |   path: ~/models/Qwen2-7B-Instruct
15 |   external_lib: null
16 | rollout:
17 |   _target_: verl.workers.config.RolloutConfig
18 |   name: vllm
19 |   mode: sync # sync: LLM, async: AsyncLLM
20 |   temperature: 1.0
21 |   top_k: 50 # 0 for hf rollout, -1 for vllm rollout
22 |   top_p: 0.7
23 |   prompt_length: 1536
24 |   response_length: 512
25 |   # for vllm rollout
26 |   dtype: bfloat16 # should align with FSDP
27 |   gpu_memory_utilization: 0.5
28 |   ignore_eos: False
29 |   enforce_eager: True
30 |   free_cache_engine: True
31 |   load_format: auto
32 |   tensor_model_parallel_size: 1
33 |   data_parallel_size: 1
34 |   max_num_batched_tokens: 8192
35 |   max_model_len: null
36 |   max_num_seqs: 1024
37 |   log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
38 |   log_prob_micro_batch_size_per_gpu: 8
39 |   # for hf rollout
40 |   do_sample: True
41 |   disable_log_stats: True
42 |   enable_chunked_prefill: True
43 |   n: 1
44 |   # support logging rollout prob for debugging purpose
45 |   calculate_log_probs: False
46 | actor:
47 |   strategy: fsdp  # This is for backward-compatibility
48 |   ulysses_sequence_parallel_size: 1 # sp size
49 |   entropy_from_logits_with_chunking: False  # calculate entropy with chunking to reduce memory peak
50 |   entropy_checkpointing: False  # recompute entropy
51 |   fsdp_config:
52 |     fsdp_size: -1
53 |     forward_prefetch: False  # FSDP1 forward_prefetch configuration
54 | 
55 | ray_kwargs:
56 |   ray_init:
57 |     num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
58 |   timeline_json_file: null
59 | 


--------------------------------------------------------------------------------
/opentinker/docs/CORS_FIX.md:
--------------------------------------------------------------------------------
 1 | # CORS 错误修复说明
 2 | 
 3 | ## 问题
 4 | 
 5 | Web Dashboard 显示错误：
 6 | ```
 7 | "OPTIONS /list_jobs HTTP/1.1" 405 Method Not Allowed
 8 | ```
 9 | 
10 | ## 原因
11 | 
12 | 这是一个 **CORS (跨域资源共享)** 问题：
13 | 
14 | 1. Web Dashboard 在浏览器中运行（`http://localhost:8081`）
15 | 2. 尝试访问调度器 API（`http://localhost:8765`）
16 | 3. 由于端口不同，浏览器认为这是跨域请求
17 | 4. 当请求包含 `Authorization` 头时，浏览器会先发送 OPTIONS 预检请求
18 | 5. 调度器没有配置 CORS，拒绝了 OPTIONS 请求
19 | 
20 | ## 解决方案
21 | 
22 | 已在调度器中添加 CORS 中间件支持！
23 | 
24 | ### 修改内容
25 | 
26 | 文件：`scheduler/job_scheduler.py`
27 | 
28 | 1. 添加 CORS 中间件导入
29 | 2. 配置允许所有来源的跨域请求
30 | 3. 允许所有 HTTP 方法（包括 OPTIONS）
31 | 4. 允许所有请求头（包括 Authorization）
32 | 
33 | ### 应用修复
34 | 
35 | **重启调度器**：
36 | 
37 | ```bash
38 | # 1. 停止当前运行的调度器（Ctrl+C）
39 | 
40 | # 2. 重新启动调度器
41 | python scheduler/launch_scheduler.py \
42 |     available_gpus=[0,1,2,3] \
43 |     scheduler_port=8765
44 | ```
45 | 
46 | ### 验证修复
47 | 
48 | 1. **启动 Web Dashboard**：
49 |    ```bash
50 |    python scheduler/web_dashboard.py --port 8081
51 |    ```
52 | 
53 | 2. **刷新浏览器**：打开 `http://localhost:8081/web_dashboard.html`
54 | 
55 | 3. **输入 API Key** 并保存
56 | 
57 | 4. **检查结果**：
58 |    - ✅ 应该能看到任务列表
59 |    - ✅ 不再有 405 错误
60 |    - ✅ OPTIONS 请求成功返回 200
61 | 
62 | ### 技术细节
63 | 
64 | 添加的 CORS 配置：
65 | ```python
66 | app.add_middleware(
67 |     CORSMiddleware,
68 |     allow_origins=["*"],      # 允许所有来源
69 |     allow_credentials=True,   # 允许携带凭证
70 |     allow_methods=["*"],      # 允许所有方法
71 |     allow_headers=["*"],      # 允许所有请求头
72 | )
73 | ```
74 | 
75 | **生产环境注意**：
76 | 在生产环境中，应该限制 `allow_origins` 为特定域名：
77 | ```python
78 | allow_origins=["https://your-dashboard-domain.com"]
79 | ```
80 | 
81 | ### 完整流程
82 | 
83 | 现在 Web Dashboard 的完整工作流程：
84 | 
85 | 1. 🌐 **浏览器**：打开 Dashboard → `http://localhost:8081/web_dashboard.html`
86 | 2. 🔑 **输入 API Key**：保存到 localStorage
87 | 3. 📡 **OPTIONS 请求**：浏览器发送预检请求 → 调度器允许
88 | 4. 📊 **GET 请求**：带 Authorization 头请求数据 → 调度器返回任务列表
89 | 5. ✅ **显示数据**：Dashboard 显示所有任务
90 | 
91 | 全部流程现在都能正常工作！
92 | 


--------------------------------------------------------------------------------
/opentinker/scripts/launch_scheduler.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Convenience script to launch the job scheduler
 3 | 
 4 | # Default configuration
 5 | AVAILABLE_GPUS="[4,5,6,7,8,9]"
 6 | PORT_RANGE="null"  # Set to null for auto-detection
 7 | NUM_PORTS=200
 8 | SCHEDULER_PORT=8780
 9 | 
10 | # Parse command line arguments (optional)
11 | while [[ $# -gt 0 ]]; do
12 |     case $1 in
13 |         --gpus)
14 |             AVAILABLE_GPUS="$2"
15 |             shift 2
16 |             ;;
17 |         --ports)
18 |             PORT_RANGE="$2"
19 |             shift 2
20 |             ;;
21 |         --num-ports)
22 |             NUM_PORTS="$2"
23 |             shift 2
24 |             ;;
25 |         --scheduler-port)
26 |             SCHEDULER_PORT="$2"
27 |             shift 2
28 |             ;;
29 |         --auto-ports)
30 |             PORT_RANGE="null"
31 |             shift 1
32 |             ;;
33 |         *)
34 |             echo "Unknown option: $1"
35 |             echo "Usage: $0 [--gpus '[0,1,2,3]'] [--ports '[38564,38600]' | --auto-ports] [--num-ports 50] [--scheduler-port 8765]"
36 |             exit 1
37 |             ;;
38 |     esac
39 | done
40 | 
41 | echo "========================================"
42 | echo "Launching Job Scheduler"
43 | echo "========================================"
44 | echo "Available GPUs: $AVAILABLE_GPUS"
45 | if [ "$PORT_RANGE" = "null" ]; then
46 |     echo "Port mode: Auto-detect ($NUM_PORTS ports)"
47 | else
48 |     echo "Port range: $PORT_RANGE"
49 | fi
50 | echo "Scheduler port: $SCHEDULER_PORT"
51 | echo "========================================"
52 | echo ""
53 | 
54 | # Launch scheduler
55 | if [ "$PORT_RANGE" = "null" ]; then
56 |     python opentinker/scheduler/launch_scheduler_kill.py \
57 |         available_gpus=$AVAILABLE_GPUS \
58 |         port_range=null \
59 |         num_ports=$NUM_PORTS \
60 |         scheduler_port=$SCHEDULER_PORT
61 | else
62 |     python opentinker/scheduler/launch_scheduler_kill.py \
63 |         available_gpus=$AVAILABLE_GPUS \
64 |         port_range=$PORT_RANGE \
65 |         scheduler_port=$SCHEDULER_PORT
66 | fi


--------------------------------------------------------------------------------
/opentinker/server/config/model/hf_model.yaml:
--------------------------------------------------------------------------------
 1 | # Format checks enforced on CI:
 2 | # 1. Comments must appear above each field.
 3 | # 2. There must be a blank line between each field.
 4 | # 3. Inline comments (after a field on the same line) are not allowed.
 5 | # 4. Indentation level is respected for nested fields.
 6 | 
 7 | _target_: verl.workers.config.HFModelConfig
 8 | 
 9 | # path to the huggingface model
10 | path: ~/models/deepseek-llm-7b-chat
11 | 
12 | # config to the huggingface config. In case it is not the same as path
13 | hf_config_path: null
14 | 
15 | # path to the huggingface tokenizer. In case it is not the same as path
16 | tokenizer_path: null
17 | 
18 | # whether to use shared memory for model loading
19 | use_shm: False
20 | 
21 | # whether to trust remote code.
22 | trust_remote_code: False
23 | 
24 | # custom chat template for the model
25 | custom_chat_template: null
26 | 
27 | # whether to use external libs for the model
28 | external_lib: null
29 | 
30 | # override hf config
31 | override_config: {}
32 | 
33 | # whether to enable gradient checkpointing. Only valid when we use hf model definition
34 | enable_gradient_checkpointing: True
35 | 
36 | # whether to enable activation offload. Only valid when we use hf model definition
37 | enable_activation_offload: False
38 | 
39 | # whether to use remove padding. Only valid when we use hf model definition
40 | use_remove_padding: False
41 | 
42 | # Set to positive value to enable LoRA (e.g., 32)
43 | lora_rank: 0
44 | 
45 | # LoRA scaling factor
46 | lora_alpha: 16
47 | 
48 | # Target modules for LoRA adaptation
49 | target_modules: all-linear
50 | 
51 | # Exclude modules from LoRA adaptation
52 | exclude_modules: null
53 | 
54 | # Path to pre-trained LoRA adapter to load for continued training
55 | lora_adapter_path: null
56 | 
57 | # whether to use liger. Only valid when we use hf model definition
58 | use_liger: False
59 | 
60 | # whether to use fused kernels.
61 | use_fused_kernels: False
62 | 
63 | # fused kernel options.
64 | fused_kernel_options:
65 | 
66 |   # the implementation backend for fused kernels.
67 |   impl_backend: torch
68 | 


--------------------------------------------------------------------------------
/opentinker/server/config/reward_model/megatron_reward_model.yaml:
--------------------------------------------------------------------------------
 1 | # defaults specify the default config from each component
 2 | defaults:
 3 | 
 4 |   # dp actor config, inheriting from trainer/config/reward_model/reward_model.yaml
 5 |   - reward_model
 6 | 
 7 |   # load the reference default config, then apply the fields in the current yaml
 8 |   - _self_
 9 | 
10 | strategy: megatron
11 | 
12 | # seconds, default is 10 minutes for torch, you can set it to a larger value
13 | # if you have long-running operations like 32B or 72B model using megatron
14 | nccl_timeout: 600
15 | 
16 | # Megatron parallelism & checkpointing config
17 | megatron:
18 | 
19 |   # Target configuration dataclass
20 |   _target_: verl.workers.config.MegatronEngineConfig
21 | 
22 |   # Whether to offload model parameters to CPU
23 |   param_offload: False
24 | 
25 |   # Number of GPUs in tensor model parallel group
26 |   tensor_model_parallel_size: 1
27 | 
28 |   # Number of GPUs in expert model parallel group
29 |   expert_model_parallel_size: 1
30 | 
31 |   # Expert tensor parallel size
32 |   expert_tensor_parallel_size: 1
33 | 
34 |   # Number of pipeline model parallel stages
35 |   pipeline_model_parallel_size: 1
36 | 
37 |   # change VPP interface for parallelism tests
38 |   virtual_pipeline_model_parallel_size: null
39 | 
40 |   # Context parallel size
41 |   context_parallel_size: 1
42 | 
43 |   # Whether to use sequence parallelism
44 |   sequence_parallel: True
45 | 
46 |   # Whether to use distributed optimizer
47 |   use_distributed_optimizer: False
48 | 
49 |   # Whether to enable distributed checkpointing
50 |   use_dist_checkpointing: False
51 | 
52 |   # Path for distributed checkpoints
53 |   dist_checkpointing_path: null
54 | 
55 |   # RNG seed for megatron
56 |   seed: ${oc.select:actor_rollout_ref.actor.megatron.seed,42}
57 | 
58 |   # Any overrides to transformer config
59 |   override_transformer_config: ${oc.select:actor_rollout_ref.actor.megatron.override_transformer_config,{}}
60 | 
61 |   # Whether to use mbridge for faster comms
62 |   use_mbridge: ${oc.select:actor_rollout_ref.actor.megatron.use_mbridge,False}
63 | 
64 | # Whether to load weights (default True)
65 | load_weight: True


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | **/*.pt
  3 | **/checkpoints
  4 | **/wget-log
  5 | **/_build/
  6 | **/*.ckpt
  7 | **/outputs
  8 | **/*.tar.gz
  9 | **/playground
 10 | **/wandb
 11 | **/tensorboard_log
 12 | **/evaluation_results
 13 | **/verl_debug
 14 | **/tool
 15 | .api_key_siqizhu4
 16 | 
 17 | # Byte-compiled / optimized / DLL files
 18 | __pycache__/
 19 | *.py[cod]
 20 | *$py.class
 21 | dataset/*
 22 | tensorflow/my_graph/*
 23 | .idea/
 24 | # C extensions
 25 | *.so
 26 | 
 27 | # Distribution / packaging
 28 | .Python
 29 | env/
 30 | build/
 31 | develop-eggs/
 32 | dist/
 33 | downloads/
 34 | eggs/
 35 | .eggs/
 36 | lib/
 37 | lib64/
 38 | parts/
 39 | sdist/
 40 | var/
 41 | tmp/
 42 | *.egg-info/
 43 | .installed.cfg
 44 | *.egg
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .coverage
 60 | .coverage.*
 61 | .cache
 62 | nosetests.xml
 63 | coverage.xml
 64 | *,cover
 65 | .hypothesis/
 66 | pytest.ini
 67 | output.txt
 68 | 
 69 | # Translations
 70 | *.mo
 71 | *.pot
 72 | 
 73 | # Django stuff:
 74 | *.log
 75 | local_settings.py
 76 | 
 77 | # Flask stuff:
 78 | instance/
 79 | .webassets-cache
 80 | 
 81 | # Scrapy stuff:
 82 | .scrapy
 83 | 
 84 | # Sphinx documentation
 85 | docs/_build/
 86 | 
 87 | # PyBuilder
 88 | target/
 89 | 
 90 | # IPython Notebook
 91 | .ipynb_checkpoints
 92 | 
 93 | # pyenv
 94 | .python-version
 95 | 
 96 | # celery beat schedule file
 97 | celerybeat-schedule
 98 | 
 99 | # dotenv
100 | .env
101 | 
102 | # virtualenv
103 | venv/
104 | .venv/
105 | ENV/
106 | 
107 | # Spyder project settings
108 | .spyderproject
109 | 
110 | # Rope project settings
111 | .ropeproject
112 | 
113 | # vscode
114 | .vscode
115 | 
116 | # Mac
117 | .DS_Store
118 | 
119 | # vim
120 | *.swp
121 | 
122 | # ckpt
123 | *.lock
124 | 
125 | # data
126 | *.parquet
127 | 
128 | 
129 | # local logs
130 | logs
131 | log
132 | outputs
133 | .history
134 | 


--------------------------------------------------------------------------------
/opentinker/environment/__init__.py:
--------------------------------------------------------------------------------
 1 | """OpenTinker Environment Module.
 2 | 
 3 | This module provides the environment framework for LLM training, including:
 4 | - BaseEnvironment: Abstract base class for all environments
 5 | - GameEnvironment: For multi-turn game environments (Gomoku, etc.)
 6 | - StaticDataEnvironment: For single-turn static datasets (Math, etc.)
 7 | - Data generators and utilities
 8 | """
 9 | 
10 | # Base classes
11 | from opentinker.environment.environment import BaseEnvironment, RewardFunctionSpec
12 | from opentinker.environment.base_game import AbstractGame, StepResult, GameDataGenerator
13 | from opentinker.environment.base_game_environment import GameEnvironment, InteractionSpec
14 | from opentinker.environment.base_data_generator import (
15 |     AbstractGameDataGenerator,
16 |     DynamicGameDataset,
17 |     collate_fn,
18 | )
19 | 
20 | # Static data support
21 | from opentinker.environment.static_data_generator import StaticDatasetGenerator
22 | # from opentinker.environment.static_data_environment import StaticDataEnvironment
23 | 
24 | # Server utilities
25 | from opentinker.environment.base_game_server import (
26 |     BaseGameStats,
27 |     GameStats,
28 |     create_game_server,
29 |     run_game_server,
30 | )
31 | 
32 | from opentinker.environment.inference_pipeline import (
33 |     InferencePipeline,
34 |     InferenceResult,
35 |     RemoteEnvironmentClient,
36 |     run_inference,
37 |     load_samples,
38 |     generate_samples,
39 | )
40 | 
41 | __all__ = [
42 |     # Base
43 |     "BaseEnvironment",
44 |     "RewardFunctionSpec",
45 |     # Game
46 |     "AbstractGame",
47 |     "StepResult",
48 |     "GameDataGenerator",
49 |     "GameEnvironment",
50 |     "InteractionSpec",
51 |     # Data
52 |     "AbstractGameDataGenerator",
53 |     "DynamicGameDataset",
54 |     "collate_fn",
55 |     # Static
56 |     "StaticDatasetGenerator",
57 |     # Inference
58 |     "InferencePipeline",
59 |     "InferenceResult",
60 |     "RemoteEnvironmentClient",
61 |     "run_inference",
62 |     "load_samples",
63 |     "generate_samples",
64 |     # "StaticDataEnvironment",
65 |     # Server
66 |     "BaseGameStats",
67 |     "GameStats",
68 |     "create_game_server",
69 |     "run_game_server",
70 | ]
71 | 


--------------------------------------------------------------------------------
/opentinker/scheduler/register_user_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Example: User Registration and Authentication
 4 | 
 5 | This script demonstrates how to register a user and use the API key
 6 | for authenticated scheduler operations.
 7 | """
 8 | 
 9 | import requests
10 | 
11 | SCHEDULER_URL = "http://localhost:8765"
12 | 
13 | def main():
14 |     print("=" * 60)
15 |     print("User Registration Example")
16 |     print("=" * 60)
17 |     
18 |     # Step 1: Register a new user
19 |     username = input("Enter username to register: ")
20 |     
21 |     print(f"\n📝 Registering user '{username}'...")
22 |     
23 |     response = requests.post(
24 |         f"{SCHEDULER_URL}/register",
25 |         params={"username": username}
26 |     )
27 |     
28 |     if response.status_code == 200:
29 |         result = response.json()
30 |         print("\n✅ Registration successful!")
31 |         print("=" * 60)
32 |         print("🔑 YOUR API KEY (save this - cannot be retrieved later!):")
33 |         print("")
34 |         print(f"  {result['api_key']}")
35 |         print("")
36 |         print("=" * 60)
37 |         print(f"User ID: {result['user_id']}")
38 |         print(f"Username: {result['username']}")
39 |         
40 |         # Step 2: Test authentication with the API key
41 |         api_key = result['api_key']
42 |         print(f"\n✅ Testing authentication...")
43 |         
44 |         # Try to list jobs with the API key
45 |         headers = {"Authorization": f"Bearer {api_key}"}
46 |         jobs_response = requests.get(f"{SCHEDULER_URL}/list_jobs", headers=headers)
47 |         
48 |         if jobs_response.status_code == 200:
49 |             print("✅ Authentication successful!")
50 |             jobs = jobs_response.json()
51 |             print(f"Current jobs: {len(jobs['jobs'])}")
52 |         else:
53 |             print(f"❌ Failed to list jobs: {jobs_response.text}")
54 |         
55 |         # Save to file for easy reference
56 |         with open(f".api_key_{username}", "w") as f:
57 |             f.write(api_key)
58 |         print(f"\n💾 API key saved to .api_key_{username}")
59 |         
60 |     else:
61 |         print(f"❌ Registration failed: {response.text}")
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     main()
66 | 


--------------------------------------------------------------------------------
/opentinker/server/config/critic/dp_critic.yaml:
--------------------------------------------------------------------------------
 1 | # Format checks enforced on CI:
 2 | # 1. Comments must appear above each field.
 3 | # 2. There must be a blank line between each field.
 4 | # 3. Inline comments (after a field on the same line) are not allowed.
 5 | # 4. Indentation level is respected for nested fields.
 6 | 
 7 | # defaults specify the default config from each component
 8 | defaults:
 9 | 
10 |   # fsdp optimizer config
11 |   - ../optim@optim: fsdp
12 | 
13 |   # fsdp engine config
14 |   - ../engine@model.fsdp_config: fsdp
15 | 
16 |   # dp actor config, inheriting from trainer/config/critic/critic.yaml
17 |   - critic
18 | 
19 |   # load the reference default config, then apply the fields in the current yaml
20 |   - _self_
21 | 
22 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
23 | _target_: verl.workers.config.FSDPCriticConfig
24 | 
25 | # distribution strategy. Options: fsdp (deprecating), fsdp2
26 | strategy: fsdp
27 | 
28 | # model config for the critic
29 | model:
30 | 
31 |   # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
32 |   _target_: verl.workers.config.FSDPCriticModelCfg
33 | 
34 |   # Whether to use shared memory for loading the model
35 |   use_shm: False
36 | 
37 |   # Enable gradient checkpointing to save memory
38 |   enable_gradient_checkpointing: True
39 | 
40 |   # Offload activations to CPU to reduce GPU memory usage
41 |   enable_activation_offload: False
42 | 
43 |   # Use remove padding optimization (saves compute)
44 |   use_remove_padding: False
45 | 
46 |   # Set to positive value to enable LoRA (e.g., 32)
47 |   lora_rank: 0
48 | 
49 |   # LoRA scaling factor
50 |   lora_alpha: 16
51 | 
52 |   # LoRA target modules: "all-linear" or list of linear projection layers
53 |   target_modules: all-linear
54 | 
55 | # Forward-only batch size during inference (global)
56 | forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
57 | 
58 | # Forward-only batch size during inference (per GPU)
59 | forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
60 | 
61 | # Sequence parallelism size for Ulysses-style model parallelism
62 | ulysses_sequence_parallel_size: 1
63 | 
64 | # Gradient clipping for critic updates
65 | grad_clip: 1.0
66 | 


--------------------------------------------------------------------------------
/opentinker/environment/math/math_server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Math Environment Server - HTTP server for math problem solving.
 3 | 
 4 | This script starts a math game server using the generic base_game_server.
 5 | For single-turn math problems, the server:
 6 | - Receives reset() with ground_truth
 7 | - Receives step() with model's answer
 8 | - Returns reward computed by MathGame
 9 | 
10 | Usage:
11 |     python math_server.py
12 |     # Or with custom port:
13 |     python math_server.py --port 8082
14 |     
15 |     # For multi-worker mode (faster handling of concurrent requests):
16 |     uvicorn opentinker.environment.math.math_server:app --host 0.0.0.0 --port 8082 --workers 4
17 | """
18 | 
19 | import argparse
20 | from opentinker.environment.base_game_server import run_game_server, create_game_app
21 | from opentinker.environment.math.math_game import MathGame
22 | 
23 | # Pre-import reward function to avoid first-request latency
24 | # (The first import of verl.utils.reward_score can be slow)
25 | try:
26 |     from verl.utils.reward_score import default_compute_score
27 | except ImportError:
28 |     pass
29 | 
30 | # # Module-level app for uvicorn multi-worker mode
31 | # # Usage: uvicorn opentinker.environment.math.math_server:app --host 0.0.0.0 --port 8082 --workers 4
32 | # app = create_game_app(game_class=MathGame)
33 | 
34 | 
35 | def main():
36 |     parser = argparse.ArgumentParser(description="Math Game Server")
37 |     parser.add_argument("--host", default="0.0.0.0", help="Server host")
38 |     parser.add_argument("--port", type=int, default=8082, help="Server port")
39 |     parser.add_argument("--max_retries", type=int, default=0, help="Max retry attempts (0=single turn)")
40 |     args = parser.parse_args()
41 |     
42 |     print(f"\nMath Game Server Configuration:")
43 |     print(f"  Single-turn mode: {'Yes' if args.max_retries == 0 else 'No'}")
44 |     print(f"  Max retries: {args.max_retries}")
45 |     print(f"\nReward structure:")
46 |     print(f"  Correct answer: +{MathGame.REWARD_CORRECT}")
47 |     print(f"  Incorrect answer: {MathGame.REWARD_INCORRECT}")
48 |     
49 |     run_game_server(
50 |         game_class=MathGame,
51 |         host=args.host,
52 |         port=args.port,
53 |         max_retries=args.max_retries,
54 |     )
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     main()
59 | 


--------------------------------------------------------------------------------
/opentinker/scheduler/web_dashboard.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Web Dashboard Server for OpenTinker
 4 | 
 5 | Serves the HTML dashboard and provides a simple HTTP server.
 6 | """
 7 | 
 8 | import argparse
 9 | import http.server
10 | import socketserver
11 | import os
12 | from pathlib import Path
13 | 
14 | 
15 | class CORSRequestHandler(http.server.SimpleHTTPRequestHandler):
16 |     """HTTP request handler with CORS enabled"""
17 |     
18 |     def end_headers(self):
19 |         """Add CORS headers"""
20 |         self.send_header('Access-Control-Allow-Origin', '*')
21 |         self.send_header('Access-Control-Allow-Methods', 'GET, POST, OPTIONS')
22 |         self.send_header('Access-Control-Allow-Headers', 'Content-Type, Authorization')
23 |         super().end_headers()
24 |     
25 |     def do_OPTIONS(self):
26 |         """Handle OPTIONS request for CORS preflight"""
27 |         self.send_response(200)
28 |         self.end_headers()
29 | 
30 | 
31 | def main():
32 |     parser = argparse.ArgumentParser(description='OpenTinker Web Dashboard Server')
33 |     parser.add_argument('--port', type=int, default=8081, help='Port to run the server on (default: 8080)')
34 |     parser.add_argument('--scheduler-url', default='http://localhost:8767', 
35 |                        help='Scheduler API URL (default: http://localhost:8767)')
36 |     args = parser.parse_args()
37 |     
38 |     # Change to the directory containing the HTML file
39 |     dashboard_dir = Path(__file__).parent
40 |     os.chdir(dashboard_dir)
41 |     
42 |     print("="*70)
43 |     print("🌐 OpenTinker Web Dashboard")
44 |     print("="*70)
45 |     print(f"\n📍 Dashboard URL: http://localhost:{args.port}")
46 |     print(f"🔗 Scheduler URL: {args.scheduler_url}")
47 |     print(f"📁 Serving from: {dashboard_dir}")
48 |     print("\n💡 Press Ctrl+C to stop the server\n")
49 |     print("="*70 + "\n")
50 |     
51 |     # Start server
52 |     with socketserver.TCPServer(("", args.port), CORSRequestHandler) as httpd:
53 |         try:
54 |             print(f"✅ Server running on port {args.port}")
55 |             print(f"\n🚀 Open http://localhost:{args.port}/web_dashboard.html in your browser\n")
56 |             httpd.serve_forever()
57 |         except KeyboardInterrupt:
58 |             print("\n\n👋 Shutting down server...")
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     main()
63 | 


--------------------------------------------------------------------------------
/opentinker/client/client_config/geo3k_param.yaml:
--------------------------------------------------------------------------------
 1 | # Geo3K Vision-Language Training Configuration
 2 | # Use with: python opentinker/client/geo3k_rl.py
 3 | 
 4 | # Project settings
 5 | project_name: opentinker
 6 | experiment_name: geo3k_vl_training
 7 | 
 8 | # Logging
 9 | logger_backends: ["console", "wandb"]
10 | 
11 | # Tracing (optional)
12 | enable_tracing: true
13 | weave_project: null
14 | 
15 | # WandB (optional)
16 | wandb_key: 2ed6f8544ac3e30d5c08879166cc10d9c6232448
17 | 
18 | # Model and processor paths
19 | # For VL models, both tokenizer_path and processor_path should point to the same model
20 | tokenizer_path: Qwen/Qwen2.5-VL-7B-Instruct
21 | processor_path: Qwen/Qwen2.5-VL-7B-Instruct  # AutoProcessor for VL models
22 | 
23 | # Data paths - use Geo3K parquet files
24 | data_path: ./data/geo3k/train.parquet
25 | val_data_path: ./data/geo3k/test.parquet
26 | 
27 | # Training parameters
28 | batch_size: 16
29 | num_workers: 0
30 | num_epochs: 5       # Total epochs
31 | num_steps: null      # Or set num_steps for step-based training
32 | save_freq: -1        # Save checkpoint every N steps
33 | test_freq: 5         # Validate every N steps
34 | 
35 | # Validation parameters
36 | val_batch_size: 32  # Total validation samples
37 | 
38 | # Generation parameters
39 | temperature: 1.0
40 | top_p: 1.0
41 | max_new_tokens: 2048     # Max tokens per response
42 | max_prompt_tokens: 1024  # Max prompt length (shorter for VL due to image tokens)
43 | 
44 | # Algorithm
45 | algorithm: "agent_loop"
46 | 
47 | # RL Algorithm settings
48 | # GRPO is recommended for VL tasks
49 | adv_estimator: "grpo"
50 | rollout_n: 5  # Number of samples per prompt for GRPO
51 | 
52 | # Interaction configuration
53 | interaction:
54 |   name: geo3k
55 |   class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction
56 |   config:
57 |     env_host: 0.0.0.0
58 |     env_port: 8088
59 |     env_endpoint: http://${interaction.config.env_host}:${interaction.config.env_port}
60 |     max_steps: 1  # Single-turn geometry problem solving
61 | 
62 | # Multi-turn settings (single-turn for Geo3K)
63 | multi_turn:
64 |   max_user_turns: 0
65 |   max_assistant_turns: 1
66 |   max_tokens_per_turn: 2048
67 |   weave_project: null
68 |   experiment_name: "geo3k_vl_interaction"
69 | 
70 | # Scheduler settings
71 | scheduler_url: "http://0.0.0.0:8780"
72 | scheduler_api_key: null
73 | 
74 | # GPU settings
75 | num_gpus: 4  # Adjust based on your setup
76 | 


--------------------------------------------------------------------------------
/opentinker/client/legacy/gomoku_inference.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Gomoku Environment Inference Script
 4 | 
 5 | Uses the shared InferencePipeline to run inference on Gomoku games.
 6 | 
 7 | Usage:
 8 |     1. Start the game server:
 9 |        python opentinker/environment/gomoku/gomoku_server.py
10 |        
11 |     2. Run inference:
12 |        python gomoku_inference.py \
13 |            model_path=/path/to/checkpoint \
14 |            env_endpoint=http://localhost:8089
15 | """
16 | 
17 | import hydra
18 | from opentinker.environment.inference_pipeline import (
19 |     InferencePipeline, load_samples, run_inference
20 | )
21 | from opentinker.environment.gomoku import GomokuGame
22 | 
23 | 
24 | @hydra.main(config_path="client_config", config_name="gomoku_inference_config.yaml", version_base=None)
25 | def main(args):
26 |     """Run inference on Gomoku games."""
27 |     print("=" * 60)
28 |     print("Gomoku Environment Inference")
29 |     print("=" * 60)
30 |     
31 |     if not args.model_path and not args.get("vllm_server_url"):
32 |         raise ValueError("model_path or vllm_server_url is required")
33 |     
34 |     # Gomoku is multi-turn: max_user_turns should be > 0
35 |     max_user_turns = args.multi_turn.get("max_user_turns", 50)
36 |     max_assistant_turns = args.multi_turn.get("max_assistant_turns", 50)
37 |     
38 |     results = run_inference(
39 |         model_path=args.get("model_path"),
40 |         vllm_server_url=args.get("vllm_server_url"),
41 |         tokenizer_path=args.get("tokenizer_path"),
42 |         data_path=args.get("data_path"),  # None for dynamic generation
43 |         game_class=GomokuGame,
44 |         env_endpoint=args.env_endpoint,
45 |         output_path=args.get("output_path"),
46 |         temperature=args.temperature,
47 |         top_p=args.top_p,
48 |         max_tokens=args.max_new_tokens,
49 |         max_tokens_per_turn=args.multi_turn.get("max_tokens_per_turn"),
50 |         max_samples=args.get("max_samples", 10),
51 |         max_user_turns=max_user_turns,
52 |         max_assistant_turns=max_assistant_turns,
53 |         max_context_length=args.get("max_context_length", 30000),
54 |         tensor_parallel_size=args.get("tensor_parallel_size", 1),
55 |         # GomokuGame kwargs
56 |         board_size=args.get("board_size", 9),
57 |     )
58 |     
59 |     if args.get("output_path"):
60 |         print(f"\nResults saved to: {args.output_path}")
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     main()


--------------------------------------------------------------------------------
/opentinker/client/client_config/math_code_interpreter_param.yaml:
--------------------------------------------------------------------------------
 1 | # Math Code Interpreter Training Configuration
 2 | # Multi-turn agent with code interpreter using agent_loop algorithm
 3 | # Use with: python math_code_interpreter_client.py
 4 | 
 5 | # Project settings
 6 | project_name: opentinker
 7 | experiment_name: math_code_interpreter
 8 | 
 9 | # Logging
10 | logger_backends: ["console", "wandb"]
11 | 
12 | # Tracing (optional)
13 | enable_tracing: true
14 | weave_project: null
15 | 
16 | # WandB (optional)
17 | wandb_key: null
18 | 
19 | # Model and tokenizer
20 | tokenizer_path: null
21 | 
22 | # Data paths
23 | data_path: null        # Path to training data (parquet/JSON/JSONL)
24 | val_data_path: null    # Path to validation data (parquet/JSON/JSONL)
25 | 
26 | # Training parameters
27 | batch_size: 32
28 | num_workers: 0
29 | # Training duration - set ONE of these (num_steps takes precedence if both set)
30 | num_epochs: 10        # Number of epochs (null = use num_steps)
31 | num_steps: null       # Total training steps (null = use num_epochs)
32 | save_freq: 100
33 | test_freq: 50         # Validation frequency (every N steps)
34 | 
35 | # Validation parameters
36 | val_batch_size: 64    # Total validation samples
37 | 
38 | # Generation parameters
39 | temperature: 1
40 | top_p: 1
41 | max_new_tokens: 8192  # TOTAL response budget for entire trajectory
42 | max_prompt_tokens: 2048
43 | 
44 | # Algorithm - agent_loop for multi-turn with GymEnvironmentInteraction
45 | algorithm: "agent_loop"
46 | 
47 | # RL Algorithm settings (passed to server via scheduler)
48 | # adv_estimator: "grpo" or "gae" (for PPO)
49 | adv_estimator: "grpo"
50 | # rollout_n: number of samples per prompt for GRPO (only used when adv_estimator=grpo)
51 | rollout_n: 8
52 | 
53 | # Multi-turn configuration
54 | multi_turn:
55 |   max_user_turns: 5         # Max environment responses (code execution results)
56 |   max_assistant_turns: 5    # Max LLM responses  
57 |   max_tokens_per_turn: 1024  # Per-turn response limit
58 |   weave_project: null
59 |   experiment_name: "math_code_interpreter"
60 | 
61 | interaction:
62 |   name: math_code_interpreter
63 |   class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction
64 |   config:
65 |     env_host: 0.0.0.0
66 |     env_port: 8088
67 |     env_endpoint: http://${interaction.config.env_host}:${interaction.config.env_port}
68 |     max_steps: 5    # Max interaction steps (code executions)
69 | 
70 | # Scheduler settings
71 | scheduler_url: "http://0.0.0.0:8780"
72 | scheduler_api_key: null
73 | 
74 | # GPU settings
75 | num_gpus: 4
76 | 
77 | 


--------------------------------------------------------------------------------
/opentinker/client/client_config/geo3k_tool_param.yaml:
--------------------------------------------------------------------------------
 1 | # Geo3K Multi-Turn Training Configuration
 2 | # Multi-turn agent with answer verification using agent_loop algorithm
 3 | # Use with: python opentinker/client/geo3k_tool_rl.py
 4 | 
 5 | # Project settings
 6 | project_name: opentinker
 7 | experiment_name: geo3k_multiturn
 8 | 
 9 | # Logging
10 | logger_backends: ["console", "wandb"]
11 | 
12 | # Tracing (optional)
13 | enable_tracing: true
14 | weave_project: null
15 | 
16 | # WandB (optional)
17 | wandb_key: 2ed6f8544ac3e30d5c08879166cc10d9c6232448
18 | 
19 | # Model and processor paths (VL model)
20 | tokenizer_path: Qwen/Qwen2.5-VL-7B-Instruct
21 | processor_path: Qwen/Qwen2.5-VL-7B-Instruct
22 | 
23 | # Data paths - use preprocessed Geo3K multi-turn data
24 | data_path: ./data/geo3k_multiturn/train.parquet
25 | val_data_path: ./data/geo3k_multiturn/test.parquet
26 | 
27 | # Training parameters
28 | batch_size: 4           # Reduced from 16 for VL model
29 | num_workers: 0
30 | num_epochs: 5        # Total epochs
31 | num_steps: null      # Or set num_steps for step-based training
32 | save_freq: -1        # Save checkpoint every N steps
33 | test_freq: 5         # Validate every N steps
34 | 
35 | # Validation parameters
36 | val_batch_size: 8    # Reduced from 32 for VL model
37 | 
38 | # Generation parameters
39 | temperature: 1.0
40 | top_p: 1.0
41 | max_new_tokens: 2048     # Reduced from 4096
42 | max_prompt_tokens: 1024  # Max prompt length
43 | 
44 | # Algorithm - agent_loop for multi-turn with GymEnvironmentInteraction
45 | algorithm: "agent_loop"
46 | 
47 | # RL Algorithm settings (passed to server via scheduler)
48 | adv_estimator: "grpo"
49 | rollout_n: 4  # Reduced from 5 for VL model
50 | 
51 | # Multi-turn configuration
52 | multi_turn:
53 |   max_user_turns: 3         # Max environment responses (verification feedback)
54 |   max_assistant_turns: 4    # Max LLM responses (initial + refinements)
55 |   max_tokens_per_turn: 1024  # Per-turn response limit
56 |   weave_project: null
57 |   experiment_name: "geo3k_multiturn"
58 | 
59 | # Interaction configuration - uses GymEnvironmentInteraction
60 | interaction:
61 |   name: geo3k_tool
62 |   class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction
63 |   config:
64 |     env_host: 0.0.0.0
65 |     env_port: 8088
66 |     env_endpoint: http://${interaction.config.env_host}:${interaction.config.env_port}
67 |     max_steps: 3    # Max verification steps
68 | 
69 | # Scheduler settings
70 | scheduler_url: "http://0.0.0.0:8780"
71 | scheduler_api_key: null
72 | 
73 | # GPU settings
74 | num_gpus: 4  # Adjust based on your setup
75 | 


--------------------------------------------------------------------------------
/opentinker/client/client_config/math_param.yaml:
--------------------------------------------------------------------------------
 1 | # Math Training Configuration (GameEnvironment Pattern)
 2 | # Use with: python math_client_unified.py
 3 | 
 4 | # Project settings
 5 | project_name: opentinker
 6 | experiment_name: math_training
 7 | 
 8 | # Logging
 9 | logger_backends: ["console", "wandb"]
10 | 
11 | # Tracing (optional)
12 | enable_tracing: true
13 | weave_project: null
14 | 
15 | # WandB (optional)
16 | wandb_key: null
17 | 
18 | # Model and tokenizer
19 | tokenizer_path: null
20 | 
21 | # Data paths
22 | data_path: null        # Path to training data (JSON/JSONL)
23 | val_data_path: null    # Path to validation data (JSON/JSONL)
24 | 
25 | # Training parameters
26 | batch_size: 64
27 | num_workers: 0
28 | # Training duration - set ONE of these (num_steps takes precedence if both set)
29 | num_epochs: 10        # Number of epochs (null = use num_steps)
30 | num_steps: null       # Total training steps (null = use num_epochs)
31 | save_freq: 100
32 | test_freq: 50        # Validation frequency (every N steps)
33 | 
34 | # Validation parameters
35 | val_batch_size: 100   # Total validation samples
36 | 
37 | # Model parameters
38 | # Generation parameters
39 | temperature: 1
40 | top_p: 1
41 | max_new_tokens: 4098  # TOTAL response budget for entire trajectory
42 | max_prompt_tokens: 4096
43 | 
44 | # Algorithm - toolcall for math with tool use
45 | algorithm: "agent_loop"
46 | 
47 | 
48 | # RL Algorithm settings (passed to server via scheduler)
49 | # adv_estimator options:
50 | #   - "grpo"          : Standard GRPO (outcome-only advantage)
51 | #   - "grpo_per_step" : Per-step GRPO with return-based advantages (for multi-turn tasks)
52 | #   - "gae"           : Generalized Advantage Estimation (for PPO, requires critic)
53 | adv_estimator: "grpo"
54 | # rollout_n: number of samples per prompt for GRPO/grpo_per_step (only used when adv_estimator=grpo or grpo_per_step)
55 | rollout_n: 8
56 | 
57 | 
58 | # Interaction configuration
59 | interaction:
60 |   name: math
61 |   class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction
62 |   config:
63 |     env_host: 0.0.0.0
64 |     env_port: 8088
65 |     env_endpoint: http://${interaction.config.env_host}:${interaction.config.env_port}
66 |     max_steps: 1     # Max interaction steps
67 | 
68 | multi_turn:
69 |     max_user_turns: 0
70 |     max_assistant_turns: 1
71 |     max_tokens_per_turn: 4096  # Per-turn response limit
72 |     weave_project: null
73 |     experiment_name: "math_interaction"
74 | 
75 | # Scheduler settings
76 | scheduler_url: "http://0.0.0.0:8780"
77 | scheduler_api_key: otk_98b8db24ccd64c92e1fdd9a232e209fa
78 | 
79 | # GPU settings
80 | num_gpus: 4
81 | 


--------------------------------------------------------------------------------
/opentinker/environment/gomoku/gomoku_server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Gomoku Environment Server - Simplified launcher.
 3 | 
 4 | This script starts a Gomoku game server using the generic base_game_server.
 5 | 
 6 | Usage:
 7 |     python gomoku_server.py
 8 |     # Or with custom config:
 9 |     python gomoku_server.py --port 8081 --board_size 9
10 | """
11 | 
12 | import argparse
13 | from opentinker.environment.base_game_server import run_game_server
14 | from opentinker.environment.gomoku.gomoku_game import GomokuGame
15 | 
16 | # GomokuGameStats is optional - falls back to BaseGameStats if not available
17 | try:
18 |     from opentinker.environment.gomoku.gomoku_stats import GomokuGameStats
19 | except ImportError:
20 |     GomokuGameStats = None
21 | 
22 | 
23 | def main():
24 |     parser = argparse.ArgumentParser(description="Gomoku Game Server")
25 |     parser.add_argument("--host", default="0.0.0.0", help="Server host")
26 |     parser.add_argument("--port", type=int, default=8081, help="Server port")
27 |     parser.add_argument("--board_size", type=int, default=9, help="Board size")
28 |     parser.add_argument("--max_total_steps", type=int, default=40, help="Max steps")
29 |     parser.add_argument("--max_initial_moves", type=int, default=6, help="Max initial moves (0-6)")
30 |     parser.add_argument("--empty_board_prob", type=float, default=0.2, help="Probability of empty board (0.0-1.0)")
31 |     args = parser.parse_args()
32 |     
33 |     print(f"\nGomoku Game Configuration:")
34 |     print(f"  Board size: {args.board_size}x{args.board_size}")
35 |     print(f"  Max steps: {args.max_total_steps}")
36 |     print(f"  Max initial moves: {args.max_initial_moves}")
37 |     print(f"  Empty board prob: {args.empty_board_prob}")
38 |     print(f"\nReward structure:")
39 |     print(f"  Win: +{GomokuGame.REWARD_WIN}")
40 |     print(f"  Loss: {GomokuGame.REWARD_LOSS}")
41 |     print(f"  Invalid format: {GomokuGame.REWARD_INVALID_FORMAT}")
42 |     print(f"  Timeout: {GomokuGame.REWARD_TIMEOUT}")
43 |     
44 |     if GomokuGameStats:
45 |         print(f"\nUsing GomokuGameStats for win/loss/draw tracking")
46 |     else:
47 |         print(f"\nUsing BaseGameStats (GomokuGameStats not available)")
48 |     
49 |     run_game_server(
50 |         game_class=GomokuGame,
51 |         host=args.host,
52 |         port=args.port,
53 |         stats_class=GomokuGameStats,  # None falls back to BaseGameStats
54 |         board_size=args.board_size,
55 |         max_total_steps=args.max_total_steps,
56 |         max_initial_moves=args.max_initial_moves,
57 |         empty_board_prob=args.empty_board_prob,
58 |     )
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     main()
63 | 


--------------------------------------------------------------------------------
/opentinker/server/config/sft_trainer_engine.yaml:
--------------------------------------------------------------------------------
 1 | # Format checks enforced on CI:
 2 | # 1. Comments must appear above each field.
 3 | # 2. There must be a blank line between each field.
 4 | # 3. Inline comments (after a field on the same line) are not allowed.
 5 | # 4. Indentation level is respected for nested fields.
 6 | 
 7 | # <folder_name>@<field_name>.<field_name>: <yaml_file_name>
 8 | 
 9 | defaults:
10 |   - model@model: hf_model
11 |   - engine@engine: fsdp
12 |   - optim@optim: fsdp
13 |   - _self_
14 | 
15 | data:
16 |   train_batch_size: 256 # global batch size
17 |   micro_batch_size_per_gpu: 4  # this is also val batch size
18 |   max_token_len_per_gpu: 8192
19 |   use_dynamic_bsz: True
20 |   train_files: ~/data/gsm8k/train.parquet
21 |   val_files: null
22 |   train_max_samples: -1  # set to -1 to use full dataset
23 |   val_max_samples: -1  # set to -1 to use full dataset
24 |   # Multi-turn settings
25 |   messages_key: messages  # Key for messages list in multi-turn mode
26 |   tools_key: tools  # Key for tools list in multi-turn mode
27 |   enable_thinking_key: enable_thinking  # Whether to enable thinking in multi-turn mode
28 |   pad_mode: no_padding
29 |   # for right padding
30 |   max_length: 1024
31 |   truncation: error
32 |   balance_dp_token: False # to be implement
33 |   custom_cls:
34 |     path: null
35 |     name: null
36 |   use_shm: False
37 |   apply_chat_template_kwargs: {}
38 | 
39 | # Checkpoint configuration
40 | checkpoint:
41 |   _target_: verl.trainer.config.CheckpointConfig
42 |   # What to include in saved checkpoints
43 |   # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
44 |   save_contents: ["model", "optimizer", "extra"]
45 | 
46 |   # For more flexibility, you can specify the contents to load from the checkpoint.
47 |   load_contents: ${checkpoint.save_contents}
48 | 
49 | trainer:
50 |   default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
51 |   default_hdfs_dir: null
52 |   project_name: gsm8k-sft
53 |   experiment_name: test
54 |   total_epochs: 4
55 |   total_training_steps: null
56 |   logger: [ 'console', 'wandb' ]
57 |   seed: 1
58 |   save_freq: -1
59 |   test_freq: -1
60 |   max_ckpt_to_keep: null  # Maximum number of checkpoints to keep, set to null to keep all
61 | 
62 |   # Resume mode: "auto", "disable", or "resume_path"
63 |   # "auto": resume from last checkpoint if available
64 |   # "disable": start from scratch
65 |   # "resume_path": resume from a user-defined path
66 |   resume_mode: auto
67 | 
68 |   # Path to resume training from (used when resume_mode is "resume_path" or "auto")
69 |   resume_from_path: null  
70 |   device: cuda
71 | 


--------------------------------------------------------------------------------
/opentinker/server/config/engine/megatron.yaml:
--------------------------------------------------------------------------------
 1 | # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
 2 | _target_: verl.workers.config.McoreEngineConfig
 3 | 
 4 | # Whether to offload model parameters to CPU
 5 | param_offload: False
 6 | 
 7 | # Whether to offload gradients to CPU
 8 | grad_offload: False
 9 | 
10 | # Whether to offload optimizer state to CPU
11 | optimizer_offload: False
12 | 
13 | # tensor model parallel size
14 | tensor_model_parallel_size: 1
15 | 
16 | # expert model parallel size
17 | expert_model_parallel_size: 1
18 | 
19 | # expert tensor parallel size
20 | expert_tensor_parallel_size: 1
21 | 
22 | # pipeline model parallel size
23 | pipeline_model_parallel_size: 1
24 | 
25 | # virtual pipeline model parallel size
26 | virtual_pipeline_model_parallel_size: null
27 | 
28 | # context parallel size
29 | context_parallel_size: 1
30 | 
31 | # sequence parallel
32 | sequence_parallel: True
33 | 
34 | # Whether to use distributed optimizer
35 | use_distributed_optimizer: True
36 | 
37 | # Whether to use distributed checkpointing
38 | use_dist_checkpointing: False
39 | 
40 | # distributed checkpointing path
41 | dist_checkpointing_path: null
42 | 
43 | # oc.select: default val for ref.megatron.seed
44 | seed: 42
45 | 
46 | # Allow to override Distributed Data Parallel (DDP) config
47 | override_ddp_config: {}
48 | 
49 | # additional transformer config like: num_layers_in_first(/last)_pipeline_stage
50 | # oc.select: default val for ref.megatron.override_transformer_config
51 | override_transformer_config:
52 |   # Recompute configuration, same as in megatron.training.arguments
53 |   # default use minimal performance-interference recompute methods
54 |   # Recompute granualarity, choices: ["full", "selective"]
55 |   recompute_granularity: null
56 | 
57 |   # Recompute modules, multiple choices: ["core_attn", "moe_act", "layernorm", "mla_up_proj", "mlp", "moe"]
58 |   # Please use correct module in matched model
59 |   recompute_modules: ["core_attn"]
60 | 
61 |   # 'uniform', 'block'
62 |   # 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk
63 |   # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
64 |   recompute_method: null
65 | 
66 |   # 'full' will checkpoint the entire transformer layer and 'selective' only checkpoints memory intensive part of attention
67 |   recompute_num_layers: null
68 | 
69 |   # Attention backend to use (flash,fused,unfused,local,auto). Defaults to auto in mcore, flash in verl
70 |   attention_backend: flash
71 | 
72 | override_mcore_model_config: {}
73 | 
74 | # oc.select: default val for ref.megatron.use_mbridge
75 | use_mbridge: False
76 | 
77 | # whether to use forward only
78 | forward_only: False
79 | 


--------------------------------------------------------------------------------
/opentinker/environment/geo3k/geo3k_tool_env.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright 2025 OpenTinker
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Geo3K Multi-Turn VL Environment.
16 | 
17 | This environment supports multi-turn geometry problem solving with vision-language models.
18 | It uses Geo3KToolGame for verification-based interactions.
19 | """
20 | 
21 | from opentinker.environment.vl_game_environment import VLGameEnvironment
22 | from opentinker.environment.geo3k.geo3k_tool_game import Geo3KToolGame
23 | 
24 | 
25 | class Geo3KToolEnvironment(VLGameEnvironment):
26 |     """Multi-turn VL environment for Geo3K geometry problems.
27 |     
28 |     This environment uses:
29 |     - Geo3KToolGame for multi-turn verification logic
30 |     - StaticDatasetGeneratorVL for image handling
31 |     - GymEnvironmentInteraction for HTTP communication
32 |     
33 |     The model can submit answers multiple times and receive feedback
34 |     in verl-compatible format: "Current parsed answer={answer} reward={0.0|1.0}"
35 |     
36 |     Args:
37 |         config: Configuration object
38 |         data_paths: Training data paths (parquet files)
39 |         val_data_paths: Validation data paths (optional)
40 |         job_id: Job identifier
41 |         max_retries: Max verification attempts per problem (default: 3)
42 |     
43 |     Example:
44 |         env = Geo3KToolEnvironment(
45 |             config=config,
46 |             data_paths=["~/data/geo3k_multiturn/train.parquet"],
47 |             val_data_paths=["~/data/geo3k_multiturn/test.parquet"],
48 |             job_id="geo3k_tool_training_001",
49 |         )
50 |     """
51 |     
52 |     def __init__(
53 |         self, 
54 |         config, 
55 |         data_paths, 
56 |         val_data_paths=None, 
57 |         job_id=None,
58 |         max_retries: int = 3,
59 |     ):
60 |         # Initialize with multi-turn Geo3K game and VL environment
61 |         super().__init__(
62 |             game_class=Geo3KToolGame,
63 |             config=config,
64 |             data_paths=data_paths,
65 |             val_data_paths=val_data_paths,
66 |             game_kwargs={"max_retries": max_retries},
67 |             job_id=job_id,
68 |             image_key="images",  # Geo3K uses "images" field
69 |         )
70 | 


--------------------------------------------------------------------------------
/opentinker/client/client_config/generic_env_param.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration for Generic Environment Training (LLM-Environment Interaction)
 2 | #
 3 | # Key differences from opentinker_param.yaml:
 4 | # - algorithm: "agent_loop" (uses GenericAgentLoop)
 5 | # - No reward function (environment provides rewards)
 6 | # - Interaction configuration for Gym environment
 7 | 
 8 | server_url: "http://localhost:8000" # 如果是scheduler版本，不需要这个参数
 9 | scheduler_url: "http://localhost:8766"
10 | scheduler_api_key: "otk_98b8db24ccd64c92e1fdd9a232e209fa"
11 | 
12 | # GPU allocation
13 | num_gpus: 4
14 | 
15 | # Data configuration
16 | data_path: null  # Path to training data (parquet file)
17 | val_data_path: null  # Optional validation data
18 | tokenizer_path: null  # Path to tokenizer/model
19 | batch_size: 16  # Smaller batch for multi-turn
20 | val_batch_size: 50  # Validation batch size (also controls dataset size if val_max_samples not set)
21 | # Training duration - set ONE of these (num_steps takes precedence if both set)
22 | num_epochs: 3     # Number of epochs (null = use num_steps)
23 | num_steps: null   # Total training steps (null = use num_epochs)
24 | num_workers: 0
25 | 
26 | # Interaction configuration (for GenericAgentLoop)
27 | # This replaces the reward function - environment provides rewards
28 | interaction:
29 |     name: "gym_env"  # Name referenced in dataset's interaction_kwargs
30 |     class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction
31 |     config:
32 |         env_endpoint: "http://localhost:8084"  # Mock environment server
33 |         max_steps: 50
34 |         observation_template: "Environment: {observation}"
35 | 
36 | # Multi-turn configuration
37 | multi_turn:
38 |     max_user_turns: 12
39 |     max_assistant_turns: 12
40 |     max_tokens_per_turn: 512  # Per-turn response limit (optional, null for no limit)
41 |     # Weave tracing (optional - runs on SERVER side)
42 |     weave_project: "zsqzz/generic-env-test"  # W&B project for tracing {'message': 'Entity opentinker not found', 'path': ['upsertModel']}
43 |     experiment_name: "gym_interaction"  # Experiment name in Weave
44 | 
45 | # Project tracking
46 | project_name: "generic_env_training"
47 | experiment_name: "gym_interaction"
48 | save_freq: 50000
49 | test_freq: 5
50 | 
51 | # Generation parameters
52 | temperature: 1  # Lower temperature for more focused responses
53 | top_p: 1
54 | max_new_tokens: 8192  # TOTAL response budget for entire multi-turn trajectory (NOT per-turn!)
55 | max_prompt_tokens: 2048
56 | 
57 | # Algorithm selection
58 | # IMPORTANT: Must be "agent_loop" for GenericEnvironment
59 | algorithm: "agent_loop"
60 | 
61 | # Logging
62 | logger_backends: ["console","wandb"]  # Add "wandb" if needed
63 | wandb_key: null
64 | enable_tracing: true
65 | # No reward configuration needed!
66 | # GenericEnvironment gets rewards from the interaction
67 | 


--------------------------------------------------------------------------------
/opentinker/client/client_config/gomoku_param.yaml:
--------------------------------------------------------------------------------
 1 | # Gomoku Training Configuration
 2 | # Use with: python gomoku_client.py
 3 | 
 4 | # Project settings
 5 | project_name: opentinker
 6 | experiment_name: gomoku_training
 7 | 
 8 | # Logging
 9 | logger_backends: ["console","wandb"]
10 | 
11 | # Tracing (optional)
12 | enable_tracing: true
13 | weave_project: null
14 | 
15 | # WandB (optional)
16 | wandb_key: 2ed6f8544ac3e30d5c08879166cc10d9c6232448
17 | 
18 | 
19 | # Model and tokenizer
20 | tokenizer_path: null
21 | 
22 | 
23 | # Training parameters
24 | batch_size: 4
25 | num_workers: 4
26 | # Training duration - set ONE of these (num_steps takes precedence if both set)
27 | num_epochs: null  # Number of epochs (null = use num_steps)
28 | num_steps: 1000   # Total training steps (null = use num_epochs)
29 | save_freq: 20000
30 | test_freq: 10     # Validation frequency (every N steps)
31 | 
32 | # Validation parameters
33 | val_batch_size: 50    # Total validation samples (null = 50)
34 | 
35 | # Model parameters
36 | # Generation parameters
37 | temperature: 1  # Lower temperature for more focused responses
38 | top_p: 1
39 | max_new_tokens: 8192  # TOTAL response budget for entire multi-turn trajectory (NOT per-turn!)
40 | max_prompt_tokens: 4096
41 | 
42 | # Algorithm (must be agent_loop for multi-turn)
43 | algorithm: "agent_loop"
44 | 
45 | # RL Algorithm settings (passed to server via scheduler)
46 | # adv_estimator options:
47 | #   - "grpo"          : Standard GRPO (outcome-only advantage)
48 | #   - "grpo_per_step" : Per-step GRPO with return-based advantages (for multi-turn tasks)
49 | #   - "gae"           : Generalized Advantage Estimation (for PPO, requires critic)
50 | adv_estimator: "grpo_per_step"
51 | # rollout_n: number of samples per prompt for GRPO/grpo_per_step (only used when adv_estimator=grpo or grpo_per_step)
52 | rollout_n: 16
53 | 
54 | # Interaction configuration
55 | interaction:
56 |   name: gomoku
57 |   class_path: opentinker.environment.gym_environment_interaction.GymEnvironmentInteraction
58 |   config:
59 |     env_host: 0.0.0.0
60 |     env_port: 8088
61 |     env_endpoint: http://${interaction.config.env_host}:${interaction.config.env_port}
62 |     max_steps: 81 # 这是后端GymEnvironmentInteraction的参数
63 |     max_total_steps: 39  # 调用环境的step方法的最大次数限制（防止invalid move hacking）
64 |     max_initial_moves: 0
65 |     board_size: 9
66 |     observation_template: "{observation}"
67 |     empty_board_prob: 1.0
68 | 
69 | 
70 | multi_turn:
71 |     max_user_turns: ${interaction.config.max_total_steps} # 这是vllm的参数
72 |     max_assistant_turns: ${interaction.config.max_total_steps} # 这是vllm的参数
73 |     max_tokens_per_turn: 256  # Per-turn response limit (optional, null for no limit)
74 |     # Weave tracing (optional - runs on SERVER side)
75 |     weave_project: "zsqzz/gomoku-env-test"  # W&B project for tracing {'message': 'Entity opentinker not found', 'path': ['upsertModel']}
76 |     experiment_name: "gomoku_interaction"  # Experiment name in Weave
77 | 
78 | 
79 | # Scheduler settings
80 | scheduler_url: "http://0.0.0.0:8780"
81 | scheduler_api_key: otk_98b8db24ccd64c92e1fdd9a232e209fa # this is user id
82 | 
83 | # GPU settings
84 | num_gpus: 4
85 | 
86 | 


--------------------------------------------------------------------------------
/opentinker/server/config/sft_trainer.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - optim: fsdp
 3 |   - _self_
 4 | 
 5 | data:
 6 |   train_batch_size: 256
 7 |   micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
 8 |   micro_batch_size_per_gpu: 4  # this is also val batch size
 9 |   train_files: ~/data/gsm8k/train.parquet
10 |   val_files: ~/data/gsm8k/test.parquet
11 |   train_max_samples: -1  # set to -1 to use full dataset
12 |   val_max_samples: -1  # set to -1 to use full dataset
13 |   # Single-turn settings
14 |   prompt_key: question
15 |   response_key: answer
16 |   prompt_dict_keys: null
17 |   response_dict_keys: null
18 |   # Multi-turn settings
19 |   multiturn:
20 |     enable: false  # Set to true to use multi-turn dataset
21 |     messages_key: messages  # Key for messages list in multi-turn mode
22 |     tools_key: tools  # Key for tools list in multi-turn mode
23 |     enable_thinking_key: enable_thinking  # Whether to enable thinking in multi-turn mode
24 |   max_length: 1024
25 |   truncation: error
26 |   balance_dp_token: False
27 |   chat_template: null
28 |   custom_cls:
29 |     path: null
30 |     name: null
31 |   use_shm: False
32 |   apply_chat_template_kwargs: {}
33 | model:
34 |   partial_pretrain: ~/models/gemma-1.1-7b-it
35 |   use_shm: False
36 |   fsdp_config:
37 |     model_dtype: fp32
38 |     wrap_policy:
39 |       min_num_params: 0
40 |     cpu_offload: False
41 |     offload_params: False
42 |   external_lib: null
43 |   enable_gradient_checkpointing: True
44 |   trust_remote_code: False
45 |   lora_rank: 0  # Set to positive value to enable LoRA (e.g., 32)
46 |   lora_alpha: 16  # LoRA scaling factor
47 |   target_modules: all-linear  # Target modules for LoRA adaptation
48 |   use_liger: False
49 |   strategy: fsdp2
50 | optim:
51 |   lr: 1e-5
52 |   betas: [0.9, 0.95]
53 |   weight_decay: 0.01
54 |   lr_warmup_steps_ratio: 0.1
55 |   clip_grad: 1.0
56 |   lr_scheduler: cosine
57 | ulysses_sequence_parallel_size: 1
58 | use_remove_padding: False
59 | trainer:
60 |   default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
61 |   default_hdfs_dir: null
62 |   project_name: gsm8k-sft
63 |   experiment_name: test
64 |   total_epochs: 4
65 |   total_training_steps: null
66 |   logger: [ 'console', 'wandb' ]
67 |   seed: 1
68 |   save_freq: -1
69 |   test_freq: -1
70 |   nnodes: 1
71 |   n_gpus_per_node: 8
72 |   max_ckpt_to_keep: null  # Maximum number of checkpoints to keep, set to null to keep all
73 | 
74 |   # Resume mode: "auto", "disable", or "resume_path"
75 |   # "auto": resume from last checkpoint if available
76 |   # "disable": start from scratch
77 |   # "resume_path": resume from a user-defined path
78 |   resume_mode: auto
79 | 
80 |   # Path to resume training from (used when resume_mode is "resume_path" or "auto")
81 |   resume_from_path: null
82 | 
83 |   # Checkpoint configuration
84 |   checkpoint:
85 |     # What to include in saved checkpoints
86 |     # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
87 |     save_contents: ["model", "optimizer", "extra"]
88 | 
89 |     # For more flexibility, you can specify the contents to load from the checkpoint.
90 |     load_contents: ${trainer.checkpoint.save_contents}
91 |   device: cuda
92 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Setup script for OpenTinker.
 4 | 
 5 | This allows the package to be installed in development mode with:
 6 |     pip install -e .
 7 | """
 8 | 
 9 | from setuptools import setup, find_packages
10 | from pathlib import Path
11 | 
12 | # Read requirements
13 | def read_requirements(filename):
14 |     """Read requirements from file, ignoring comments and empty lines."""
15 |     requirements = []
16 |     filepath = Path(__file__).parent / filename
17 |     if filepath.exists():
18 |         with open(filepath, "r") as f:
19 |             for line in f:
20 |                 line = line.strip()
21 |                 if line and not line.startswith("#"):
22 |                     requirements.append(line)
23 |     return requirements
24 | 
25 | 
26 | # Read README for long description
27 | readme_path = Path(__file__).parent / "README.md"
28 | long_description = ""
29 | if readme_path.exists():
30 |     with open(readme_path, "r", encoding="utf-8") as f:
31 |         long_description = f.read()
32 | 
33 | 
34 | setup(
35 |     name="opentinker",
36 |     version="0.1.0",
37 |     description="OpenTinker: A distributedframework for training and inference with interactive environments",
38 |     long_description=long_description,
39 |     long_description_content_type="text/markdown",
40 |     author="Siqi Zhu, Jiaxuan You",
41 |     author_email="siqizhu4@illinois.edu, jiaxuan@illinois.edu",
42 |     url="https://github.com/open-tinker/OpenTinker",
43 |     python_requires=">=3.8",
44 |     packages=find_packages(include=["opentinker", "opentinker.*"]),
45 |     install_requires=[
46 |         # Core dependencies
47 |         "ray>=2.9.0",
48 |         "torch>=2.0.0",
49 |         "transformers>=4.35.0",
50 |         # Web framework
51 |         "fastapi>=0.104.0",
52 |         "uvicorn>=0.24.0",
53 |         "pydantic>=2.0.0",
54 |         # Configuration
55 |         "omegaconf>=2.3.0",
56 |         "hydra-core>=1.3.0",
57 |         "pyyaml>=6.0",
58 |         # Data processing
59 |         "pandas>=2.0.0",
60 |         "pyarrow>=14.0.0",
61 |         "datasets>=2.14.0",
62 |         # Utilities
63 |         "requests>=2.31.0",
64 |         "aiohttp>=3.9.0",
65 |     ],
66 |     extras_require={
67 |         "dev": [
68 |             "pytest>=7.4.0",
69 |             "black>=23.0.0",
70 |             "flake8>=6.1.0",
71 |         ],
72 |         "logging": [
73 |             "wandb>=0.16.0",
74 |         ],
75 |     },
76 |     entry_points={
77 |         # "console_scripts": [
78 |         #     "opentinker-scheduler=opentinker.scheduler.launch_scheduler_kill:main",
79 |         # ],
80 |     },
81 |     classifiers=[
82 |         "Development Status :: 3 - Alpha",
83 |         "Intended Audience :: Developers",
84 |         "Intended Audience :: Science/Research",
85 |         "License :: OSI Approved :: Apache Software License",
86 |         "Programming Language :: Python :: 3",
87 |         "Programming Language :: Python :: 3.8",
88 |         "Programming Language :: Python :: 3.9",
89 |         "Programming Language :: Python :: 3.10",
90 |         "Programming Language :: Python :: 3.11",
91 |         "Programming Language :: Python :: 3.12",
92 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
93 |     ],
94 | )
95 | 


--------------------------------------------------------------------------------
/opentinker/server/config/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass, field
16 | from typing import Any, Optional
17 | 
18 | from verl.base_config import BaseConfig
19 | 
20 | __all__ = ["CheckpointConfig", "ProfileConfig", "BaseModelConfig"]
21 | 
22 | 
23 | @dataclass
24 | class CheckpointConfig(BaseConfig):
25 |     """Configuration for model checkpointing.
26 | 
27 |     The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config.
28 | 
29 |     Args:
30 |         save_contents (list[str]): What to include in saved checkpoints.
31 |             Options: 'model', 'optimizer', 'extra', 'hf_model'.
32 |         load_contents (list[str]): Contents to load from checkpoint. Defaults to same as save_contents.
33 |         async_save (bool): Whether to save checkpoints asynchronously. Only implemented for Megatron as of now.
34 |     """
35 | 
36 |     save_contents: list[str] = field(default_factory=lambda: ["model", "optimizer", "extra"])
37 |     load_contents: list[str] = field(default_factory=lambda: ["model", "optimizer", "extra"])
38 |     async_save: bool = False
39 | 
40 | 
41 | @dataclass
42 | class ProfileConfig(BaseConfig):
43 |     """Configuration for profiling.
44 | 
45 |     The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config.
46 | 
47 |     Args:
48 |         profile_ranks (Optional[list[int]]): List of ranks to profile. None means all ranks.
49 |         step_start (int): Starting step for profiling.
50 |         step_end (int): Ending step for profiling.
51 |         save_path (Optional[str]): Path to save profiling results.
52 |     """
53 | 
54 |     profile_ranks: Optional[list[int]] = None
55 |     step_start: int = -1
56 |     step_end: int = -1
57 |     save_path: Optional[str] = None
58 | 
59 | 
60 | @dataclass
61 | class BaseModelConfig(BaseConfig):
62 |     """Base configuration for a model.
63 |     Contains core settings for loading and initializing a pretrained model checkpoint.
64 | 
65 |     Args:
66 |         path (str): Path to pretrained model weights.
67 |         tokenizer_path (Optional[str]): Tokenizer path (defaults to actor's model path if not set).
68 |         override_config (dict): Hugging Face config override.
69 |         external_lib (Optional[str]): External model implementation (optional).
70 |         trust_remote_code (bool): Whether to trust remote code from Hugging Face models.
71 |     """
72 | 
73 |     path: str = "~/models/deepseek-llm-7b-chat"
74 |     tokenizer_path: Optional[str] = None
75 |     override_config: dict[str, Any] = field(default_factory=dict)
76 |     external_lib: Optional[str] = None
77 |     trust_remote_code: bool = False
78 | 


--------------------------------------------------------------------------------
/opentinker/data_preprocess/math.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Bytedance Ltd.
 2 | # Licensed under the Apache License, Version 2.0
 3 | 
 4 | import argparse
 5 | import json
 6 | import os
 7 | 
 8 | import datasets
 9 | 
10 | from verl.utils.hdfs_io import copy, makedirs
11 | from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed
12 | 
13 | 
14 | def extract_solution(solution_str):
15 |     return remove_boxed(last_boxed_only_string(solution_str))
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument("--local_dir", default="./data", help="The local directory for the preprocessed dataset.")
21 |     parser.add_argument("--hdfs_dir", default=None)
22 |     parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
23 |     parser.add_argument(
24 |         "--local_save_dir", default="./data", help="The save directory for the preprocessed dataset."
25 |     )
26 | 
27 |     args = parser.parse_args()
28 |     local_dataset_path = args.local_dataset_path
29 | 
30 |     data_source = "DigitalLearningGmbH/MATH-lighteval"
31 |     print(f"Loading the {data_source} dataset from huggingface...", flush=True)
32 | 
33 |     if local_dataset_path is not None:
34 |         dataset = datasets.load_dataset(local_dataset_path)
35 |     else:
36 |         dataset = datasets.load_dataset(data_source)
37 | 
38 |     train_dataset = dataset["train"]
39 |     test_dataset = dataset["test"]
40 | 
41 |     instruction_following = "Let's think step by step and output the final answer within \\boxed{}."
42 | 
43 |     # build map function
44 |     def make_map_fn():
45 |         def process_fn(example, idx):
46 |             question = example.pop("problem") + " " + instruction_following
47 | 
48 |             answer = example.pop("solution")
49 |             solution = extract_solution(answer)
50 | 
51 |             return {
52 |                 "prompt": [{"role": "user", "content": question}],
53 |                 "ground_truth": solution,
54 |                 "data_source": data_source,
55 |             }
56 |         return process_fn
57 | 
58 |     train_dataset = train_dataset.map(make_map_fn(), with_indices=True)
59 |     test_dataset = test_dataset.map(make_map_fn(), with_indices=True)
60 | 
61 |     local_save_dir = args.local_dir or args.local_save_dir
62 |     local_dir = os.path.expanduser(local_save_dir)
63 |     os.makedirs(local_dir, exist_ok=True)
64 | 
65 |     # --------------------
66 |     # Write JSONL files
67 |     # --------------------
68 |     def write_jsonl(ds, path):
69 |         with open(path, "w") as f:
70 |             for item in ds:
71 |                 json.dump(item, f)
72 |                 f.write("\n")
73 | 
74 |     train_jsonl = os.path.join(local_dir, "train.jsonl")
75 |     test_jsonl = os.path.join(local_dir, "test.jsonl")
76 | 
77 |     print(f"Writing JSONL to {train_jsonl} and {test_jsonl}")
78 |     write_jsonl(train_dataset, train_jsonl)
79 |     write_jsonl(test_dataset, test_jsonl)
80 | 
81 |     # Save first example for reference
82 |     with open(os.path.join(local_dir, "train_example.json"), "w") as f:
83 |         json.dump(train_dataset[0], f, indent=2)
84 |     with open(os.path.join(local_dir, "test_example.json"), "w") as f:
85 |         json.dump(test_dataset[0], f, indent=2)
86 | 
87 |     if args.hdfs_dir is not None:
88 |         makedirs(args.hdfs_dir)
89 |         copy(src=local_dir, dst=args.hdfs_dir)
90 | 


--------------------------------------------------------------------------------
/docs/geo3k_quickstart.md:
--------------------------------------------------------------------------------
  1 | # Geo3K Vision-Language Training Quick Start
  2 | 
  3 | ## Overview
  4 | 
  5 | This guide shows how to train vision-language models (like Qwen2.5-VL) on Geo3K geometry problems using OpenTinker.
  6 | 
  7 | ## Prerequisites
  8 | 
  9 | ```bash
 10 | # Install required packages
 11 | pip install transformers>=4.37.0 pillow
 12 | 
 13 | # Prepare Geo3K data (if not already done)
 14 | cd verl/examples/data_preprocess
 15 | python geo3k.py --local_save_dir ~/data/geo3k
 16 | ```
 17 | 
 18 | ## Quick Start
 19 | 
 20 | ### 1. Test Data Loading (Optional)
 21 | 
 22 | ```bash
 23 | python opentinker/test_geo3k_data.py --data_path ~/data/geo3k/train.parquet
 24 | ```
 25 | 
 26 | ### 2. Configure Training
 27 | 
 28 | Edit `opentinker/client/client_config/geo3k_param.yaml`:
 29 | 
 30 | ```yaml
 31 | # Model paths
 32 | tokenizer_path: Qwen/Qwen2.5-VL-7B-Instruct
 33 | processor_path: Qwen/Qwen2.5-VL-7B-Instruct
 34 | 
 35 | # Data
 36 | data_path: ~/data/geo3k/train.parquet
 37 | val_data_path: ~/data/geo3k/test.parquet
 38 | 
 39 | # GRPO settings
 40 | adv_estimator: "grpo"
 41 | rollout_n: 5
 42 | 
 43 | # Resources
 44 | num_gpus: 8
 45 | batch_size: 64
 46 | ```
 47 | 
 48 | ### 3. Launch Training
 49 | 
 50 | ```bash
 51 | python opentinker/client/geo3k_rl.py
 52 | ```
 53 | 
 54 | Or with custom parameters:
 55 | 
 56 | ```bash
 57 | python opentinker/client/geo3k_rl.py \
 58 |   tokenizer_path=Qwen/Qwen2.5-VL-7B-Instruct \
 59 |   batch_size=32 \
 60 |   num_epochs=15 \
 61 |   num_gpus=4
 62 | ```
 63 | 
 64 | ## Architecture Components
 65 | 
 66 | - **Data Generator**: `StaticDatasetGeneratorVL` - loads images from parquet
 67 | - **Dataset**: `DynamicGameDatasetVL` - processes text + images with AutoProcessor
 68 | - **Environment**: `VLGameEnvironment` - VL-aware training environment
 69 | - **Game**: `Geo3KGame` - geometry problem logic with reward computation
 70 | - **Client**: `geo3k_rl.py` - training launcher
 71 | 
 72 | ## Key Differences from Text-Only Training
 73 | 
 74 | | Aspect | Text-Only | Vision-Language |
 75 | |--------|-----------|-----------------|
 76 | | Processor | AutoTokenizer | AutoProcessor |
 77 | | Data Generator | StaticDatasetGenerator | StaticDatasetGeneratorVL |
 78 | | Dataset | DynamicGameDataset | DynamicGameDatasetVL |
 79 | | Environment | GameEnvironment | VLGameEnvironment |
 80 | | Data Fields | prompt | prompt + images |
 81 | | Model Input | input_ids, attention_mask | + pixel_values, image_grid_thw |
 82 | 
 83 | ## Next Steps
 84 | 
 85 | ### Add Multi-Turn Support
 86 | 
 87 | Create a multi-turn version that allows reasoning refinement:
 88 | 
 89 | 1. Extend `Geo3KGame` for multi-turn interactions
 90 | 2. Update config: `max_user_turns: 2`, `max_assistant_turns: 3`
 91 | 3. Optionally add tools for intermediate verification
 92 | 
 93 | ### Add Other VL Tasks
 94 | 
 95 | Follow the Geo3K pattern for:
 96 | - MathVista (math with diagrams)
 97 | - ChartQA (chart understanding)
 98 | - DocVQA (document QA)
 99 | 
100 | ## Troubleshooting
101 | 
102 | ### "No module named transformers"
103 | ```bash
104 | pip install transformers>=4.37.0
105 | ```
106 | 
107 | ### "Data file not found"
108 | ```bash
109 | python verl/examples/data_preprocess/geo3k.py --local_save_dir ~/data/geo3k
110 | ```
111 | 
112 | ### "AutoProcessor not found"
113 | Ensure you're using a VL model path (e.g., Qwen2.5-VL, not Qwen2.5).
114 | 
115 | ## References
116 | 
117 | - Implementation Plan: `implementation_plan.md`
118 | - Walkthrough: `walkthrough.md`
119 | - verl Geo3K Example: `verl/examples/grpo_trainer/run_qwen2_5_vl-7b.sh`
120 | 


--------------------------------------------------------------------------------
/opentinker/client/geo3k_rl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Geo3K Vision-Language RL Training Client.
 3 | 
 4 | This script launches Geo3K geometry problem training using vision-language models.
 5 | It follows the same pattern as math_rl.py but uses VL-specific components.
 6 | """
 7 | 
 8 | import hydra
 9 | from omegaconf import OmegaConf
10 | 
11 | from opentinker.client.utils.http_training_client import ServiceClient, SchedulerClient
12 | from opentinker.environment.geo3k import Geo3KGameEnvironment
13 | from opentinker.environment.game_stats_client import GameStatsClient
14 | from opentinker.client.utils.utils import resolve_paths_in_config
15 | from opentinker.client.utils.scheduler_client_lifecycle import get_lifecycle_manager
16 | 
17 | 
18 | @hydra.main(config_path="client_config", config_name="geo3k_param.yaml")
19 | def main(args):
20 |     args = resolve_paths_in_config(args)
21 |     lifecycle = get_lifecycle_manager()
22 |     
23 |     print("=" * 60)
24 |     print("Geo3K Vision-Language Training with OpenTinker")
25 |     print("=" * 60)
26 |     
27 |     # 1. Submit job to scheduler
28 |     scheduler_client = SchedulerClient(
29 |         scheduler_url=args.get("scheduler_url", "http://localhost:8780"),
30 |         api_key=args.get("scheduler_api_key")
31 |     )
32 |     
33 |     job_result = scheduler_client.submit_job(
34 |         config=OmegaConf.to_container(args, resolve=True),
35 |         enable_agent_loop=True,
36 |         wandb_key=args.get("wandb_key"),
37 |         num_gpus=args.get("num_gpus"),
38 |     )
39 |     
40 |     job_id = job_result["job_id"]
41 |     server_url = job_result["server_url"]
42 |     lifecycle.register_job(scheduler_client, job_id)
43 |     
44 |     print(f"✓ Job {job_id} allocated at {server_url}")
45 |     
46 |     # 2. Setup Geo3K VL environment
47 |     env_endpoint = args.interaction.config.env_endpoint
48 |     env = Geo3KGameEnvironment(
49 |         config=args,
50 |         data_paths=[args.data_path],
51 |         val_data_paths=[args.val_data_path] if args.val_data_path else None,
52 |         job_id=job_id,
53 |     )
54 |     print(f"✓ Geo3K VL environment created, interaction config: {env.get_interaction_config_path()}")
55 |     
56 |     # 3. Setup game stats client (optional)
57 |     game_stats = GameStatsClient(env_endpoint, job_id=env.job_id)
58 |     if game_stats.health_check():
59 |         game_stats.reset_all()
60 |         print(f"✓ Connected to game server at {env_endpoint}")
61 |     else:
62 |         game_stats = None
63 |         print(f"⚠ Game server not responding at {env_endpoint}")
64 |     
65 |     # 4. Connect to training server
66 |     client = ServiceClient(
67 |         server_url=server_url,
68 |         project_name=args.project_name,
69 |         experiment_name=args.experiment_name,
70 |         logger_backends=args.logger_backends,
71 |     )
72 |     client.set_config(args, env)
73 |     
74 |     # 5. Train
75 |     print(f"Starting Geo3K training: steps={args.get('num_steps')}, epochs={args.get('num_epochs')}")
76 |     
77 |     try:
78 |         final_metrics = client.fit(
79 |             env=env,
80 |             num_epochs=args.get("num_epochs"),
81 |             num_steps=args.get("num_steps"),
82 |             save_freq=args.save_freq,
83 |             test_freq=args.test_freq,
84 |             verbose=True,
85 |             validate_before_training=True,
86 |             game_stats_client=game_stats,
87 |         )
88 |         print(f"Training completed! Metrics: {final_metrics}")
89 |     finally:
90 |         env.cleanup()
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     main()
95 | 


--------------------------------------------------------------------------------
/opentinker/environment/static_data_generator_vl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Vision-Language Static Data Generator for OpenTinker.
 3 | 
 4 | This module extends StaticDatasetGenerator to support vision-language models
 5 | by loading and processing images from parquet files.
 6 | """
 7 | 
 8 | import logging
 9 | from typing import Any, Dict, List, Optional
10 | 
11 | from opentinker.environment.static_data_generator import StaticDatasetGenerator
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | class StaticDatasetGeneratorVL(StaticDatasetGenerator):
17 |     """Static dataset generator with vision-language support.
18 |     
19 |     This generator extends StaticDatasetGenerator to handle image data
20 |     from parquet files. Images are typically stored as lists of PIL images
21 |     or image paths in the dataset.
22 |     
23 |     Args:
24 |         data_paths: List of parquet file paths
25 |         interaction_name: Name of the interaction handler
26 |         prompt_key: Key for prompt field in data (default: "prompt")
27 |         ground_truth_key: Key for ground truth answer (default: "ground_truth")
28 |         image_key: Key for image field in data (default: "images")
29 |         shuffle: Whether to shuffle data
30 |         seed: Random seed for shuffling
31 |         system_prompt: Optional system prompt to prepend
32 |     
33 |     Example:
34 |         generator = StaticDatasetGeneratorVL(
35 |             data_paths=["~/data/geo3k/train.parquet"],
36 |             interaction_name="game",
37 |             image_key="images",
38 |         )
39 |     """
40 |     
41 |     def __init__(
42 |         self,
43 |         data_paths: List[str],
44 |         interaction_name: str = "game",
45 |         prompt_key: str = "prompt",
46 |         ground_truth_key: str = "ground_truth",
47 |         image_key: str = "images",
48 |         shuffle: bool = False,
49 |         seed: Optional[int] = None,
50 |         system_prompt: Optional[str] = None,
51 |     ):
52 |         super().__init__(
53 |             data_paths=data_paths,
54 |             interaction_name=interaction_name,
55 |             prompt_key=prompt_key,
56 |             ground_truth_key=ground_truth_key,
57 |             shuffle=shuffle,
58 |             seed=seed,
59 |             system_prompt=system_prompt,
60 |         )
61 |         self.image_key = image_key
62 |         logger.info(f"StaticDatasetGeneratorVL initialized with image_key='{image_key}'")
63 |     
64 |     def generate_sample(self, index: int) -> Dict[str, Any]:
65 |         """Generate a sample with vision-language data.
66 |         
67 |         Args:
68 |             index: Sample index
69 |         
70 |         Returns:
71 |             Dict with keys:
72 |                 - prompt: List of message dicts
73 |                 - env_kwargs: Dict with ground_truth
74 |                 - images: List of images (if present)
75 |                 - data_source: Data source identifier
76 |         """
77 |         # Get base sample from parent class
78 |         sample = super().generate_sample(index)
79 |         
80 |         # Add images if present in the data
81 |         actual_idx = self._indices[index % len(self._samples)]
82 |         row = self._samples[actual_idx]
83 |         if self.image_key in row:
84 |             images = row[self.image_key]
85 |             # Ensure images is a list
86 |             if not isinstance(images, list):
87 |                 images = [images] if images is not None else []
88 |             sample["images"] = images
89 |         else:
90 |             sample["images"] = []
91 |         
92 |         return sample
93 | 


--------------------------------------------------------------------------------
/opentinker/reward_functions/math_reward_server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Example Remote Reward API Server
  4 | 
  5 | This is a simple FastAPI server that implements the reward computation endpoint.
  6 | Use this as a template for creating your own remote reward services.
  7 | 
  8 | Start the server:
  9 |     python remote_reward_api_server.py
 10 | 
 11 | The server will listen on http://localhost:30000
 12 | """
 13 | 
 14 | from fastapi import FastAPI, HTTPException
 15 | from pydantic import BaseModel
 16 | from typing import Dict, Any
 17 | import uvicorn
 18 | from verl.utils.reward_score import default_compute_score
 19 | from transformers import PreTrainedTokenizer
 20 | from typing import Any
 21 | 
 22 | 
 23 | app = FastAPI(
 24 |     title="Remote Reward API",
 25 |     description="Example reward computation service",
 26 |     version="1.0.0"
 27 | )
 28 | 
 29 | 
 30 | class ComputeRewardRequest(BaseModel):
 31 |     """Request model for reward computation"""
 32 |     data_source: str
 33 |     solution_str: str
 34 |     ground_truth: str
 35 |     extra_info: Dict[str, Any]
 36 |     sandbox_fusion_url: str = None
 37 |     concurrent_semaphore: int = None
 38 |     memory_limit_mb: int = None
 39 |     reward_router_address: str = None
 40 |     # reward_model_tokenizer: PreTrainedTokenizer = None
 41 |     reward_model_tokenizer: Any = None
 42 | 
 43 | class ComputeRewardResponse(BaseModel):
 44 |     """Response model for reward computation"""
 45 |     reward: float
 46 | 
 47 | 
 48 | @app.get("/health")
 49 | async def health_check():
 50 |     """Health check endpoint"""
 51 |     return {"status": "healthy", "service": "remote_reward_api"}
 52 | 
 53 | 
 54 | @app.post("/compute_reward", response_model=ComputeRewardResponse)
 55 | async def compute_reward(request: ComputeRewardRequest):
 56 |     """
 57 |     Compute reward for a single solution.
 58 |     
 59 |     This is a simple example implementation. Replace with your own logic.
 60 |     """
 61 |     func_rm_score = default_compute_score(
 62 |         request.data_source, 
 63 |         request.solution_str, 
 64 |         request.ground_truth, 
 65 |         request.extra_info,
 66 |         # request.sandbox_fusion_url,
 67 |         # request.concurrent_semaphore,
 68 |         # request.memory_limit_mb,
 69 |     )
 70 |     
 71 |     # Handle both dict and scalar return values
 72 |     # default_compute_score may return dict with {"score": ..., other_keys: ...}
 73 |     if isinstance(func_rm_score, dict):
 74 |         reward = float(func_rm_score.get("score", 0.0))
 75 |     else:
 76 |         reward = float(func_rm_score)
 77 |     
 78 |     return ComputeRewardResponse(reward=reward)
 79 | 
 80 |     
 81 | def main():
 82 |     """Start the remote reward API server"""
 83 |     import argparse
 84 |     
 85 |     parser = argparse.ArgumentParser(description="Remote Reward API Server")
 86 |     parser.add_argument("--port", type=int, default=30001, help="Port to listen on (default: 30001)")
 87 |     parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to (default: 0.0.0.0)")
 88 |     args = parser.parse_args()
 89 |     
 90 |     print("="*60)
 91 |     print("Starting Remote Reward API Server")
 92 |     print("="*60)
 93 |     print("Endpoints:")
 94 |     print(f"  - Health: http://localhost:{args.port}/health")
 95 |     print(f"  - Compute reward: http://localhost:{args.port}/compute_reward")
 96 |     print("="*60)
 97 |     
 98 |     uvicorn.run(
 99 |         app,
100 |         host=args.host,
101 |         port=args.port,
102 |         log_level="info"
103 |     )
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     main()
108 | 


--------------------------------------------------------------------------------
/opentinker/server/config/reward_model/reward_model.yaml:
--------------------------------------------------------------------------------
 1 | # configs for the reward model
 2 | 
 3 | # Whether to enable reward model. If False, we compute the reward only with the user-defined reward functions.
 4 | # In GSM8K and Math examples, we disable reward model.
 5 | # For RLHF alignment example using full_hh_rlhf, we utilize reward model to assess the responses.
 6 | # If False, the following parameters are not effective
 7 | enable: False
 8 | 
 9 | # Whether to deploy the model to a separate resource pool.
10 | # If true, n_gpus_per_node & nnodes will be used to determine the resource node.
11 | enable_resource_pool: False
12 | n_gpus_per_node: 0
13 | nnodes: 0
14 | 
15 | # FSDP strategy: "fsdp" or "fsdp2"
16 | strategy: ???
17 | 
18 | # model config for reward scoring
19 | model:
20 | 
21 |   # Input tokenizer. If the reward model's chat template is inconsistent with the policy,
22 |   # we need to first decode to plaintext, then apply the rm's chat_template.
23 |   # Then score with RM. If chat_templates are consistent, it can be set to null.
24 |   # set this to null if the chat template is identical
25 |   input_tokenizer: ${actor_rollout_ref.model.path}
26 | 
27 |   # RM’s HDFS path or local path. Note that RM only supports AutoModelForSequenceClassification.
28 |   # Other model types need to define their own RewardModelWorker and pass it from the code.
29 |   path: ~/models/FsfairX-LLaMA3-RM-v0.1
30 | 
31 |   # External model implementation (optional)
32 |   external_lib: ${actor_rollout_ref.model.external_lib}
33 | 
34 |   # Whether to enable loading a remote code model, default to False
35 |   trust_remote_code: False
36 | 
37 | # [Deprecated] Global micro batch size
38 | # will be deprecated, use micro_batch_size_per_gpu
39 | micro_batch_size: null
40 | 
41 | # Local per-GPU micro batch size
42 | micro_batch_size_per_gpu: null
43 | 
44 | # Maximum sequence length to process for scoring
45 | max_length: null
46 | 
47 | # Whether to dynamically adjust batch size at runtime
48 | use_dynamic_bsz: ${critic.use_dynamic_bsz}
49 | 
50 | # Maximum number of tokens per GPU in one forward pass
51 | forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
52 | 
53 | # Reward Manager. This defines the mechanism of computing rule-based reward and handling different reward sources.
54 | # Default is naive. If all verification functions are multiprocessing-safe,
55 | # the reward manager can be set to prime for parallel verification.
56 | reward_manager: naive
57 | 
58 | # Whether to launch custom reward function asynchronously during log_prob
59 | # custom reward function executed async on CPU, during log_prob
60 | launch_reward_fn_async: False
61 | 
62 | # Cloud/local sandbox fusion configuration for custom reward logic
63 | sandbox_fusion:
64 | 
65 |   # Cloud /local function URL for sandbox execution
66 |   url: null
67 | 
68 |   # Max concurrent requests allowed to sandbox
69 |   max_concurrent: 64
70 | 
71 |   # Max memory limit for each sandbox process in MB
72 |   memory_limit_mb: 1024
73 | 
74 | # profile the reward model in `compute_reward` 
75 | profiler:
76 | 
77 |   # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
78 |   _target_: verl.utils.profiler.ProfilerConfig
79 | 
80 |   # profiler tool, default same as profiler.tool in global config
81 |   # choices: nsys, npu, torch
82 |   tool: ${oc.select:global_profiler.tool,null}
83 | 
84 |   # whether enable profile on ref
85 |   enable: False
86 |   
87 |   # Whether to profile all ranks.
88 |   all_ranks: False
89 | 
90 |   # The ranks that will be profiled. [] or [0,1,...]
91 |   ranks: []
92 | 
93 |   # profile results saving path
94 |   save_path: ${oc.select:global_profiler.save_path,null}
95 | 
96 |   # specific tool config
97 |   tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}


--------------------------------------------------------------------------------
/opentinker/server/sandbox_tool.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import re
16 | import aiohttp
17 | from transformers.utils import get_json_schema
18 | from verl.tools.base_tool import BaseTool, OpenAIFunctionToolSchema, ToolResponse
19 | 
20 | 
21 | class SandboxTool(BaseTool):
22 |     """Client tool to interact with the Sandbox server."""
23 |     
24 |     def __init__(self, config: dict, tool_schema: OpenAIFunctionToolSchema):
25 |         super().__init__(config, tool_schema)
26 |         # Different model may use different code pattern, e.g. python, py, etc.
27 |         self.code_pattern = re.compile(r"```py(.*?)```", re.DOTALL)
28 | 
29 |     async def code_interpreter(self, code: str) -> str:
30 |         """Execute the code in the sandbox.
31 | 
32 |         Args:
33 |             code: The code to be executed.
34 | 
35 |         Returns:
36 |             str: The output of the code execution.
37 |         """
38 |         async with aiohttp.ClientSession() as session:
39 |             async with session.post(
40 |                 self.config.get("sandbox_fusion_url"),
41 |                 json={"code": code},
42 |             ) as resp:
43 |                 resp.raise_for_status()
44 |                 result = await resp.json()
45 |                 stdout, stderr = result["run_result"]["stdout"], result["run_result"]["stderr"]
46 |                 return stdout + stderr
47 | 
48 |     def get_openai_tool_schema(self) -> OpenAIFunctionToolSchema:
49 |         schema = get_json_schema(self.code_interpreter)
50 |         return OpenAIFunctionToolSchema(**schema)
51 | 
52 |     async def execute(self, instance_id: str, parameters: dict, **kwargs) -> tuple[str, float, dict]:
53 |         code = parameters["code"]
54 |         matches = self.code_pattern.findall(code)
55 |         if matches:
56 |             code = matches[0].strip()
57 | 
58 |         # NOTE: Some script may not explicitly print result, we need to add a print statement to the end of the script.
59 |         # More better way is to SFT the model to make it print result by default, we skip SFT stage in this tutorial.
60 |         lines = code.split("\n")
61 |         for i, line in reversed(list(enumerate(lines))):
62 |             if line == "":
63 |                 continue
64 |             if not lines[i].startswith("print"):
65 |                 lines[i] = f"print({line})"
66 |             break
67 |         code = "\n".join(lines)
68 | 
69 |         result = await self.code_interpreter(code)
70 |         return ToolResponse(text=result), 0.0, {}
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     # Example usage - assumes sandbox server is running
75 |     import asyncio
76 |     
77 |     async def test_sandbox_tool():
78 |         sandbox_tool = SandboxTool(
79 |             config={"sandbox_fusion_url": "http://localhost:8000/run_code"}, 
80 |             tool_schema=None
81 |         )
82 |         
83 |         # Test code execution
84 |         test_code = "print('Hello from sandbox!')\nresult = 2 + 2"
85 |         response, reward, info = await sandbox_tool.execute(
86 |             instance_id="test", 
87 |             parameters={"code": test_code}
88 |         )
89 |         print(f"Response: {response.text}")
90 |     
91 |     asyncio.run(test_sandbox_tool())
92 | 


--------------------------------------------------------------------------------
/opentinker/client/math_rl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import hydra
 3 | from omegaconf import OmegaConf
 4 | from torch.utils.data import DataLoader
 5 | from transformers import AutoTokenizer
 6 | from torchdata.stateful_dataloader import StatefulDataLoader
 7 | 
 8 | from utils.http_training_client import ServiceClient, SchedulerClient
 9 | from opentinker.environment.base_game_environment import GameEnvironment
10 | from opentinker.environment.base_data_generator import DynamicGameDataset, collate_fn
11 | from opentinker.environment.math import MathGame
12 | from opentinker.environment.static_data_generator import StaticDatasetGenerator
13 | from opentinker.environment.game_stats_client import GameStatsClient
14 | from utils.utils import resolve_paths_in_config
15 | from utils.scheduler_client_lifecycle import get_lifecycle_manager
16 | from verl.trainer.main_ppo import create_rl_sampler
17 | from opentinker.environment.math.math_env import MathGameEnvironment
18 | 
19 | 
20 | @hydra.main(config_path="client_config", config_name="math_param.yaml")
21 | def main(args):
22 |     args = resolve_paths_in_config(args)
23 |     lifecycle = get_lifecycle_manager()
24 |     
25 |     print("=" * 60)
26 |     print("Math Training with GameEnvironment Pattern")
27 |     print("=" * 60)
28 |     
29 |     # 1. Submit job to scheduler
30 |     scheduler_client = SchedulerClient(
31 |         scheduler_url=args.get("scheduler_url", "http://localhost:8780"),
32 |         api_key=args.get("scheduler_api_key")
33 |     )
34 |     
35 |     job_result = scheduler_client.submit_job(
36 |         config=OmegaConf.to_container(args, resolve=True),
37 |         enable_agent_loop=True,
38 |         wandb_key=args.get("wandb_key"),
39 |         num_gpus=args.get("num_gpus"),
40 |     )
41 |     
42 |     job_id = job_result["job_id"]
43 |     server_url = job_result["server_url"]
44 |     lifecycle.register_job(scheduler_client, job_id)
45 |     
46 |     print(f"✓ Job {job_id} allocated at {server_url}")
47 |     
48 |     # 2. Setup environment (job_id is automatically handled)
49 |     env_endpoint = args.interaction.config.env_endpoint
50 |     env = MathGameEnvironment(
51 |         game_class=MathGame,
52 |         config=args,
53 |         data_paths=[args.data_path],
54 |         val_data_paths=[args.val_data_path] if args.val_data_path else None,
55 |         job_id=job_id,  # Pass job_id directly
56 |     )
57 |     print(f"✓ Environment created, interaction config: {env.get_interaction_config_path()}")
58 |     
59 |     # 3. Setup game stats client (use env.job_id for consistency)
60 |     game_stats = GameStatsClient(env_endpoint, job_id=env.job_id)
61 |     if game_stats.health_check():
62 |         game_stats.reset_all()
63 |         print(f"✓ Connected to math server at {env_endpoint}")
64 |     else:
65 |         game_stats = None
66 |         print(f"⚠ Math server not responding at {env_endpoint}")
67 |     
68 |     # 4. Connect to training server
69 |     client = ServiceClient(
70 |         server_url=server_url,
71 |         project_name=args.project_name,
72 |         experiment_name=args.experiment_name,
73 |         logger_backends=args.logger_backends,
74 |     )
75 |     client.set_config(args, env)
76 |     
77 |     # 5. Train
78 |     print(f"Starting training: steps={args.get('num_steps')}, epochs={args.get('num_epochs')}")
79 |     
80 |     try:
81 |         final_metrics = client.fit(
82 |             env=env,
83 |             num_epochs=args.get("num_epochs"),
84 |             num_steps=args.get("num_steps"),
85 |             save_freq=args.save_freq,
86 |             test_freq=args.test_freq,
87 |             verbose=True,
88 |             validate_before_training=True,
89 |             game_stats_client=game_stats,
90 |         )
91 |         print(f"Training completed! Metrics: {final_metrics}")
92 |     finally:
93 |         env.cleanup()
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     main()
98 | 


--------------------------------------------------------------------------------
/opentinker/data_preprocess/geo3k.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """
 15 | Preprocess the Geometry3k dataset to parquet format
 16 | """
 17 | 
 18 | import argparse
 19 | import os
 20 | 
 21 | import datasets
 22 | 
 23 | from verl.utils.hdfs_io import copy, makedirs
 24 | 
 25 | if __name__ == "__main__":
 26 |     parser = argparse.ArgumentParser()
 27 |     parser.add_argument("--local_dir", default=None)
 28 |     parser.add_argument("--hdfs_dir", default=None)
 29 |     parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
 30 |     parser.add_argument(
 31 |         "--local_save_dir", default="./data/geo3k", help="The save directory for the preprocessed dataset."
 32 |     )
 33 | 
 34 |     args = parser.parse_args()
 35 |     local_dataset_path = args.local_dataset_path
 36 | 
 37 |     data_source = "hiyouga/geometry3k"
 38 | 
 39 |     if local_dataset_path is not None:
 40 |         dataset = datasets.load_dataset(
 41 |             local_dataset_path,
 42 |         )
 43 |     else:
 44 |         dataset = datasets.load_dataset(
 45 |             data_source,
 46 |         )
 47 | 
 48 |     train_dataset = dataset["train"]
 49 |     test_dataset = dataset["test"]
 50 | 
 51 |     instruction_following = (
 52 |         r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. "
 53 |         r"The reasoning process MUST BE enclosed within <think> </think> tags. "
 54 |         r"The final answer MUST BE put in \boxed{}."
 55 |     )
 56 | 
 57 |     # add a row to each data item that represents a unique id
 58 |     def make_map_fn(split):
 59 |         def process_fn(example, idx):
 60 |             problem = example.pop("problem")
 61 |             prompt = problem + " " + instruction_following
 62 |             answer = example.pop("answer")
 63 |             images = example.pop("images")
 64 | 
 65 |             data = {
 66 |                 "data_source": data_source,
 67 |                 "prompt": [
 68 |                     {
 69 |                         "role": "user",
 70 |                         "content": prompt,
 71 |                     }
 72 |                 ],
 73 |                 "images": images,
 74 |                 "ability": "math",
 75 |                 "reward_model": {"style": "rule", "ground_truth": answer},
 76 |                 "extra_info": {
 77 |                     "split": split,
 78 |                     "index": idx,
 79 |                     "answer": answer,
 80 |                     "question": problem,
 81 |                 },
 82 |             }
 83 |             return data
 84 | 
 85 |         return process_fn
 86 | 
 87 |     train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True, num_proc=8)
 88 |     test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True, num_proc=8)
 89 | 
 90 |     hdfs_dir = args.hdfs_dir
 91 |     local_save_dir = args.local_dir
 92 |     if local_save_dir is not None:
 93 |         print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
 94 |     else:
 95 |         local_save_dir = args.local_save_dir
 96 | 
 97 |     train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
 98 |     test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
 99 | 
100 |     if hdfs_dir is not None:
101 |         makedirs(hdfs_dir)
102 |         copy(src=local_save_dir, dst=hdfs_dir)
103 | 


--------------------------------------------------------------------------------
/opentinker/scheduler/SCHEDULER_GUIDE.md:
--------------------------------------------------------------------------------
  1 | # Scheduler & Web Dashboard Guide
  2 | 
  3 | This guide covers configuration and usage for the OpenTinker Job Scheduler and Web Dashboard.
  4 | 
  5 | ## Configuration
  6 | 
  7 | The scheduler is configured via `opentinker/scheduler/config/scheduler.yaml`.
  8 | 
  9 | ### Key Settings
 10 | 
 11 | ```yaml
 12 | # Authentication
 13 | enable_auth: true   # Set to true to require API keys
 14 | user_db_path: "scheduler_users.db"
 15 | 
 16 | # Resources
 17 | available_gpus: [0, 1, 2, 3]  # GPUs to manage
 18 | port_range: null              # null for auto-detect, or [min, max]
 19 | num_ports: 50                 # Number of ports to auto-detect
 20 | scheduler_port: 8765          # Main API port
 21 | ```
 22 | 
 23 | ## Authentication
 24 | 
 25 | ### 1. Registering Users
 26 | 
 27 | **Method 1: Interactive Script (Recommended)**
 28 | ```bash
 29 | python opentinker/scheduler/register_user_example.py
 30 | ```
 31 | This script prompts for a username, registers the user, and saves the API key to a local file.
 32 | 
 33 | **Method 2: REST API**
 34 | ```bash
 35 | # Register a new user
 36 | curl -X POST "http://<scheduler_url>/register?username=<your_username>"
 37 | ```
 38 | **Response:**
 39 | ```json
 40 | {
 41 |   "user_id": "user_abc123",
 42 |   "username": "your_username",
 43 |   "api_key": "otk_98b8db24ccd64c92e1fdd9a232e209fa",
 44 |   "message": "User registered successfully..."
 45 | }
 46 | ```
 47 | > ⚠️ **Important**: Save your API key immediately! It cannot be retrieved after registration.
 48 | 
 49 | ### 2. Using the API Key
 50 | 
 51 | Include the API key in the `Authorization` header for all requests:
 52 | 
 53 | **cURL**:
 54 | ```bash
 55 | curl -H "Authorization: Bearer <your_api_key>" http://<scheduler_url>/list_jobs
 56 | ```
 57 | 
 58 | **Python**:
 59 | ```python
 60 | import requests
 61 | headers = {"Authorization": f"Bearer {api_key}"}
 62 | response = requests.get(f"{scheduler_url}/list_jobs", headers=headers)
 63 | ```
 64 | 
 65 | The Web Dashboard provides a real-time view of job status and resource usage.
 66 | 
 67 | ### 1. Start the Dashboard
 68 | 
 69 | ```bash
 70 | python opentinker/scheduler/web_dashboard.py --port 8081
 71 | ```
 72 | 
 73 | ### 2. Access
 74 | 
 75 | Open [http://localhost:8081/web_dashboard.html](http://localhost:8081/web_dashboard.html) in your browser.
 76 | 
 77 | ### 3. Authentication
 78 | 
 79 | If `enable_auth` is true in the scheduler config, you must provide an API Key.
 80 | 
 81 | 1.  **Get your API Key**:
 82 |     - Run: `python opentinker/scheduler/register_user_example.py`
 83 |     - Or check your client config: `cat client/client_config/opentinker_param.yaml | grep scheduler_api_key`
 84 | 2.  **Enter in Dashboard**:
 85 |     - Go to the "Settings" section at the top of the dashboard.
 86 |     - Paste your key into the "API Key" field.
 87 |     - The key is automatically saved to your browser's local storage.
 88 | 
 89 | ## Scheduler API Reference
 90 | 
 91 | Base URL: `http://localhost:<scheduler_port>`
 92 | 
 93 | | Method | Endpoint | Description |
 94 | |--------|----------|-------------|
 95 | | POST | `/submit_job` | Submit a new training job |
 96 | | GET | `/list_jobs` | List all jobs and their status |
 97 | | GET | `/job_status/{job_id}` | Get details for a specific job |
 98 | | DELETE | `/cancel_job/{job_id}` | Cancel a running or queued job |
 99 | | POST | `/complete_job/{job_id}` | Mark a job as completed (called by client) |
100 | | POST | `/register` | Register a new user (if auth enabled) |
101 | 
102 | ## Troubleshooting
103 | 
104 | ### Job stuck in QUEUED
105 | - Check GPU availability with `nvidia-smi`.
106 | - Verify the scheduler has free ports in its range.
107 | 
108 | ### 401 Unauthorized Errors
109 | - Ensure you are providing a valid `Authorization: Bearer <key>` header (API) or have entered the key in the dashboard.
110 | - If running locally without need for auth, set `enable_auth: false` in `scheduler.yaml`.
111 | 
112 | ### Server Launch Failures
113 | - Check the scheduler console logs for Python tracebacks.
114 | - Ensure all dependencies are installed in the environment where the scheduler runs.
115 | 


--------------------------------------------------------------------------------
/opentinker/client/math_inference.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import hydra
 3 | from omegaconf import OmegaConf
 4 | 
 5 | from utils.http_training_client import InferenceSchedulerClient
 6 | from utils.scheduler_client_lifecycle import get_lifecycle_manager
 7 | from opentinker.environment.inference_pipeline import run_inference
 8 | from opentinker.environment.math import MathGame
 9 | from opentinker.environment.game_stats_client import GameStatsClient
10 | 
11 | 
12 | @hydra.main(config_path="client_config", config_name="math_inference_scheduler_config.yaml", version_base=None)
13 | def main(args):
14 |     """Run math inference with scheduler-managed vLLM server."""
15 |     lifecycle = get_lifecycle_manager()
16 |     
17 |     print("=" * 60)
18 |     print("Math Inference with Scheduler")
19 |     print("=" * 60)
20 |     
21 |     if not args.model_path:
22 |         raise ValueError("model_path is required")
23 |     if not args.data_path:
24 |         raise ValueError("data_path is required")
25 |     
26 |     # 1. Submit inference job to scheduler
27 |     scheduler_client = InferenceSchedulerClient(
28 |         scheduler_url=args.get("scheduler_url", "http://localhost:8780"),
29 |         api_key=args.get("scheduler_api_key"),
30 |     )
31 |     
32 |     print(f"Submitting inference job to scheduler...")
33 |     job_result = scheduler_client.submit_inference_job(
34 |         model_path=args.model_path,
35 |         tokenizer_path=args.get("tokenizer_path"),
36 |         tensor_parallel_size=args.get("tensor_parallel_size", 1),
37 |         num_gpus=args.get("num_gpus"),
38 |         gpu_memory_utilization=args.get("gpu_memory_utilization", 0.9),
39 |         max_model_len=args.get("max_model_len"),
40 |         trust_remote_code=args.get("trust_remote_code", True),
41 |     )
42 |     
43 |     job_id = job_result["job_id"]
44 |     vllm_server_url = job_result["vllm_server_url"]
45 |     
46 |     # Register job for lifecycle cleanup
47 |     lifecycle.register_job(scheduler_client, job_id)
48 |     
49 |     print(f"✓ Inference job {job_id} started at {vllm_server_url}")
50 |     
51 |     # 2. Setup GameStatsClient for per-step metrics (with job_id isolation)
52 |     game_stats = GameStatsClient(args.env_endpoint, job_id=job_id)
53 |     if game_stats.health_check():
54 |         print(f"✓ Connected to game server at {args.env_endpoint}")
55 |         game_stats.reset_all()  # Reset stats for this job before inference
56 |     else:
57 |         print(f"⚠ Game server not available at {args.env_endpoint}, continuing without stats")
58 |         game_stats = None
59 |     
60 |     # 3. Run inference using the remote vLLM server
61 |     print(f"Running inference on {args.data_path}...")
62 |     
63 |     results = run_inference(
64 |         model_path=None,  # Not needed when using vllm_server_url
65 |         vllm_server_url=vllm_server_url,
66 |         tokenizer_path=args.get("tokenizer_path") or args.model_path,
67 |         data_path=args.data_path,
68 |         game_class=MathGame,
69 |         env_endpoint=args.env_endpoint,
70 |         job_id=job_id,  # Pass job_id for stats isolation
71 |         output_path=args.get("output_path"),
72 |         temperature=args.temperature,
73 |         top_p=args.top_p,
74 |         max_tokens=args.max_new_tokens,
75 |         max_samples=args.get("max_samples"),
76 |         max_user_turns=args.multi_turn.max_user_turns,
77 |         max_assistant_turns=args.multi_turn.max_assistant_turns,
78 |     )
79 |     
80 |     # 4. Log game stats after inference
81 |     if game_stats:
82 |         stats = game_stats.get_all_stats()
83 |         print(f"\nGame Server Stats (job_id={job_id}):")
84 |         print(f"  Total samples: {stats.get('total_samples', 0)}")
85 |         print(f"  Games completed: {stats.get('games_in_step', 0)}")
86 |         print(f"  Mean reward: {stats.get('mean_final_reward', 0):.4f}")
87 |     
88 |     if args.get("output_path"):
89 |         print(f"\nResults saved to: {args.output_path}")
90 |     
91 |     print(f"\n{'='*60}")
92 |     print("Inference completed! vLLM server will be automatically cleaned up.")
93 |     print(f"{'='*60}")
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     main()
98 | 
99 | 


--------------------------------------------------------------------------------
/opentinker/server/config/ref/ref.yaml:
--------------------------------------------------------------------------------
 1 | # actor_rollout_ref.ref: FSDP config same as actor. For models larger than 7B, it’s recommended to turn on offload for ref by default
 2 | strategy: ${actor_rollout_ref.actor.strategy}
 3 | 
 4 | # whether to enable torch.compile
 5 | # same as actor_rollout_ref.actor.use_torch_compile if it exists, otherwise 1
 6 | use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
 7 | 
 8 | # [Will be deprecated, use log_prob_micro_batch_size_per_gpu]
 9 | # The batch size for one forward pass in the computation of log_prob. Global batch size.
10 | log_prob_micro_batch_size: null
11 | 
12 | # The batch size for one forward pass in the computation of log_prob. Local batch size per GPU.
13 | log_prob_micro_batch_size_per_gpu: null
14 | 
15 | # enable dynamic batch size (sequence packing) for log_prob computation
16 | # same as actor_rollout_ref.actor.use_dynamic_bsz if it exists, otherwise false
17 | log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
18 | 
19 | # the max token length per GPU
20 | # same as actor_rollout_ref.actor.ppo_max_token_len_per_gpu if it exists, otherwise 16384
21 | log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
22 | 
23 | # profile the ref model in `compute_log_prob`
24 | profiler:
25 | 
26 |   # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
27 |   _target_: verl.utils.profiler.ProfilerConfig
28 | 
29 |   # choices: nsys, npu, torch, torch_memory
30 |   tool: ${oc.select:global_profiler.tool,null}
31 | 
32 |   # whether enable profile on Ref
33 |   enable: False
34 | 
35 |   # Whether to profile all ranks.
36 |   all_ranks: False
37 | 
38 |   # The ranks that will be profiled. [] or [0,1,...]
39 |   ranks: []
40 | 
41 |   # profile results saving path
42 |   save_path: ${oc.select:global_profiler.save_path,null}
43 | 
44 |   # specific tool config which only related to the role
45 |   tool_config:
46 | 
47 |     # nsys tool config
48 |     nsys:
49 | 
50 |       # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
51 |       _target_: verl.utils.profiler.config.NsightToolConfig
52 |     
53 |       # True for each task has its own database, False for all tasks in one training step share one database.
54 |       discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
55 |     
56 |     # npu config
57 |     npu:
58 | 
59 |       # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
60 |       _target_: verl.utils.profiler.config.NPUToolConfig
61 | 
62 |       # Contents to profile, can be empty
63 |       # options: npu, cpu, memory, shapes, module, stack
64 |       contents: []
65 | 
66 |       # Collection level, optional values: level_none, level0, level1, level2.
67 |       level: "level1"
68 | 
69 |       # Whether to automatically parse the data.
70 |       analysis: True
71 | 
72 |       # True for each task has its own database, False for all tasks in one training step share one database.
73 |       discrete: False
74 |     
75 |     # torch profiler config
76 |     torch:
77 | 
78 |       # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
79 |       _target_: verl.utils.profiler.config.TorchProfilerToolConfig
80 | 
81 |       # start profile mini-batch in training
82 |       # NOTICE: different with global steps config which refers to iteration
83 |       # This field only related with mini-batch
84 |       step_start: 0
85 | 
86 |       # stop profile mini-batch in training
87 |       step_end: null
88 | 
89 |     # torch memory profiler config
90 |     torch_memory:
91 | 
92 |       # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
93 |       _target_: verl.utils.profiler.config.TorchMemoryToolConfig
94 | 
95 |       # Maximum number of memory allocation entries to track
96 |       trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
97 | 
98 |       # Stack trace depth for memory allocations
99 |       stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}


--------------------------------------------------------------------------------
/opentinker/data_preprocess/math_multiturn_w_interaction.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates
  2 | # Copyright 2023-2024 SGLang Team
  3 | # Copyright 2025 ModelBest Inc. and/or its affiliates
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Preprocess the GSM8k dataset to parquet format
 18 | """
 19 | 
 20 | import argparse
 21 | import os
 22 | import re
 23 | 
 24 | import datasets
 25 | 
 26 | from verl.utils.hdfs_io import copy, makedirs
 27 | from verl.utils.hdfs_io import copy, makedirs
 28 | from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed
 29 | 
 30 | 
 31 | def extract_solution(solution_str):
 32 |     return remove_boxed(last_boxed_only_string(solution_str))
 33 | 
 34 | if __name__ == "__main__":
 35 |     parser = argparse.ArgumentParser()
 36 |     parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
 37 |     parser.add_argument("--hdfs_dir", default=None)
 38 |     parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
 39 |     parser.add_argument(
 40 |         "--local_save_dir", default="data/math_agentloop", help="The save directory for the preprocessed dataset."
 41 |     )
 42 | 
 43 |     args = parser.parse_args()
 44 |     local_dataset_path = args.local_dataset_path
 45 | 
 46 |     data_source = "DigitalLearningGmbH/MATH-lighteval"
 47 | 
 48 |     dataset = datasets.load_dataset(
 49 |         data_source,
 50 |     )
 51 |     train_dataset = dataset["train"]
 52 |     test_dataset = dataset["test"]
 53 | 
 54 |     instruction_following = "Let's think step by step and output the final answer within \\boxed{}."
 55 | 
 56 |     # add a row to each data item that represents a unique id
 57 |     def make_map_fn(split):
 58 |         def process_fn(example, idx):
 59 |             question_raw = example.pop("problem")
 60 | 
 61 |             question = question_raw + " " + instruction_following
 62 | 
 63 |             answer_raw = example.pop("solution")
 64 |             solution = extract_solution(answer_raw)
 65 |             data = {
 66 |                 "data_source": data_source,
 67 |                 "prompt": [
 68 |                     {
 69 |                         "role": "user",
 70 |                         "content": question,
 71 |                     },
 72 |                 ],
 73 |                 "ability": "math",
 74 |                 "reward_model": {"style": "rule", "ground_truth": solution},
 75 |                 "extra_info": {
 76 |                     "split": split,
 77 |                     "index": idx,
 78 |                     "answer": answer_raw,
 79 |                     "question": question_raw,
 80 |                     "interaction_kwargs": {
 81 |                         "name": "math",
 82 |                         "query": question,
 83 |                         "ground_truth": solution,
 84 |                     },
 85 |                 },
 86 |             }
 87 |             return data
 88 | 
 89 |         return process_fn
 90 | 
 91 |     train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
 92 |     test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
 93 | 
 94 |     hdfs_dir = args.hdfs_dir
 95 |     local_save_dir = args.local_dir
 96 |     if local_save_dir is not None:
 97 |         print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
 98 |     else:
 99 |         local_save_dir = args.local_save_dir
100 | 
101 |     train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
102 |     test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
103 | 
104 |     if hdfs_dir is not None:
105 |         makedirs(hdfs_dir)
106 |         copy(src=local_save_dir, dst=hdfs_dir)
107 | 


--------------------------------------------------------------------------------
/opentinker/data_preprocess/math_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """
 15 | Preprocess the MATH-lighteval dataset to parquet format
 16 | """
 17 | 
 18 | import argparse
 19 | import json
 20 | import os
 21 | 
 22 | import datasets
 23 | 
 24 | from verl.utils.hdfs_io import copy, makedirs
 25 | from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed
 26 | 
 27 | 
 28 | def extract_solution(solution_str):
 29 |     return remove_boxed(last_boxed_only_string(solution_str))
 30 | 
 31 | 
 32 | if __name__ == "__main__":
 33 |     parser = argparse.ArgumentParser()
 34 |     parser.add_argument("--local_dir", default=None)
 35 |     parser.add_argument("--hdfs_dir", default=None)
 36 |     parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
 37 |     parser.add_argument(
 38 |         "--local_save_dir", default="~/data/math", help="The save directory for the preprocessed dataset."
 39 |     )
 40 | 
 41 |     args = parser.parse_args()
 42 |     local_dataset_path = args.local_dataset_path
 43 | 
 44 |     # 'lighteval/MATH' is no longer available on huggingface.
 45 |     # Use mirror repo: DigitalLearningGmbH/MATH-lighteval
 46 |     data_source = "DigitalLearningGmbH/MATH-lighteval"
 47 |     print(f"Loading the {data_source} dataset from huggingface...", flush=True)
 48 |     if local_dataset_path is not None:
 49 |         dataset = datasets.load_dataset(
 50 |             local_dataset_path,
 51 |         )
 52 |     else:
 53 |         dataset = datasets.load_dataset(
 54 |             data_source,
 55 |         )
 56 | 
 57 |     train_dataset = dataset["train"]
 58 |     test_dataset = dataset["test"]
 59 | 
 60 |     instruction_following = "Let's think step by step and output the final answer within \\boxed{}."
 61 | 
 62 | 
 63 |     # add a row to each data item that represents a unique id
 64 |     def make_map_fn(split):
 65 |         def process_fn(example, idx):
 66 |             question = example.pop("problem")
 67 | 
 68 |             question = question + " " + instruction_following
 69 | 
 70 |             answer = example.pop("solution")
 71 |             solution = extract_solution(answer)
 72 |             data = {
 73 |                 "data_source": data_source,
 74 |                 "prompt": [{"role": "user", "content": question}],
 75 |                 "ability": "math",
 76 |                 "reward_model": {"style": "rule", "ground_truth": solution},
 77 |                 "extra_info": {"split": split, "index": idx},
 78 |             }
 79 |             return data
 80 | 
 81 |         return process_fn
 82 | 
 83 |     train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
 84 |     test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
 85 | 
 86 |     local_save_dir = args.local_dir
 87 |     if local_save_dir is not None:
 88 |         print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
 89 |     else:
 90 |         local_save_dir = args.local_save_dir
 91 | 
 92 |     local_dir = os.path.expanduser(local_save_dir)
 93 |     hdfs_dir = args.hdfs_dir
 94 | 
 95 |     train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
 96 |     test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
 97 |     # Save one example as JSON for reference
 98 |     example = train_dataset[0]
 99 |     with open(os.path.join(local_dir, "train_example.json"), "w") as f:
100 |         json.dump(example, f, indent=2)
101 |     example = test_dataset[0]
102 |     with open(os.path.join(local_dir, "test_example.json"), "w") as f:
103 |         json.dump(example, f, indent=2)
104 |     if hdfs_dir is not None:
105 |         makedirs(hdfs_dir)
106 | 
107 |         copy(src=local_dir, dst=hdfs_dir)
108 | 


--------------------------------------------------------------------------------
/opentinker/environment/legacy/generic/README.md:
--------------------------------------------------------------------------------
  1 | # Generic Environment for LLM-Environment Interaction
  2 | 
  3 | This directory contains the generic environment implementation for training LLMs
  4 | to interact with external environments (like OpenAI Gym).
  5 | 
  6 | ## Architecture
  7 | 
  8 | ```
  9 | ┌─────────────────────────────────────────────────────────────────────┐
 10 | │                        Training Pipeline                             │
 11 | ├─────────────────────────────────────────────────────────────────────┤
 12 | │                                                                       │
 13 | │  ┌─────────────────┐        ┌─────────────────────────────────────┐  │
 14 | │  │ GenericEnvironment│───────▶│       GenericAgentLoop              │  │
 15 | │  │  (BaseEnvironment) │        │  (verl/experimental/agent_loop/)   │  │
 16 | │  │                   │        │                                     │  │
 17 | │  │ - Dataloader     │        │  PENDING → GENERATING → INTERACTING │  │
 18 | │  │ - InteractionSpec │        │              │              │       │  │
 19 | │  └─────────────────┘        │              ▼              ▼       │  │
 20 | │                              │        LLM Server    Environment   │  │
 21 | │                              │        (mask=1)      (mask=0)      │  │
 22 | │                              └─────────────────────────────────────┘  │
 23 | │                                              │                         │
 24 | │                              ┌───────────────┴───────────────┐        │
 25 | │                              │    BaseInteraction            │        │
 26 | │                              │  (verl/interactions/)          │        │
 27 | │                              │                               │        │
 28 | │                              │  - GymEnvironmentInteraction  │        │
 29 | │                              │  - SimpleTextEnvironment      │        │
 30 | │                              │  - Gsm8kInteraction           │        │
 31 | │                              └───────────────────────────────┘        │
 32 | └─────────────────────────────────────────────────────────────────────┘
 33 | ```
 34 | 
 35 | ## Key Concept: Environment Provides Rewards
 36 | 
 37 | Unlike standard PPO training where a separate reward function evaluates completions,
 38 | in environment interaction:
 39 | 
 40 | - **Reward comes from the environment** via `interaction.generate_response()`
 41 | - **No external `reward_function` is needed**
 42 | - `response_mask` ensures only LLM tokens contribute to the loss
 43 | 
 44 | ## Quick Start
 45 | 
 46 | ```python
 47 | from omegaconf import OmegaConf
 48 | from opentinker.environment.generic.generic_env import (
 49 |     GenericEnvironment,
 50 |     InteractionSpec,
 51 | )
 52 | 
 53 | # 1. Configure environment
 54 | config = OmegaConf.create({
 55 |     "tokenizer_path": "meta-llama/Llama-2-7b-chat-hf",
 56 |     "data_path": "data/train.parquet",
 57 |     "max_prompt_tokens": 1024,
 58 |     "max_new_tokens": 512,
 59 |     "batch_size": 4,
 60 |     "num_workers": 4,
 61 |     "algorithm": "agent_loop",
 62 | })
 63 | 
 64 | # 2. Define interaction with Gym environment
 65 | interaction_specs = [
 66 |     InteractionSpec(
 67 |         name="my_env",
 68 |         class_path="verl.interactions.gym_environment_interaction.GymEnvironmentInteraction",
 69 |         config={"env_endpoint": "http://localhost:8080", "max_steps": 100}
 70 |     )
 71 | ]
 72 | 
 73 | # 3. Create environment
 74 | env = GenericEnvironment(config, interaction_specs)
 75 | 
 76 | # 4. Use with training client
 77 | train_dl, val_dl = env.get_dataloader()
 78 | env_config = env.setup(client)
 79 | ```
 80 | 
 81 | ## Files
 82 | 
 83 | | File | Description |
 84 | |------|-------------|
 85 | | `generic_env.py` | Main GenericEnvironment class |
 86 | | `example_usage.py` | Usage examples |
 87 | 
 88 | ## Dataset Format
 89 | 
 90 | Your training data should include `interaction_kwargs` to specify which interaction to use:
 91 | 
 92 | ```json
 93 | {
 94 |   "prompt": [
 95 |     {"role": "system", "content": "You are playing a text adventure..."},
 96 |     {"role": "user", "content": "You are in a cave. What do you do?"}
 97 |   ],
 98 |   "extra_info": {
 99 |     "interaction_kwargs": {"name": "my_env"}
100 |   }
101 | }
102 | ```
103 | 
104 | ## Related Files
105 | 
106 | - Agent Loop: `verl/experimental/agent_loop/generic_agent_loop.py`
107 | - Interactions: `verl/interactions/gym_environment_interaction.py`
108 | - Mock Server: `opentinker/environment/example/mock_env_server.py`
109 | 


--------------------------------------------------------------------------------
/opentinker/client/math_tool_rl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import hydra
  3 | from omegaconf import OmegaConf
  4 | from transformers import AutoTokenizer
  5 | from torchdata.stateful_dataloader import StatefulDataLoader
  6 | 
  7 | from utils.http_training_client import ServiceClient, SchedulerClient
  8 | from opentinker.environment.base_game_environment import GameEnvironment
  9 | from opentinker.environment.base_data_generator import DynamicGameDataset, collate_fn
 10 | from opentinker.environment.math.math_tool_game import CodeInterpreterMathGame
 11 | from opentinker.environment.static_data_generator import StaticDatasetGenerator
 12 | from opentinker.environment.game_stats_client import GameStatsClient
 13 | from utils.utils import resolve_paths_in_config
 14 | from utils.scheduler_client_lifecycle import get_lifecycle_manager
 15 | from verl.trainer.main_ppo import create_rl_sampler
 16 | from opentinker.environment.math.math_tool_env import MathCodeInterpreterEnvironment
 17 | 
 18 | @hydra.main(config_path="client_config", config_name="math_code_interpreter_param.yaml")
 19 | def main(args):
 20 |     args = resolve_paths_in_config(args)
 21 |     lifecycle = get_lifecycle_manager()
 22 |     
 23 |     print("=" * 60)
 24 |     print("Math Training with Code Interpreter (Agent Loop)")
 25 |     print("=" * 60)
 26 |     
 27 |     # 1. Submit job to scheduler
 28 |     print("\n[1/4] Submitting job to scheduler...")
 29 |     scheduler_client = SchedulerClient(
 30 |         scheduler_url=args.get("scheduler_url", "http://localhost:8780"),
 31 |         api_key=args.get("scheduler_api_key")
 32 |     )
 33 |     
 34 |     job_result = scheduler_client.submit_job(
 35 |         config=OmegaConf.to_container(args, resolve=True),
 36 |         enable_agent_loop=True,
 37 |         wandb_key=args.get("wandb_key"),
 38 |         num_gpus=args.get("num_gpus"),
 39 |     )
 40 |     
 41 |     job_id = job_result["job_id"]
 42 |     server_url = job_result["server_url"]
 43 |     lifecycle.register_job(scheduler_client, job_id)
 44 |     
 45 |     print(f"✓ Job {job_id} allocated at {server_url}")
 46 |     
 47 |     # 2. Setup environment
 48 |     print("\n[2/4] Setting up environment...")
 49 |     env_endpoint = args.interaction.config.env_endpoint
 50 |     env = MathCodeInterpreterEnvironment(
 51 |         game_class=CodeInterpreterMathGame,
 52 |         config=args,
 53 |         data_paths=[args.data_path],
 54 |         val_data_paths=[args.val_data_path] if args.val_data_path else None,
 55 |         job_id=job_id,
 56 |     )
 57 |     print(f"✓ Environment created")
 58 |     print(f"  - Interaction config: {env.get_interaction_config_path()}")
 59 |     print(f"  - Game server endpoint: {env_endpoint}")
 60 |     
 61 |     # 3. Setup game stats client
 62 |     print("\n[3/4] Connecting to game server...")
 63 |     game_stats = GameStatsClient(env_endpoint, job_id=env.job_id)
 64 |     if game_stats.health_check():
 65 |         game_stats.reset_all()
 66 |         print(f"✓ Connected to game server at {env_endpoint}")
 67 |     else:
 68 |         game_stats = None
 69 |         print(f"⚠ Game server not responding at {env_endpoint}")
 70 |         print(f"  Make sure to start: python opentinker/environment/math/code_interpreter_math_server.py --port {args.interaction.config.env_port}")
 71 |     
 72 |     # 4. Connect to training server and train
 73 |     print("\n[4/4] Starting training...")
 74 |     client = ServiceClient(
 75 |         server_url=server_url,
 76 |         project_name=args.project_name,
 77 |         experiment_name=args.experiment_name,
 78 |         logger_backends=args.logger_backends,
 79 |     )
 80 |     client.set_config(args, env)
 81 |     
 82 |     print(f"\nTraining configuration:")
 83 |     print(f"  - Algorithm: {args.algorithm}")
 84 |     print(f"  - Steps: {args.get('num_steps')}")
 85 |     print(f"  - Epochs: {args.get('num_epochs')}")
 86 |     print(f"  - Batch size: {args.batch_size}")
 87 |     print(f"  - Max turns: {args.multi_turn.max_assistant_turns}")
 88 |     
 89 |     try:
 90 |         final_metrics = client.fit(
 91 |             env=env,
 92 |             num_epochs=args.get("num_epochs"),
 93 |             num_steps=args.get("num_steps"),
 94 |             save_freq=args.save_freq,
 95 |             test_freq=args.test_freq,
 96 |             verbose=True,
 97 |             validate_before_training=True,
 98 |             game_stats_client=game_stats,
 99 |         )
100 |         print(f"\n✓ Training completed!")
101 |         print(f"Final metrics: {final_metrics}")
102 |     finally:
103 |         env.cleanup()
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     main()
108 | 


--------------------------------------------------------------------------------
/opentinker/client/gomoku_inference.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import hydra
  3 | from omegaconf import OmegaConf
  4 | 
  5 | from utils.http_training_client import InferenceSchedulerClient
  6 | from utils.scheduler_client_lifecycle import get_lifecycle_manager
  7 | from opentinker.environment.inference_pipeline import run_inference
  8 | from opentinker.environment.gomoku import GomokuGame
  9 | from opentinker.environment.game_stats_client import GameStatsClient
 10 | 
 11 | 
 12 | @hydra.main(config_path="client_config", config_name="gomoku_inference_scheduler_config.yaml", version_base=None)
 13 | def main(args):
 14 |     """Run gomoku inference with scheduler-managed vLLM server."""
 15 |     lifecycle = get_lifecycle_manager()
 16 |     
 17 |     print("=" * 60)
 18 |     print("Gomoku Inference with Scheduler")
 19 |     print("=" * 60)
 20 |     
 21 |     if not args.model_path:
 22 |         raise ValueError("model_path is required")
 23 |     
 24 |     # 1. Submit inference job to scheduler
 25 |     scheduler_client = InferenceSchedulerClient(
 26 |         scheduler_url=args.get("scheduler_url", "http://localhost:8780"),
 27 |         api_key=args.get("scheduler_api_key"),
 28 |     )
 29 |     
 30 |     print(f"Submitting inference job to scheduler...")
 31 |     job_result = scheduler_client.submit_inference_job(
 32 |         model_path=args.model_path,
 33 |         tokenizer_path=args.get("tokenizer_path"),
 34 |         tensor_parallel_size=args.get("tensor_parallel_size", 1),
 35 |         num_gpus=args.get("num_gpus"),
 36 |         gpu_memory_utilization=args.get("gpu_memory_utilization", 0.9),
 37 |         max_model_len=args.get("max_model_len"),
 38 |         trust_remote_code=args.get("trust_remote_code", True),
 39 |     )
 40 |     
 41 |     job_id = job_result["job_id"]
 42 |     vllm_server_url = job_result["vllm_server_url"]
 43 |     
 44 |     # Register job for lifecycle cleanup
 45 |     lifecycle.register_job(scheduler_client, job_id)
 46 |     
 47 |     print(f"✓ Inference job {job_id} started at {vllm_server_url}")
 48 |     
 49 |     # 2. Setup GameStatsClient for per-step metrics (with job_id isolation)
 50 |     game_stats = GameStatsClient(args.env_endpoint, job_id=job_id)
 51 |     if game_stats.health_check():
 52 |         print(f"✓ Connected to game server at {args.env_endpoint}")
 53 |         game_stats.reset_all()  # Reset stats for this job before inference
 54 |     else:
 55 |         print(f"⚠ Game server not available at {args.env_endpoint}, continuing without stats")
 56 |         game_stats = None
 57 |     
 58 |     # 3. Run inference using the remote vLLM server
 59 |     max_user_turns = args.multi_turn.get("max_user_turns", 50)
 60 |     max_assistant_turns = args.multi_turn.get("max_assistant_turns", 50)
 61 |     
 62 |     print(f"Running inference with max_samples={args.get('max_samples', 10)}...")
 63 |     
 64 |     results = run_inference(
 65 |         model_path=None,  # Not needed when using vllm_server_url
 66 |         vllm_server_url=vllm_server_url,
 67 |         tokenizer_path=args.get("tokenizer_path") or args.model_path,
 68 |         data_path=args.get("data_path"),  # None for dynamic generation
 69 |         game_class=GomokuGame,
 70 |         env_endpoint=args.env_endpoint,
 71 |         job_id=job_id,  # Pass job_id for stats isolation
 72 |         output_path=args.get("output_path"),
 73 |         temperature=args.temperature,
 74 |         top_p=args.top_p,
 75 |         max_tokens=args.max_new_tokens,
 76 |         max_tokens_per_turn=args.multi_turn.get("max_tokens_per_turn"),
 77 |         max_samples=args.get("max_samples", 10),
 78 |         max_user_turns=max_user_turns,
 79 |         max_assistant_turns=max_assistant_turns,
 80 |         max_context_length=args.get("max_context_length", 30000),
 81 |         # GomokuGame kwargs
 82 |         board_size=args.get("board_size", 9),
 83 |     )
 84 |     
 85 |     # 4. Log game stats after inference
 86 |     if game_stats:
 87 |         stats = game_stats.get_all_stats()
 88 |         print(f"\nGame Server Stats (job_id={job_id}):")
 89 |         print(f"  Total samples: {stats.get('total_samples', 0)}")
 90 |         print(f"  Games completed: {stats.get('games_in_step', 0)}")
 91 |         print(f"  Win rate: {stats.get('win_rate', 0):.2%}")
 92 |         print(f"  Mean reward: {stats.get('mean_final_reward', 0):.4f}")
 93 |     
 94 |     if args.get("output_path"):
 95 |         print(f"\nResults saved to: {args.output_path}")
 96 |     
 97 |     print(f"\n{'='*60}")
 98 |     print("Inference completed! vLLM server will be automatically cleaned up.")
 99 |     print(f"{'='*60}")
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     main()
104 | 
105 | 


--------------------------------------------------------------------------------
/opentinker/docs/SERVER_CONNECTION_FIX.md:
--------------------------------------------------------------------------------
  1 | # 服务器连接超时问题 - 解决方案
  2 | 
  3 | ## 问题描述
  4 | 
  5 | 客户端在尝试连接到 HTTP 训练服务器时失败，报错：
  6 | ```
  7 | RuntimeError: Failed to complete request to set_generation_config after 3 attempts
  8 | ```
  9 | 
 10 | ## 根本原因
 11 | 
 12 | **服务器初始化时间过长**：HTTP 训练服务器需要时间来：
 13 | 1. 启动 Ray actors
 14 | 2. 加载大型语言模型到 GPU
 15 | 3. 初始化各个组件（actor, critic, reference model 等）
 16 | 
 17 | 而客户端的重试机制太"急躁"：
 18 | - 只重试 3 次
 19 | - 重试间隔较短（2s、4s、8s）
 20 | - 总共只等待约 14 秒
 21 | 
 22 | 对于需要加载大模型的服务器，这个时间远远不够！
 23 | 
 24 | ---
 25 | 
 26 | ## 已实施的解决方案
 27 | 
 28 | ### 1. ✅ 增加 HTTP 客户端重试次数和延迟
 29 | 
 30 | **修改文件**: `client/http_training_client.py`
 31 | 
 32 | **更改内容**:
 33 | - `max_retries`: 3 → **10** (增加重试次数)
 34 | - `retry_delay`: 2.0s → **5.0s** (增加基础延迟)
 35 | - 添加指数退避上限：最长等待 60 秒
 36 | 
 37 | **新的重试时间表**:
 38 | ```
 39 | 尝试 1: 0秒
 40 | 尝试 2: 5秒 →  等待 5s
 41 | 尝试 3: 10秒 → 等待 5s  
 42 | 尝试 4: 20秒 → 等待 10s
 43 | 尝试 5: 40秒 → 等待 20s
 44 | 尝试 6: 80秒 → 等待 40s
 45 | 尝试 7: 140秒 → 等待 60s (上限)
 46 | 尝试 8: 200秒 → 等待 60s
 47 | 尝试 9: 260秒 → 等待 60s
 48 | 尝试 10: 320秒 → 等待 60s
 49 | ```
 50 | 
 51 | **总等待时间**: ~5 分钟（足够服务器完全启动）
 52 | 
 53 | ### 2. ✅ 创建服务器就绪等待工具
 54 | 
 55 | **新文件**: `scripts/wait_for_server.py`
 56 | 
 57 | 这是一个独立工具，可以在客户端连接之前检查服务器是否已就绪。
 58 | 
 59 | **用法**:
 60 | ```bash
 61 | # 等待服务器就绪（默认超时 5 分钟）
 62 | python scripts/wait_for_server.py http://localhost:38001
 63 | 
 64 | # 自定义超时和轮询间隔
 65 | python scripts/wait_for_server.py http://localhost:38001 600 10
 66 | ```
 67 | 
 68 | **输出示例**:
 69 | ```
 70 | ⏳ 等待服务器就绪: http://localhost:38001
 71 |    超时时间: 300秒
 72 |    检查端点: /api/v1/health
 73 | 
 74 | ⏳ 尝试 1: 连接失败 (已等待: 0秒)
 75 | ⏳ 尝试 2: HTTP 404 (已等待: 5秒)
 76 | ⏳ 尝试 3: HTTP 200 ✓
 77 | 
 78 | ✅ 服务器已就绪！(耗时: 47.2秒)
 79 | ```
 80 | 
 81 | ---
 82 | 
 83 | ## 如何使用
 84 | 
 85 | ### 方法 1: 直接运行（推荐）
 86 | 
 87 | 由于已经修改了 `http_training_client.py`，现在客户端会自动等待更长时间：
 88 | 
 89 | ```bash
 90 | python client/custom_client_with_scheduler.py \
 91 |     data_path=data/math/train.parquet \
 92 |     val_data_path=data/math/test.parquet \
 93 |     tokenizer_path=/path/to/tokenizer \
 94 |     num_gpus=4
 95 | ```
 96 | 
 97 | 客户端会自动：
 98 | - 重试 10 次（而不是 3 次）
 99 | - 使用指数退避策略
100 | - 最多等待约 5 分钟
101 | 
102 | ### 方法 2: 手动等待（最保险）
103 | 
104 | 如果服务器特别慢，可以先手动等待：
105 | 
106 | ```bash
107 | # 步骤 1: 提交任务到调度器
108 | # （从客户端输出中获取 server_url，比如 http://localhost:38001）
109 | 
110 | # 步骤 2: 等待服务器就绪
111 | python scripts/wait_for_server.py http://localhost:38001 600
112 | 
113 | # 步骤 3: 服务器就绪后，客户端会自动连接
114 | ```
115 | 
116 | ### 方法 3: 增加调度器的服务器启动等待时间
117 | 
118 | **修改文件**: `scheduler/job_scheduler.py`
119 | 
120 | 找到第 394 行：
121 | ```python
122 | time.sleep(0.5)  # Current: 0.5 seconds
123 | ```
124 | 
125 | 改为：
126 | ```python
127 | time.sleep(10.0)  # Wait 10 seconds for server to initialize
128 | ```
129 | 
130 | 这会让调度器在启动服务器后等待更长时间再返回给客户端。
131 | 
132 | ---
133 | 
134 | ## 故障排查
135 | 
136 | ### 检查服务器是否真的在运行
137 | 
138 | ```bash
139 | ps aux | grep launch_http_server
140 | ```
141 | 
142 | 应该看到类似：
143 | ```
144 | root  3283256  8.6  0.0  python .../launch_http_server.py server.port=38001 ...
145 | ```
146 | 
147 | ### 检查服务器端口是否可访问
148 | 
149 | ```bash
150 | curl http://localhost:38001/api/v1/health
151 | ```
152 | 
153 | **预期响应**:
154 | - 如果服务器还在初始化：`{"detail":"Not Found"}` 或连接错误
155 | - 如果服务器已就绪：`{"status": "healthy", ...}`
156 | 
157 | ### 查看服务器日志
158 | 
159 | 服务器日志保存在 `/workspace/logs/`:
160 | ```bash
161 | # 查找最新的日志文件
162 | ls -lth /workspace/logs/ | head -5
163 | 
164 | # 查看stderr（错误日志）
165 | tail -100 /workspace/logs/job_*_stderr.log
166 | ```
167 | 
168 | ### 延长客户端超时时间
169 | 
170 | 如果服务器特别慢（例如首次加载超大模型），可以在客户端创建时增加超时：
171 | 
172 | ```python
173 | # 在 custom_client_with_scheduler.py 中
174 | client = ServiceClient(
175 |     server_url=server_url,
176 |     timeout=10000.0,  # 增加到 10000 秒
177 |     max_retries=15,   # 增加到 15 次
178 |     retry_delay=10.0  # 增加基础延迟到 10 秒
179 | )
180 | ```
181 | 
182 | ---
183 | 
184 | ## 预防措施
185 | 
186 | ### 1. 预热服务器
187 | 
188 | 在第一次提交任务前，先启动一个服务器让它加载模型：
189 | 
190 | ```bash
191 | # 启动一个"预热"服务器
192 | python server/launch_http_server.py server.port=38000
193 | 
194 | # 等待模型加载完成（观察GPU内存增长）
195 | watch -n 1 nvidia-smi
196 | 
197 | # 之后的任务会启动得更快（模型已在缓存中）
198 | ```
199 | 
200 | ### 2. 使用更快的启动配置
201 | 
202 | 如果不需要所有组件，可以简化配置来加快启动：
203 | - 减少 GPU 数量
204 | - 使用更小的模型
205 | - 禁用不需要的功能
206 | 
207 | ### 3. 监控服务器启动进度
208 | 
209 | 添加日志查看脚本：
210 | ```bash
211 | # 实时查看最新的服务器日志
212 | tail -f /workspace/logs/job_*_stderr.log
213 | ```
214 | 
215 | ---
216 | 
217 | ## 总结
218 | 
219 | **问题**: 客户端连接超时（3 次重试，~14 秒）  
220 | **原因**: 服务器初始化需要更长时间（加载模型到 GPU）  
221 | **解决**: 
222 | 1. ✅ 增加重试次数到 10 次
223 | 2. ✅ 增加重试延迟到 5 秒
224 | 3. ✅ 添加 wait_for_server.py 工具
225 | 4. ✅ 使用指数退避，最长等待 60 秒
226 | 
227 | **现在客户端最多会等待约 5 分钟**，应该足够大多数服务器完成初始化。
228 | 


--------------------------------------------------------------------------------
/opentinker/setup_cross_node.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # 跨节点配置快速设置脚本
  3 | # 用法: ./setup_cross_node.sh <scheduler_ip> <environment_ip>
  4 | 
  5 | set -e
  6 | 
  7 | # 颜色输出
  8 | RED='\033[0;31m'
  9 | GREEN='\033[0;32m'
 10 | YELLOW='\033[1;33m'
 11 | NC='\033[0m' # No Color
 12 | 
 13 | # 检查参数
 14 | if [ $# -lt 2 ]; then
 15 |     echo -e "${RED}错误: 参数不足${NC}"
 16 |     echo "用法: $0 <scheduler_ip> <environment_ip> [scheduler_port] [env_port]"
 17 |     echo ""
 18 |     echo "示例:"
 19 |     echo "  $0 192.168.1.100 192.168.1.101"
 20 |     echo "  $0 192.168.1.100 192.168.1.101 8766 8084"
 21 |     exit 1
 22 | fi
 23 | 
 24 | SCHEDULER_IP=$1
 25 | ENV_IP=$2
 26 | SCHEDULER_PORT=${3:-8766}  # 默认 8766
 27 | ENV_PORT=${4:-8084}         # 默认 8084
 28 | 
 29 | echo -e "${GREEN}========================================${NC}"
 30 | echo -e "${GREEN}OpenTinker 跨节点配置工具${NC}"
 31 | echo -e "${GREEN}========================================${NC}"
 32 | echo ""
 33 | echo -e "${YELLOW}配置信息:${NC}"
 34 | echo "  Scheduler: http://${SCHEDULER_IP}:${SCHEDULER_PORT}"
 35 | echo "  Environment: http://${ENV_IP}:${ENV_PORT}"
 36 | echo ""
 37 | 
 38 | # 获取脚本所在目录
 39 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 40 | CLIENT_CONFIG_DIR="${SCRIPT_DIR}/client/client_config"
 41 | 
 42 | # 检查配置目录是否存在
 43 | if [ ! -d "$CLIENT_CONFIG_DIR" ]; then
 44 |     echo -e "${RED}错误: 配置目录不存在: ${CLIENT_CONFIG_DIR}${NC}"
 45 |     echo "确保脚本位于 OpenTinker 项目根目录"
 46 |     exit 1
 47 | fi
 48 | 
 49 | # 配置文件列表
 50 | CONFIG_FILES=(
 51 |     "generic_env_param.yaml"
 52 |     "gomoku_param.yaml"
 53 | )
 54 | 
 55 | # 备份并修改配置文件
 56 | for CONFIG_FILE in "${CONFIG_FILES[@]}"; do
 57 |     CONFIG_PATH="${CLIENT_CONFIG_DIR}/${CONFIG_FILE}"
 58 |     
 59 |     if [ ! -f "$CONFIG_PATH" ]; then
 60 |         echo -e "${YELLOW}跳过: ${CONFIG_FILE} (文件不存在)${NC}"
 61 |         continue
 62 |     fi
 63 |     
 64 |     echo -e "${GREEN}处理: ${CONFIG_FILE}${NC}"
 65 |     
 66 |     # 创建备份
 67 |     BACKUP_PATH="${CONFIG_PATH}.backup.$(date +%Y%m%d_%H%M%S)"
 68 |     cp "$CONFIG_PATH" "$BACKUP_PATH"
 69 |     echo "  ✓ 备份已创建: $(basename $BACKUP_PATH)"
 70 |     
 71 |     # 修改 scheduler_url
 72 |     sed -i.tmp "s|scheduler_url:.*|scheduler_url: \"http://${SCHEDULER_IP}:${SCHEDULER_PORT}\"|" "$CONFIG_PATH"
 73 |     
 74 |     # 修改 env_endpoint
 75 |     sed -i.tmp "s|env_endpoint:.*|env_endpoint: \"http://${ENV_IP}:${ENV_PORT}\"  # Modified by setup script|" "$CONFIG_PATH"
 76 |     
 77 |     # 删除临时文件
 78 |     rm -f "${CONFIG_PATH}.tmp"
 79 |     
 80 |     echo "  ✓ 配置已更新"
 81 |     echo ""
 82 | done
 83 | 
 84 | # 验证修改
 85 | echo -e "${GREEN}========================================${NC}"
 86 | echo -e "${GREEN}配置验证${NC}"
 87 | echo -e "${GREEN}========================================${NC}"
 88 | echo ""
 89 | 
 90 | for CONFIG_FILE in "${CONFIG_FILES[@]}"; do
 91 |     CONFIG_PATH="${CLIENT_CONFIG_DIR}/${CONFIG_FILE}"
 92 |     
 93 |     if [ ! -f "$CONFIG_PATH" ]; then
 94 |         continue
 95 |     fi
 96 |     
 97 |     echo -e "${YELLOW}${CONFIG_FILE}:${NC}"
 98 |     
 99 |     # 显示 scheduler_url
100 |     SCHEDULER_LINE=$(grep "scheduler_url:" "$CONFIG_PATH" | head -1)
101 |     echo "  ${SCHEDULER_LINE}"
102 |     
103 |     # 显示 env_endpoint
104 |     ENV_LINE=$(grep "env_endpoint:" "$CONFIG_PATH" | head -1)
105 |     echo "  ${ENV_LINE}"
106 |     echo ""
107 | done
108 | 
109 | # 网络连通性测试
110 | echo -e "${GREEN}========================================${NC}"
111 | echo -e "${GREEN}网络连通性测试${NC}"
112 | echo -e "${GREEN}========================================${NC}"
113 | echo ""
114 | 
115 | echo -e "${YELLOW}测试 Scheduler 连接...${NC}"
116 | if ping -c 1 -W 2 "$SCHEDULER_IP" &> /dev/null; then
117 |     echo -e "  ${GREEN}✓ Ping ${SCHEDULER_IP} 成功${NC}"
118 | else
119 |     echo -e "  ${RED}✗ Ping ${SCHEDULER_IP} 失败${NC}"
120 | fi
121 | 
122 | echo ""
123 | echo -e "${YELLOW}测试 Environment 连接...${NC}"
124 | if ping -c 1 -W 2 "$ENV_IP" &> /dev/null; then
125 |     echo -e "  ${GREEN}✓ Ping ${ENV_IP} 成功${NC}"
126 | else
127 |     echo -e "  ${RED}✗ Ping ${ENV_IP} 失败${NC}"
128 | fi
129 | 
130 | echo ""
131 | echo -e "${GREEN}========================================${NC}"
132 | echo -e "${GREEN}配置完成!${NC}"
133 | echo -e "${GREEN}========================================${NC}"
134 | echo ""
135 | echo "下一步:"
136 | echo "  1. 启动 Scheduler (在 ${SCHEDULER_IP} 节点):"
137 | echo "     cd scheduler && python launch_scheduler.py"
138 | echo ""
139 | echo "  2. 启动 Environment Server (在 ${ENV_IP} 节点):"
140 | echo "     cd environment/example && python mock_env_server.py --port ${ENV_PORT}"
141 | echo ""
142 | echo "  3. 运行 Client (在当前节点):"
143 | echo "     cd client && python generic_env_client.py"
144 | echo ""
145 | echo -e "${YELLOW}注意: 记得在 generic_env_param.yaml 中填写正确的 scheduler_api_key!${NC}"
146 | 


--------------------------------------------------------------------------------
/opentinker/test_geo3k_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Quick test script to verify Geo3K data loading with vision-language support.
  3 | 
  4 | This script tests:
  5 | 1. Loading parquet data with images
  6 | 2. VL data generator functionality
  7 | 3. Image tensor processing
  8 | 
  9 | Usage:
 10 |     python test_geo3k_data.py --data_path ~/data/geo3k/train.parquet
 11 | """
 12 | 
 13 | import argparse
 14 | from transformers import AutoProcessor
 15 | from omegaconf import OmegaConf
 16 | 
 17 | from opentinker.environment.static_data_generator_vl import StaticDatasetGeneratorVL
 18 | from opentinker.environment.base_data_generator_vl import DynamicGameDatasetVL
 19 | 
 20 | 
 21 | def test_geo3k_data(data_path: str, num_samples: int = 5):
 22 |     """Test Geo3K data loading and processing.
 23 |     
 24 |     Args:
 25 |         data_path: Path to Geo3K parquet file
 26 |         num_samples: Number of samples to test
 27 |     """
 28 |     print("=" * 60)
 29 |     print("Testing Geo3K Vision-Language Data Loading")
 30 |     print("=" * 60)
 31 |     
 32 |     # 1. Test static data generator
 33 |     print("\n1. Testing StaticDatasetGeneratorVL...")
 34 |     generator = StaticDatasetGeneratorVL(
 35 |         data_paths=[data_path],
 36 |         interaction_name="game",
 37 |         image_key="images",
 38 |         shuffle=False,
 39 |     )
 40 |     print(f"   ✓ Loaded dataset with {len(generator)} samples")
 41 |     
 42 |     # Check first sample
 43 |     sample = generator.generate_sample(0)
 44 |     print(f"   ✓ Sample keys: {sample.keys()}")
 45 |     print(f"   ✓ Prompt type: {type(sample['prompt'])}")
 46 |     print(f"   ✓ Images: {len(sample.get('images', []))} image(s)")
 47 |     if sample.get('images'):
 48 |         print(f"   ✓ First image type: {type(sample['images'][0])}")
 49 |     
 50 |     # 2. Test processor loading
 51 |     print("\n2. Testing AutoProcessor...")
 52 |     processor = AutoProcessor.from_pretrained(
 53 |         "Qwen/Qwen2.5-VL-7B-Instruct",
 54 |         trust_remote_code=True
 55 |     )
 56 |     print(f"   ✓ Loaded processor: {type(processor).__name__}")
 57 |     
 58 |     # 3. Test dynamic dataset
 59 |     print("\n3. Testing DynamicGameDatasetVL...")
 60 |     config = OmegaConf.create({
 61 |         "max_prompt_length": 1024,
 62 |         "truncation": "right",
 63 |         "return_raw_chat": True,
 64 |     })
 65 |     
 66 |     dataset = DynamicGameDatasetVL(
 67 |         data_generator=generator,
 68 |         tokenizer=None,
 69 |         processor=processor,
 70 |         config=config,
 71 |         virtual_size=num_samples,
 72 |     )
 73 |     print(f"   ✓ Created dataset with {len(dataset)} samples")
 74 |     
 75 |     # Test sample fetching
 76 |     print(f"\n4. Testing sample processing (first {num_samples} samples)...")
 77 |     for i in range(min(num_samples, len(dataset))):
 78 |         sample = dataset[i]
 79 |         print(f"\n   Sample {i}:")
 80 |         print(f"   - input_ids shape: {sample['input_ids'].shape}")
 81 |         print(f"   - attention_mask shape: {sample['attention_mask'].shape}")
 82 |         
 83 |         # Check for image tensors
 84 |         image_keys = [k for k in sample.keys() if 'pixel' in k or 'image' in k]
 85 |         if image_keys:
 86 |             print(f"   - Image tensor keys: {image_keys}")
 87 |             for key in image_keys:
 88 |                 print(f"   - {key} shape: {sample[key].shape}")
 89 |         else:
 90 |             print(f"   - No image tensors found")
 91 |         
 92 |         print(f"   - data_source: {sample.get('data_source')}")
 93 |         print(f"   - interaction_kwargs: {sample.get('interaction_kwargs', {}).get('name')}")
 94 |     
 95 |     print("\n" + "=" * 60)
 96 |     print("✓ All tests passed!")
 97 |     print("=" * 60)
 98 |     
 99 |     return True
100 | 
101 | 
102 | def main():
103 |     parser = argparse.ArgumentParser(description="Test Geo3K VL data loading")
104 |     parser.add_argument(
105 |         "--data_path",
106 |         type=str,
107 |         default="~/data/geo3k/train.parquet",
108 |         help="Path to Geo3K parquet file"
109 |     )
110 |     parser.add_argument(
111 |         "--num_samples",
112 |         type=int,
113 |         default=3,
114 |         help="Number of samples to test"
115 |     )
116 |     
117 |     args = parser.parse_args()
118 |     
119 |     # Expand path
120 |     import os
121 |     data_path = os.path.expanduser(args.data_path)
122 |     
123 |     if not os.path.exists(data_path):
124 |         print(f"Error: Data file not found: {data_path}")
125 |         print(f"\nPlease prepare Geo3K data first:")
126 |         print(f"  python verl/examples/data_preprocess/geo3k.py --local_save_dir ~/data/geo3k")
127 |         return False
128 |     
129 |     test_geo3k_data(data_path, args.num_samples)
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     main()
134 | 


--------------------------------------------------------------------------------
/opentinker/client/math_tool_inference.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import hydra
  3 | from omegaconf import OmegaConf
  4 | 
  5 | from utils.http_training_client import InferenceSchedulerClient
  6 | from utils.scheduler_client_lifecycle import get_lifecycle_manager
  7 | from opentinker.environment.inference_pipeline import run_inference
  8 | from opentinker.environment.math.math_tool_game import CodeInterpreterMathGame
  9 | from opentinker.environment.game_stats_client import GameStatsClient
 10 | 
 11 | 
 12 | @hydra.main(config_path="client_config", config_name="math_code_interpreter_inference_config.yaml", version_base=None)
 13 | def main(args):
 14 |     """Run math code interpreter inference with scheduler-managed vLLM server."""
 15 |     lifecycle = get_lifecycle_manager()
 16 |     
 17 |     print("=" * 60)
 18 |     print("Math Code Interpreter Inference with Scheduler")
 19 |     print("=" * 60)
 20 |     
 21 |     if not args.model_path:
 22 |         raise ValueError("model_path is required")
 23 |     if not args.data_path:
 24 |         raise ValueError("data_path is required")
 25 |     
 26 |     # 1. Submit inference job to scheduler
 27 |     scheduler_client = InferenceSchedulerClient(
 28 |         scheduler_url=args.get("scheduler_url", "http://localhost:8780"),
 29 |         api_key=args.get("scheduler_api_key"),
 30 |     )
 31 |     
 32 |     print(f"Submitting inference job to scheduler...")
 33 |     job_result = scheduler_client.submit_inference_job(
 34 |         model_path=args.model_path,
 35 |         tokenizer_path=args.get("tokenizer_path"),
 36 |         tensor_parallel_size=args.get("tensor_parallel_size", 1),
 37 |         num_gpus=args.get("num_gpus"),
 38 |         gpu_memory_utilization=args.get("gpu_memory_utilization", 0.9),
 39 |         max_model_len=args.get("max_model_len"),
 40 |         trust_remote_code=args.get("trust_remote_code", True),
 41 |     )
 42 |     
 43 |     job_id = job_result["job_id"]
 44 |     vllm_server_url = job_result["vllm_server_url"]
 45 |     
 46 |     # Register job for lifecycle cleanup
 47 |     lifecycle.register_job(scheduler_client, job_id)
 48 |     
 49 |     print(f"✓ Inference job {job_id} started at {vllm_server_url}")
 50 |     
 51 |     # 2. Setup GameStatsClient for per-step metrics (with job_id isolation)
 52 |     game_stats = GameStatsClient(args.env_endpoint, job_id=job_id)
 53 |     if game_stats.health_check():
 54 |         print(f"✓ Connected to code interpreter game server at {args.env_endpoint}")
 55 |         game_stats.reset_all()  # Reset stats for this job before inference
 56 |     else:
 57 |         print(f"⚠ Game server not available at {args.env_endpoint}, continuing without stats")
 58 |         print(f"  Make sure to start: python opentinker/environment/math/code_interpreter_math_server.py --port 8088")
 59 |         game_stats = None
 60 |     
 61 |     # 3. Run inference using the remote vLLM server
 62 |     print(f"\nRunning code interpreter inference on {args.data_path}...")
 63 |     print(f"  - Multi-turn: max_user_turns={args.multi_turn.max_user_turns}, max_assistant_turns={args.multi_turn.max_assistant_turns}")
 64 |     print(f"  - Max tokens: {args.max_new_tokens} total, {args.get('max_tokens_per_turn', 'unlimited')} per turn")
 65 |     
 66 |     results = run_inference(
 67 |         model_path=None,  # Not needed when using vllm_server_url
 68 |         vllm_server_url=vllm_server_url,
 69 |         tokenizer_path=args.get("tokenizer_path") or args.model_path,
 70 |         data_path=args.data_path,
 71 |         game_class=CodeInterpreterMathGame,
 72 |         env_endpoint=args.env_endpoint,
 73 |         job_id=job_id,  # Pass job_id for stats isolation
 74 |         output_path=args.get("output_path"),
 75 |         temperature=args.temperature,
 76 |         top_p=args.top_p,
 77 |         max_tokens=args.max_new_tokens,
 78 |         max_tokens_per_turn=args.get("max_tokens_per_turn"),
 79 |         max_samples=args.get("max_samples"),
 80 |         max_user_turns=args.multi_turn.max_user_turns,
 81 |         max_assistant_turns=args.multi_turn.max_assistant_turns,
 82 |     )
 83 |     
 84 |     # 4. Log game stats after inference
 85 |     if game_stats:
 86 |         stats = game_stats.get_all_stats()
 87 |         print(f"\nCode Interpreter Stats (job_id={job_id}):")
 88 |         print(f"  Total samples: {stats.get('total_samples', 0)}")
 89 |         print(f"  Games completed: {stats.get('games_in_step', 0)}")
 90 |         print(f"  Mean reward: {stats.get('mean_final_reward', 0):.4f}")
 91 |         print(f"  Code executions: {stats.get('code_executions', 'N/A')}")
 92 |     
 93 |     if args.get("output_path"):
 94 |         print(f"\nResults saved to: {args.output_path}")
 95 |     
 96 |     print(f"\n{'='*60}")
 97 |     print("Inference completed! vLLM server will be automatically cleaned up.")
 98 |     print(f"{'='*60}")
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     main()
103 | 


--------------------------------------------------------------------------------
/opentinker/client/geo3k_tool_rl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Geo3K Multi-Turn Vision-Language RL Training Client.
  3 | 
  4 | This script launches Geo3K geometry problem training using vision-language models
  5 | with multi-turn verification. The model can submit answers and receive feedback
  6 | before giving the final answer.
  7 | 
  8 | Usage:
  9 |     # First, start the scheduler:
 10 |     bash opentinker/scripts/launch_scheduler.sh
 11 |     
 12 |     # Then start the game server:
 13 |     python opentinker/environment/geo3k/geo3k_tool_server.py --port 8088
 14 |     
 15 |     # Finally, run this training script:
 16 |     python opentinker/client/geo3k_tool_rl.py
 17 | """
 18 | 
 19 | import hydra
 20 | from omegaconf import OmegaConf
 21 | 
 22 | from opentinker.client.utils.http_training_client import ServiceClient, SchedulerClient
 23 | from opentinker.environment.geo3k import Geo3KToolEnvironment
 24 | from opentinker.environment.game_stats_client import GameStatsClient
 25 | from opentinker.client.utils.utils import resolve_paths_in_config
 26 | from opentinker.client.utils.scheduler_client_lifecycle import get_lifecycle_manager
 27 | 
 28 | 
 29 | @hydra.main(config_path="client_config", config_name="geo3k_tool_param.yaml")
 30 | def main(args):
 31 |     args = resolve_paths_in_config(args)
 32 |     lifecycle = get_lifecycle_manager()
 33 |     
 34 |     print("=" * 60)
 35 |     print("Geo3K Multi-Turn Vision-Language Training")
 36 |     print("=" * 60)
 37 |     
 38 |     # 1. Submit job to scheduler
 39 |     print("\n[1/4] Submitting job to scheduler...")
 40 |     scheduler_client = SchedulerClient(
 41 |         scheduler_url=args.get("scheduler_url", "http://localhost:8780"),
 42 |         api_key=args.get("scheduler_api_key")
 43 |     )
 44 |     
 45 |     job_result = scheduler_client.submit_job(
 46 |         config=OmegaConf.to_container(args, resolve=True),
 47 |         enable_agent_loop=True,
 48 |         wandb_key=args.get("wandb_key"),
 49 |         num_gpus=args.get("num_gpus"),
 50 |     )
 51 |     
 52 |     job_id = job_result["job_id"]
 53 |     server_url = job_result["server_url"]
 54 |     lifecycle.register_job(scheduler_client, job_id)
 55 |     
 56 |     print(f"✓ Job {job_id} allocated at {server_url}")
 57 |     
 58 |     # 2. Setup Geo3K multi-turn VL environment
 59 |     print("\n[2/4] Setting up environment...")
 60 |     env_endpoint = args.interaction.config.env_endpoint
 61 |     
 62 |     max_retries = args.multi_turn.get("max_assistant_turns", 3) - 1  # -1 for initial attempt
 63 |     env = Geo3KToolEnvironment(
 64 |         config=args,
 65 |         data_paths=[args.data_path],
 66 |         val_data_paths=[args.val_data_path] if args.val_data_path else None,
 67 |         job_id=job_id,
 68 |         max_retries=max_retries,
 69 |     )
 70 |     print(f"✓ Geo3K multi-turn VL environment created")
 71 |     print(f"  - Interaction config: {env.get_interaction_config_path()}")
 72 |     print(f"  - Max retries: {max_retries}")
 73 |     
 74 |     # 3. Setup game stats client
 75 |     print("\n[3/4] Connecting to game server...")
 76 |     game_stats = GameStatsClient(env_endpoint, job_id=env.job_id)
 77 |     if game_stats.health_check():
 78 |         game_stats.reset_all()
 79 |         print(f"✓ Connected to game server at {env_endpoint}")
 80 |     else:
 81 |         game_stats = None
 82 |         print(f"⚠ Game server not responding at {env_endpoint}")
 83 |         print(f"  Make sure to start: python opentinker/environment/geo3k/geo3k_tool_server.py --port {args.interaction.config.env_port}")
 84 |     
 85 |     # 4. Connect to training server and train
 86 |     print("\n[4/4] Starting training...")
 87 |     client = ServiceClient(
 88 |         server_url=server_url,
 89 |         project_name=args.project_name,
 90 |         experiment_name=args.experiment_name,
 91 |         logger_backends=args.logger_backends,
 92 |     )
 93 |     client.set_config(args, env)
 94 |     
 95 |     print(f"\nTraining configuration:")
 96 |     print(f"  - Algorithm: {args.algorithm}")
 97 |     print(f"  - Epochs: {args.get('num_epochs')}")
 98 |     print(f"  - Batch size: {args.batch_size}")
 99 |     print(f"  - Max assistant turns: {args.multi_turn.max_assistant_turns}")
100 |     print(f"  - ADV estimator: {args.adv_estimator}")
101 |     print(f"  - Rollout N: {args.rollout_n}")
102 |     
103 |     try:
104 |         final_metrics = client.fit(
105 |             env=env,
106 |             num_epochs=args.get("num_epochs"),
107 |             num_steps=args.get("num_steps"),
108 |             save_freq=args.save_freq,
109 |             test_freq=args.test_freq,
110 |             verbose=True,
111 |             validate_before_training=True,
112 |             game_stats_client=game_stats,
113 |         )
114 |         print(f"\n✓ Training completed!")
115 |         print(f"Final metrics: {final_metrics}")
116 |     finally:
117 |         env.cleanup()
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/opentinker/environment/math/math_env.py:
--------------------------------------------------------------------------------
 1 | from opentinker.environment.base_game_environment import GameEnvironment
 2 | import hydra
 3 | from omegaconf import OmegaConf
 4 | from torch.utils.data import DataLoader
 5 | from transformers import AutoTokenizer
 6 | from torchdata.stateful_dataloader import StatefulDataLoader
 7 | 
 8 | from opentinker.client.utils.http_training_client import ServiceClient, SchedulerClient
 9 | from opentinker.environment.base_game_environment import GameEnvironment
10 | from opentinker.environment.base_data_generator import DynamicGameDataset, collate_fn
11 | from opentinker.environment.math import MathGame
12 | from opentinker.environment.static_data_generator import StaticDatasetGenerator
13 | from opentinker.environment.game_stats_client import GameStatsClient
14 | from opentinker.client.utils.utils import resolve_paths_in_config
15 | from opentinker.client.utils.scheduler_client_lifecycle import get_lifecycle_manager
16 | from verl.trainer.main_ppo import create_rl_sampler
17 | 
18 | class MathGameEnvironment(GameEnvironment):
19 |     """GameEnvironment for static dataset math problems."""
20 |     
21 |     def __init__(self, game_class, config, data_paths, val_data_paths=None, game_kwargs=None, job_id=None):
22 |         self.data_paths = [data_paths] if isinstance(data_paths, str) else list(data_paths)
23 |         self.val_data_paths = [val_data_paths] if isinstance(val_data_paths, str) else (list(val_data_paths) if val_data_paths else None)
24 |         super().__init__(game_class=game_class, config=config, game_kwargs=game_kwargs or {}, job_id=job_id)
25 |     
26 |     def _setup_dataloader(self):
27 |         """Use StaticDatasetGenerator for static dataset."""
28 |         tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
29 |         tokenizer.padding_side = "left"
30 |         if tokenizer.pad_token is None:
31 |             tokenizer.pad_token = tokenizer.eos_token
32 |         
33 |         dataset_config = OmegaConf.create({
34 |             "max_prompt_length": self.config.max_prompt_tokens,
35 |             "truncation": "right",
36 |             "return_raw_chat": True,
37 |         })
38 | 
39 |         math_game_for_prompt = MathGame()
40 | 
41 |         
42 |         # Training data generator
43 |         train_generator = StaticDatasetGenerator(
44 |             data_paths=self.data_paths,
45 |             interaction_name=self.interaction_name,
46 |             prompt_key="prompt",
47 |             ground_truth_key="ground_truth",
48 |             shuffle=True,
49 |             system_prompt=math_game_for_prompt.get_system_prompt(),
50 |         )
51 |         
52 |         batch_size = self.config.batch_size
53 |         num_steps = getattr(self.config, 'num_steps', None)
54 |         virtual_size = num_steps * batch_size if num_steps else len(train_generator) * getattr(self.config, 'num_epochs', 1)
55 |         
56 |         train_dataset = DynamicGameDataset(train_generator, tokenizer, dataset_config, virtual_size=virtual_size)
57 | 
58 |         sampler_config = OmegaConf.create({
59 |             "shuffle": True,
60 |             "seed": 42,
61 |             "sampler": None,
62 |         })
63 |         train_sampler = create_rl_sampler(sampler_config, train_dataset)
64 | 
65 | 
66 |         self.train_dataloader = StatefulDataLoader(train_dataset, batch_size=batch_size, shuffle=False, 
67 |                                         sampler=train_sampler,
68 |                                         num_workers=getattr(self.config, 'num_workers', 0),
69 |                                         collate_fn=collate_fn, drop_last=True)
70 |         print(f"Training dataloader: {len(self.train_dataloader)} batches")
71 |         
72 |         # Validation data generator - sample exactly val_batch_size samples, keep fixed
73 |         if self.val_data_paths:
74 |             val_generator = StaticDatasetGenerator(
75 |                 data_paths=self.val_data_paths,
76 |                 interaction_name=self.interaction_name,
77 |                 prompt_key="prompt",
78 |                 ground_truth_key="ground_truth",
79 |                 shuffle=False,  # No shuffle - keep samples fixed
80 |                 seed=42,
81 |                 system_prompt=math_game_for_prompt.get_system_prompt(),
82 |             )
83 |             val_batch_size = getattr(self.config, 'val_batch_size', min(64, len(val_generator)))
84 |             # Use val_batch_size as virtual_size to sample exactly that many samples
85 |             val_dataset = DynamicGameDataset(val_generator, tokenizer, dataset_config, 
86 |                                              virtual_size=val_batch_size, seed=42)
87 |             self.val_dataloader = StatefulDataLoader(val_dataset, batch_size=val_batch_size, shuffle=False,
88 |                                              num_workers=getattr(self.config, 'num_workers', 0),
89 |                                              collate_fn=collate_fn, drop_last=False)
90 |             print(f"Validation dataloader: {val_batch_size} fixed samples in {len(self.val_dataloader)} batch(es)")
91 | 
92 | 


--------------------------------------------------------------------------------
/opentinker/server/config/data/legacy_data.yaml:
--------------------------------------------------------------------------------
  1 | # Tokenizer class or path. If null, it will be inferred from the model.
  2 | tokenizer: null
  3 | 
  4 | # Whether to use shared memory for data loading.
  5 | use_shm: False
  6 | 
  7 | # Training set parquet. Can be a list or a single file.
  8 | # The program will read all files into memory, so it can't be too large (< 100GB).
  9 | # The path can be either a local path or an HDFS path.
 10 | # For HDFS path, we provide utils to download it to DRAM and convert it to a local path.
 11 | train_files: ~/data/rlhf/gsm8k/train.parquet
 12 | 
 13 | # Validation parquet. Can be a list or a single file.
 14 | val_files: ~/data/rlhf/gsm8k/test.parquet
 15 | 
 16 | # Maximum sample length to be used.
 17 | # Set to -1 to use full dataset, otherwise, randomly
 18 | # select the specified number of samples from train dataset
 19 | train_max_samples: -1
 20 | 
 21 | # Maximum sample length to be used.
 22 | # Set to -1 to use full dataset, otherwise, randomly
 23 | # select the specified number of samples from val dataset
 24 | val_max_samples: 100
 25 | 
 26 | # The field in the dataset where the prompt is located. Default is 'prompt'.
 27 | prompt_key: prompt
 28 | 
 29 | # The field used to select the reward function (if using different ones per example).
 30 | reward_fn_key: data_source
 31 | 
 32 | # Maximum prompt length. All prompts will be left-padded to this length.
 33 | # An error will be reported if the length is too long.
 34 | # oc.select: default val for rollout.prompt_length
 35 | max_prompt_length: 512
 36 | 
 37 | # Maximum response length. Rollout in RL algorithms (e.g. PPO) generates up to this length.
 38 | # oc.select: default val for rollout.response_length
 39 | max_response_length: 512
 40 | 
 41 | # Batch size sampled for one training iteration of different RL algorithms.
 42 | train_batch_size: 1024
 43 | 
 44 | # Batch size used during validation. Can be null.
 45 | val_batch_size: null
 46 | 
 47 | # use tool config to calculate true prompt length
 48 | tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, null}
 49 | 
 50 | # Whether to return the original input_ids without adding chat template.
 51 | # This is used when the reward model's chat template differs from the policy.
 52 | # If using a model-based RM with different templates, this should be True.
 53 | return_raw_input_ids: False
 54 | 
 55 | # Whether to return the original chat (prompt) without applying chat template.
 56 | return_raw_chat: False
 57 | 
 58 | # Whether to return the full prompt with chat template.
 59 | return_full_prompt: False
 60 | 
 61 | # Whether to shuffle the data in the dataloader.
 62 | shuffle: True
 63 | 
 64 | # Seed to use when shuffling the data
 65 | seed: null
 66 | 
 67 | # num dataloader workers
 68 | dataloader_num_workers: 8
 69 | 
 70 | # image patch size
 71 | image_patch_size: 14
 72 | 
 73 | # Whether to shuffle the validation set.
 74 | validation_shuffle: False
 75 | 
 76 | # Whether to filter overlong prompts.
 77 | filter_overlong_prompts: False
 78 | 
 79 | # Number of workers for filtering overlong prompts.
 80 | # For large-scale datasets, filtering can be time-consuming.
 81 | # Use multiprocessing to speed up. Default is 1.
 82 | filter_overlong_prompts_workers: 1
 83 | 
 84 | # Truncate the input_ids or prompt if they exceed max_prompt_length.
 85 | # Options: 'error', 'left', 'right', 'middle'. Default is 'error'.
 86 | truncation: error
 87 | 
 88 | # The field in the multi-modal dataset where the image is located. Default is 'images'.
 89 | image_key: images
 90 | 
 91 | # The field in the multi-modal dataset where the video is located.
 92 | video_key: videos
 93 | 
 94 | # If the remote tokenizer has a Python file, this flag determines whether to allow using it.
 95 | trust_remote_code: False
 96 | 
 97 | # Optional: specify a custom dataset class path and name if overriding default loading behavior.
 98 | custom_cls:
 99 | 
100 |   # The path to the file containing your customized dataset class. If not specified, pre-implemented dataset will be used.
101 |   path: null
102 | 
103 |   # The name of the dataset class within the specified file.
104 |   name: null
105 | 
106 | # Whether to return multi-modal inputs in the dataset. Set to False if rollout generates new multi-modal inputs.
107 | return_multi_modal_inputs: True
108 | 
109 | # settings related to data sampler
110 | sampler:
111 | 
112 |   # the path to the module containing a curriculum class which implements the
113 |   # AbstractSampler interface
114 |   class_path: null
115 | 
116 |   # the name of the curriculum class like `MySampler`
117 |   class_name: null
118 | 
119 | # Data generation configuration for augmenting the dataset.
120 | datagen:
121 | 
122 |   # The path to the file containing your customized data generation class.
123 |   # E.g. 'pkg://verl.experimental.dynamic_dataset.dynamicgen_dataset'
124 |   path: null
125 | 
126 |   # The class name of the data generation class within the specified file.
127 |   # E.g. 'MockDataGenerator'
128 |   name: null
129 | 
130 | # Additional kwargs when calling tokenizer.apply_chat_template
131 | apply_chat_template_kwargs: {}
132 | 


--------------------------------------------------------------------------------
/opentinker/environment/environment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Environment API for PPO Training
  4 | 
  5 | Provides abstract base class and concrete implementation for configuring
  6 | dataloader and reward functions in PPO training.
  7 | """
  8 | 
  9 | from abc import ABC, abstractmethod
 10 | from dataclasses import dataclass
 11 | from typing import Any, Dict, Optional, Callable
 12 | import inspect
 13 | from transformers import AutoTokenizer
 14 | from omegaconf import OmegaConf
 15 | 
 16 | # Note(Siqi): 
 17 | # ImportError: cannot import name 'ServiceClient'
 18 | #  from partially initialized module 'http_training_client'
 19 | #   (most likely due to a circular import)
 20 | 
 21 | from verl.utils.dataset.rl_dataset import collate_fn
 22 | from torchdata.stateful_dataloader import StatefulDataLoader
 23 | from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
 24 | from opentinker.client.utils.utils import prepare_dataset, verify_raw_prompt_format
 25 | 
 26 | @dataclass
 27 | class RewardFunctionSpec:
 28 |     """Specification for reward function configuration.
 29 |     
 30 |     Supports three types:
 31 |     - "config": Load from Python file (path + function name)
 32 |     - "remote": Call remote API endpoint (future)
 33 |     - "code": Upload custom Python function to server
 34 |     """
 35 |     type: str  # "config", "remote", or "code"
 36 |     
 37 |     # For type="config"
 38 |     config_path: Optional[str] = None
 39 |     config_name: Optional[str] = None
 40 |     config_kwargs: Optional[Dict[str, Any]] = None
 41 |     
 42 |     # For type="remote" (future)
 43 |     remote_endpoint: Optional[str] = None
 44 |     remote_api_key: Optional[str] = None
 45 |     
 46 |     # For type="code"
 47 |     code_function: Optional[Callable] = None
 48 |     code_source: Optional[str] = None
 49 |     
 50 |     def __post_init__(self):
 51 |         """Validate configuration and extract source code if needed."""
 52 |         if self.type not in ["config", "remote", "code"]:
 53 |             raise ValueError(f"Invalid reward function type: {self.type}. Must be 'config', 'remote', or 'code'")
 54 |         
 55 |         if self.type == "config":
 56 |             if not self.config_path or not self.config_name:
 57 |                 raise ValueError("config_path and config_name are required for type='config'")
 58 |         
 59 |         elif self.type == "remote":
 60 |             if not self.remote_endpoint:
 61 |                 raise ValueError("remote_endpoint is required for type='remote'")
 62 |         
 63 |         elif self.type == "code":
 64 |             if not self.code_function:
 65 |                 raise ValueError("code_function is required for type='code'")
 66 |             
 67 |             # Auto-extract source code if not provided
 68 |             if self.code_source is None:
 69 |                 try:
 70 |                     self.code_source = inspect.getsource(self.code_function)
 71 |                 except (OSError, TypeError) as e:
 72 |                     raise ValueError(f"Could not extract source code from function: {e}")
 73 |     
 74 |     def to_config_dict(self) -> Dict[str, Any]:
 75 |         """Convert to configuration dictionary for server."""
 76 |         if self.type == "config":
 77 |             config = {
 78 |                 "type": "config",
 79 |                 "config_path": self.config_path,
 80 |                 "config_name": self.config_name,
 81 |             }
 82 |             if self.config_kwargs:
 83 |                 config["config_kwargs"] = self.config_kwargs
 84 |             return config
 85 |         
 86 |         elif self.type == "remote":
 87 |             config = {
 88 |                 "type": "remote",
 89 |                 "remote_endpoint": self.remote_endpoint,
 90 |             }
 91 |             if self.remote_api_key:
 92 |                 config["remote_api_key"] = self.remote_api_key
 93 |             return config
 94 |         
 95 |         elif self.type == "code":
 96 |             # Use 'name' field to match server config schema (not 'function_name')
 97 |             return {
 98 |                 "type": "code",
 99 |                 "name": self.code_function.__name__,
100 |             }
101 |         
102 |         return {}
103 | 
104 | 
105 | class BaseEnvironment(ABC):
106 |     """Abstract base class for PPO training environments.
107 |     
108 |     Subclasses must implement:
109 |     - setup(client): Configure the environment on the server
110 |     - dataloader property: Return the training dataloader
111 |     - get_config(): Return configuration dict for server
112 |     """
113 |     
114 |     @abstractmethod
115 |     def setup(self, client):
116 |         """Setup environment on the server.
117 |         
118 |         Args:
119 |             client: HTTPTrainingClient or ServiceClient instance
120 |         """
121 |         pass
122 |     
123 |     @abstractmethod
124 |     def get_dataloader(self):
125 |         """Return the training dataloader."""
126 |         pass
127 |     
128 |     @abstractmethod
129 |     def get_config(self) -> Dict[str, Any]:
130 |         """Return configuration dictionary for server."""
131 |         pass


--------------------------------------------------------------------------------
/opentinker/server/config/algorithm.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 Bytedance Ltd. and/or its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from dataclasses import dataclass, field
 16 | from typing import Any, Optional
 17 | 
 18 | from verl.base_config import BaseConfig
 19 | 
 20 | __all__ = ["AlgoConfig", "FilterGroupsConfig", "KLControlConfig"]
 21 | 
 22 | 
 23 | @dataclass
 24 | class KLControlConfig(BaseConfig):
 25 |     """Configuration for KL control.
 26 | 
 27 |     The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config.
 28 | 
 29 |     Args:
 30 |         type (str): Type of KL control. Can be "fixed" or "adaptive".
 31 |         kl_coef (float): Initial coefficient for KL penalty.
 32 |         horizon (int): Horizon value for adaptive controller.
 33 |         target_kl (float): Target KL divergence for adaptive controller.
 34 |     """
 35 | 
 36 |     type: str = "fixed"
 37 |     kl_coef: float = 0.001
 38 |     horizon: int = 10000
 39 |     target_kl: float = 0.1
 40 | 
 41 | 
 42 | @dataclass
 43 | class FilterGroupsConfig(BaseConfig):
 44 |     """Configuration for filter groups (used in DAPO and Entropy).
 45 | 
 46 |     The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config.
 47 | 
 48 |     Args:
 49 |         enable (bool): Whether to enable filter groups.
 50 |         metric (Optional[str]): Metric to use for filtering: "acc", "score", "seq_reward", "seq_final_reward", etc.
 51 |         max_num_gen_batches (int): Non-positive values mean no upper limit.
 52 |     """
 53 | 
 54 |     enable: bool = False
 55 |     metric: Optional[str] = None
 56 |     max_num_gen_batches: int = 0
 57 | 
 58 | 
 59 | @dataclass
 60 | class AlgoConfig(BaseConfig):
 61 |     """Configuration for the algorithm.
 62 | 
 63 |     The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config.
 64 | 
 65 |     Args:
 66 |         gamma (float): Discount factor for future rewards.
 67 |         lam (float): Trade-off between bias and variance in the GAE estimator.
 68 |         adv_estimator (str): Advantage estimator type: "gae", "grpo", "reinforce_plus_plus", etc.
 69 |         norm_adv_by_std_in_grpo (bool): Whether to normalize advantages by std (specific to GRPO).
 70 |         use_kl_in_reward (bool): Whether to enable in-reward KL penalty.
 71 |         kl_penalty (str): How to estimate KL divergence: "kl", "abs", "mse", "low_var_kl", or "full".
 72 |         kl_ctrl (KLControlConfig): KL control configuration.
 73 |         use_pf_ppo (bool): Whether to enable preference feedback PPO.
 74 |         pf_ppo (dict[str, Any]): Preference feedback PPO settings.
 75 |         filter_groups (Optional[FilterGroupsConfig]): Filter groups configuration, used in DAPO and Entropy
 76 |         rollout_is_threshold (Optional[float]): Upper threshold for IS weights. null = disabled,
 77 |             float value = enabled (compute weights and metrics). This is the main on/off switch.
 78 |         rollout_is_threshold_lower (Optional[float]): Lower threshold for IS weights. If None, defaults to 1/upper.
 79 |         rollout_is_level (str): Aggregation level: "token", "sequence", or "geometric".
 80 |         rollout_is_mode (str): Bounding mode: "truncate" (cap upper only) or "mask" (zero outside bounds).
 81 |         rollout_is_veto_threshold (float or None): Per-token veto threshold for catastrophic outliers. None to disable.
 82 |         rollout_is (bool): Whether to apply IS weights to policy loss. True = apply weights,
 83 |             False = compute metrics only (useful for monitoring before enabling correction). Default: False.
 84 |     """
 85 | 
 86 |     gamma: float = 1.0
 87 |     lam: float = 1.0
 88 |     adv_estimator: str = "gae"
 89 |     norm_adv_by_std_in_grpo: bool = True
 90 |     use_kl_in_reward: bool = False
 91 |     kl_penalty: str = "kl"
 92 |     kl_ctrl: KLControlConfig = field(default_factory=KLControlConfig)
 93 |     use_pf_ppo: bool = False
 94 |     pf_ppo: dict[str, Any] = field(default_factory=dict)
 95 |     filter_groups: Optional[FilterGroupsConfig] = None
 96 |     # Rollout Importance Sampling
 97 |     # Controls computation of IS weights and mismatch metrics
 98 |     rollout_is_threshold: Optional[float] = None  # null = disabled, float = enabled
 99 |     rollout_is_threshold_lower: Optional[float] = None
100 |     rollout_is_level: str = "token"
101 |     rollout_is_mode: str = "truncate"
102 |     rollout_is_veto_threshold: Optional[float] = None
103 |     # Controls whether to apply IS weights to policy loss (only if rollout_is_threshold is set)
104 |     # True = apply weights to loss, False = compute metrics only (no weight application)
105 |     rollout_is: bool = False
106 | 


--------------------------------------------------------------------------------
/opentinker/environment/math/math_tool_server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Code Interpreter Math Environment Server.
  3 | 
  4 | This script starts a game server for math problem solving with code interpreter.
  5 | It also manages a sandbox server for Python code execution.
  6 | 
  7 | The server handles:
  8 | - /reset: Initialize a new math problem session
  9 | - /step: Process LLM response, extract and execute code, return results
 10 | 
 11 | Usage:
 12 |     # Start with auto-managed sandbox (recommended):
 13 |     python code_interpreter_math_server.py --port 8088
 14 |     
 15 |     # Start with external sandbox:
 16 |     python code_interpreter_math_server.py --port 8088 --sandbox-url http://localhost:8000/run_code
 17 |     
 18 |     # For multi-worker mode (production):
 19 |     # First start sandbox separately, then:
 20 |     uvicorn opentinker.environment.math.code_interpreter_math_server:app \\
 21 |         --host 0.0.0.0 --port 8088 --workers 4
 22 | """
 23 | 
 24 | import argparse
 25 | import atexit
 26 | import threading
 27 | import time
 28 | from typing import Optional
 29 | 
 30 | from opentinker.environment.base_game_server import run_game_server, create_game_app
 31 | from opentinker.environment.math.math_tool_game import CodeInterpreterMathGame
 32 | 
 33 | 
 34 | # Global sandbox reference for cleanup
 35 | _sandbox_actor = None
 36 | _sandbox_url = None
 37 | 
 38 | 
 39 | def start_sandbox_background() -> str:
 40 |     """Start sandbox server in background and return URL.
 41 |     
 42 |     Returns:
 43 |         URL of the sandbox server
 44 |     """
 45 |     global _sandbox_actor, _sandbox_url
 46 |     
 47 |     import ray
 48 |     from opentinker.server.sandbox import Sandbox
 49 |     
 50 |     # Initialize Ray if needed
 51 |     if not ray.is_initialized():
 52 |         ray.init(ignore_reinit_error=True)
 53 |     
 54 |     # Create and start sandbox
 55 |     _sandbox_actor = Sandbox.remote()
 56 |     ray.get(_sandbox_actor.start_server.remote())
 57 |     
 58 |     # Wait for server to be ready
 59 |     time.sleep(0.5)
 60 |     
 61 |     # Get address
 62 |     address = ray.get(_sandbox_actor.get_server_address.remote())
 63 |     _sandbox_url = f"http://{address}/run_code"
 64 |     
 65 |     print(f"✓ Sandbox server started at {_sandbox_url}")
 66 |     return _sandbox_url
 67 | 
 68 | 
 69 | def cleanup_sandbox():
 70 |     """Clean up sandbox server on exit."""
 71 |     global _sandbox_actor
 72 |     if _sandbox_actor is not None:
 73 |         try:
 74 |             import ray
 75 |             ray.kill(_sandbox_actor)
 76 |             print("✓ Sandbox server stopped")
 77 |         except:
 78 |             pass
 79 | 
 80 | 
 81 | # Register cleanup
 82 | atexit.register(cleanup_sandbox)
 83 | 
 84 | 
 85 | def create_app_with_sandbox(sandbox_url: str):
 86 |     """Create FastAPI app with sandbox URL configured.
 87 |     
 88 |     Args:
 89 |         sandbox_url: URL of the sandbox server
 90 |     
 91 |     Returns:
 92 |         FastAPI app
 93 |     """
 94 |     # Create game class factory with sandbox_url
 95 |     def game_factory(**kwargs):
 96 |         return CodeInterpreterMathGame(sandbox_url=sandbox_url, **kwargs)
 97 |     
 98 |     return create_game_app(game_class=game_factory)
 99 | 
100 | 
101 | def main():
102 |     parser = argparse.ArgumentParser(description="Code Interpreter Math Game Server")
103 |     parser.add_argument("--host", default="0.0.0.0", help="Server host")
104 |     parser.add_argument("--port", type=int, default=8088, help="Server port")
105 |     parser.add_argument("--sandbox-url", type=str, default=None, 
106 |                         help="External sandbox URL. If not provided, starts internal sandbox.")
107 |     parser.add_argument("--max-turns", type=int, default=10, 
108 |                         help="Maximum turns per problem")
109 |     parser.add_argument("--timeout", type=int, default=30,
110 |                         help="Sandbox execution timeout in seconds")
111 |     args = parser.parse_args()
112 |     
113 |     print("\n" + "=" * 60)
114 |     print("Code Interpreter Math Game Server")
115 |     print("=" * 60)
116 |     
117 |     # Determine sandbox URL
118 |     if args.sandbox_url:
119 |         sandbox_url = args.sandbox_url
120 |         print(f"Using external sandbox at: {sandbox_url}")
121 |     else:
122 |         print("Starting internal sandbox server...")
123 |         sandbox_url = start_sandbox_background()
124 |     
125 |     print(f"\nServer configuration:")
126 |     print(f"  Host: {args.host}")
127 |     print(f"  Port: {args.port}")
128 |     print(f"  Sandbox URL: {sandbox_url}")
129 |     print(f"  Max turns: {args.max_turns}")
130 |     print(f"  Timeout: {args.timeout}s")
131 |     print("=" * 60 + "\n")
132 |     
133 |     # Run game server with configured sandbox
134 |     run_game_server(
135 |         game_class=CodeInterpreterMathGame,
136 |         host=args.host,
137 |         port=args.port,
138 |         sandbox_url=sandbox_url,
139 |         max_turns=args.max_turns,
140 |         timeout=args.timeout,
141 |     )
142 | 
143 | 
144 | # For uvicorn multi-worker mode, create app with default sandbox URL
145 | # Usage: Set SANDBOX_URL env var before running uvicorn
146 | import os
147 | _default_sandbox_url = os.environ.get("SANDBOX_URL", "http://localhost:8000/run_code")
148 | app = create_game_app(
149 |     game_class=CodeInterpreterMathGame,
150 |     sandbox_url=_default_sandbox_url,
151 | )
152 | 
153 | 
154 | if __name__ == "__main__":
155 |     main()
156 | 


--------------------------------------------------------------------------------
/opentinker/environment/math/math_tool_env.py:
--------------------------------------------------------------------------------
  1 | from opentinker.environment.base_game_environment import GameEnvironment
  2 | import hydra
  3 | from omegaconf import OmegaConf
  4 | from torch.utils.data import DataLoader
  5 | from transformers import AutoTokenizer
  6 | from torchdata.stateful_dataloader import StatefulDataLoader
  7 | 
  8 | from opentinker.client.utils.http_training_client import ServiceClient, SchedulerClient
  9 | from opentinker.environment.base_game_environment import GameEnvironment
 10 | from opentinker.environment.base_data_generator import DynamicGameDataset, collate_fn
 11 | from opentinker.environment.math import MathGame
 12 | from opentinker.environment.static_data_generator import StaticDatasetGenerator
 13 | from opentinker.environment.game_stats_client import GameStatsClient
 14 | from opentinker.client.utils.utils import resolve_paths_in_config
 15 | from opentinker.client.utils.scheduler_client_lifecycle import get_lifecycle_manager
 16 | from verl.trainer.main_ppo import create_rl_sampler
 17 | from opentinker.environment.math.math_tool_game import CodeInterpreterMathGame
 18 | 
 19 | class MathCodeInterpreterEnvironment(GameEnvironment):
 20 |     """GameEnvironment for math with code interpreter.
 21 |     
 22 |     Uses agent_loop (GenericAgentLoop) with GymEnvironmentInteraction.
 23 |     The game server handles code execution internally.
 24 |     """
 25 |     
 26 |     def __init__(
 27 |         self, 
 28 |         game_class, 
 29 |         config, 
 30 |         data_paths, 
 31 |         val_data_paths=None, 
 32 |         game_kwargs=None, 
 33 |         job_id=None,
 34 |     ):
 35 |         self.data_paths = [data_paths] if isinstance(data_paths, str) else list(data_paths)
 36 |         self.val_data_paths = [val_data_paths] if isinstance(val_data_paths, str) else (list(val_data_paths) if val_data_paths else None)
 37 |         super().__init__(
 38 |             game_class=game_class, 
 39 |             config=config, 
 40 |             game_kwargs=game_kwargs or {}, 
 41 |             job_id=job_id
 42 |         )
 43 |     
 44 |     def _setup_dataloader(self):
 45 |         """Use StaticDatasetGenerator for static dataset."""
 46 |         tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
 47 |         tokenizer.padding_side = "left"
 48 |         if tokenizer.pad_token is None:
 49 |             tokenizer.pad_token = tokenizer.eos_token
 50 |         
 51 |         dataset_config = OmegaConf.create({
 52 |             "max_prompt_length": self.config.max_prompt_tokens,
 53 |             "truncation": "right",
 54 |             "return_raw_chat": True,
 55 |         })
 56 | 
 57 |         # Use CodeInterpreterMathGame for system prompt
 58 |         math_game_for_prompt = CodeInterpreterMathGame()
 59 |         
 60 |         # Training data generator
 61 |         train_generator = StaticDatasetGenerator(
 62 |             data_paths=self.data_paths,
 63 |             interaction_name=self.interaction_name,
 64 |             prompt_key="prompt",
 65 |             ground_truth_key="ground_truth",
 66 |             shuffle=True,
 67 |             system_prompt=math_game_for_prompt.get_system_prompt(),
 68 |         )
 69 |         
 70 |         batch_size = self.config.batch_size
 71 |         num_steps = getattr(self.config, 'num_steps', None)
 72 |         virtual_size = num_steps * batch_size if num_steps else len(train_generator) * getattr(self.config, 'num_epochs', 1)
 73 |         
 74 |         train_dataset = DynamicGameDataset(train_generator, tokenizer, dataset_config, virtual_size=virtual_size)
 75 | 
 76 |         sampler_config = OmegaConf.create({
 77 |             "shuffle": True,
 78 |             "seed": 42,
 79 |             "sampler": None,
 80 |         })
 81 |         train_sampler = create_rl_sampler(sampler_config, train_dataset)
 82 | 
 83 |         self.train_dataloader = StatefulDataLoader(
 84 |             train_dataset, 
 85 |             batch_size=batch_size, 
 86 |             shuffle=False, 
 87 |             sampler=train_sampler,
 88 |             num_workers=getattr(self.config, 'num_workers', 0),
 89 |             collate_fn=collate_fn, 
 90 |             drop_last=True
 91 |         )
 92 |         print(f"Training dataloader: {len(self.train_dataloader)} batches")
 93 |         
 94 |         # Validation data generator
 95 |         if self.val_data_paths:
 96 |             val_generator = StaticDatasetGenerator(
 97 |                 data_paths=self.val_data_paths,
 98 |                 interaction_name=self.interaction_name,
 99 |                 prompt_key="prompt",
100 |                 ground_truth_key="ground_truth",
101 |                 shuffle=False,
102 |                 seed=42,
103 |                 system_prompt=math_game_for_prompt.get_system_prompt(),
104 |             )
105 |             val_batch_size = getattr(self.config, 'val_batch_size', min(64, len(val_generator)))
106 |             val_dataset = DynamicGameDataset(
107 |                 val_generator, tokenizer, dataset_config, 
108 |                 virtual_size=val_batch_size, seed=42
109 |             )
110 |             self.val_dataloader = StatefulDataLoader(
111 |                 val_dataset, 
112 |                 batch_size=val_batch_size, 
113 |                 shuffle=False,
114 |                 num_workers=getattr(self.config, 'num_workers', 0),
115 |                 collate_fn=collate_fn, 
116 |                 drop_last=False
117 |             )
118 |             print(f"Validation dataloader: {val_batch_size} fixed samples in {len(self.val_dataloader)} batch(es)")
119 | 
120 | 


--------------------------------------------------------------------------------