├── .github └── workflows │ └── CI-runpod_dep.yml ├── .gitignore ├── .gitmodules ├── .runpod ├── hub.json └── tests.json ├── Dockerfile ├── LICENSE ├── README.md ├── builder └── requirements.txt ├── docker-bake.hcl ├── media └── ui_demo.gif ├── src ├── __init__.py ├── constants.py ├── download_model.py ├── engine.py ├── engine_args.py ├── handler.py ├── tokenizer.py └── utils.py └── worker-config.json /.github/workflows/CI-runpod_dep.yml: -------------------------------------------------------------------------------- 1 | name: CI | Update runpod package version 2 | 3 | on: 4 | repository_dispatch: 5 | types: [python-package-release] 6 | 7 | push: 8 | branches: ["main"] 9 | 10 | workflow_dispatch: 11 | 12 | jobs: 13 | check_dep: 14 | runs-on: ubuntu-latest 15 | name: Check python requirements file and update 16 | steps: 17 | - name: Checkout 18 | uses: actions/checkout@v2 19 | 20 | - name: Check for new package version and update 21 | run: | 22 | echo "Fetching the current runpod version from requirements.txt..." 23 | 24 | # Get current version, allowing both == and ~= in the search pattern 25 | current_version=$(grep -oP 'runpod[~=]{1,2}\K[^"]+' ./builder/requirements.txt) 26 | echo "Current version: $current_version" 27 | 28 | # Extract major and minor from current version 29 | current_major_minor=$(echo $current_version | cut -d. -f1,2) 30 | echo "Current major.minor: $current_major_minor" 31 | 32 | echo "Fetching the latest runpod version from PyPI..." 33 | 34 | # Get new version from PyPI 35 | new_version=$(curl -s https://pypi.org/pypi/runpod/json | jq -r .info.version) 36 | echo "NEW_VERSION_ENV=$new_version" >> $GITHUB_ENV 37 | echo "New version: $new_version" 38 | 39 | # Extract major and minor from new version 40 | new_major_minor=$(echo $new_version | cut -d. -f1,2) 41 | echo "New major.minor: $new_major_minor" 42 | 43 | if [ -z "$new_version" ]; then 44 | echo "ERROR: Failed to fetch the new version from PyPI." 45 | exit 1 46 | fi 47 | 48 | # Check if the major or minor version is different 49 | if [ "$current_major_minor" = "$new_major_minor" ]; then 50 | echo "No update needed. The new version ($new_major_minor) is within the allowed range (~= $current_major_minor)." 51 | exit 0 52 | fi 53 | 54 | echo "New major/minor detected ($new_major_minor). Updating requirements.txt..." 55 | 56 | # Update requirements.txt, preserving the existing constraint type (~= or ==) 57 | sed -i "s/runpod[~=][^ ]*/runpod~=$new_version/" ./builder/requirements.txt 58 | echo "requirements.txt has been updated." 59 | 60 | - name: Create Pull Request 61 | uses: peter-evans/create-pull-request@v3 62 | with: 63 | token: ${{ secrets.GITHUB_TOKEN }} 64 | commit-message: Update runpod package version 65 | title: Update runpod package version 66 | body: The package version has been updated to ${{ env.NEW_VERSION_ENV }} 67 | branch: runpod-package-update 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | runpod.toml 3 | *.pyc 4 | .env 5 | test/* 6 | vllm-base/vllm-* 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "vllm-base-image/vllm"] 2 | path = vllm-base-image/vllm 3 | url = https://github.com/runpod/vllm-fork-for-sls-worker.git 4 | -------------------------------------------------------------------------------- /.runpod/hub.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "vLLM", 3 | "description": "Deploy OpenAI-Compatible Blazing-Fast LLM Endpoints powered by the vLLM Inference Engine on RunPod Serverless", 4 | "type": "serverless", 5 | "category": "language", 6 | "iconUrl": "https://registry.npmmirror.com/@lobehub/icons-static-png/latest/files/dark/vllm-color.png", 7 | "config": { 8 | "runsOn": "GPU", 9 | "containerDiskInGb": 200, 10 | "presets": [ 11 | { 12 | "name": "deepseek-ai/deepseek-r1-distill-llama-8b", 13 | "defaults": { 14 | "MODEL_NAME": "deepseek-ai/deepseek-r1-distill-llama-8b" 15 | } 16 | } 17 | ], 18 | "env": [ 19 | { 20 | "key": "TOKENIZER", 21 | "input": { 22 | "name": "Tokenizer", 23 | "type": "string", 24 | "description": "Name or path of the Hugging Face tokenizer to use.", 25 | "advanced": true 26 | } 27 | }, 28 | { 29 | "key": "TOKENIZER_MODE", 30 | "input": { 31 | "name": "Tokenizer Mode", 32 | "type": "string", 33 | "description": "The tokenizer mode.", 34 | "options": [ 35 | { 36 | "label": "auto", 37 | "value": "auto" 38 | }, 39 | { 40 | "label": "slow", 41 | "value": "slow" 42 | } 43 | ], 44 | "default": "auto", 45 | "advanced": true 46 | } 47 | }, 48 | { 49 | "key": "SKIP_TOKENIZER_INIT", 50 | "input": { 51 | "name": "Skip Tokenizer Init", 52 | "type": "boolean", 53 | "description": "Skip initialization of tokenizer and detokenizer.", 54 | "default": false, 55 | "advanced": true 56 | } 57 | }, 58 | { 59 | "key": "TRUST_REMOTE_CODE", 60 | "input": { 61 | "name": "Trust Remote Code", 62 | "type": "boolean", 63 | "description": "Trust remote code from Hugging Face.", 64 | "default": false, 65 | "advanced": true 66 | } 67 | }, 68 | { 69 | "key": "DOWNLOAD_DIR", 70 | "input": { 71 | "name": "Download Directory", 72 | "type": "string", 73 | "description": "Directory to download and load the weights.", 74 | "advanced": true 75 | } 76 | }, 77 | { 78 | "key": "LOAD_FORMAT", 79 | "input": { 80 | "name": "Load Format", 81 | "type": "string", 82 | "description": "The format of the model weights to load.", 83 | "options": [ 84 | { 85 | "label": "auto", 86 | "value": "auto" 87 | }, 88 | { 89 | "label": "pt", 90 | "value": "pt" 91 | }, 92 | { 93 | "label": "safetensors", 94 | "value": "safetensors" 95 | }, 96 | { 97 | "label": "npcache", 98 | "value": "npcache" 99 | }, 100 | { 101 | "label": "dummy", 102 | "value": "dummy" 103 | }, 104 | { 105 | "label": "tensorizer", 106 | "value": "tensorizer" 107 | }, 108 | { 109 | "label": "bitsandbytes", 110 | "value": "bitsandbytes" 111 | } 112 | ], 113 | "default": "auto", 114 | "advanced": true 115 | } 116 | }, 117 | { 118 | "key": "DTYPE", 119 | "input": { 120 | "name": "Data Type", 121 | "type": "string", 122 | "description": "Data type for model weights and activations.", 123 | "options": [ 124 | { 125 | "label": "auto", 126 | "value": "auto" 127 | }, 128 | { 129 | "label": "half", 130 | "value": "half" 131 | }, 132 | { 133 | "label": "float16", 134 | "value": "float16" 135 | }, 136 | { 137 | "label": "bfloat16", 138 | "value": "bfloat16" 139 | }, 140 | { 141 | "label": "float", 142 | "value": "float" 143 | }, 144 | { 145 | "label": "float32", 146 | "value": "float32" 147 | } 148 | ], 149 | "default": "auto", 150 | "advanced": true 151 | } 152 | }, 153 | { 154 | "key": "KV_CACHE_DTYPE", 155 | "input": { 156 | "name": "KV Cache Data Type", 157 | "type": "string", 158 | "description": "Data type for KV cache storage.", 159 | "options": [ 160 | { 161 | "label": "auto", 162 | "value": "auto" 163 | }, 164 | { 165 | "label": "fp8", 166 | "value": "fp8" 167 | } 168 | ], 169 | "default": "auto", 170 | "advanced": true 171 | } 172 | }, 173 | { 174 | "key": "QUANTIZATION_PARAM_PATH", 175 | "input": { 176 | "name": "Quantization Param Path", 177 | "type": "string", 178 | "description": "Path to the JSON file containing the KV cache scaling factors.", 179 | "advanced": true 180 | } 181 | }, 182 | { 183 | "key": "MAX_MODEL_LEN", 184 | "input": { 185 | "name": "Max Model Length", 186 | "type": "number", 187 | "description": "Model context length.", 188 | "advanced": true 189 | } 190 | }, 191 | { 192 | "key": "GUIDED_DECODING_BACKEND", 193 | "input": { 194 | "name": "Guided Decoding Backend", 195 | "type": "string", 196 | "description": "Which engine will be used for guided decoding by default.", 197 | "options": [ 198 | { 199 | "label": "outlines", 200 | "value": "outlines" 201 | }, 202 | { 203 | "label": "lm-format-enforcer", 204 | "value": "lm-format-enforcer" 205 | } 206 | ], 207 | "default": "outlines", 208 | "advanced": true 209 | } 210 | }, 211 | { 212 | "key": "DISTRIBUTED_EXECUTOR_BACKEND", 213 | "input": { 214 | "name": "Distributed Executor Backend", 215 | "type": "string", 216 | "description": "Backend to use for distributed serving.", 217 | "options": [ 218 | { 219 | "label": "ray", 220 | "value": "ray" 221 | }, 222 | { 223 | "label": "mp", 224 | "value": "mp" 225 | } 226 | ], 227 | "advanced": true 228 | } 229 | }, 230 | { 231 | "key": "WORKER_USE_RAY", 232 | "input": { 233 | "name": "Worker Use Ray", 234 | "type": "boolean", 235 | "description": "Deprecated, use --distributed-executor-backend=ray.", 236 | "default": false, 237 | "advanced": true 238 | } 239 | }, 240 | { 241 | "key": "RAY_WORKERS_USE_NSIGHT", 242 | "input": { 243 | "name": "Ray Workers Use Nsight", 244 | "type": "boolean", 245 | "description": "If specified, use nsight to profile Ray workers.", 246 | "default": false, 247 | "advanced": true 248 | } 249 | }, 250 | { 251 | "key": "PIPELINE_PARALLEL_SIZE", 252 | "input": { 253 | "name": "Pipeline Parallel Size", 254 | "type": "number", 255 | "description": "Number of pipeline stages.", 256 | "default": 1, 257 | "advanced": true 258 | } 259 | }, 260 | { 261 | "key": "TENSOR_PARALLEL_SIZE", 262 | "input": { 263 | "name": "Tensor Parallel Size", 264 | "type": "number", 265 | "description": "Number of tensor parallel replicas.", 266 | "default": 1, 267 | "advanced": true 268 | } 269 | }, 270 | { 271 | "key": "MAX_PARALLEL_LOADING_WORKERS", 272 | "input": { 273 | "name": "Max Parallel Loading Workers", 274 | "type": "number", 275 | "description": "Load model sequentially in multiple batches.", 276 | "advanced": true 277 | } 278 | }, 279 | { 280 | "key": "ENABLE_PREFIX_CACHING", 281 | "input": { 282 | "name": "Enable Prefix Caching", 283 | "type": "boolean", 284 | "description": "Enables automatic prefix caching.", 285 | "default": false, 286 | "advanced": true 287 | } 288 | }, 289 | { 290 | "key": "DISABLE_SLIDING_WINDOW", 291 | "input": { 292 | "name": "Disable Sliding Window", 293 | "type": "boolean", 294 | "description": "Disables sliding window, capping to sliding window size.", 295 | "default": false, 296 | "advanced": true 297 | } 298 | }, 299 | { 300 | "key": "USE_V2_BLOCK_MANAGER", 301 | "input": { 302 | "name": "Use V2 Block Manager", 303 | "type": "boolean", 304 | "description": "Use BlockSpaceMangerV2.", 305 | "default": false, 306 | "advanced": true 307 | } 308 | }, 309 | { 310 | "key": "NUM_LOOKAHEAD_SLOTS", 311 | "input": { 312 | "name": "Num Lookahead Slots", 313 | "type": "number", 314 | "description": "Experimental scheduling config necessary for speculative decoding.", 315 | "default": 0, 316 | "advanced": true 317 | } 318 | }, 319 | { 320 | "key": "SEED", 321 | "input": { 322 | "name": "Seed", 323 | "type": "number", 324 | "description": "Random seed for operations.", 325 | "default": 0, 326 | "advanced": true 327 | } 328 | }, 329 | { 330 | "key": "NUM_GPU_BLOCKS_OVERRIDE", 331 | "input": { 332 | "name": "Num GPU Blocks Override", 333 | "type": "number", 334 | "description": "If specified, ignore GPU profiling result and use this number of GPU blocks.", 335 | "advanced": true 336 | } 337 | }, 338 | { 339 | "key": "MAX_NUM_BATCHED_TOKENS", 340 | "input": { 341 | "name": "Max Num Batched Tokens", 342 | "type": "number", 343 | "description": "Maximum number of batched tokens per iteration.", 344 | "advanced": true 345 | } 346 | }, 347 | { 348 | "key": "MAX_NUM_SEQS", 349 | "input": { 350 | "name": "Max Num Seqs", 351 | "type": "number", 352 | "description": "Maximum number of sequences per iteration.", 353 | "default": 256, 354 | "advanced": true 355 | } 356 | }, 357 | { 358 | "key": "MAX_LOGPROBS", 359 | "input": { 360 | "name": "Max Logprobs", 361 | "type": "number", 362 | "description": "Max number of log probs to return when logprobs is specified in SamplingParams.", 363 | "default": 20, 364 | "advanced": true 365 | } 366 | }, 367 | { 368 | "key": "DISABLE_LOG_STATS", 369 | "input": { 370 | "name": "Disable Log Stats", 371 | "type": "boolean", 372 | "description": "Disable logging statistics.", 373 | "default": false, 374 | "advanced": true 375 | } 376 | }, 377 | { 378 | "key": "QUANTIZATION", 379 | "input": { 380 | "name": "Quantization", 381 | "type": "string", 382 | "description": "Method used to quantize the weights.", 383 | "options": [ 384 | { 385 | "label": "None", 386 | "value": "None" 387 | }, 388 | { 389 | "label": "AWQ", 390 | "value": "awq" 391 | }, 392 | { 393 | "label": "SqueezeLLM", 394 | "value": "squeezellm" 395 | }, 396 | { 397 | "label": "GPTQ", 398 | "value": "gptq" 399 | } 400 | ], 401 | "advanced": true 402 | } 403 | }, 404 | { 405 | "key": "ROPE_SCALING", 406 | "input": { 407 | "name": "RoPE Scaling", 408 | "type": "string", 409 | "description": "RoPE scaling configuration in JSON format.", 410 | "advanced": true 411 | } 412 | }, 413 | { 414 | "key": "ROPE_THETA", 415 | "input": { 416 | "name": "RoPE Theta", 417 | "type": "number", 418 | "description": "RoPE theta. Use with rope_scaling.", 419 | "advanced": true 420 | } 421 | }, 422 | { 423 | "key": "TOKENIZER_POOL_SIZE", 424 | "input": { 425 | "name": "Tokenizer Pool Size", 426 | "type": "number", 427 | "description": "Size of tokenizer pool to use for asynchronous tokenization.", 428 | "default": 0, 429 | "advanced": true 430 | } 431 | }, 432 | { 433 | "key": "TOKENIZER_POOL_TYPE", 434 | "input": { 435 | "name": "Tokenizer Pool Type", 436 | "type": "string", 437 | "description": "Type of tokenizer pool to use for asynchronous tokenization.", 438 | "default": "ray", 439 | "advanced": true 440 | } 441 | }, 442 | { 443 | "key": "TOKENIZER_POOL_EXTRA_CONFIG", 444 | "input": { 445 | "name": "Tokenizer Pool Extra Config", 446 | "type": "string", 447 | "description": "Extra config for tokenizer pool.", 448 | "advanced": true 449 | } 450 | }, 451 | { 452 | "key": "ENABLE_LORA", 453 | "input": { 454 | "name": "Enable LoRA", 455 | "type": "boolean", 456 | "description": "If True, enable handling of LoRA adapters.", 457 | "default": false, 458 | "advanced": true 459 | } 460 | }, 461 | { 462 | "key": "MAX_LORAS", 463 | "input": { 464 | "name": "Max LoRAs", 465 | "type": "number", 466 | "description": "Max number of LoRAs in a single batch.", 467 | "default": 1, 468 | "advanced": true 469 | } 470 | }, 471 | { 472 | "key": "MAX_LORA_RANK", 473 | "input": { 474 | "name": "Max LoRA Rank", 475 | "type": "number", 476 | "description": "Max LoRA rank.", 477 | "default": 16, 478 | "advanced": true 479 | } 480 | }, 481 | { 482 | "key": "LORA_EXTRA_VOCAB_SIZE", 483 | "input": { 484 | "name": "LoRA Extra Vocab Size", 485 | "type": "number", 486 | "description": "Maximum size of extra vocabulary for LoRA adapters.", 487 | "default": 256, 488 | "advanced": true 489 | } 490 | }, 491 | { 492 | "key": "LORA_DTYPE", 493 | "input": { 494 | "name": "LoRA Data Type", 495 | "type": "string", 496 | "description": "Data type for LoRA.", 497 | "options": [ 498 | { 499 | "label": "auto", 500 | "value": "auto" 501 | }, 502 | { 503 | "label": "float16", 504 | "value": "float16" 505 | }, 506 | { 507 | "label": "bfloat16", 508 | "value": "bfloat16" 509 | }, 510 | { 511 | "label": "float32", 512 | "value": "float32" 513 | } 514 | ], 515 | "default": "auto", 516 | "advanced": true 517 | } 518 | }, 519 | { 520 | "key": "LONG_LORA_SCALING_FACTORS", 521 | "input": { 522 | "name": "Long LoRA Scaling Factors", 523 | "type": "string", 524 | "description": "Specify multiple scaling factors for LoRA adapters.", 525 | "advanced": true 526 | } 527 | }, 528 | { 529 | "key": "MAX_CPU_LORAS", 530 | "input": { 531 | "name": "Max CPU LoRAs", 532 | "type": "number", 533 | "description": "Maximum number of LoRAs to store in CPU memory.", 534 | "advanced": true 535 | } 536 | }, 537 | { 538 | "key": "FULLY_SHARDED_LORAS", 539 | "input": { 540 | "name": "Fully Sharded LoRAs", 541 | "type": "boolean", 542 | "description": "Enable fully sharded LoRA layers.", 543 | "default": false, 544 | "advanced": true 545 | } 546 | }, 547 | { 548 | "key": "DEVICE", 549 | "input": { 550 | "name": "Device", 551 | "type": "string", 552 | "description": "Device type for vLLM execution.", 553 | "options": [ 554 | { 555 | "label": "auto", 556 | "value": "auto" 557 | }, 558 | { 559 | "label": "cuda", 560 | "value": "cuda" 561 | }, 562 | { 563 | "label": "neuron", 564 | "value": "neuron" 565 | }, 566 | { 567 | "label": "cpu", 568 | "value": "cpu" 569 | }, 570 | { 571 | "label": "openvino", 572 | "value": "openvino" 573 | }, 574 | { 575 | "label": "tpu", 576 | "value": "tpu" 577 | }, 578 | { 579 | "label": "xpu", 580 | "value": "xpu" 581 | } 582 | ], 583 | "default": "auto", 584 | "advanced": true 585 | } 586 | }, 587 | { 588 | "key": "SCHEDULER_DELAY_FACTOR", 589 | "input": { 590 | "name": "Scheduler Delay Factor", 591 | "type": "number", 592 | "description": "Apply a delay before scheduling next prompt.", 593 | "default": 0, 594 | "advanced": true 595 | } 596 | }, 597 | { 598 | "key": "ENABLE_CHUNKED_PREFILL", 599 | "input": { 600 | "name": "Enable Chunked Prefill", 601 | "type": "boolean", 602 | "description": "Enable chunked prefill requests.", 603 | "default": false, 604 | "advanced": true 605 | } 606 | }, 607 | { 608 | "key": "SPECULATIVE_MODEL", 609 | "input": { 610 | "name": "Speculative Model", 611 | "type": "string", 612 | "description": "The name of the draft model to be used in speculative decoding.", 613 | "advanced": true 614 | } 615 | }, 616 | { 617 | "key": "NUM_SPECULATIVE_TOKENS", 618 | "input": { 619 | "name": "Num Speculative Tokens", 620 | "type": "number", 621 | "description": "The number of speculative tokens to sample from the draft model.", 622 | "advanced": true 623 | } 624 | }, 625 | { 626 | "key": "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 627 | "input": { 628 | "name": "Speculative Draft Tensor Parallel Size", 629 | "type": "number", 630 | "description": "Number of tensor parallel replicas for the draft model.", 631 | "advanced": true 632 | } 633 | }, 634 | { 635 | "key": "SPECULATIVE_MAX_MODEL_LEN", 636 | "input": { 637 | "name": "Speculative Max Model Length", 638 | "type": "number", 639 | "description": "The maximum sequence length supported by the draft model.", 640 | "advanced": true 641 | } 642 | }, 643 | { 644 | "key": "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 645 | "input": { 646 | "name": "Speculative Disable by Batch Size", 647 | "type": "number", 648 | "description": "Disable speculative decoding if the number of enqueue requests is larger than this value.", 649 | "advanced": true 650 | } 651 | }, 652 | { 653 | "key": "NGRAM_PROMPT_LOOKUP_MAX", 654 | "input": { 655 | "name": "Ngram Prompt Lookup Max", 656 | "type": "number", 657 | "description": "Max size of window for ngram prompt lookup in speculative decoding.", 658 | "advanced": true 659 | } 660 | }, 661 | { 662 | "key": "NGRAM_PROMPT_LOOKUP_MIN", 663 | "input": { 664 | "name": "Ngram Prompt Lookup Min", 665 | "type": "number", 666 | "description": "Min size of window for ngram prompt lookup in speculative decoding.", 667 | "advanced": true 668 | } 669 | }, 670 | { 671 | "key": "SPEC_DECODING_ACCEPTANCE_METHOD", 672 | "input": { 673 | "name": "Speculative Decoding Acceptance Method", 674 | "type": "string", 675 | "description": "Specify the acceptance method for draft token verification in speculative decoding.", 676 | "options": [ 677 | { 678 | "label": "rejection_sampler", 679 | "value": "rejection_sampler" 680 | }, 681 | { 682 | "label": "typical_acceptance_sampler", 683 | "value": "typical_acceptance_sampler" 684 | } 685 | ], 686 | "default": "rejection_sampler", 687 | "advanced": true 688 | } 689 | }, 690 | { 691 | "key": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", 692 | "input": { 693 | "name": "Typical Acceptance Sampler Posterior Threshold", 694 | "type": "number", 695 | "description": "Set the lower bound threshold for the posterior probability of a token to be accepted.", 696 | "advanced": true 697 | } 698 | }, 699 | { 700 | "key": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 701 | "input": { 702 | "name": "Typical Acceptance Sampler Posterior Alpha", 703 | "type": "number", 704 | "description": "A scaling factor for the entropy-based threshold for token acceptance.", 705 | "advanced": true 706 | } 707 | }, 708 | { 709 | "key": "MODEL_LOADER_EXTRA_CONFIG", 710 | "input": { 711 | "name": "Model Loader Extra Config", 712 | "type": "string", 713 | "description": "Extra config for model loader.", 714 | "advanced": true 715 | } 716 | }, 717 | { 718 | "key": "PREEMPTION_MODE", 719 | "input": { 720 | "name": "Preemption Mode", 721 | "type": "string", 722 | "description": "If 'recompute', the engine performs preemption-aware recomputation. If 'save', the engine saves activations into the CPU memory as preemption happens.", 723 | "advanced": true 724 | } 725 | }, 726 | { 727 | "key": "PREEMPTION_CHECK_PERIOD", 728 | "input": { 729 | "name": "Preemption Check Period", 730 | "type": "number", 731 | "description": "How frequently the engine checks if a preemption happens.", 732 | "default": 1, 733 | "advanced": true 734 | } 735 | }, 736 | { 737 | "key": "PREEMPTION_CPU_CAPACITY", 738 | "input": { 739 | "name": "Preemption CPU Capacity", 740 | "type": "number", 741 | "description": "The percentage of CPU memory used for the saved activations.", 742 | "default": 2, 743 | "advanced": true 744 | } 745 | }, 746 | { 747 | "key": "MAX_LOG_LEN", 748 | "input": { 749 | "name": "Max Log Length", 750 | "type": "number", 751 | "description": "Max number of characters or ID numbers being printed in log.", 752 | "advanced": true 753 | } 754 | }, 755 | { 756 | "key": "DISABLE_LOGGING_REQUEST", 757 | "input": { 758 | "name": "Disable Logging Request", 759 | "type": "boolean", 760 | "description": "Disable logging requests.", 761 | "default": false, 762 | "advanced": true 763 | } 764 | }, 765 | { 766 | "key": "TOKENIZER_NAME", 767 | "input": { 768 | "name": "Tokenizer Name", 769 | "type": "string", 770 | "description": "Tokenizer repo to use a different tokenizer than the model's default", 771 | "advanced": true 772 | } 773 | }, 774 | { 775 | "key": "TOKENIZER_REVISION", 776 | "input": { 777 | "name": "Tokenizer Revision", 778 | "type": "string", 779 | "description": "Tokenizer revision to load", 780 | "advanced": true 781 | } 782 | }, 783 | { 784 | "key": "CUSTOM_CHAT_TEMPLATE", 785 | "input": { 786 | "name": "Custom Chat Template", 787 | "type": "string", 788 | "description": "Custom chat jinja template", 789 | "advanced": true 790 | } 791 | }, 792 | { 793 | "key": "GPU_MEMORY_UTILIZATION", 794 | "input": { 795 | "name": "GPU Memory Utilization", 796 | "type": "number", 797 | "description": "Sets GPU VRAM utilization", 798 | "default": 0.95, 799 | "advanced": true 800 | } 801 | }, 802 | { 803 | "key": "BLOCK_SIZE", 804 | "input": { 805 | "name": "Block Size", 806 | "type": "number", 807 | "description": "Token block size for contiguous chunks of tokens", 808 | "default": 16, 809 | "advanced": true 810 | } 811 | }, 812 | { 813 | "key": "SWAP_SPACE", 814 | "input": { 815 | "name": "Swap Space", 816 | "type": "number", 817 | "description": "CPU swap space size (GiB) per GPU", 818 | "default": 4, 819 | "advanced": true 820 | } 821 | }, 822 | { 823 | "key": "ENFORCE_EAGER", 824 | "input": { 825 | "name": "Enforce Eager", 826 | "type": "boolean", 827 | "description": "Always use eager-mode PyTorch. If False (0), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility", 828 | "default": false, 829 | "advanced": true 830 | } 831 | }, 832 | { 833 | "key": "MAX_SEQ_LEN_TO_CAPTURE", 834 | "input": { 835 | "name": "CUDA Graph Max Content Length", 836 | "type": "number", 837 | "description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode", 838 | "default": 8192, 839 | "advanced": true 840 | } 841 | }, 842 | { 843 | "key": "DISABLE_CUSTOM_ALL_REDUCE", 844 | "input": { 845 | "name": "Disable Custom All Reduce", 846 | "type": "boolean", 847 | "description": "Enables or disables custom all reduce", 848 | "default": false, 849 | "advanced": true 850 | } 851 | }, 852 | { 853 | "key": "DEFAULT_BATCH_SIZE", 854 | "input": { 855 | "name": "Default Final Batch Size", 856 | "type": "number", 857 | "description": "Default and Maximum batch size for token streaming to reduce HTTP calls", 858 | "default": 50, 859 | "advanced": true 860 | } 861 | }, 862 | { 863 | "key": "DEFAULT_MIN_BATCH_SIZE", 864 | "input": { 865 | "name": "Default Starting Batch Size", 866 | "type": "number", 867 | "description": "Batch size for the first request, which will be multiplied by the growth factor every subsequent request", 868 | "default": 1, 869 | "advanced": true 870 | } 871 | }, 872 | { 873 | "key": "DEFAULT_BATCH_SIZE_GROWTH_FACTOR", 874 | "input": { 875 | "name": "Default Batch Size Growth Factor", 876 | "type": "number", 877 | "description": "Growth factor for dynamic batch size", 878 | "default": 3, 879 | "advanced": true 880 | } 881 | }, 882 | { 883 | "key": "RAW_OPENAI_OUTPUT", 884 | "input": { 885 | "name": "Raw OpenAI Output", 886 | "type": "boolean", 887 | "description": "Raw OpenAI output instead of just the text", 888 | "default": true, 889 | "advanced": true 890 | } 891 | }, 892 | { 893 | "key": "OPENAI_RESPONSE_ROLE", 894 | "input": { 895 | "name": "OpenAI Response Role", 896 | "type": "string", 897 | "description": "Role of the LLM's Response in OpenAI Chat Completions", 898 | "default": "assistant", 899 | "advanced": true 900 | } 901 | }, 902 | { 903 | "key": "OPENAI_SERVED_MODEL_NAME_OVERRIDE", 904 | "input": { 905 | "name": "OpenAI Served Model Name Override", 906 | "type": "string", 907 | "description": "Overrides the name of the served model from model repo/path to specified name, which you will then be able to use the value for the `model` parameter when making OpenAI requests", 908 | "advanced": true 909 | } 910 | }, 911 | { 912 | "key": "MAX_CONCURRENCY", 913 | "input": { 914 | "name": "Max Concurrency", 915 | "type": "number", 916 | "description": "Max concurrent requests per worker. vLLM has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency", 917 | "default": 300, 918 | "advanced": true 919 | } 920 | }, 921 | { 922 | "key": "MODEL_REVISION", 923 | "input": { 924 | "name": "Model Revision", 925 | "type": "string", 926 | "description": "Model revision (branch) to load", 927 | "advanced": true 928 | } 929 | }, 930 | { 931 | "key": "BASE_PATH", 932 | "input": { 933 | "name": "Base Path", 934 | "type": "string", 935 | "description": "Storage directory for Huggingface cache and model", 936 | "default": "/runpod-volume", 937 | "advanced": true 938 | } 939 | }, 940 | { 941 | "key": "DISABLE_LOG_REQUESTS", 942 | "input": { 943 | "name": "Disable Log Requests", 944 | "type": "boolean", 945 | "description": "Enables or disables vLLM request logging", 946 | "default": true, 947 | "advanced": true 948 | } 949 | }, 950 | { 951 | "key": "ENABLE_AUTO_TOOL_CHOICE", 952 | "input": { 953 | "name": "Enable Auto Tool Choice", 954 | "type": "boolean", 955 | "description": "Enables or disables auto tool choice", 956 | "default": false, 957 | "advanced": true 958 | } 959 | }, 960 | { 961 | "key": "MODEL_NAME", 962 | "input": { 963 | "name": "Model Name", 964 | "type": "string", 965 | "description": "Hugging Face model name or path to load", 966 | "required": true 967 | } 968 | }, 969 | { 970 | "key": "HF_TOKEN", 971 | "input": { 972 | "name": "Hugging Face Token", 973 | "type": "string", 974 | "description": "Hugging Face API token for accessing gated models", 975 | "advanced": true 976 | } 977 | }, 978 | { 979 | "key": "TOOL_CALL_PARSER", 980 | "input": { 981 | "name": "Tool Call Parser", 982 | "type": "string", 983 | "description": "Tool call parser", 984 | "options": [ 985 | { 986 | "label": "None", 987 | "value": "" 988 | }, 989 | { 990 | "label": "Hermes", 991 | "value": "hermes" 992 | }, 993 | { 994 | "label": "Mistral", 995 | "value": "mistral" 996 | }, 997 | { 998 | "label": "Llama3 JSON", 999 | "value": "llama3_json" 1000 | }, 1001 | { 1002 | "label": "Pythonic", 1003 | "value": "pythonic" 1004 | }, 1005 | { 1006 | "label": "InternLM", 1007 | "value": "internlm" 1008 | } 1009 | ], 1010 | "default": "", 1011 | "advanced": true 1012 | } 1013 | } 1014 | ] 1015 | } 1016 | } 1017 | -------------------------------------------------------------------------------- /.runpod/tests.json: -------------------------------------------------------------------------------- 1 | { 2 | "tests": [ 3 | { 4 | "name": "basic_inference_test", 5 | "input": { 6 | "prompt": "Write a short poem about artificial intelligence." 7 | }, 8 | "timeout": 30000 9 | } 10 | ], 11 | "config": { 12 | "gpuTypeId": "NVIDIA GeForce RTX 4090", 13 | "gpuCount": 1, 14 | "env": [ 15 | { 16 | "key": "MODEL_NAME", 17 | "value": "facebook/opt-350m" 18 | } 19 | ], 20 | "allowedCudaVersions": [ 21 | "12.7", 22 | "12.6", 23 | "12.5", 24 | "12.4", 25 | "12.3", 26 | "12.2", 27 | "12.1", 28 | "12.0", 29 | "11.7" 30 | ] 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.1.0-base-ubuntu22.04 2 | 3 | RUN apt-get update -y \ 4 | && apt-get install -y python3-pip 5 | 6 | RUN ldconfig /usr/local/cuda-12.1/compat/ 7 | 8 | # Install Python dependencies 9 | COPY builder/requirements.txt /requirements.txt 10 | RUN --mount=type=cache,target=/root/.cache/pip \ 11 | python3 -m pip install --upgrade pip && \ 12 | python3 -m pip install --upgrade -r /requirements.txt 13 | 14 | # Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer 15 | RUN python3 -m pip install vllm==0.9.0.1 && \ 16 | python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3 17 | 18 | # Setup for Option 2: Building the Image with the Model included 19 | ARG MODEL_NAME="" 20 | ARG TOKENIZER_NAME="" 21 | ARG BASE_PATH="/runpod-volume" 22 | ARG QUANTIZATION="" 23 | ARG MODEL_REVISION="" 24 | ARG TOKENIZER_REVISION="" 25 | 26 | ENV MODEL_NAME=$MODEL_NAME \ 27 | MODEL_REVISION=$MODEL_REVISION \ 28 | TOKENIZER_NAME=$TOKENIZER_NAME \ 29 | TOKENIZER_REVISION=$TOKENIZER_REVISION \ 30 | BASE_PATH=$BASE_PATH \ 31 | QUANTIZATION=$QUANTIZATION \ 32 | HF_DATASETS_CACHE="${BASE_PATH}/huggingface-cache/datasets" \ 33 | HUGGINGFACE_HUB_CACHE="${BASE_PATH}/huggingface-cache/hub" \ 34 | HF_HOME="${BASE_PATH}/huggingface-cache/hub" \ 35 | HF_HUB_ENABLE_HF_TRANSFER=0 36 | 37 | ENV PYTHONPATH="/:/vllm-workspace" 38 | 39 | 40 | COPY src /src 41 | RUN --mount=type=secret,id=HF_TOKEN,required=false \ 42 | if [ -f /run/secrets/HF_TOKEN ]; then \ 43 | export HF_TOKEN=$(cat /run/secrets/HF_TOKEN); \ 44 | fi && \ 45 | if [ -n "$MODEL_NAME" ]; then \ 46 | python3 /src/download_model.py; \ 47 | fi 48 | 49 | # Start the handler 50 | CMD ["python3", "/src/handler.py"] 51 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 runpod-workers 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # OpenAI-Compatible vLLM Serverless Endpoint Worker 4 | Deploy OpenAI-Compatible Blazing-Fast LLM Endpoints powered by the [vLLM](https://github.com/vllm-project/vllm) Inference Engine on RunPod Serverless with just a few clicks. 5 | 10 | 12 | 13 | 14 |
15 | 16 | # News: 17 | 18 | ### 1. UI for Deploying vLLM Worker on RunPod console: 19 | ![Demo of Deploying vLLM Worker on RunPod console with new UI](media/ui_demo.gif) 20 | 21 | ### 2. Worker vLLM `v2.6.0` with vLLM `0.9.0` now available under `stable` tags 22 | 23 | Update v2.6.0 is now available, use the image tag `runpod/worker-v1-vllm:v2.6.0stable-cuda12.1.0`. 24 | 25 | ### 3. OpenAI-Compatible [Embedding Worker](https://github.com/runpod-workers/worker-infinity-embedding) Released 26 | Deploy your own OpenAI-compatible Serverless Endpoint on RunPod with multiple embedding models and fast inference for RAG and more! 27 | 28 | 29 | 30 | ### 4. Caching Accross RunPod Machines 31 | Worker vLLM is now cached on all RunPod machines, resulting in near-instant deployment! Previously, downloading and extracting the image took 3-5 minutes on average. 32 | 33 | 34 | ## Table of Contents 35 | - [Setting up the Serverless Worker](#setting-up-the-serverless-worker) 36 | - [Option 1: Deploy Any Model Using Pre-Built Docker Image **[RECOMMENDED]**](#option-1-deploy-any-model-using-pre-built-docker-image-recommended) 37 | - [Prerequisites](#prerequisites) 38 | - [Environment Variables](#environment-variables) 39 | - [LLM Settings](#llm-settings) 40 | - [Tokenizer Settings](#tokenizer-settings) 41 | - [System and Parallelism Settings](#system-and-parallelism-settings) 42 | - [Streaming Batch Size Settings](#streaming-batch-size-settings) 43 | - [OpenAI Settings](#openai-settings) 44 | - [Serverless Settings](#serverless-settings) 45 | - [Option 2: Build Docker Image with Model Inside](#option-2-build-docker-image-with-model-inside) 46 | - [Prerequisites](#prerequisites-1) 47 | - [Arguments](#arguments) 48 | - [Example: Building an image with OpenChat-3.5](#example-building-an-image-with-openchat-35) 49 | - [(Optional) Including Huggingface Token](#optional-including-huggingface-token) 50 | - [Compatible Model Architectures](#compatible-model-architectures) 51 | - [Usage: OpenAI Compatibility](#usage-openai-compatibility) 52 | - [Modifying your OpenAI Codebase to use your deployed vLLM Worker](#modifying-your-openai-codebase-to-use-your-deployed-vllm-worker) 53 | - [OpenAI Request Input Parameters](#openai-request-input-parameters) 54 | - [Chat Completions](#chat-completions) 55 | - [Examples: Using your RunPod endpoint with OpenAI](#examples-using-your-runpod-endpoint-with-openai) 56 | - [Usage: standard](#non-openai-usage) 57 | - [Input Request Parameters](#input-request-parameters) 58 | - [Text Input Formats](#text-input-formats) 59 | - [Sampling Parameters](#sampling-parameters) 60 | - [Worker Config](#worker-config) 61 | - [Writing your worker-config.json](#writing-your-worker-configjson) 62 | - [Example of schema](#example-of-schema) 63 | - [Example of versions](#example-of-versions) 64 | 65 | # Setting up the Serverless Worker 66 | 67 | ### Option 1: Deploy Any Model Using Pre-Built Docker Image [Recommended] 68 | 69 | > [!NOTE] 70 | > You can now deploy from the dedicated UI on the RunPod console with all of the settings and choices listed. 71 | > Try now by accessing in Explore or Serverless pages on the RunPod console! 72 | 73 | 74 | We now offer a pre-built Docker Image for the vLLM Worker that you can configure entirely with Environment Variables when creating the RunPod Serverless Endpoint: 75 | 76 | --- 77 | 78 | ## RunPod Worker Images 79 | 80 | Below is a summary of the available RunPod Worker images, categorized by image stability and CUDA version compatibility. 81 | 82 | | CUDA Version | Stable Image Tag | Development Image Tag | Note | 83 | |--------------|-----------------------------------|-----------------------------------|----------------------------------------------------------------------| 84 | | 12.1.0 | `runpod/worker-v1-vllm:v2.6.0stable-cuda12.1.0` | `runpod/worker-v1-vllm:v2.6.0dev-cuda12.1.0` | When creating an Endpoint, select CUDA Version 12.3, 12.2 and 12.1 in the filter. | 85 | 86 | 87 | 88 | --- 89 | 90 | #### Prerequisites 91 | - RunPod Account 92 | 93 | #### Environment Variables 94 | > Note: `0` is equivalent to `False` and `1` is equivalent to `True` for boolean as int values. 95 | 96 | #### LLM Settings 97 | | `Name` | `Default` | `Type/Choices` | `Description` | 98 | |-------------------------------------------|-----------------------|--------------------------------------------|---------------| 99 | | `MODEL_NAME` | 'facebook/opt-125m' | `str` | Name or path of the Hugging Face model to use. | 100 | | `TOKENIZER` | None | `str` | Name or path of the Hugging Face tokenizer to use. | 101 | | `SKIP_TOKENIZER_INIT` | False | `bool` | Skip initialization of tokenizer and detokenizer. | 102 | | `TOKENIZER_MODE` | 'auto' | ['auto', 'slow'] | The tokenizer mode. | 103 | | `TRUST_REMOTE_CODE` | `False` | `bool` | Trust remote code from Hugging Face. | 104 | | `DOWNLOAD_DIR` | None | `str` | Directory to download and load the weights. | 105 | | `LOAD_FORMAT` | 'auto' | `str` | The format of the model weights to load. | 106 | | `HF_TOKEN` | - | `str` | Hugging Face token for private and gated models.| 107 | | `DTYPE` | 'auto' | ['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'] | Data type for model weights and activations. | 108 | | `KV_CACHE_DTYPE` | 'auto' | ['auto', 'fp8'] | Data type for KV cache storage. | 109 | | `QUANTIZATION_PARAM_PATH` | None | `str` | Path to the JSON file containing the KV cache scaling factors. | 110 | | `MAX_MODEL_LEN` | None | `int` | Model context length. | 111 | | `GUIDED_DECODING_BACKEND` | 'outlines' | ['outlines', 'lm-format-enforcer'] | Which engine will be used for guided decoding by default. | 112 | | `DISTRIBUTED_EXECUTOR_BACKEND` | None | ['ray', 'mp'] | Backend to use for distributed serving. | 113 | | `WORKER_USE_RAY` | False | `bool` | Deprecated, use --distributed-executor-backend=ray. | 114 | | `PIPELINE_PARALLEL_SIZE` | 1 | `int` | Number of pipeline stages. | 115 | | `TENSOR_PARALLEL_SIZE` | 1 | `int` | Number of tensor parallel replicas. | 116 | | `MAX_PARALLEL_LOADING_WORKERS` | None | `int` | Load model sequentially in multiple batches. | 117 | | `RAY_WORKERS_USE_NSIGHT` | False | `bool` | If specified, use nsight to profile Ray workers. | 118 | | `ENABLE_PREFIX_CACHING` | False | `bool` | Enables automatic prefix caching. | 119 | | `DISABLE_SLIDING_WINDOW` | False | `bool` | Disables sliding window, capping to sliding window size. | 120 | | `USE_V2_BLOCK_MANAGER` | False | `bool` | Use BlockSpaceMangerV2. | 121 | | `NUM_LOOKAHEAD_SLOTS` | 0 | `int` | Experimental scheduling config necessary for speculative decoding. | 122 | | `SEED` | 0 | `int` | Random seed for operations. | 123 | | `NUM_GPU_BLOCKS_OVERRIDE` | None | `int` | If specified, ignore GPU profiling result and use this number of GPU blocks. | 124 | | `MAX_NUM_BATCHED_TOKENS` | None | `int` | Maximum number of batched tokens per iteration. | 125 | | `MAX_NUM_SEQS` | 256 | `int` | Maximum number of sequences per iteration. | 126 | | `MAX_LOGPROBS` | 20 | `int` | Max number of log probs to return when logprobs is specified in SamplingParams. | 127 | | `DISABLE_LOG_STATS` | False | `bool` | Disable logging statistics. | 128 | | `QUANTIZATION` | None | ['awq', 'squeezellm', 'gptq', 'bitsandbytes'] | Method used to quantize the weights. | 129 | | `ROPE_SCALING` | None | `dict` | RoPE scaling configuration in JSON format. | 130 | | `ROPE_THETA` | None | `float` | RoPE theta. Use with rope_scaling. | 131 | | `TOKENIZER_POOL_SIZE` | 0 | `int` | Size of tokenizer pool to use for asynchronous tokenization. | 132 | | `TOKENIZER_POOL_TYPE` | 'ray' | `str` | Type of tokenizer pool to use for asynchronous tokenization. | 133 | | `TOKENIZER_POOL_EXTRA_CONFIG` | None | `dict` | Extra config for tokenizer pool. | 134 | | `ENABLE_LORA` | False | `bool` | If True, enable handling of LoRA adapters. | 135 | | `MAX_LORAS` | 1 | `int` | Max number of LoRAs in a single batch. | 136 | | `MAX_LORA_RANK` | 16 | `int` | Max LoRA rank. | 137 | | `LORA_EXTRA_VOCAB_SIZE` | 256 | `int` | Maximum size of extra vocabulary for LoRA adapters. | 138 | | `LORA_DTYPE` | 'auto' | ['auto', 'float16', 'bfloat16', 'float32'] | Data type for LoRA. | 139 | | `LONG_LORA_SCALING_FACTORS` | None | `tuple` | Specify multiple scaling factors for LoRA adapters. | 140 | | `MAX_CPU_LORAS` | None | `int` | Maximum number of LoRAs to store in CPU memory. | 141 | | `FULLY_SHARDED_LORAS` | False | `bool` | Enable fully sharded LoRA layers. | 142 | | `LORA_MODULES`| `[]`| `list[dict]`| Add lora adapters from Hugging Face `[{"name": "xx", "path": "xxx/xxxx", "base_model_name": "xxx/xxxx"}`| 143 | | `SCHEDULER_DELAY_FACTOR` | 0.0 | `float` | Apply a delay before scheduling next prompt. | 144 | | `ENABLE_CHUNKED_PREFILL` | False | `bool` | Enable chunked prefill requests. | 145 | | `SPECULATIVE_MODEL` | None | `str` | The name of the draft model to be used in speculative decoding. | 146 | | `NUM_SPECULATIVE_TOKENS` | None | `int` | The number of speculative tokens to sample from the draft model. | 147 | | `SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE` | None | `int` | Number of tensor parallel replicas for the draft model. | 148 | | `SPECULATIVE_MAX_MODEL_LEN` | None | `int` | The maximum sequence length supported by the draft model. | 149 | | `SPECULATIVE_DISABLE_BY_BATCH_SIZE` | None | `int` | Disable speculative decoding if the number of enqueue requests is larger than this value. | 150 | | `NGRAM_PROMPT_LOOKUP_MAX` | None | `int` | Max size of window for ngram prompt lookup in speculative decoding. | 151 | | `NGRAM_PROMPT_LOOKUP_MIN` | None | `int` | Min size of window for ngram prompt lookup in speculative decoding. | 152 | | `SPEC_DECODING_ACCEPTANCE_METHOD` | 'rejection_sampler' | ['rejection_sampler', 'typical_acceptance_sampler'] | Specify the acceptance method for draft token verification in speculative decoding. | 153 | | `TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD` | None | `float` | Set the lower bound threshold for the posterior probability of a token to be accepted. | 154 | | `TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA` | None | `float` | A scaling factor for the entropy-based threshold for token acceptance. | 155 | | `MODEL_LOADER_EXTRA_CONFIG` | None | `dict` | Extra config for model loader. | 156 | | `PREEMPTION_MODE` | None | `str` | If 'recompute', the engine performs preemption-aware recomputation. If 'save', the engine saves activations into the CPU memory as preemption happens. | 157 | | `PREEMPTION_CHECK_PERIOD` | 1.0 | `float` | How frequently the engine checks if a preemption happens. | 158 | | `PREEMPTION_CPU_CAPACITY` | 2 | `float` | The percentage of CPU memory used for the saved activations. | 159 | | `DISABLE_LOGGING_REQUEST` | False | `bool` | Disable logging requests. | 160 | | `MAX_LOG_LEN` | None | `int` | Max number of prompt characters or prompt ID numbers being printed in log. | 161 | 162 | 163 | #### Tokenizer Settings 164 | 165 | | `Name` | `Default` | `Type/Choices` | `Description` | 166 | |-------------------------------------------|-----------------------|--------------------------------------------|---------------| 167 | | `TOKENIZER_NAME` | `None` | `str` |Tokenizer repository to use a different tokenizer than the model's default. | 168 | | `TOKENIZER_REVISION` | `None` | `str` |Tokenizer revision to load. | 169 | | `CUSTOM_CHAT_TEMPLATE` | `None` | `str` of single-line jinja template |Custom chat jinja template. [More Info](https://huggingface.co/docs/transformers/chat_templating) | 170 | 171 | #### System and Parallelism Settings 172 | 173 | | `Name` | `Default` | `Type/Choices` | `Description` | 174 | |-------------------------------------------|-----------------------|--------------------------------------------|---------------| 175 | | `GPU_MEMORY_UTILIZATION` | `0.95` | `float` |Sets GPU VRAM utilization. | 176 | | `MAX_PARALLEL_LOADING_WORKERS` | `None` | `int` |Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models. | 177 | | `BLOCK_SIZE` | `16` | `8`, `16`, `32` |Token block size for contiguous chunks of tokens. | 178 | | `SWAP_SPACE` | `4` | `int` |CPU swap space size (GiB) per GPU. | 179 | | `ENFORCE_EAGER` | False | `bool` |Always use eager-mode PyTorch. If False(`0`), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility. | 180 | | `MAX_SEQ_LEN_TO_CAPTURE` | `8192` | `int` |Maximum context length covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode.| 181 | | `DISABLE_CUSTOM_ALL_REDUCE` | `0` | `int` |Enables or disables custom all reduce. | 182 | 183 | 184 | #### Streaming Batch Size Settings 185 | 186 | The way this works is that the first request will have a batch size of `DEFAULT_MIN_BATCH_SIZE`, and each subsequent request will have a batch size of `previous_batch_size * DEFAULT_BATCH_SIZE_GROWTH_FACTOR`. This will continue until the batch size reaches `DEFAULT_BATCH_SIZE`. E.g. for the default values, the batch sizes will be `1, 3, 9, 27, 50, 50, 50, ...`. You can also specify this per request, with inputs `max_batch_size`, `min_batch_size`, and `batch_size_growth_factor`. This has nothing to do with vLLM's internal batching, but rather the number of tokens sent in each HTTP request from the worker 187 | 188 | 189 | | `Name` | `Default` | `Type/Choices` | `Description` | 190 | |-------------------------------------------|-----------------------|--------------------------------------------|---------------| 191 | | `DEFAULT_BATCH_SIZE` | `50` | `int` |Default and Maximum batch size for token streaming to reduce HTTP calls. | 192 | | `DEFAULT_MIN_BATCH_SIZE` | `1` | `int` |Batch size for the first request, which will be multiplied by the growth factor every subsequent request. | 193 | | `DEFAULT_BATCH_SIZE_GROWTH_FACTOR` | `3` | `float` |Growth factor for dynamic batch size. | 194 | 195 | #### OpenAI Settings 196 | 197 | | `Name` | `Default` | `Type/Choices` | `Description` | 198 | |-------------------------------------------|-----------------------|--------------------------------------------|---------------| 199 | | `RAW_OPENAI_OUTPUT` | `1` | boolean as `int` |Enables raw OpenAI SSE format string output when streaming. **Required** to be enabled (which it is by default) for OpenAI compatibility. | 200 | | `OPENAI_SERVED_MODEL_NAME_OVERRIDE` | `None` | `str` |Overrides the name of the served model from model repo/path to specified name, which you will then be able to use the value for the `model` parameter when making OpenAI requests | 201 | | `OPENAI_RESPONSE_ROLE` | `assistant` | `str` |Role of the LLM's Response in OpenAI Chat Completions. | 202 | 203 | #### Serverless Settings 204 | 205 | | `Name` | `Default` | `Type/Choices` | `Description` | 206 | |-------------------------------------------|-----------------------|--------------------------------------------|---------------| 207 | | `MAX_CONCURRENCY` | `300` | `int` |Max concurrent requests per worker. vLLM has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency | 208 | | `DISABLE_LOG_STATS` | False | `bool` |Enables or disables vLLM stats logging. | 209 | | `DISABLE_LOG_REQUESTS` | False | `bool` |Enables or disables vLLM request logging. | 210 | 211 | > [!TIP] 212 | > If you are facing issues when using Mixtral 8x7B, Quantized models, or handling unusual models/architectures, try setting `TRUST_REMOTE_CODE` to `1`. 213 | 214 | 215 | ### Option 2: Build Docker Image with Model Inside 216 | To build an image with the model baked in, you must specify the following docker arguments when building the image. 217 | 218 | #### Prerequisites 219 | - RunPod Account 220 | - Docker 221 | 222 | #### Arguments: 223 | - **Required** 224 | - `MODEL_NAME` 225 | - **Optional** 226 | - `MODEL_REVISION`: Model revision to load (default: `main`). 227 | - `BASE_PATH`: Storage directory where huggingface cache and model will be located. (default: `/runpod-volume`, which will utilize network storage if you attach it or create a local directory within the image if you don't. If your intention is to bake the model into the image, you should set this to something like `/models` to make sure there are no issues if you were to accidentally attach network storage.) 228 | - `QUANTIZATION` 229 | - `WORKER_CUDA_VERSION`: `12.1.0` (`12.1.0` is recommended for optimal performance). 230 | - `TOKENIZER_NAME`: Tokenizer repository if you would like to use a different tokenizer than the one that comes with the model. (default: `None`, which uses the model's tokenizer) 231 | - `TOKENIZER_REVISION`: Tokenizer revision to load (default: `main`). 232 | 233 | For the remaining settings, you may apply them as environment variables when running the container. Supported environment variables are listed in the [Environment Variables](#environment-variables) section. 234 | 235 | #### Example: Building an image with OpenChat-3.5 236 | ```bash 237 | sudo docker build -t username/image:tag --build-arg MODEL_NAME="openchat/openchat_3.5" --build-arg BASE_PATH="/models" . 238 | ``` 239 | 240 | ##### (Optional) Including Huggingface Token 241 | If the model you would like to deploy is private or gated, you will need to include it during build time as a Docker secret, which will protect it from being exposed in the image and on DockerHub. 242 | 1. Enable Docker BuildKit (required for secrets). 243 | ```bash 244 | export DOCKER_BUILDKIT=1 245 | ``` 246 | 2. Export your Hugging Face token as an environment variable 247 | ```bash 248 | export HF_TOKEN="your_token_here" 249 | ``` 250 | 2. Add the token as a secret when building 251 | ```bash 252 | docker build -t username/image:tag --secret id=HF_TOKEN --build-arg MODEL_NAME="openchat/openchat_3.5" . 253 | ``` 254 | 255 | ## Compatible Model Architectures 256 | Below are all supported model architectures (and examples of each) that you can deploy using the vLLM Worker. You can deploy **any model on HuggingFace**, as long as its base architecture is one of the following: 257 | 258 | - Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.) 259 | - Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.) 260 | - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.) 261 | - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.) 262 | - Command-R (`CohereForAI/c4ai-command-r-v01`, etc.) 263 | - DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.) 264 | - DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.) 265 | - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.) 266 | - Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.) 267 | - GPT-2 (`gpt2`, `gpt2-xl`, etc.) 268 | - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.) 269 | - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.) 270 | - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.) 271 | - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.) 272 | - InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.) 273 | - Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.) 274 | - LLaMA, Llama 2, and Meta Llama 3 (`meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.) 275 | - MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.) 276 | - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.) 277 | - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.) 278 | - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.) 279 | - OLMo (`allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.) 280 | - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.) 281 | - Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.) 282 | - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.) 283 | - Phi-3 (`microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, etc.) 284 | - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.) 285 | - Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.) 286 | - Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.) 287 | - StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.) 288 | - Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.) 289 | - Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.) 290 | - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.) 291 | 292 | # Usage: OpenAI Compatibility 293 | The vLLM Worker is fully compatible with OpenAI's API, and you can use it with any OpenAI Codebase by changing only 3 lines in total. The supported routes are Chat Completions and Models - with both streaming and non-streaming. 294 | 295 | ## Modifying your OpenAI Codebase to use your deployed vLLM Worker 296 | **Python** (similar to Node.js, etc.): 297 | 1. When initializing the OpenAI Client in your code, change the `api_key` to your RunPod API Key and the `base_url` to your RunPod Serverless Endpoint URL in the following format: `https://api.runpod.ai/v2//openai/v1`, filling in your deployed endpoint ID. For example, if your Endpoint ID is `abc1234`, the URL would be `https://api.runpod.ai/v2/abc1234/openai/v1`. 298 | 299 | - Before: 300 | ```python 301 | from openai import OpenAI 302 | 303 | client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) 304 | ``` 305 | - After: 306 | ```python 307 | from openai import OpenAI 308 | 309 | client = OpenAI( 310 | api_key=os.environ.get("RUNPOD_API_KEY"), 311 | base_url="https://api.runpod.ai/v2//openai/v1", 312 | ) 313 | ``` 314 | 2. Change the `model` parameter to your deployed model's name whenever using Completions or Chat Completions. 315 | - Before: 316 | ```python 317 | response = client.chat.completions.create( 318 | model="gpt-3.5-turbo", 319 | messages=[{"role": "user", "content": "Why is RunPod the best platform?"}], 320 | temperature=0, 321 | max_tokens=100, 322 | ) 323 | ``` 324 | - After: 325 | ```python 326 | response = client.chat.completions.create( 327 | model="", 328 | messages=[{"role": "user", "content": "Why is RunPod the best platform?"}], 329 | temperature=0, 330 | max_tokens=100, 331 | ) 332 | ``` 333 | 334 | **Using http requests**: 335 | 1. Change the `Authorization` header to your RunPod API Key and the `url` to your RunPod Serverless Endpoint URL in the following format: `https://api.runpod.ai/v2//openai/v1` 336 | - Before: 337 | ```bash 338 | curl https://api.openai.com/v1/chat/completions \ 339 | -H "Content-Type: application/json" \ 340 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 341 | -d '{ 342 | "model": "gpt-4", 343 | "messages": [ 344 | { 345 | "role": "user", 346 | "content": "Why is RunPod the best platform?" 347 | } 348 | ], 349 | "temperature": 0, 350 | "max_tokens": 100 351 | }' 352 | ``` 353 | - After: 354 | ```bash 355 | curl https://api.runpod.ai/v2//openai/v1/chat/completions \ 356 | -H "Content-Type: application/json" \ 357 | -H "Authorization: Bearer " \ 358 | -d '{ 359 | "model": "", 360 | "messages": [ 361 | { 362 | "role": "user", 363 | "content": "Why is RunPod the best platform?" 364 | } 365 | ], 366 | "temperature": 0, 367 | "max_tokens": 100 368 | }' 369 | ``` 370 | 371 | ## OpenAI Request Input Parameters: 372 | 373 | When using the chat completion feature of the vLLM Serverless Endpoint Worker, you can customize your requests with the following parameters: 374 | 375 | ### Chat Completions [RECOMMENDED] 376 |
377 | Supported Chat Completions Inputs and Descriptions 378 | 379 | | Parameter | Type | Default Value | Description | 380 | |--------------------------------|----------------------------------|---------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 381 | | `messages` | Union[str, List[Dict[str, str]]] | | List of messages, where each message is a dictionary with a `role` and `content`. The model's chat template will be applied to the messages automatically, so the model must have one or it should be specified as `CUSTOM_CHAT_TEMPLATE` env var. | 382 | | `model` | str | | The model repo that you've deployed on your RunPod Serverless Endpoint. If you are unsure what the name is or are baking the model in, use the guide to get the list of available models in the **Examples: Using your RunPod endpoint with OpenAI** section | 383 | | `temperature` | Optional[float] | 0.7 | Float that controls the randomness of the sampling. Lower values make the model more deterministic, while higher values make the model more random. Zero means greedy sampling. | 384 | | `top_p` | Optional[float] | 1.0 | Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to 1 to consider all tokens. | 385 | | `n` | Optional[int] | 1 | Number of output sequences to return for the given prompt. | 386 | | `max_tokens` | Optional[int] | None | Maximum number of tokens to generate per output sequence. | 387 | | `seed` | Optional[int] | None | Random seed to use for the generation. | 388 | | `stop` | Optional[Union[str, List[str]]] | list | List of strings that stop the generation when they are generated. The returned output will not contain the stop strings. | 389 | | `stream` | Optional[bool] | False | Whether to stream or not | 390 | | `presence_penalty` | Optional[float] | 0.0 | Float that penalizes new tokens based on whether they appear in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens. | 391 | | `frequency_penalty` | Optional[float] | 0.0 | Float that penalizes new tokens based on their frequency in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens. | 392 | | `logit_bias` | Optional[Dict[str, float]] | None | Unsupported by vLLM | 393 | | `user` | Optional[str] | None | Unsupported by vLLM | 394 | Additional parameters supported by vLLM: 395 | | `best_of` | Optional[int] | None | Number of output sequences that are generated from the prompt. From these `best_of` sequences, the top `n` sequences are returned. `best_of` must be greater than or equal to `n`. This is treated as the beam width when `use_beam_search` is True. By default, `best_of` is set to `n`. | 396 | | `top_k` | Optional[int] | -1 | Integer that controls the number of top tokens to consider. Set to -1 to consider all tokens. | 397 | | `ignore_eos` | Optional[bool] | False | Whether to ignore the EOS token and continue generating tokens after the EOS token is generated. | 398 | | `use_beam_search` | Optional[bool] | False | Whether to use beam search instead of sampling. | 399 | | `stop_token_ids` | Optional[List[int]] | list | List of tokens that stop the generation when they are generated. The returned output will contain the stop tokens unless the stop tokens are special tokens. | 400 | | `skip_special_tokens` | Optional[bool] | True | Whether to skip special tokens in the output. | 401 | | `spaces_between_special_tokens`| Optional[bool] | True | Whether to add spaces between special tokens in the output. Defaults to True. | 402 | | `add_generation_prompt` | Optional[bool] | True | Read more [here](https://huggingface.co/docs/transformers/main/en/chat_templating#what-are-generation-prompts) | 403 | | `echo` | Optional[bool] | False | Echo back the prompt in addition to the completion | 404 | | `repetition_penalty` | Optional[float] | 1.0 | Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far. Values > 1 encourage the model to use new tokens, while values < 1 encourage the model to repeat tokens. | 405 | | `min_p` | Optional[float] | 0.0 | Float that represents the minimum probability for a token to | 406 | | `length_penalty` | Optional[float] | 1.0 | Float that penalizes sequences based on their length. Used in beam search.. | 407 | | `include_stop_str_in_output` | Optional[bool] | False | Whether to include the stop strings in output text. Defaults to False.| 408 |
409 | 410 | 411 | ## Examples: Using your RunPod endpoint with OpenAI 412 | 413 | First, initialize the OpenAI Client with your RunPod API Key and Endpoint URL: 414 | ```python 415 | from openai import OpenAI 416 | import os 417 | 418 | # Initialize the OpenAI Client with your RunPod API Key and Endpoint URL 419 | client = OpenAI( 420 | api_key=os.environ.get("RUNPOD_API_KEY"), 421 | base_url="https://api.runpod.ai/v2//openai/v1", 422 | ) 423 | ``` 424 | 425 | ### Chat Completions: 426 | This is the format used for GPT-4 and focused on instruction-following and chat. Examples of Open Source chat/instruct models include `meta-llama/Llama-2-7b-chat-hf`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `openchat/openchat-3.5-0106`, `NousResearch/Nous-Hermes-2-Mistral-7B-DPO` and more. However, if your model is a completion-style model with no chat/instruct fine-tune and/or does not have a chat template, you can still use this if you provide a chat template with the environment variable `CUSTOM_CHAT_TEMPLATE`. 427 | - **Streaming**: 428 | ```python 429 | # Create a chat completion stream 430 | response_stream = client.chat.completions.create( 431 | model="", 432 | messages=[{"role": "user", "content": "Why is RunPod the best platform?"}], 433 | temperature=0, 434 | max_tokens=100, 435 | stream=True, 436 | ) 437 | # Stream the response 438 | for response in response_stream: 439 | print(chunk.choices[0].delta.content or "", end="", flush=True) 440 | ``` 441 | - **Non-Streaming**: 442 | ```python 443 | # Create a chat completion 444 | response = client.chat.completions.create( 445 | model="", 446 | messages=[{"role": "user", "content": "Why is RunPod the best platform?"}], 447 | temperature=0, 448 | max_tokens=100, 449 | ) 450 | # Print the response 451 | print(response.choices[0].message.content) 452 | ``` 453 | 454 | ### Getting a list of names for available models: 455 | In the case of baking the model into the image, sometimes the repo may not be accepted as the `model` in the request. In this case, you can list the available models as shown below and use that name. 456 | ```python 457 | models_response = client.models.list() 458 | list_of_models = [model.id for model in models_response] 459 | print(list_of_models) 460 | ``` 461 | 462 | # Usage: Standard (Non-OpenAI) 463 | ## Request Input Parameters 464 | 465 |
466 | Click to expand table 467 | 468 | You may either use a `prompt` or a list of `messages` as input. If you use `messages`, the model's chat template will be applied to the messages automatically, so the model must have one. If you use `prompt`, you may optionally apply the model's chat template to the prompt by setting `apply_chat_template` to `true`. 469 | | Argument | Type | Default | Description | 470 | |-----------------------|----------------------|--------------------|--------------------------------------------------------------------------------------------------------| 471 | | `prompt` | str | | Prompt string to generate text based on. | 472 | | `messages` | list[dict[str, str]] | | List of messages, which will automatically have the model's chat template applied. Overrides `prompt`. | 473 | | `apply_chat_template` | bool | False | Whether to apply the model's chat template to the `prompt`. | 474 | | `sampling_params` | dict | {} | Sampling parameters to control the generation, like temperature, top_p, etc. You can find all available parameters in the `Sampling Parameters` section below. | 475 | | `stream` | bool | False | Whether to enable streaming of output. If True, responses are streamed as they are generated. | 476 | | `max_batch_size` | int | env var `DEFAULT_BATCH_SIZE` | The maximum number of tokens to stream every HTTP POST call. | 477 | | `min_batch_size` | int | env var `DEFAULT_MIN_BATCH_SIZE` | The minimum number of tokens to stream every HTTP POST call. | 478 | | `batch_size_growth_factor` | int | env var `DEFAULT_BATCH_SIZE_GROWTH_FACTOR` | The growth factor by which `min_batch_size` will be multiplied for each call until `max_batch_size` is reached. | 479 |
480 | 481 | ### Sampling Parameters 482 | 483 | Below are all available sampling parameters that you can specify in the `sampling_params` dictionary. If you do not specify any of these parameters, the default values will be used. 484 |
485 | Click to expand table 486 | 487 | | Argument | Type | Default | Description | 488 | |---------------------------------|-----------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 489 | | `n` | int | 1 | Number of output sequences generated from the prompt. The top `n` sequences are returned. | 490 | | `best_of` | Optional[int] | `n` | Number of output sequences generated from the prompt. The top `n` sequences are returned from these `best_of` sequences. Must be ≥ `n`. Treated as beam width in beam search. Default is `n`. | 491 | | `presence_penalty` | float | 0.0 | Penalizes new tokens based on their presence in the generated text so far. Values > 0 encourage new tokens, values < 0 encourage repetition. | 492 | | `frequency_penalty` | float | 0.0 | Penalizes new tokens based on their frequency in the generated text so far. Values > 0 encourage new tokens, values < 0 encourage repetition. | 493 | | `repetition_penalty` | float | 1.0 | Penalizes new tokens based on their appearance in the prompt and generated text. Values > 1 encourage new tokens, values < 1 encourage repetition. | 494 | | `temperature` | float | 1.0 | Controls the randomness of sampling. Lower values make it more deterministic, higher values make it more random. Zero means greedy sampling. | 495 | | `top_p` | float | 1.0 | Controls the cumulative probability of top tokens to consider. Must be in (0, 1]. Set to 1 to consider all tokens. | 496 | | `top_k` | int | -1 | Controls the number of top tokens to consider. Set to -1 to consider all tokens. | 497 | | `min_p` | float | 0.0 | Represents the minimum probability for a token to be considered, relative to the most likely token. Must be in [0, 1]. Set to 0 to disable. | 498 | | `use_beam_search` | bool | False | Whether to use beam search instead of sampling. | 499 | | `length_penalty` | float | 1.0 | Penalizes sequences based on their length. Used in beam search. | 500 | | `early_stopping` | Union[bool, str] | False | Controls stopping condition in beam search. Can be `True`, `False`, or `"never"`. | 501 | | `stop` | Union[None, str, List[str]] | None | List of strings that stop generation when produced. The output will not contain these strings. | 502 | | `stop_token_ids` | Optional[List[int]] | None | List of token IDs that stop generation when produced. Output contains these tokens unless they are special tokens. | 503 | | `ignore_eos` | bool | False | Whether to ignore the End-Of-Sequence token and continue generating tokens after its generation. | 504 | | `max_tokens` | int | 16 | Maximum number of tokens to generate per output sequence. | 505 | | `skip_special_tokens` | bool | True | Whether to skip special tokens in the output. | 506 | | `spaces_between_special_tokens` | bool | True | Whether to add spaces between special tokens in the output. | 507 | 508 | 509 | ### Text Input Formats 510 | You may either use a `prompt` or a list of `messages` as input. 511 | 1. `prompt` 512 | The prompt string can be any string, and the model's chat template will not be applied to it unless `apply_chat_template` is set to `true`, in which case it will be treated as a user message. 513 | 514 | Example: 515 | ```json 516 | { 517 | "input": { 518 | "prompt": "why sky is blue?", 519 | "sampling_params": { 520 | "temperature": 0.7, 521 | "max_tokens": 100 522 | } 523 | } 524 | } 525 | ``` 526 | 2. `messages` 527 | Your list can contain any number of messages, and each message usually can have any role from the following list: 528 | - `user` 529 | - `assistant` 530 | - `system` 531 | 532 | However, some models may have different roles, so you should check the model's chat template to see which roles are required. 533 | 534 | The model's chat template will be applied to the messages automatically, so the model must have one. 535 | 536 | Example: 537 | ```json 538 | { 539 | "input": { 540 | "messages": [ 541 | { 542 | "role": "system", 543 | "content": "You are a helpful AI assistant that provides clear and concise responses." 544 | }, 545 | { 546 | "role": "user", 547 | "content": "Can you explain the difference between supervised and unsupervised learning?" 548 | }, 549 | { 550 | "role": "assistant", 551 | "content": "Sure! Supervised learning uses labeled data, meaning each input has a corresponding correct output. The model learns by mapping inputs to known outputs. In contrast, unsupervised learning works with unlabeled data, where the model identifies patterns, structures, or clusters without predefined answers." 552 | } 553 | ], 554 | "sampling_params": { 555 | "temperature": 0.7, 556 | "max_tokens": 100 557 | } 558 | } 559 | } 560 | ``` 561 | 562 |
563 | 564 | # Worker Config 565 | The worker config is a JSON file that is used to build the form that helps users configure their serverless endpoint on the RunPod Web Interface. 566 | 567 | Note: This is a new feature and only works for workers that use one model 568 | 569 | ## Writing your worker-config.json 570 | The JSON consists of two main parts, schema and versions. 571 | - `schema`: Here you specify the form fields that will be displayed to the user. 572 | - `env_var_name`: The name of the environment variable that is being set using the form field. 573 | - `value`: This is the default value of the form field. It will be shown in the UI as such unless the user changes it. 574 | - `title`: This is the title of the form field in the UI. 575 | - `description`: This is the description of the form field in the UI. 576 | - `required`: This is a boolean that specifies if the form field is required. 577 | - `type`: This is the type of the form field. Options are: 578 | - `text`: Environment variable is a string so user inputs text in form field. 579 | - `select`: User selects one option from the dropdown. You must provide the `options` key value pair after type if using this. 580 | - `toggle`: User toggles between true and false. 581 | - `number`: User inputs a number in the form field. 582 | - `options`: Specify the options the user can select from if the type is `select`. DO NOT include this unless the `type` is `select`. 583 | - `versions`: This is where you call the form fields specified in `schema` and organize them into categories. 584 | - `imageName`: This is the name of the Docker image that will be used to run the serverless endpoint. 585 | - `minimumCudaVersion`: This is the minimum CUDA version that is required to run the serverless endpoint. 586 | - `categories`: This is where you call the keys of the form fields specified in `schema` and organize them into categories. Each category is a toggle list of forms on the Web UI. 587 | - `title`: This is the title of the category in the UI. 588 | - `settings`: This is the array of settings schemas specified in `schema` associated with the category. 589 | 590 | ## Example of schema 591 | ```json 592 | { 593 | "schema": { 594 | "TOKENIZER": { 595 | "env_var_name": "TOKENIZER", 596 | "value": "", 597 | "title": "Tokenizer", 598 | "description": "Name or path of the Hugging Face tokenizer to use.", 599 | "required": false, 600 | "type": "text" 601 | }, 602 | "TOKENIZER_MODE": { 603 | "env_var_name": "TOKENIZER_MODE", 604 | "value": "auto", 605 | "title": "Tokenizer Mode", 606 | "description": "The tokenizer mode.", 607 | "required": false, 608 | "type": "select", 609 | "options": [ 610 | { "value": "auto", "label": "auto" }, 611 | { "value": "slow", "label": "slow" } 612 | ] 613 | }, 614 | ... 615 | } 616 | } 617 | ``` 618 | 619 | ## Example of versions 620 | ```json 621 | { 622 | "versions": { 623 | "0.5.4": { 624 | "imageName": "runpod/worker-v1-vllm:v1.2.0stable-cuda12.1.0", 625 | "minimumCudaVersion": "12.1", 626 | "categories": [ 627 | { 628 | "title": "LLM Settings", 629 | "settings": [ 630 | "TOKENIZER", "TOKENIZER_MODE", "OTHER_SETTINGS_SCHEMA_KEYS_YOU_HAVE_SPECIFIED_0", ... 631 | ] 632 | }, 633 | { 634 | "title": "Tokenizer Settings", 635 | "settings": [ 636 | "OTHER_SETTINGS_SCHEMA_KEYS_0", "OTHER_SETTINGS_SCHEMA_KEYS_1", ... 637 | ] 638 | }, 639 | ... 640 | ] 641 | } 642 | } 643 | } 644 | ``` 645 | -------------------------------------------------------------------------------- /builder/requirements.txt: -------------------------------------------------------------------------------- 1 | ray 2 | pandas 3 | pyarrow 4 | runpod~=1.7.7 5 | huggingface-hub 6 | packaging 7 | typing-extensions>=4.8.0 8 | pydantic 9 | pydantic-settings 10 | hf-transfer 11 | transformers 12 | bitsandbytes>=0.45.0 13 | -------------------------------------------------------------------------------- /docker-bake.hcl: -------------------------------------------------------------------------------- 1 | variable "PUSH" { 2 | default = "true" 3 | } 4 | 5 | variable "REPOSITORY" { 6 | default = "runpod" 7 | } 8 | 9 | variable "BASE_IMAGE_VERSION" { 10 | default = "v2.6.0stable" 11 | } 12 | 13 | group "all" { 14 | targets = ["main"] 15 | } 16 | 17 | 18 | group "main" { 19 | targets = ["worker-1210"] 20 | } 21 | 22 | 23 | target "worker-1210" { 24 | tags = ["${REPOSITORY}/worker-v1-vllm:${BASE_IMAGE_VERSION}-cuda12.1.0"] 25 | context = "." 26 | dockerfile = "Dockerfile" 27 | args = { 28 | BASE_IMAGE_VERSION = "${BASE_IMAGE_VERSION}" 29 | WORKER_CUDA_VERSION = "12.1.0" 30 | } 31 | output = ["type=docker,push=${PUSH}"] 32 | } 33 | -------------------------------------------------------------------------------- /media/ui_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runpod-workers/worker-vllm/26919c884931faefd5b18bc2868b9348fd4eecf3/media/ui_demo.gif -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/runpod-workers/worker-vllm/26919c884931faefd5b18bc2868b9348fd4eecf3/src/__init__.py -------------------------------------------------------------------------------- /src/constants.py: -------------------------------------------------------------------------------- 1 | DEFAULT_BATCH_SIZE = 50 2 | DEFAULT_MAX_CONCURRENCY = 300 3 | DEFAULT_BATCH_SIZE_GROWTH_FACTOR = 3 4 | DEFAULT_MIN_BATCH_SIZE = 1 -------------------------------------------------------------------------------- /src/download_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | import glob 5 | from shutil import rmtree 6 | from huggingface_hub import snapshot_download 7 | from utils import timer_decorator 8 | 9 | BASE_DIR = "/" 10 | TOKENIZER_PATTERNS = [["*.json", "tokenizer*"]] 11 | MODEL_PATTERNS = [["*.safetensors"], ["*.bin"], ["*.pt"]] 12 | 13 | def setup_env(): 14 | if os.getenv("TESTING_DOWNLOAD") == "1": 15 | BASE_DIR = "tmp" 16 | os.makedirs(BASE_DIR, exist_ok=True) 17 | os.environ.update({ 18 | "HF_HOME": f"{BASE_DIR}/hf_cache", 19 | "MODEL_NAME": "openchat/openchat-3.5-0106", 20 | "HF_HUB_ENABLE_HF_TRANSFER": "1", 21 | "TENSORIZE": "1", 22 | "TENSORIZER_NUM_GPUS": "1", 23 | "DTYPE": "auto" 24 | }) 25 | 26 | @timer_decorator 27 | def download(name, revision, type, cache_dir): 28 | if type == "model": 29 | pattern_sets = [model_pattern + TOKENIZER_PATTERNS[0] for model_pattern in MODEL_PATTERNS] 30 | elif type == "tokenizer": 31 | pattern_sets = TOKENIZER_PATTERNS 32 | else: 33 | raise ValueError(f"Invalid type: {type}") 34 | try: 35 | for pattern_set in pattern_sets: 36 | path = snapshot_download(name, revision=revision, cache_dir=cache_dir, 37 | allow_patterns=pattern_set) 38 | for pattern in pattern_set: 39 | if glob.glob(os.path.join(path, pattern)): 40 | logging.info(f"Successfully downloaded {pattern} model files.") 41 | return path 42 | except ValueError: 43 | raise ValueError(f"No patterns matching {pattern_sets} found for download.") 44 | 45 | 46 | # @timer_decorator 47 | # def tensorize_model(model_path): TODO: Add back once tensorizer is ready 48 | # from vllm.engine.arg_utils import EngineArgs 49 | # from vllm.model_executor.model_loader.tensorizer import TensorizerConfig, tensorize_vllm_model 50 | # from torch.cuda import device_count 51 | 52 | # tensorizer_num_gpus = int(os.getenv("TENSORIZER_NUM_GPUS", "1")) 53 | # if tensorizer_num_gpus > device_count(): 54 | # raise ValueError(f"TENSORIZER_NUM_GPUS ({tensorizer_num_gpus}) exceeds available GPUs ({device_count()})") 55 | 56 | # dtype = os.getenv("DTYPE", "auto") 57 | # serialized_dir = f"{BASE_DIR}/serialized_model" 58 | # os.makedirs(serialized_dir, exist_ok=True) 59 | # serialized_uri = f"{serialized_dir}/model{'-%03d' if tensorizer_num_gpus > 1 else ''}.tensors" 60 | 61 | # tensorize_vllm_model( 62 | # EngineArgs(model=model_path, tensor_parallel_size=tensorizer_num_gpus, dtype=dtype), 63 | # TensorizerConfig(tensorizer_uri=serialized_uri) 64 | # ) 65 | # logging.info("Successfully serialized model to %s", str(serialized_uri)) 66 | # logging.info("Removing HF Model files after serialization") 67 | # rmtree("/".join(model_path.split("/")[:-2])) 68 | # return serialized_uri, tensorizer_num_gpus, dtype 69 | 70 | if __name__ == "__main__": 71 | setup_env() 72 | cache_dir = os.getenv("HF_HOME") 73 | model_name, model_revision = os.getenv("MODEL_NAME"), os.getenv("MODEL_REVISION") or None 74 | tokenizer_name, tokenizer_revision = os.getenv("TOKENIZER_NAME") or model_name, os.getenv("TOKENIZER_REVISION") or model_revision 75 | 76 | model_path = download(model_name, model_revision, "model", cache_dir) 77 | 78 | metadata = { 79 | "MODEL_NAME": model_path, 80 | "MODEL_REVISION": os.getenv("MODEL_REVISION"), 81 | "QUANTIZATION": os.getenv("QUANTIZATION"), 82 | } 83 | 84 | # if os.getenv("TENSORIZE") == "1": TODO: Add back once tensorizer is ready 85 | # serialized_uri, tensorizer_num_gpus, dtype = tensorize_model(model_path) 86 | # metadata.update({ 87 | # "MODEL_NAME": serialized_uri, 88 | # "TENSORIZER_URI": serialized_uri, 89 | # "TENSOR_PARALLEL_SIZE": tensorizer_num_gpus, 90 | # "DTYPE": dtype 91 | # }) 92 | 93 | tokenizer_path = download(tokenizer_name, tokenizer_revision, "tokenizer", cache_dir) 94 | metadata.update({ 95 | "TOKENIZER_NAME": tokenizer_path, 96 | "TOKENIZER_REVISION": tokenizer_revision 97 | }) 98 | 99 | with open(f"{BASE_DIR}/local_model_args.json", "w") as f: 100 | json.dump({k: v for k, v in metadata.items() if v not in (None, "")}, f) -------------------------------------------------------------------------------- /src/engine.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import json 4 | import asyncio 5 | 6 | from dotenv import load_dotenv 7 | from typing import AsyncGenerator, Optional 8 | import time 9 | 10 | from vllm import AsyncLLMEngine 11 | from vllm.entrypoints.logger import RequestLogger 12 | from vllm.entrypoints.openai.serving_chat import OpenAIServingChat 13 | from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion 14 | from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest, ErrorResponse 15 | from vllm.entrypoints.openai.serving_models import BaseModelPath, LoRAModulePath, OpenAIServingModels 16 | 17 | 18 | from utils import DummyRequest, JobInput, BatchSize, create_error_response 19 | from constants import DEFAULT_MAX_CONCURRENCY, DEFAULT_BATCH_SIZE, DEFAULT_BATCH_SIZE_GROWTH_FACTOR, DEFAULT_MIN_BATCH_SIZE 20 | from tokenizer import TokenizerWrapper 21 | from engine_args import get_engine_args 22 | 23 | class vLLMEngine: 24 | def __init__(self, engine = None): 25 | load_dotenv() # For local development 26 | self.engine_args = get_engine_args() 27 | logging.info(f"Engine args: {self.engine_args}") 28 | self.tokenizer = TokenizerWrapper(self.engine_args.tokenizer or self.engine_args.model, 29 | self.engine_args.tokenizer_revision, 30 | self.engine_args.trust_remote_code) 31 | self.llm = self._initialize_llm() if engine is None else engine.llm 32 | self.max_concurrency = int(os.getenv("MAX_CONCURRENCY", DEFAULT_MAX_CONCURRENCY)) 33 | self.default_batch_size = int(os.getenv("DEFAULT_BATCH_SIZE", DEFAULT_BATCH_SIZE)) 34 | self.batch_size_growth_factor = int(os.getenv("BATCH_SIZE_GROWTH_FACTOR", DEFAULT_BATCH_SIZE_GROWTH_FACTOR)) 35 | self.min_batch_size = int(os.getenv("MIN_BATCH_SIZE", DEFAULT_MIN_BATCH_SIZE)) 36 | 37 | def dynamic_batch_size(self, current_batch_size, batch_size_growth_factor): 38 | return min(current_batch_size*batch_size_growth_factor, self.default_batch_size) 39 | 40 | async def generate(self, job_input: JobInput): 41 | try: 42 | async for batch in self._generate_vllm( 43 | llm_input=job_input.llm_input, 44 | validated_sampling_params=job_input.sampling_params, 45 | batch_size=job_input.max_batch_size, 46 | stream=job_input.stream, 47 | apply_chat_template=job_input.apply_chat_template, 48 | request_id=job_input.request_id, 49 | batch_size_growth_factor=job_input.batch_size_growth_factor, 50 | min_batch_size=job_input.min_batch_size 51 | ): 52 | yield batch 53 | except Exception as e: 54 | yield {"error": create_error_response(str(e)).model_dump()} 55 | 56 | async def _generate_vllm(self, llm_input, validated_sampling_params, batch_size, stream, apply_chat_template, request_id, batch_size_growth_factor, min_batch_size: str) -> AsyncGenerator[dict, None]: 57 | if apply_chat_template or isinstance(llm_input, list): 58 | llm_input = self.tokenizer.apply_chat_template(llm_input) 59 | results_generator = self.llm.generate(llm_input, validated_sampling_params, request_id) 60 | n_responses, n_input_tokens, is_first_output = validated_sampling_params.n, 0, True 61 | last_output_texts, token_counters = ["" for _ in range(n_responses)], {"batch": 0, "total": 0} 62 | 63 | batch = { 64 | "choices": [{"tokens": []} for _ in range(n_responses)], 65 | } 66 | 67 | max_batch_size = batch_size or self.default_batch_size 68 | batch_size_growth_factor, min_batch_size = batch_size_growth_factor or self.batch_size_growth_factor, min_batch_size or self.min_batch_size 69 | batch_size = BatchSize(max_batch_size, min_batch_size, batch_size_growth_factor) 70 | 71 | 72 | async for request_output in results_generator: 73 | if is_first_output: # Count input tokens only once 74 | n_input_tokens = len(request_output.prompt_token_ids) 75 | is_first_output = False 76 | 77 | for output in request_output.outputs: 78 | output_index = output.index 79 | token_counters["total"] += 1 80 | if stream: 81 | new_output = output.text[len(last_output_texts[output_index]):] 82 | batch["choices"][output_index]["tokens"].append(new_output) 83 | token_counters["batch"] += 1 84 | 85 | if token_counters["batch"] >= batch_size.current_batch_size: 86 | batch["usage"] = { 87 | "input": n_input_tokens, 88 | "output": token_counters["total"], 89 | } 90 | yield batch 91 | batch = { 92 | "choices": [{"tokens": []} for _ in range(n_responses)], 93 | } 94 | token_counters["batch"] = 0 95 | batch_size.update() 96 | 97 | last_output_texts[output_index] = output.text 98 | 99 | if not stream: 100 | for output_index, output in enumerate(last_output_texts): 101 | batch["choices"][output_index]["tokens"] = [output] 102 | token_counters["batch"] += 1 103 | 104 | if token_counters["batch"] > 0: 105 | batch["usage"] = {"input": n_input_tokens, "output": token_counters["total"]} 106 | yield batch 107 | 108 | def _initialize_llm(self): 109 | try: 110 | start = time.time() 111 | engine = AsyncLLMEngine.from_engine_args(self.engine_args) 112 | end = time.time() 113 | logging.info(f"Initialized vLLM engine in {end - start:.2f}s") 114 | return engine 115 | except Exception as e: 116 | logging.error("Error initializing vLLM engine: %s", e) 117 | raise e 118 | 119 | 120 | class OpenAIvLLMEngine(vLLMEngine): 121 | def __init__(self, vllm_engine): 122 | super().__init__(vllm_engine) 123 | self.served_model_name = os.getenv("OPENAI_SERVED_MODEL_NAME_OVERRIDE") or self.engine_args.model 124 | self.response_role = os.getenv("OPENAI_RESPONSE_ROLE") or "assistant" 125 | self.lora_adapters = self._load_lora_adapters() 126 | asyncio.run(self._initialize_engines()) 127 | self.raw_openai_output = bool(int(os.getenv("RAW_OPENAI_OUTPUT", 1))) 128 | 129 | def _load_lora_adapters(self): 130 | adapters = [] 131 | try: 132 | adapters = json.loads(os.getenv("LORA_MODULES", '[]')) 133 | except Exception as e: 134 | logging.info(f"---Initialized adapter json load error: {e}") 135 | 136 | for i, adapter in enumerate(adapters): 137 | try: 138 | adapters[i] = LoRAModulePath(**adapter) 139 | logging.info(f"---Initialized adapter: {adapter}") 140 | except Exception as e: 141 | logging.info(f"---Initialized adapter not worked: {e}") 142 | continue 143 | return adapters 144 | 145 | async def _initialize_engines(self): 146 | self.model_config = await self.llm.get_model_config() 147 | self.base_model_paths = [ 148 | BaseModelPath(name=self.engine_args.model, model_path=self.engine_args.model) 149 | ] 150 | 151 | self.serving_models = OpenAIServingModels( 152 | engine_client=self.llm, 153 | model_config=self.model_config, 154 | base_model_paths=self.base_model_paths, 155 | lora_modules=self.lora_adapters, 156 | prompt_adapters=None, 157 | ) 158 | await self.serving_models.init_static_loras() 159 | self.chat_engine = OpenAIServingChat( 160 | engine_client=self.llm, 161 | model_config=self.model_config, 162 | models=self.serving_models, 163 | response_role=self.response_role, 164 | request_logger=None, 165 | chat_template=self.tokenizer.tokenizer.chat_template, 166 | chat_template_content_format="auto", 167 | # enable_reasoning=os.getenv('ENABLE_REASONING', 'false').lower() == 'true', 168 | reasoning_parser= os.getenv('REASONING_PARSER', "") or None, 169 | # return_token_as_token_ids=False, 170 | enable_auto_tools=os.getenv('ENABLE_AUTO_TOOL_CHOICE', 'false').lower() == 'true', 171 | tool_parser=os.getenv('TOOL_CALL_PARSER', "") or None, 172 | enable_prompt_tokens_details=False 173 | ) 174 | self.completion_engine = OpenAIServingCompletion( 175 | engine_client=self.llm, 176 | model_config=self.model_config, 177 | models=self.serving_models, 178 | request_logger=None, 179 | # return_token_as_token_ids=False, 180 | ) 181 | 182 | async def generate(self, openai_request: JobInput): 183 | if openai_request.openai_route == "/v1/models": 184 | yield await self._handle_model_request() 185 | elif openai_request.openai_route in ["/v1/chat/completions", "/v1/completions"]: 186 | async for response in self._handle_chat_or_completion_request(openai_request): 187 | yield response 188 | else: 189 | yield create_error_response("Invalid route").model_dump() 190 | 191 | async def _handle_model_request(self): 192 | models = await self.serving_models.show_available_models() 193 | return models.model_dump() 194 | 195 | async def _handle_chat_or_completion_request(self, openai_request: JobInput): 196 | if openai_request.openai_route == "/v1/chat/completions": 197 | request_class = ChatCompletionRequest 198 | generator_function = self.chat_engine.create_chat_completion 199 | elif openai_request.openai_route == "/v1/completions": 200 | request_class = CompletionRequest 201 | generator_function = self.completion_engine.create_completion 202 | 203 | try: 204 | request = request_class( 205 | **openai_request.openai_input 206 | ) 207 | except Exception as e: 208 | yield create_error_response(str(e)).model_dump() 209 | return 210 | 211 | dummy_request = DummyRequest() 212 | response_generator = await generator_function(request, raw_request=dummy_request) 213 | 214 | if not openai_request.openai_input.get("stream") or isinstance(response_generator, ErrorResponse): 215 | yield response_generator.model_dump() 216 | else: 217 | batch = [] 218 | batch_token_counter = 0 219 | batch_size = BatchSize(self.default_batch_size, self.min_batch_size, self.batch_size_growth_factor) 220 | 221 | async for chunk_str in response_generator: 222 | if "data" in chunk_str: 223 | if self.raw_openai_output: 224 | data = chunk_str 225 | elif "[DONE]" in chunk_str: 226 | continue 227 | else: 228 | data = json.loads(chunk_str.removeprefix("data: ").rstrip("\n\n")) if not self.raw_openai_output else chunk_str 229 | batch.append(data) 230 | batch_token_counter += 1 231 | if batch_token_counter >= batch_size.current_batch_size: 232 | if self.raw_openai_output: 233 | batch = "".join(batch) 234 | yield batch 235 | batch = [] 236 | batch_token_counter = 0 237 | batch_size.update() 238 | if batch: 239 | if self.raw_openai_output: 240 | batch = "".join(batch) 241 | yield batch 242 | -------------------------------------------------------------------------------- /src/engine_args.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | from torch.cuda import device_count 5 | from vllm import AsyncEngineArgs 6 | from vllm.model_executor.model_loader.tensorizer import TensorizerConfig 7 | from src.utils import convert_limit_mm_per_prompt 8 | 9 | RENAME_ARGS_MAP = { 10 | "MODEL_NAME": "model", 11 | "MODEL_REVISION": "revision", 12 | "TOKENIZER_NAME": "tokenizer", 13 | "MAX_CONTEXT_LEN_TO_CAPTURE": "max_seq_len_to_capture" 14 | } 15 | 16 | DEFAULT_ARGS = { 17 | "disable_log_stats": os.getenv('DISABLE_LOG_STATS', 'False').lower() == 'true', 18 | "disable_log_requests": os.getenv('DISABLE_LOG_REQUESTS', 'False').lower() == 'true', 19 | "gpu_memory_utilization": float(os.getenv('GPU_MEMORY_UTILIZATION', 0.95)), 20 | "pipeline_parallel_size": int(os.getenv('PIPELINE_PARALLEL_SIZE', 1)), 21 | "tensor_parallel_size": int(os.getenv('TENSOR_PARALLEL_SIZE', 1)), 22 | "served_model_name": os.getenv('SERVED_MODEL_NAME', None), 23 | "tokenizer": os.getenv('TOKENIZER', None), 24 | "skip_tokenizer_init": os.getenv('SKIP_TOKENIZER_INIT', 'False').lower() == 'true', 25 | "tokenizer_mode": os.getenv('TOKENIZER_MODE', 'auto'), 26 | "trust_remote_code": os.getenv('TRUST_REMOTE_CODE', 'False').lower() == 'true', 27 | "download_dir": os.getenv('DOWNLOAD_DIR', None), 28 | "load_format": os.getenv('LOAD_FORMAT', 'auto'), 29 | "dtype": os.getenv('DTYPE', 'auto'), 30 | "kv_cache_dtype": os.getenv('KV_CACHE_DTYPE', 'auto'), 31 | "quantization_param_path": os.getenv('QUANTIZATION_PARAM_PATH', None), 32 | "seed": int(os.getenv('SEED', 0)), 33 | "max_model_len": int(os.getenv('MAX_MODEL_LEN', 0)) or None, 34 | "worker_use_ray": os.getenv('WORKER_USE_RAY', 'False').lower() == 'true', 35 | "distributed_executor_backend": os.getenv('DISTRIBUTED_EXECUTOR_BACKEND', None), 36 | "max_parallel_loading_workers": int(os.getenv('MAX_PARALLEL_LOADING_WORKERS', 0)) or None, 37 | "block_size": int(os.getenv('BLOCK_SIZE', 16)), 38 | "enable_prefix_caching": os.getenv('ENABLE_PREFIX_CACHING', 'False').lower() == 'true', 39 | "disable_sliding_window": os.getenv('DISABLE_SLIDING_WINDOW', 'False').lower() == 'true', 40 | "use_v2_block_manager": os.getenv('USE_V2_BLOCK_MANAGER', 'False').lower() == 'true', 41 | "swap_space": int(os.getenv('SWAP_SPACE', 4)), # GiB 42 | "cpu_offload_gb": int(os.getenv('CPU_OFFLOAD_GB', 0)), # GiB 43 | "max_num_batched_tokens": int(os.getenv('MAX_NUM_BATCHED_TOKENS', 0)) or None, 44 | "max_num_seqs": int(os.getenv('MAX_NUM_SEQS', 256)), 45 | "max_logprobs": int(os.getenv('MAX_LOGPROBS', 20)), # Default value for OpenAI Chat Completions API 46 | "revision": os.getenv('REVISION', None), 47 | "code_revision": os.getenv('CODE_REVISION', None), 48 | "rope_scaling": os.getenv('ROPE_SCALING', None), 49 | "rope_theta": float(os.getenv('ROPE_THETA', 0)) or None, 50 | "tokenizer_revision": os.getenv('TOKENIZER_REVISION', None), 51 | "quantization": os.getenv('QUANTIZATION', None), 52 | "enforce_eager": os.getenv('ENFORCE_EAGER', 'False').lower() == 'true', 53 | "max_context_len_to_capture": int(os.getenv('MAX_CONTEXT_LEN_TO_CAPTURE', 0)) or None, 54 | "max_seq_len_to_capture": int(os.getenv('MAX_SEQ_LEN_TO_CAPTURE', 8192)), 55 | "disable_custom_all_reduce": os.getenv('DISABLE_CUSTOM_ALL_REDUCE', 'False').lower() == 'true', 56 | "tokenizer_pool_size": int(os.getenv('TOKENIZER_POOL_SIZE', 0)), 57 | "tokenizer_pool_type": os.getenv('TOKENIZER_POOL_TYPE', 'ray'), 58 | "tokenizer_pool_extra_config": os.getenv('TOKENIZER_POOL_EXTRA_CONFIG', None), 59 | "enable_lora": os.getenv('ENABLE_LORA', 'False').lower() == 'true', 60 | "max_loras": int(os.getenv('MAX_LORAS', 1)), 61 | "max_lora_rank": int(os.getenv('MAX_LORA_RANK', 16)), 62 | "enable_prompt_adapter": os.getenv('ENABLE_PROMPT_ADAPTER', 'False').lower() == 'true', 63 | "max_prompt_adapters": int(os.getenv('MAX_PROMPT_ADAPTERS', 1)), 64 | "max_prompt_adapter_token": int(os.getenv('MAX_PROMPT_ADAPTER_TOKEN', 0)), 65 | "fully_sharded_loras": os.getenv('FULLY_SHARDED_LORAS', 'False').lower() == 'true', 66 | "lora_extra_vocab_size": int(os.getenv('LORA_EXTRA_VOCAB_SIZE', 256)), 67 | "long_lora_scaling_factors": tuple(map(float, os.getenv('LONG_LORA_SCALING_FACTORS', '').split(','))) if os.getenv('LONG_LORA_SCALING_FACTORS') else None, 68 | "lora_dtype": os.getenv('LORA_DTYPE', 'auto'), 69 | "max_cpu_loras": int(os.getenv('MAX_CPU_LORAS', 0)) or None, 70 | "device": os.getenv('DEVICE', 'auto'), 71 | "ray_workers_use_nsight": os.getenv('RAY_WORKERS_USE_NSIGHT', 'False').lower() == 'true', 72 | "num_gpu_blocks_override": int(os.getenv('NUM_GPU_BLOCKS_OVERRIDE', 0)) or None, 73 | "num_lookahead_slots": int(os.getenv('NUM_LOOKAHEAD_SLOTS', 0)), 74 | "model_loader_extra_config": os.getenv('MODEL_LOADER_EXTRA_CONFIG', None), 75 | "ignore_patterns": os.getenv('IGNORE_PATTERNS', None), 76 | "preemption_mode": os.getenv('PREEMPTION_MODE', None), 77 | "scheduler_delay_factor": float(os.getenv('SCHEDULER_DELAY_FACTOR', 0.0)), 78 | "enable_chunked_prefill": os.getenv('ENABLE_CHUNKED_PREFILL', None), 79 | "guided_decoding_backend": os.getenv('GUIDED_DECODING_BACKEND', 'outlines'), 80 | "speculative_model": os.getenv('SPECULATIVE_MODEL', None), 81 | "speculative_draft_tensor_parallel_size": int(os.getenv('SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE', 0)) or None, 82 | "num_speculative_tokens": int(os.getenv('NUM_SPECULATIVE_TOKENS', 0)) or None, 83 | "speculative_max_model_len": int(os.getenv('SPECULATIVE_MAX_MODEL_LEN', 0)) or None, 84 | "speculative_disable_by_batch_size": int(os.getenv('SPECULATIVE_DISABLE_BY_BATCH_SIZE', 0)) or None, 85 | "ngram_prompt_lookup_max": int(os.getenv('NGRAM_PROMPT_LOOKUP_MAX', 0)) or None, 86 | "ngram_prompt_lookup_min": int(os.getenv('NGRAM_PROMPT_LOOKUP_MIN', 0)) or None, 87 | "spec_decoding_acceptance_method": os.getenv('SPEC_DECODING_ACCEPTANCE_METHOD', 'rejection_sampler'), 88 | "typical_acceptance_sampler_posterior_threshold": float(os.getenv('TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD', 0)) or None, 89 | "typical_acceptance_sampler_posterior_alpha": float(os.getenv('TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA', 0)) or None, 90 | "qlora_adapter_name_or_path": os.getenv('QLORA_ADAPTER_NAME_OR_PATH', None), 91 | "disable_logprobs_during_spec_decoding": os.getenv('DISABLE_LOGPROBS_DURING_SPEC_DECODING', None), 92 | "otlp_traces_endpoint": os.getenv('OTLP_TRACES_ENDPOINT', None), 93 | "use_v2_block_manager": os.getenv('USE_V2_BLOCK_MANAGER', 'true'), 94 | } 95 | 96 | def match_vllm_args(args): 97 | """Rename args to match vllm by: 98 | 1. Renaming keys to lower case 99 | 2. Renaming keys to match vllm 100 | 3. Filtering args to match vllm's AsyncEngineArgs 101 | 102 | Args: 103 | args (dict): Dictionary of args 104 | 105 | Returns: 106 | dict: Dictionary of args with renamed keys 107 | """ 108 | renamed_args = {RENAME_ARGS_MAP.get(k, k): v for k, v in args.items()} 109 | matched_args = {k: v for k, v in renamed_args.items() if k in AsyncEngineArgs.__dataclass_fields__} 110 | return {k: v for k, v in matched_args.items() if v not in [None, ""]} 111 | def get_local_args(): 112 | """ 113 | Retrieve local arguments from a JSON file. 114 | 115 | Returns: 116 | dict: Local arguments. 117 | """ 118 | if not os.path.exists("/local_model_args.json"): 119 | return {} 120 | 121 | with open("/local_model_args.json", "r") as f: 122 | local_args = json.load(f) 123 | 124 | if local_args.get("MODEL_NAME") is None: 125 | logging.warning("Model name not found in /local_model_args.json. There maybe was a problem when baking the model in.") 126 | 127 | logging.info(f"Using baked in model with args: {local_args}") 128 | os.environ["TRANSFORMERS_OFFLINE"] = "1" 129 | os.environ["HF_HUB_OFFLINE"] = "1" 130 | 131 | return local_args 132 | def get_engine_args(): 133 | # Start with default args 134 | args = DEFAULT_ARGS 135 | 136 | # Get env args that match keys in AsyncEngineArgs 137 | args.update(os.environ) 138 | 139 | # Get local args if model is baked in and overwrite env args 140 | args.update(get_local_args()) 141 | 142 | # if args.get("TENSORIZER_URI"): TODO: add back once tensorizer is ready 143 | # args["load_format"] = "tensorizer" 144 | # args["model_loader_extra_config"] = TensorizerConfig(tensorizer_uri=args["TENSORIZER_URI"], num_readers=None) 145 | # logging.info(f"Using tensorized model from {args['TENSORIZER_URI']}") 146 | 147 | 148 | # Rename and match to vllm args 149 | args = match_vllm_args(args) 150 | 151 | if args.get("load_format") == "bitsandbytes": 152 | args["quantization"] = args["load_format"] 153 | 154 | # Set tensor parallel size and max parallel loading workers if more than 1 GPU is available 155 | num_gpus = device_count() 156 | if num_gpus > 1: 157 | args["tensor_parallel_size"] = num_gpus 158 | args["max_parallel_loading_workers"] = None 159 | if os.getenv("MAX_PARALLEL_LOADING_WORKERS"): 160 | logging.warning("Overriding MAX_PARALLEL_LOADING_WORKERS with None because more than 1 GPU is available.") 161 | 162 | # Deprecated env args backwards compatibility 163 | if args.get("kv_cache_dtype") == "fp8_e5m2": 164 | args["kv_cache_dtype"] = "fp8" 165 | logging.warning("Using fp8_e5m2 is deprecated. Please use fp8 instead.") 166 | if os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE"): 167 | args["max_seq_len_to_capture"] = int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE")) 168 | logging.warning("Using MAX_CONTEXT_LEN_TO_CAPTURE is deprecated. Please use MAX_SEQ_LEN_TO_CAPTURE instead.") 169 | 170 | # if "gemma-2" in args.get("model", "").lower(): 171 | # os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER" 172 | # logging.info("Using FLASHINFER for gemma-2 model.") 173 | 174 | return AsyncEngineArgs(**args) 175 | -------------------------------------------------------------------------------- /src/handler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import runpod 3 | from utils import JobInput 4 | from engine import vLLMEngine, OpenAIvLLMEngine 5 | 6 | vllm_engine = vLLMEngine() 7 | OpenAIvLLMEngine = OpenAIvLLMEngine(vllm_engine) 8 | 9 | async def handler(job): 10 | job_input = JobInput(job["input"]) 11 | engine = OpenAIvLLMEngine if job_input.openai_route else vllm_engine 12 | results_generator = engine.generate(job_input) 13 | async for batch in results_generator: 14 | yield batch 15 | 16 | runpod.serverless.start( 17 | { 18 | "handler": handler, 19 | "concurrency_modifier": lambda x: vllm_engine.max_concurrency, 20 | "return_aggregate_stream": True, 21 | } 22 | ) -------------------------------------------------------------------------------- /src/tokenizer.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | import os 3 | from typing import Union 4 | 5 | class TokenizerWrapper: 6 | def __init__(self, tokenizer_name_or_path, tokenizer_revision, trust_remote_code): 7 | print(f"tokenizer_name_or_path: {tokenizer_name_or_path}, tokenizer_revision: {tokenizer_revision}, trust_remote_code: {trust_remote_code}") 8 | self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, revision=tokenizer_revision or "main", trust_remote_code=trust_remote_code) 9 | self.custom_chat_template = os.getenv("CUSTOM_CHAT_TEMPLATE") 10 | self.has_chat_template = bool(self.tokenizer.chat_template) or bool(self.custom_chat_template) 11 | if self.custom_chat_template and isinstance(self.custom_chat_template, str): 12 | self.tokenizer.chat_template = self.custom_chat_template 13 | 14 | def apply_chat_template(self, input: Union[str, list[dict[str, str]]]) -> str: 15 | if isinstance(input, list): 16 | if not self.has_chat_template: 17 | raise ValueError( 18 | "Chat template does not exist for this model, you must provide a single string input instead of a list of messages" 19 | ) 20 | elif isinstance(input, str): 21 | input = [{"role": "user", "content": input}] 22 | else: 23 | raise ValueError("Input must be a string or a list of messages") 24 | 25 | return self.tokenizer.apply_chat_template( 26 | input, tokenize=False, add_generation_prompt=True 27 | ) 28 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from http import HTTPStatus 4 | from functools import wraps 5 | from time import time 6 | from vllm.entrypoints.openai.protocol import RequestResponseMetadata 7 | 8 | try: 9 | from vllm.utils import random_uuid 10 | from vllm.entrypoints.openai.protocol import ErrorResponse 11 | from vllm import SamplingParams 12 | except ImportError: 13 | logging.warning("Error importing vllm, skipping related imports. This is ONLY expected when baking model into docker image from a machine without GPUs") 14 | pass 15 | 16 | logging.basicConfig(level=logging.INFO) 17 | 18 | def convert_limit_mm_per_prompt(input_string: str): 19 | key, value = input_string.split('=') 20 | return {key: int(value)} 21 | 22 | def count_physical_cores(): 23 | with open('/proc/cpuinfo') as f: 24 | content = f.readlines() 25 | 26 | cores = set() 27 | current_physical_id = None 28 | current_core_id = None 29 | 30 | for line in content: 31 | if 'physical id' in line: 32 | current_physical_id = line.strip().split(': ')[1] 33 | elif 'core id' in line: 34 | current_core_id = line.strip().split(': ')[1] 35 | cores.add((current_physical_id, current_core_id)) 36 | 37 | return len(cores) 38 | 39 | 40 | class JobInput: 41 | def __init__(self, job): 42 | self.llm_input = job.get("messages", job.get("prompt")) 43 | self.stream = job.get("stream", False) 44 | self.max_batch_size = job.get("max_batch_size") 45 | self.apply_chat_template = job.get("apply_chat_template", False) 46 | self.use_openai_format = job.get("use_openai_format", False) 47 | samp_param = job.get("sampling_params", {}) 48 | if "max_tokens" not in samp_param: 49 | samp_param["max_tokens"] = 100 50 | self.sampling_params = SamplingParams(**samp_param) 51 | # self.sampling_params = SamplingParams(max_tokens=100, **job.get("sampling_params", {})) 52 | self.request_id = random_uuid() 53 | batch_size_growth_factor = job.get("batch_size_growth_factor") 54 | self.batch_size_growth_factor = float(batch_size_growth_factor) if batch_size_growth_factor else None 55 | min_batch_size = job.get("min_batch_size") 56 | self.min_batch_size = int(min_batch_size) if min_batch_size else None 57 | self.openai_route = job.get("openai_route") 58 | self.openai_input = job.get("openai_input") 59 | class DummyState: 60 | def __init__(self): 61 | self.request_metadata = None 62 | 63 | class DummyRequest: 64 | def __init__(self): 65 | self.headers = {} 66 | self.state = DummyState() 67 | async def is_disconnected(self): 68 | return False 69 | 70 | class BatchSize: 71 | def __init__(self, max_batch_size, min_batch_size, batch_size_growth_factor): 72 | self.max_batch_size = max_batch_size 73 | self.batch_size_growth_factor = batch_size_growth_factor 74 | self.min_batch_size = min_batch_size 75 | self.is_dynamic = batch_size_growth_factor > 1 and min_batch_size >= 1 and max_batch_size > min_batch_size 76 | if self.is_dynamic: 77 | self.current_batch_size = min_batch_size 78 | else: 79 | self.current_batch_size = max_batch_size 80 | 81 | def update(self): 82 | if self.is_dynamic: 83 | self.current_batch_size = min(self.current_batch_size*self.batch_size_growth_factor, self.max_batch_size) 84 | 85 | def create_error_response(message: str, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: 86 | return ErrorResponse(message=message, 87 | type=err_type, 88 | code=status_code.value) 89 | 90 | def get_int_bool_env(env_var: str, default: bool) -> bool: 91 | return int(os.getenv(env_var, int(default))) == 1 92 | 93 | def timer_decorator(func): 94 | @wraps(func) 95 | def wrapper(*args, **kwargs): 96 | start = time() 97 | result = func(*args, **kwargs) 98 | end = time() 99 | logging.info(f"{func.__name__} completed in {end - start:.2f} seconds") 100 | return result 101 | return wrapper -------------------------------------------------------------------------------- /worker-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "versions": { 3 | "0.9.0": { 4 | "imageName": "runpod/worker-v1-vllm:v2.6.0stable-cuda12.1.0", 5 | "minimumCudaVersion": "12.1", 6 | "categories": [ 7 | { 8 | "title": "LLM Settings", 9 | "settings": [ 10 | "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", 11 | "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", 12 | "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", 13 | "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", 14 | "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", 15 | "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS", 16 | "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", 17 | "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", 18 | "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", 19 | "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", 20 | "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", 21 | "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", 22 | "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 23 | "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 24 | "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", 25 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 26 | "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", 27 | "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST", 28 | "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER" 29 | ] 30 | }, 31 | { 32 | "title": "Tokenizer Settings", 33 | "settings": [ 34 | "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" 35 | ] 36 | }, 37 | { 38 | "title": "System Settings", 39 | "settings": [ 40 | "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", 41 | "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" 42 | ] 43 | }, 44 | { 45 | "title": "Streaming Settings", 46 | "settings": [ 47 | "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" 48 | ] 49 | }, 50 | { 51 | "title": "OpenAI Settings", 52 | "settings": [ 53 | "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" 54 | ] 55 | }, 56 | { 57 | "title": "Serverless Settings", 58 | "settings": [ 59 | "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" 60 | ] 61 | } 62 | ] 63 | }, 64 | "0.8.5": { 65 | "imageName": "runpod/worker-v1-vllm:v2.5.0stable-cuda12.1.0", 66 | "minimumCudaVersion": "12.1", 67 | "categories": [ 68 | { 69 | "title": "LLM Settings", 70 | "settings": [ 71 | "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", 72 | "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", 73 | "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", 74 | "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", 75 | "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", 76 | "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS", 77 | "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", 78 | "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", 79 | "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", 80 | "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", 81 | "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", 82 | "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", 83 | "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 84 | "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 85 | "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", 86 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 87 | "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", 88 | "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST", 89 | "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER" 90 | ] 91 | }, 92 | { 93 | "title": "Tokenizer Settings", 94 | "settings": [ 95 | "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" 96 | ] 97 | }, 98 | { 99 | "title": "System Settings", 100 | "settings": [ 101 | "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", 102 | "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" 103 | ] 104 | }, 105 | { 106 | "title": "Streaming Settings", 107 | "settings": [ 108 | "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" 109 | ] 110 | }, 111 | { 112 | "title": "OpenAI Settings", 113 | "settings": [ 114 | "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" 115 | ] 116 | }, 117 | { 118 | "title": "Serverless Settings", 119 | "settings": [ 120 | "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" 121 | ] 122 | } 123 | ] 124 | }, 125 | "0.8.4": { 126 | "imageName": "runpod/worker-v1-vllm:v2.4.0stable-cuda12.1.0", 127 | "minimumCudaVersion": "12.1", 128 | "categories": [ 129 | { 130 | "title": "LLM Settings", 131 | "settings": [ 132 | "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", 133 | "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", 134 | "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", 135 | "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", 136 | "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", 137 | "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS", 138 | "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", 139 | "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", 140 | "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", 141 | "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", 142 | "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", 143 | "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", 144 | "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 145 | "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 146 | "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", 147 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 148 | "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", 149 | "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST", 150 | "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER" 151 | ] 152 | }, 153 | { 154 | "title": "Tokenizer Settings", 155 | "settings": [ 156 | "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" 157 | ] 158 | }, 159 | { 160 | "title": "System Settings", 161 | "settings": [ 162 | "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", 163 | "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" 164 | ] 165 | }, 166 | { 167 | "title": "Streaming Settings", 168 | "settings": [ 169 | "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" 170 | ] 171 | }, 172 | { 173 | "title": "OpenAI Settings", 174 | "settings": [ 175 | "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" 176 | ] 177 | }, 178 | { 179 | "title": "Serverless Settings", 180 | "settings": [ 181 | "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" 182 | ] 183 | } 184 | ] 185 | }, 186 | "0.8.3": { 187 | "imageName": "runpod/worker-v1-vllm:v2.3.0stable-cuda12.1.0", 188 | "minimumCudaVersion": "12.1", 189 | "categories": [ 190 | { 191 | "title": "LLM Settings", 192 | "settings": [ 193 | "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", 194 | "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", 195 | "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", 196 | "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", 197 | "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", 198 | "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS", 199 | "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", 200 | "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", 201 | "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", 202 | "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", 203 | "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", 204 | "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", 205 | "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 206 | "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 207 | "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", 208 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 209 | "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", 210 | "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST", 211 | "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER" 212 | ] 213 | }, 214 | { 215 | "title": "Tokenizer Settings", 216 | "settings": [ 217 | "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" 218 | ] 219 | }, 220 | { 221 | "title": "System Settings", 222 | "settings": [ 223 | "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", 224 | "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" 225 | ] 226 | }, 227 | { 228 | "title": "Streaming Settings", 229 | "settings": [ 230 | "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" 231 | ] 232 | }, 233 | { 234 | "title": "OpenAI Settings", 235 | "settings": [ 236 | "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" 237 | ] 238 | }, 239 | { 240 | "title": "Serverless Settings", 241 | "settings": [ 242 | "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" 243 | ] 244 | } 245 | ] 246 | }, 247 | "0.8.2": { 248 | "imageName": "runpod/worker-v1-vllm:v2.2.0stable-cuda12.1.0", 249 | "minimumCudaVersion": "12.1", 250 | "categories": [ 251 | { 252 | "title": "LLM Settings", 253 | "settings": [ 254 | "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", 255 | "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", 256 | "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", 257 | "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", 258 | "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", 259 | "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS", 260 | "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", 261 | "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", 262 | "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", 263 | "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", 264 | "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", 265 | "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", 266 | "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 267 | "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 268 | "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", 269 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 270 | "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", 271 | "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST", 272 | "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER" 273 | ] 274 | }, 275 | { 276 | "title": "Tokenizer Settings", 277 | "settings": [ 278 | "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" 279 | ] 280 | }, 281 | { 282 | "title": "System Settings", 283 | "settings": [ 284 | "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", 285 | "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" 286 | ] 287 | }, 288 | { 289 | "title": "Streaming Settings", 290 | "settings": [ 291 | "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" 292 | ] 293 | }, 294 | { 295 | "title": "OpenAI Settings", 296 | "settings": [ 297 | "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" 298 | ] 299 | }, 300 | { 301 | "title": "Serverless Settings", 302 | "settings": [ 303 | "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" 304 | ] 305 | } 306 | ] 307 | }, 308 | "0.7.3": { 309 | "imageName": "runpod/worker-v1-vllm:v2.1.0stable-cuda12.1.0", 310 | "minimumCudaVersion": "12.1", 311 | "categories": [ 312 | { 313 | "title": "LLM Settings", 314 | "settings": [ 315 | "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", 316 | "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", 317 | "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", 318 | "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", 319 | "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", 320 | "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS", 321 | "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", 322 | "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", 323 | "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", 324 | "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", 325 | "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", 326 | "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", 327 | "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 328 | "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 329 | "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", 330 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 331 | "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", 332 | "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST", 333 | "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER" 334 | ] 335 | }, 336 | { 337 | "title": "Tokenizer Settings", 338 | "settings": [ 339 | "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" 340 | ] 341 | }, 342 | { 343 | "title": "System Settings", 344 | "settings": [ 345 | "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", 346 | "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" 347 | ] 348 | }, 349 | { 350 | "title": "Streaming Settings", 351 | "settings": [ 352 | "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" 353 | ] 354 | }, 355 | { 356 | "title": "OpenAI Settings", 357 | "settings": [ 358 | "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" 359 | ] 360 | }, 361 | { 362 | "title": "Serverless Settings", 363 | "settings": [ 364 | "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" 365 | ] 366 | } 367 | ] 368 | }, 369 | "0.6.6": { 370 | "imageName": "runpod/worker-v1-vllm:v1.8.0stable-cuda12.1.0", 371 | "minimumCudaVersion": "12.1", 372 | "categories": [ 373 | { 374 | "title": "LLM Settings", 375 | "settings": [ 376 | "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", 377 | "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", 378 | "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", 379 | "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", 380 | "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", 381 | "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS", 382 | "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", 383 | "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", 384 | "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", 385 | "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", 386 | "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", 387 | "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", 388 | "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 389 | "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 390 | "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", 391 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 392 | "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", 393 | "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST", 394 | "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER" 395 | ] 396 | }, 397 | { 398 | "title": "Tokenizer Settings", 399 | "settings": [ 400 | "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" 401 | ] 402 | }, 403 | { 404 | "title": "System Settings", 405 | "settings": [ 406 | "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", 407 | "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" 408 | ] 409 | }, 410 | { 411 | "title": "Streaming Settings", 412 | "settings": [ 413 | "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" 414 | ] 415 | }, 416 | { 417 | "title": "OpenAI Settings", 418 | "settings": [ 419 | "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" 420 | ] 421 | }, 422 | { 423 | "title": "Serverless Settings", 424 | "settings": [ 425 | "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" 426 | ] 427 | } 428 | ] 429 | }, 430 | "0.7.0": { 431 | "imageName": "runpod/worker-v1-vllm:v1.9.0stable-cuda12.1.0", 432 | "minimumCudaVersion": "12.1", 433 | "categories": [ 434 | { 435 | "title": "LLM Settings", 436 | "settings": [ 437 | "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", 438 | "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", 439 | "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", 440 | "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", 441 | "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", 442 | "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS", 443 | "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", 444 | "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", 445 | "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", 446 | "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", 447 | "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", 448 | "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", 449 | "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 450 | "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 451 | "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", 452 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 453 | "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", 454 | "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST", 455 | "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER" 456 | ] 457 | }, 458 | { 459 | "title": "Tokenizer Settings", 460 | "settings": [ 461 | "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" 462 | ] 463 | }, 464 | { 465 | "title": "System Settings", 466 | "settings": [ 467 | "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", 468 | "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" 469 | ] 470 | }, 471 | { 472 | "title": "Streaming Settings", 473 | "settings": [ 474 | "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" 475 | ] 476 | }, 477 | { 478 | "title": "OpenAI Settings", 479 | "settings": [ 480 | "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" 481 | ] 482 | }, 483 | { 484 | "title": "Serverless Settings", 485 | "settings": [ 486 | "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" 487 | ] 488 | } 489 | ] 490 | }, 491 | "0.6.4": { 492 | "imageName": "runpod/worker-v1-vllm:v1.7.0stable-cuda12.1.0", 493 | "minimumCudaVersion": "12.1", 494 | "categories": [ 495 | { 496 | "title": "LLM Settings", 497 | "settings": [ 498 | "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", 499 | "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", 500 | "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", 501 | "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", 502 | "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", 503 | "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS", 504 | "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", 505 | "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", 506 | "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", 507 | "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", 508 | "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", 509 | "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", 510 | "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 511 | "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 512 | "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", 513 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 514 | "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", 515 | "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST", 516 | "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER" 517 | ] 518 | }, 519 | { 520 | "title": "Tokenizer Settings", 521 | "settings": [ 522 | "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" 523 | ] 524 | }, 525 | { 526 | "title": "System Settings", 527 | "settings": [ 528 | "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", 529 | "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" 530 | ] 531 | }, 532 | { 533 | "title": "Streaming Settings", 534 | "settings": [ 535 | "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" 536 | ] 537 | }, 538 | { 539 | "title": "OpenAI Settings", 540 | "settings": [ 541 | "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" 542 | ] 543 | }, 544 | { 545 | "title": "Serverless Settings", 546 | "settings": [ 547 | "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" 548 | ] 549 | } 550 | ] 551 | }, 552 | "0.6.3": { 553 | "imageName": "runpod/worker-v1-vllm:v1.6.0stable-cuda12.1.0", 554 | "minimumCudaVersion": "12.1", 555 | "categories": [ 556 | { 557 | "title": "LLM Settings", 558 | "settings": [ 559 | "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", 560 | "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", 561 | "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", 562 | "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", 563 | "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", 564 | "DISABLE_SLIDING_WINDOW", "NUM_LOOKAHEAD_SLOTS", 565 | "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", 566 | "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", 567 | "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", 568 | "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", 569 | "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", 570 | "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", 571 | "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 572 | "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 573 | "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", 574 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 575 | "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", 576 | "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST", 577 | "ENABLE_AUTO_TOOL_CHOICE", "TOOL_CALL_PARSER" 578 | ] 579 | }, 580 | { 581 | "title": "Tokenizer Settings", 582 | "settings": [ 583 | "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" 584 | ] 585 | }, 586 | { 587 | "title": "System Settings", 588 | "settings": [ 589 | "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", 590 | "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" 591 | ] 592 | }, 593 | { 594 | "title": "Streaming Settings", 595 | "settings": [ 596 | "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" 597 | ] 598 | }, 599 | { 600 | "title": "OpenAI Settings", 601 | "settings": [ 602 | "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" 603 | ] 604 | }, 605 | { 606 | "title": "Serverless Settings", 607 | "settings": [ 608 | "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" 609 | ] 610 | } 611 | ] 612 | }, 613 | "0.6.2": { 614 | "imageName": "runpod/worker-v1-vllm:v1.5.0stable-cuda12.1.0", 615 | "minimumCudaVersion": "12.1", 616 | "categories": [ 617 | { 618 | "title": "LLM Settings", 619 | "settings": [ 620 | "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", 621 | "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", 622 | "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", 623 | "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", 624 | "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", 625 | "DISABLE_SLIDING_WINDOW", "USE_V2_BLOCK_MANAGER", "NUM_LOOKAHEAD_SLOTS", 626 | "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", 627 | "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", 628 | "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", 629 | "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", 630 | "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", 631 | "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", 632 | "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 633 | "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 634 | "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", 635 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 636 | "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", 637 | "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST" 638 | ] 639 | }, 640 | { 641 | "title": "Tokenizer Settings", 642 | "settings": [ 643 | "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" 644 | ] 645 | }, 646 | { 647 | "title": "System Settings", 648 | "settings": [ 649 | "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", 650 | "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" 651 | ] 652 | }, 653 | { 654 | "title": "Streaming Settings", 655 | "settings": [ 656 | "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" 657 | ] 658 | }, 659 | { 660 | "title": "OpenAI Settings", 661 | "settings": [ 662 | "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" 663 | ] 664 | }, 665 | { 666 | "title": "Serverless Settings", 667 | "settings": [ 668 | "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" 669 | ] 670 | } 671 | ] 672 | }, 673 | "0.6.1": { 674 | "imageName": "runpod/worker-v1-vllm:v1.4.0stable-cuda12.1.0", 675 | "minimumCudaVersion": "12.1", 676 | "categories": [ 677 | { 678 | "title": "LLM Settings", 679 | "settings": [ 680 | "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", 681 | "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", 682 | "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", 683 | "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", 684 | "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", 685 | "DISABLE_SLIDING_WINDOW", "USE_V2_BLOCK_MANAGER", "NUM_LOOKAHEAD_SLOTS", 686 | "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", 687 | "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", 688 | "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", 689 | "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", 690 | "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", 691 | "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", 692 | "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 693 | "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 694 | "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", 695 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 696 | "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", 697 | "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST" 698 | ] 699 | }, 700 | { 701 | "title": "Tokenizer Settings", 702 | "settings": [ 703 | "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" 704 | ] 705 | }, 706 | { 707 | "title": "System Settings", 708 | "settings": [ 709 | "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", 710 | "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" 711 | ] 712 | }, 713 | { 714 | "title": "Streaming Settings", 715 | "settings": [ 716 | "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" 717 | ] 718 | }, 719 | { 720 | "title": "OpenAI Settings", 721 | "settings": [ 722 | "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" 723 | ] 724 | }, 725 | { 726 | "title": "Serverless Settings", 727 | "settings": [ 728 | "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" 729 | ] 730 | } 731 | ] 732 | } 733 | }, 734 | "schema": { 735 | "TOKENIZER": { 736 | "env_var_name": "TOKENIZER", 737 | "value": "", 738 | "title": "Tokenizer", 739 | "description": "Name or path of the Hugging Face tokenizer to use.", 740 | "required": false, 741 | "type": "text" 742 | }, 743 | "TOKENIZER_MODE": { 744 | "env_var_name": "TOKENIZER_MODE", 745 | "value": "auto", 746 | "title": "Tokenizer Mode", 747 | "description": "The tokenizer mode.", 748 | "required": false, 749 | "type": "select", 750 | "options": [ 751 | { "value": "auto", "label": "auto" }, 752 | { "value": "slow", "label": "slow" } 753 | ] 754 | }, 755 | "SKIP_TOKENIZER_INIT": { 756 | "env_var_name": "SKIP_TOKENIZER_INIT", 757 | "value": false, 758 | "title": "Skip Tokenizer Init", 759 | "description": "Skip initialization of tokenizer and detokenizer.", 760 | "required": false, 761 | "type": "toggle" 762 | }, 763 | "TRUST_REMOTE_CODE": { 764 | "env_var_name": "TRUST_REMOTE_CODE", 765 | "value": false, 766 | "title": "Trust Remote Code", 767 | "description": "Trust remote code from Hugging Face.", 768 | "required": false, 769 | "type": "toggle" 770 | }, 771 | "DOWNLOAD_DIR": { 772 | "env_var_name": "DOWNLOAD_DIR", 773 | "value": "", 774 | "title": "Download Directory", 775 | "description": "Directory to download and load the weights.", 776 | "required": false, 777 | "type": "text" 778 | }, 779 | "LOAD_FORMAT": { 780 | "env_var_name": "LOAD_FORMAT", 781 | "value": "auto", 782 | "title": "Load Format", 783 | "description": "The format of the model weights to load.", 784 | "required": false, 785 | "type": "select", 786 | "options": [ 787 | { "value": "auto", "label": "auto" }, 788 | { "value": "pt", "label": "pt" }, 789 | { "value": "safetensors", "label": "safetensors" }, 790 | { "value": "npcache", "label": "npcache" }, 791 | { "value": "dummy", "label": "dummy" }, 792 | { "value": "tensorizer", "label": "tensorizer" }, 793 | { "value": "bitsandbytes", "label": "bitsandbytes" } 794 | ] 795 | }, 796 | "DTYPE": { 797 | "env_var_name": "DTYPE", 798 | "value": "auto", 799 | "title": "Data Type", 800 | "description": "Data type for model weights and activations.", 801 | "required": false, 802 | "type": "select", 803 | "options": [ 804 | { "value": "auto", "label": "auto" }, 805 | { "value": "half", "label": "half" }, 806 | { "value": "float16", "label": "float16" }, 807 | { "value": "bfloat16", "label": "bfloat16" }, 808 | { "value": "float", "label": "float" }, 809 | { "value": "float32", "label": "float32" } 810 | ] 811 | }, 812 | "KV_CACHE_DTYPE": { 813 | "env_var_name": "KV_CACHE_DTYPE", 814 | "value": "auto", 815 | "title": "KV Cache Data Type", 816 | "description": "Data type for KV cache storage.", 817 | "required": false, 818 | "type": "select", 819 | "options": [ 820 | { "value": "auto", "label": "auto" }, 821 | { "value": "fp8", "label": "fp8" } 822 | ] 823 | }, 824 | "QUANTIZATION_PARAM_PATH": { 825 | "env_var_name": "QUANTIZATION_PARAM_PATH", 826 | "value": "", 827 | "title": "Quantization Param Path", 828 | "description": "Path to the JSON file containing the KV cache scaling factors.", 829 | "required": false, 830 | "type": "text" 831 | }, 832 | "MAX_MODEL_LEN": { 833 | "env_var_name": "MAX_MODEL_LEN", 834 | "value": "", 835 | "title": "Max Model Length", 836 | "description": "Model context length.", 837 | "required": false, 838 | "type": "number" 839 | }, 840 | "GUIDED_DECODING_BACKEND": { 841 | "env_var_name": "GUIDED_DECODING_BACKEND", 842 | "value": "outlines", 843 | "title": "Guided Decoding Backend", 844 | "description": "Which engine will be used for guided decoding by default.", 845 | "required": false, 846 | "type": "select", 847 | "options": [ 848 | { "value": "outlines", "label": "outlines" }, 849 | { "value": "lm-format-enforcer", "label": "lm-format-enforcer" } 850 | ] 851 | }, 852 | "DISTRIBUTED_EXECUTOR_BACKEND": { 853 | "env_var_name": "DISTRIBUTED_EXECUTOR_BACKEND", 854 | "value": "", 855 | "title": "Distributed Executor Backend", 856 | "description": "Backend to use for distributed serving.", 857 | "required": false, 858 | "type": "select", 859 | "options": [ 860 | { "value": "ray", "label": "ray" }, 861 | { "value": "mp", "label": "mp" } 862 | ] 863 | }, 864 | "WORKER_USE_RAY": { 865 | "env_var_name": "WORKER_USE_RAY", 866 | "value": false, 867 | "title": "Worker Use Ray", 868 | "description": "Deprecated, use --distributed-executor-backend=ray.", 869 | "required": false, 870 | "type": "toggle" 871 | }, 872 | "RAY_WORKERS_USE_NSIGHT": { 873 | "env_var_name": "RAY_WORKERS_USE_NSIGHT", 874 | "value": false, 875 | "title": "Ray Workers Use Nsight", 876 | "description": "If specified, use nsight to profile Ray workers.", 877 | "required": false, 878 | "type": "toggle" 879 | }, 880 | "PIPELINE_PARALLEL_SIZE": { 881 | "env_var_name": "PIPELINE_PARALLEL_SIZE", 882 | "value": 1, 883 | "title": "Pipeline Parallel Size", 884 | "description": "Number of pipeline stages.", 885 | "required": false, 886 | "type": "number" 887 | }, 888 | "TENSOR_PARALLEL_SIZE": { 889 | "env_var_name": "TENSOR_PARALLEL_SIZE", 890 | "value": 1, 891 | "title": "Tensor Parallel Size", 892 | "description": "Number of tensor parallel replicas.", 893 | "required": false, 894 | "type": "number" 895 | }, 896 | "MAX_PARALLEL_LOADING_WORKERS": { 897 | "env_var_name": "MAX_PARALLEL_LOADING_WORKERS", 898 | "value": "", 899 | "title": "Max Parallel Loading Workers", 900 | "description": "Load model sequentially in multiple batches.", 901 | "required": false, 902 | "type": "number" 903 | }, 904 | "ENABLE_PREFIX_CACHING": { 905 | "env_var_name": "ENABLE_PREFIX_CACHING", 906 | "value": false, 907 | "title": "Enable Prefix Caching", 908 | "description": "Enables automatic prefix caching.", 909 | "required": false, 910 | "type": "toggle" 911 | }, 912 | "DISABLE_SLIDING_WINDOW": { 913 | "env_var_name": "DISABLE_SLIDING_WINDOW", 914 | "value": false, 915 | "title": "Disable Sliding Window", 916 | "description": "Disables sliding window, capping to sliding window size.", 917 | "required": false, 918 | "type": "toggle" 919 | }, 920 | "USE_V2_BLOCK_MANAGER": { 921 | "env_var_name": "USE_V2_BLOCK_MANAGER", 922 | "value": false, 923 | "title": "Use V2 Block Manager", 924 | "description": "Use BlockSpaceMangerV2.", 925 | "required": false, 926 | "type": "toggle" 927 | }, 928 | "NUM_LOOKAHEAD_SLOTS": { 929 | "env_var_name": "NUM_LOOKAHEAD_SLOTS", 930 | "value": 0, 931 | "title": "Num Lookahead Slots", 932 | "description": "Experimental scheduling config necessary for speculative decoding.", 933 | "required": false, 934 | "type": "number" 935 | }, 936 | "SEED": { 937 | "env_var_name": "SEED", 938 | "value": 0, 939 | "title": "Seed", 940 | "description": "Random seed for operations.", 941 | "required": false, 942 | "type": "number" 943 | }, 944 | "NUM_GPU_BLOCKS_OVERRIDE": { 945 | "env_var_name": "NUM_GPU_BLOCKS_OVERRIDE", 946 | "value": "", 947 | "title": "Num GPU Blocks Override", 948 | "description": "If specified, ignore GPU profiling result and use this number of GPU blocks.", 949 | "required": false, 950 | "type": "number" 951 | }, 952 | "MAX_NUM_BATCHED_TOKENS": { 953 | "env_var_name": "MAX_NUM_BATCHED_TOKENS", 954 | "value": "", 955 | "title": "Max Num Batched Tokens", 956 | "description": "Maximum number of batched tokens per iteration.", 957 | "required": false, 958 | "type": "number" 959 | }, 960 | "MAX_NUM_SEQS": { 961 | "env_var_name": "MAX_NUM_SEQS", 962 | "value": 256, 963 | "title": "Max Num Seqs", 964 | "description": "Maximum number of sequences per iteration.", 965 | "required": false, 966 | "type": "number" 967 | }, 968 | "MAX_LOGPROBS": { 969 | "env_var_name": "MAX_LOGPROBS", 970 | "value": 20, 971 | "title": "Max Logprobs", 972 | "description": "Max number of log probs to return when logprobs is specified in SamplingParams.", 973 | "required": false, 974 | "type": "number" 975 | }, 976 | "DISABLE_LOG_STATS": { 977 | "env_var_name": "DISABLE_LOG_STATS", 978 | "value": false, 979 | "title": "Disable Log Stats", 980 | "description": "Disable logging statistics.", 981 | "required": false, 982 | "type": "toggle" 983 | }, 984 | "QUANTIZATION": { 985 | "env_var_name": "QUANTIZATION", 986 | "value": "", 987 | "title": "Quantization", 988 | "description": "Method used to quantize the weights.\nif the `Load Format` is 'bitsandbytes' then `Quantization` will be forced to 'bitsandbytes'", 989 | "required": false, 990 | "type": "select", 991 | "options": [ 992 | { "value": "None", "label": "None" }, 993 | { "value": "awq", "label": "AWQ" }, 994 | { "value": "squeezellm", "label": "SqueezeLLM" }, 995 | { "value": "gptq", "label": "GPTQ" }, 996 | { "value": "bitsandbytes", "label": "bitsandbytes" } 997 | ] 998 | }, 999 | "ROPE_SCALING": { 1000 | "env_var_name": "ROPE_SCALING", 1001 | "value": "", 1002 | "title": "RoPE Scaling", 1003 | "description": "RoPE scaling configuration in JSON format.", 1004 | "required": false, 1005 | "type": "text" 1006 | }, 1007 | "ROPE_THETA": { 1008 | "env_var_name": "ROPE_THETA", 1009 | "value": "", 1010 | "title": "RoPE Theta", 1011 | "description": "RoPE theta. Use with rope_scaling.", 1012 | "required": false, 1013 | "type": "number" 1014 | }, 1015 | "TOKENIZER_POOL_SIZE": { 1016 | "env_var_name": "TOKENIZER_POOL_SIZE", 1017 | "value": 0, 1018 | "title": "Tokenizer Pool Size", 1019 | "description": "Size of tokenizer pool to use for asynchronous tokenization.", 1020 | "required": false, 1021 | "type": "number" 1022 | }, 1023 | "TOKENIZER_POOL_TYPE": { 1024 | "env_var_name": "TOKENIZER_POOL_TYPE", 1025 | "value": "ray", 1026 | "title": "Tokenizer Pool Type", 1027 | "description": "Type of tokenizer pool to use for asynchronous tokenization.", 1028 | "required": false, 1029 | "type": "text" 1030 | }, 1031 | "TOKENIZER_POOL_EXTRA_CONFIG": { 1032 | "env_var_name": "TOKENIZER_POOL_EXTRA_CONFIG", 1033 | "value": "", 1034 | "title": "Tokenizer Pool Extra Config", 1035 | "description": "Extra config for tokenizer pool.", 1036 | "required": false, 1037 | "type": "text" 1038 | }, 1039 | "ENABLE_LORA": { 1040 | "env_var_name": "ENABLE_LORA", 1041 | "value": false, 1042 | "title": "Enable LoRA", 1043 | "description": "If True, enable handling of LoRA adapters.", 1044 | "required": false, 1045 | "type": "toggle" 1046 | }, 1047 | "MAX_LORAS": { 1048 | "env_var_name": "MAX_LORAS", 1049 | "value": 1, 1050 | "title": "Max LoRAs", 1051 | "description": "Max number of LoRAs in a single batch.", 1052 | "required": false, 1053 | "type": "number" 1054 | }, 1055 | "MAX_LORA_RANK": { 1056 | "env_var_name": "MAX_LORA_RANK", 1057 | "value": 16, 1058 | "title": "Max LoRA Rank", 1059 | "description": "Max LoRA rank.", 1060 | "required": false, 1061 | "type": "number" 1062 | }, 1063 | "LORA_EXTRA_VOCAB_SIZE": { 1064 | "env_var_name": "LORA_EXTRA_VOCAB_SIZE", 1065 | "value": 256, 1066 | "title": "LoRA Extra Vocab Size", 1067 | "description": "Maximum size of extra vocabulary for LoRA adapters.", 1068 | "required": false, 1069 | "type": "number" 1070 | }, 1071 | "LORA_DTYPE": { 1072 | "env_var_name": "LORA_DTYPE", 1073 | "value": "auto", 1074 | "title": "LoRA Data Type", 1075 | "description": "Data type for LoRA.", 1076 | "required": false, 1077 | "type": "select", 1078 | "options": [ 1079 | { "value": "auto", "label": "auto" }, 1080 | { "value": "float16", "label": "float16" }, 1081 | { "value": "bfloat16", "label": "bfloat16" }, 1082 | { "value": "float32", "label": "float32" } 1083 | ] 1084 | }, 1085 | "LONG_LORA_SCALING_FACTORS": { 1086 | "env_var_name": "LONG_LORA_SCALING_FACTORS", 1087 | "value": "", 1088 | "title": "Long LoRA Scaling Factors", 1089 | "description": "Specify multiple scaling factors for LoRA adapters.", 1090 | "required": false, 1091 | "type": "text" 1092 | }, 1093 | "MAX_CPU_LORAS": { 1094 | "env_var_name": "MAX_CPU_LORAS", 1095 | "value": "", 1096 | "title": "Max CPU LoRAs", 1097 | "description": "Maximum number of LoRAs to store in CPU memory.", 1098 | "required": false, 1099 | "type": "number" 1100 | }, 1101 | "FULLY_SHARDED_LORAS": { 1102 | "env_var_name": "FULLY_SHARDED_LORAS", 1103 | "value": false, 1104 | "title": "Fully Sharded LoRAs", 1105 | "description": "Enable fully sharded LoRA layers.", 1106 | "required": false, 1107 | "type": "toggle" 1108 | }, 1109 | "DEVICE": { 1110 | "env_var_name": "DEVICE", 1111 | "value": "auto", 1112 | "title": "Device", 1113 | "description": "Device type for vLLM execution.", 1114 | "required": false, 1115 | "type": "select", 1116 | "options": [ 1117 | { "value": "auto", "label": "auto" }, 1118 | { "value": "cuda", "label": "cuda" }, 1119 | { "value": "neuron", "label": "neuron" }, 1120 | { "value": "cpu", "label": "cpu" }, 1121 | { "value": "openvino", "label": "openvino" }, 1122 | { "value": "tpu", "label": "tpu" }, 1123 | { "value": "xpu", "label": "xpu" } 1124 | ] 1125 | }, 1126 | "SCHEDULER_DELAY_FACTOR": { 1127 | "env_var_name": "SCHEDULER_DELAY_FACTOR", 1128 | "value": 0.0, 1129 | "title": "Scheduler Delay Factor", 1130 | "description": "Apply a delay before scheduling next prompt.", 1131 | "required": false, 1132 | "type": "number" 1133 | }, 1134 | "ENABLE_CHUNKED_PREFILL": { 1135 | "env_var_name": "ENABLE_CHUNKED_PREFILL", 1136 | "value": false, 1137 | "title": "Enable Chunked Prefill", 1138 | "description": "Enable chunked prefill requests.", 1139 | "required": false, 1140 | "type": "toggle" 1141 | }, 1142 | "SPECULATIVE_MODEL": { 1143 | "env_var_name": "SPECULATIVE_MODEL", 1144 | "value": "", 1145 | "title": "Speculative Model", 1146 | "description": "The name of the draft model to be used in speculative decoding.", 1147 | "required": false, 1148 | "type": "text" 1149 | }, 1150 | "NUM_SPECULATIVE_TOKENS": { 1151 | "env_var_name": "NUM_SPECULATIVE_TOKENS", 1152 | "value": "", 1153 | "title": "Num Speculative Tokens", 1154 | "description": "The number of speculative tokens to sample from the draft model.", 1155 | "required": false, 1156 | "type": "number" 1157 | }, 1158 | "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE": { 1159 | "env_var_name": "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", 1160 | "value": "", 1161 | "title": "Speculative Draft Tensor Parallel Size", 1162 | "description": "Number of tensor parallel replicas for the draft model.", 1163 | "required": false, 1164 | "type": "number" 1165 | }, 1166 | "SPECULATIVE_MAX_MODEL_LEN": { 1167 | "env_var_name": "SPECULATIVE_MAX_MODEL_LEN", 1168 | "value": "", 1169 | "title": "Speculative Max Model Length", 1170 | "description": "The maximum sequence length supported by the draft model.", 1171 | "required": false, 1172 | "type": "number" 1173 | }, 1174 | "SPECULATIVE_DISABLE_BY_BATCH_SIZE": { 1175 | "env_var_name": "SPECULATIVE_DISABLE_BY_BATCH_SIZE", 1176 | "value": "", 1177 | "title": "Speculative Disable by Batch Size", 1178 | "description": "Disable speculative decoding if the number of enqueue requests is larger than this value.", 1179 | "required": false, 1180 | "type": "number" 1181 | }, 1182 | "NGRAM_PROMPT_LOOKUP_MAX": { 1183 | "env_var_name": "NGRAM_PROMPT_LOOKUP_MAX", 1184 | "value": "", 1185 | "title": "Ngram Prompt Lookup Max", 1186 | "description": "Max size of window for ngram prompt lookup in speculative decoding.", 1187 | "required": false, 1188 | "type": "number" 1189 | }, 1190 | "NGRAM_PROMPT_LOOKUP_MIN": { 1191 | "env_var_name": "NGRAM_PROMPT_LOOKUP_MIN", 1192 | "value": "", 1193 | "title": "Ngram Prompt Lookup Min", 1194 | "description": "Min size of window for ngram prompt lookup in speculative decoding.", 1195 | "required": false, 1196 | "type": "number" 1197 | }, 1198 | "SPEC_DECODING_ACCEPTANCE_METHOD": { 1199 | "env_var_name": "SPEC_DECODING_ACCEPTANCE_METHOD", 1200 | "value": "rejection_sampler", 1201 | "title": "Speculative Decoding Acceptance Method", 1202 | "description": "Specify the acceptance method for draft token verification in speculative decoding.", 1203 | "required": false, 1204 | "type": "select", 1205 | "options": [ 1206 | { "value": "rejection_sampler", "label": "rejection_sampler" }, 1207 | { "value": "typical_acceptance_sampler", "label": "typical_acceptance_sampler" } 1208 | ] 1209 | }, 1210 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD": { 1211 | "env_var_name": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", 1212 | "value": "", 1213 | "title": "Typical Acceptance Sampler Posterior Threshold", 1214 | "description": "Set the lower bound threshold for the posterior probability of a token to be accepted.", 1215 | "required": false, 1216 | "type": "number" 1217 | }, 1218 | "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA": { 1219 | "env_var_name": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", 1220 | "value": "", 1221 | "title": "Typical Acceptance Sampler Posterior Alpha", 1222 | "description": "A scaling factor for the entropy-based threshold for token acceptance.", 1223 | "required": false, 1224 | "type": "number" 1225 | }, 1226 | "MODEL_LOADER_EXTRA_CONFIG": { 1227 | "env_var_name": "MODEL_LOADER_EXTRA_CONFIG", 1228 | "value": "", 1229 | "title": "Model Loader Extra Config", 1230 | "description": "Extra config for model loader.", 1231 | "required": false, 1232 | "type": "text" 1233 | }, 1234 | "PREEMPTION_MODE": { 1235 | "env_var_name": "PREEMPTION_MODE", 1236 | "value": "", 1237 | "title": "Preemption Mode", 1238 | "description": "If 'recompute', the engine performs preemption-aware recomputation. If 'save', the engine saves activations into the CPU memory as preemption happens.", 1239 | "required": false, 1240 | "type": "text" 1241 | }, 1242 | "PREEMPTION_CHECK_PERIOD": { 1243 | "env_var_name": "PREEMPTION_CHECK_PERIOD", 1244 | "value": 1.0, 1245 | "title": "Preemption Check Period", 1246 | "description": "How frequently the engine checks if a preemption happens.", 1247 | "required": false, 1248 | "type": "number" 1249 | }, 1250 | "PREEMPTION_CPU_CAPACITY": { 1251 | "env_var_name": "PREEMPTION_CPU_CAPACITY", 1252 | "value": 2, 1253 | "title": "Preemption CPU Capacity", 1254 | "description": "The percentage of CPU memory used for the saved activations.", 1255 | "required": false, 1256 | "type": "number" 1257 | }, 1258 | "MAX_LOG_LEN": { 1259 | "env_var_name": "MAX_LOG_LEN", 1260 | "value": "", 1261 | "title": "Max Log Length", 1262 | "description": "Max number of characters or ID numbers being printed in log.", 1263 | "required": false, 1264 | "type": "number" 1265 | }, 1266 | "DISABLE_LOGGING_REQUEST": { 1267 | "env_var_name": "DISABLE_LOGGING_REQUEST", 1268 | "value": false, 1269 | "title": "Disable Logging Request", 1270 | "description": "Disable logging requests.", 1271 | "required": false, 1272 | "type": "toggle" 1273 | }, 1274 | "TOKENIZER_NAME": { 1275 | "env_var_name": "TOKENIZER_NAME", 1276 | "value": "", 1277 | "title": "Tokenizer Name", 1278 | "description": "Tokenizer repo to use a different tokenizer than the model's default", 1279 | "required": false, 1280 | "type": "text" 1281 | }, 1282 | "TOKENIZER_REVISION": { 1283 | "env_var_name": "TOKENIZER_REVISION", 1284 | "value": "", 1285 | "title": "Tokenizer Revision", 1286 | "description": "Tokenizer revision to load", 1287 | "required": false, 1288 | "type": "text" 1289 | }, 1290 | "CUSTOM_CHAT_TEMPLATE": { 1291 | "env_var_name": "CUSTOM_CHAT_TEMPLATE", 1292 | "value": "", 1293 | "title": "Custom Chat Template", 1294 | "description": "Custom chat jinja template", 1295 | "required": false, 1296 | "type": "text" 1297 | }, 1298 | "GPU_MEMORY_UTILIZATION": { 1299 | "env_var_name": "GPU_MEMORY_UTILIZATION", 1300 | "value": "0.95", 1301 | "title": "GPU Memory Utilization", 1302 | "description": "Sets GPU VRAM utilization", 1303 | "required": false, 1304 | "type": "number" 1305 | }, 1306 | "BLOCK_SIZE": { 1307 | "env_var_name": "BLOCK_SIZE", 1308 | "value": "16", 1309 | "title": "Block Size", 1310 | "description": "Token block size for contiguous chunks of tokens", 1311 | "required": false, 1312 | "type": "number" 1313 | }, 1314 | "SWAP_SPACE": { 1315 | "env_var_name": "SWAP_SPACE", 1316 | "value": "4", 1317 | "title": "Swap Space", 1318 | "description": "CPU swap space size (GiB) per GPU", 1319 | "required": false, 1320 | "type": "number" 1321 | }, 1322 | "ENFORCE_EAGER": { 1323 | "env_var_name": "ENFORCE_EAGER", 1324 | "value": false, 1325 | "title": "Enforce Eager", 1326 | "description": "Always use eager-mode PyTorch. If False (0), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility", 1327 | "required": false, 1328 | "type": "toggle" 1329 | }, 1330 | "MAX_SEQ_LEN_TO_CAPTURE": { 1331 | "env_var_name": "MAX_SEQ_LEN_TO_CAPTURE", 1332 | "value": "8192", 1333 | "title": "CUDA Graph Max Content Length", 1334 | "description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode", 1335 | "required": false, 1336 | "type": "number" 1337 | }, 1338 | "DISABLE_CUSTOM_ALL_REDUCE": { 1339 | "env_var_name": "DISABLE_CUSTOM_ALL_REDUCE", 1340 | "value": false, 1341 | "title": "Disable Custom All Reduce", 1342 | "description": "Enables or disables custom all reduce", 1343 | "required": false, 1344 | "type": "toggle" 1345 | }, 1346 | "DEFAULT_BATCH_SIZE": { 1347 | "env_var_name": "DEFAULT_BATCH_SIZE", 1348 | "value": "50", 1349 | "title": "Default Final Batch Size", 1350 | "description": "Default and Maximum batch size for token streaming to reduce HTTP calls", 1351 | "required": false, 1352 | "type": "number" 1353 | }, 1354 | "DEFAULT_MIN_BATCH_SIZE": { 1355 | "env_var_name": "DEFAULT_MIN_BATCH_SIZE", 1356 | "value": "1", 1357 | "title": "Default Starting Batch Size", 1358 | "description": "Batch size for the first request, which will be multiplied by the growth factor every subsequent request", 1359 | "required": false, 1360 | "type": "number" 1361 | }, 1362 | "DEFAULT_BATCH_SIZE_GROWTH_FACTOR": { 1363 | "env_var_name": "DEFAULT_BATCH_SIZE_GROWTH_FACTOR", 1364 | "value": "3", 1365 | "title": "Default Batch Size Growth Factor", 1366 | "description": "Growth factor for dynamic batch size", 1367 | "required": false, 1368 | "type": "number" 1369 | }, 1370 | "RAW_OPENAI_OUTPUT": { 1371 | "env_var_name": "RAW_OPENAI_OUTPUT", 1372 | "value": true, 1373 | "title": "Raw OpenAI Output", 1374 | "description": "Raw OpenAI output instead of just the text", 1375 | "required": false, 1376 | "type": "toggle" 1377 | }, 1378 | "OPENAI_RESPONSE_ROLE": { 1379 | "env_var_name": "OPENAI_RESPONSE_ROLE", 1380 | "value": "assistant", 1381 | "title": "OpenAI Response Role", 1382 | "description": "Role of the LLM's Response in OpenAI Chat Completions", 1383 | "required": false, 1384 | "type": "text" 1385 | }, 1386 | "OPENAI_SERVED_MODEL_NAME_OVERRIDE": { 1387 | "env_var_name": "OPENAI_SERVED_MODEL_NAME_OVERRIDE", 1388 | "value": "", 1389 | "title": "OpenAI Served Model Name Override", 1390 | "description": "Overrides the name of the served model from model repo/path to specified name, which you will then be able to use the value for the `model` parameter when making OpenAI requests", 1391 | "required": false, 1392 | "type": "text" 1393 | }, 1394 | "MAX_CONCURRENCY": { 1395 | "env_var_name": "MAX_CONCURRENCY", 1396 | "value": "300", 1397 | "title": "Max Concurrency", 1398 | "description": "Max concurrent requests per worker. vLLM has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency", 1399 | "required": false, 1400 | "type": "number" 1401 | }, 1402 | "MODEL_REVISION": { 1403 | "env_var_name": "MODEL_REVISION", 1404 | "value": "", 1405 | "title": "Model Revision", 1406 | "description": "Model revision (branch) to load", 1407 | "required": false, 1408 | "type": "text" 1409 | }, 1410 | "BASE_PATH": { 1411 | "env_var_name": "BASE_PATH", 1412 | "value": "/runpod-volume", 1413 | "title": "Base Path", 1414 | "description": "Storage directory for Huggingface cache and model", 1415 | "required": false, 1416 | "type": "text" 1417 | }, 1418 | "DISABLE_LOG_REQUESTS": { 1419 | "env_var_name": "DISABLE_LOG_REQUESTS", 1420 | "value": true, 1421 | "title": "Disable Log Requests", 1422 | "description": "Enables or disables vLLM request logging", 1423 | "required": false, 1424 | "type": "toggle" 1425 | }, 1426 | "ENABLE_AUTO_TOOL_CHOICE": { 1427 | "env_var_name": "ENABLE_AUTO_TOOL_CHOICE", 1428 | "value": false, 1429 | "title": "Enable Auto Tool Choice", 1430 | "description": "Enables or disables auto tool choice", 1431 | "required": false, 1432 | "type": "toggle" 1433 | }, 1434 | "TOOL_CALL_PARSER": { 1435 | "env_var_name": "TOOL_CALL_PARSER", 1436 | "value": "", 1437 | "title": "Tool Call Parser", 1438 | "description": "Tool call parser", 1439 | "required": false, 1440 | "type": "select", 1441 | "options": [ 1442 | { "value": "", "label": "None" }, 1443 | { "value": "hermes", "label": "Hermes" }, 1444 | { "value": "mistral", "label": "Mistral" }, 1445 | { "value": "llama3_json", "label": "Llama3 JSON" }, 1446 | { "value": "pythonic", "label": "Pythonic" }, 1447 | { "value": "internlm", "label": "InternLM" } 1448 | ] 1449 | } 1450 | } 1451 | } 1452 | --------------------------------------------------------------------------------