├── .buildkite ├── run-amd-test.sh ├── run-benchmarks.sh ├── test-pipeline.yaml └── test-template.j2 ├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── 100-documentation.yml │ ├── 200-installation.yml │ ├── 300-usage.yml │ ├── 400-bug report.yml │ ├── 500-feature request.yml │ ├── 600-new model.yml │ ├── 700-performance discussion.yml │ ├── 800-misc discussion.yml │ └── config.yml ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── publish.yml │ ├── ruff.yml │ ├── scripts │ ├── build.sh │ ├── create_release.js │ ├── cuda-install.sh │ ├── env.sh │ └── pytorch-install.sh │ └── yapf.yml ├── .gitignore ├── .readthedocs.yaml ├── .yapfignore ├── CMakeLists.txt ├── CONTRIBUTING.md ├── Dockerfile ├── Dockerfile.rocm ├── LICENSE ├── MANIFEST.in ├── README.md ├── benchmarks ├── README.md ├── backend_request_func.py ├── benchmark_latency.py ├── benchmark_prefix_caching.py ├── benchmark_serving.py ├── benchmark_throughput.py ├── kernels │ ├── benchmark_mixtral_moe.py │ ├── benchmark_paged_attention.py │ └── benchmark_rope.py └── launch_tgi_server.sh ├── cmake ├── hipify.py └── utils.cmake ├── collect_env.py ├── csrc ├── activation_kernels.cu ├── attention │ ├── attention_dtypes.h │ ├── attention_generic.cuh │ ├── attention_kernels.cu │ ├── attention_utils.cuh │ ├── dtype_bfloat16.cuh │ ├── dtype_float16.cuh │ ├── dtype_float32.cuh │ └── dtype_fp8_e5m2.cuh ├── cache.h ├── cache_kernels.cu ├── cuda_compat.h ├── cuda_utils.h ├── cuda_utils_kernels.cu ├── custom_all_reduce.cu ├── custom_all_reduce.cuh ├── custom_all_reduce_test.cu ├── dispatch_utils.h ├── layernorm_kernels.cu ├── moe │ ├── moe_ops.cpp │ ├── moe_ops.h │ └── topk_softmax_kernels.cu ├── moe_align_block_size_kernels.cu ├── ops.h ├── pos_encoding_kernels.cu ├── punica │ ├── LICENSE │ ├── bgmv │ │ ├── bgmv_bf16_bf16_bf16.cu │ │ ├── bgmv_bf16_bf16_fp16.cu │ │ ├── bgmv_bf16_fp16_bf16.cu │ │ ├── bgmv_bf16_fp16_fp16.cu │ │ ├── bgmv_bf16_fp32_bf16.cu │ │ ├── bgmv_bf16_fp32_fp16.cu │ │ ├── bgmv_config.h │ │ ├── bgmv_fp16_bf16_bf16.cu │ │ ├── bgmv_fp16_bf16_fp16.cu │ │ ├── bgmv_fp16_fp16_bf16.cu │ │ ├── bgmv_fp16_fp16_fp16.cu │ │ ├── bgmv_fp16_fp32_bf16.cu │ │ ├── bgmv_fp16_fp32_fp16.cu │ │ ├── bgmv_fp32_bf16_bf16.cu │ │ ├── bgmv_fp32_bf16_fp16.cu │ │ ├── bgmv_fp32_fp16_bf16.cu │ │ ├── bgmv_fp32_fp16_fp16.cu │ │ ├── bgmv_fp32_fp32_bf16.cu │ │ ├── bgmv_fp32_fp32_fp16.cu │ │ ├── bgmv_impl.cuh │ │ ├── generator.py │ │ └── vec_dtypes.cuh │ └── punica_ops.cc ├── pybind.cpp ├── quantization │ ├── awq │ │ ├── dequantize.cuh │ │ └── gemm_kernels.cu │ ├── fp8_e5m2_kvcache │ │ └── quant_utils.cuh │ ├── gptq │ │ ├── compat.cuh │ │ ├── matrix_view.cuh │ │ ├── q_gemm.cu │ │ ├── qdq_2.cuh │ │ ├── qdq_3.cuh │ │ ├── qdq_4.cuh │ │ ├── qdq_8.cuh │ │ └── qdq_util.cuh │ ├── marlin │ │ ├── LICENSE │ │ └── marlin_cuda_kernel.cu │ └── squeezellm │ │ └── quant_cuda_kernel.cu └── reduction_utils.cuh ├── docs ├── Makefile ├── README.md ├── make.bat ├── requirements-docs.txt └── source │ ├── assets │ ├── kernel │ │ ├── k_vecs.png │ │ ├── key.png │ │ ├── logits_vec.png │ │ ├── q_vecs.png │ │ ├── query.png │ │ ├── v_vec.png │ │ └── value.png │ └── logos │ │ ├── vllm-logo-only-light.png │ │ ├── vllm-logo-text-dark.png │ │ └── vllm-logo-text-light.png │ ├── conf.py │ ├── dev │ ├── engine │ │ ├── async_llm_engine.rst │ │ ├── engine_index.rst │ │ └── llm_engine.rst │ ├── kernel │ │ └── paged_attention.rst │ └── sampling_params.rst │ ├── getting_started │ ├── amd-installation.rst │ ├── installation.rst │ ├── neuron-installation.rst │ └── quickstart.rst │ ├── index.rst │ ├── models │ ├── adding_model.rst │ ├── engine_args.rst │ ├── lora.rst │ └── supported_models.rst │ ├── quantization │ ├── auto_awq.rst │ └── fp8_e5m2_kv_cache.rst │ └── serving │ ├── deploying_with_bentoml.rst │ ├── deploying_with_docker.rst │ ├── deploying_with_kserve.rst │ ├── deploying_with_triton.rst │ ├── distributed_serving.rst │ ├── integrations.rst │ ├── metrics.rst │ ├── openai_compatible_server.md │ ├── run_on_sky.rst │ └── serving_with_langchain.rst ├── examples ├── api_client.py ├── gradio_openai_chatbot_webserver.py ├── gradio_webserver.py ├── llm_engine_example.py ├── multilora_inference.py ├── offline_inference.py ├── offline_inference_distributed.py ├── offline_inference_neuron.py ├── offline_inference_with_prefix.py ├── openai_chatcompletion_client.py ├── openai_completion_client.py ├── production_monitoring │ ├── README.md │ ├── docker-compose.yaml │ ├── grafana.json │ └── prometheus.yaml ├── template_alpaca.jinja ├── template_baichuan.jinja ├── template_chatglm.jinja ├── template_chatglm2.jinja ├── template_chatml.jinja ├── template_falcon.jinja ├── template_falcon_180b.jinja └── template_inkbot.jinja ├── format.sh ├── patch_xformers.rocm.sh ├── pyproject.toml ├── requirements-build.txt ├── requirements-dev.txt ├── requirements-neuron.txt ├── requirements-rocm.txt ├── requirements.txt ├── rocm_patch ├── commonpy_xformers-0.0.23.rocm.patch ├── flashpy_xformers-0.0.23.rocm.patch └── rocm_bf16.patch ├── setup.py ├── tests ├── __init__.py ├── async_engine │ ├── api_server_async_engine.py │ ├── test_api_server.py │ ├── test_async_llm_engine.py │ ├── test_chat_template.py │ └── test_request_tracker.py ├── basic_correctness │ └── test_basic_correctness.py ├── conftest.py ├── core │ ├── __init__.py │ ├── test_block_manager.py │ ├── test_scheduler.py │ └── utils.py ├── distributed │ ├── test_basic_distributed_correctness.py │ ├── test_comm_ops.py │ └── test_custom_all_reduce.py ├── engine │ └── test_computed_prefix_blocks.py ├── entrypoints │ ├── test_guided_processors.py │ └── test_openai_server.py ├── kernels │ ├── allclose_default.py │ ├── conftest.py │ ├── test_activation.py │ ├── test_attention.py │ ├── test_cache.py │ ├── test_layernorm.py │ ├── test_moe.py │ ├── test_pos_encoding.py │ ├── test_prefix_prefill.py │ ├── test_rand.py │ └── test_sampler.py ├── lora │ ├── __init__.py │ ├── conftest.py │ ├── test_gemma.py │ ├── test_layer_variation.py │ ├── test_layers.py │ ├── test_llama.py │ ├── test_lora.py │ ├── test_lora_manager.py │ ├── test_mixtral.py │ ├── test_punica.py │ ├── test_tokenizer_group.py │ ├── test_utils.py │ ├── test_worker.py │ └── utils.py ├── metrics │ └── test_metrics.py ├── models │ ├── test_marlin.py │ ├── test_mistral.py │ └── test_models.py ├── prefix_caching │ └── test_prefix_caching.py ├── prompts │ ├── example.txt │ └── summary.txt ├── samplers │ ├── test_beam_search.py │ ├── test_logprobs.py │ ├── test_rejection_sampler.py │ ├── test_sampler.py │ └── test_seeded_generate.py ├── spec_decode │ ├── __init__.py │ ├── test_batch_expansion.py │ ├── test_metrics.py │ ├── test_multi_step_worker.py │ ├── test_spec_decode_worker.py │ ├── test_utils.py │ └── utils.py ├── test_cache_block_hashing.py ├── test_config.py ├── test_logits_processor.py ├── test_regression.py ├── test_sampling_params.py ├── test_sequence.py ├── tokenization │ ├── __init__.py │ ├── test_cached_tokenizer.py │ ├── test_detokenize.py │ └── test_tokenizer_group.py └── worker │ ├── __init__.py │ ├── test_model_runner.py │ └── test_swap.py └── vllm ├── __init__.py ├── block.py ├── config.py ├── core ├── __init__.py ├── block_manager.py ├── evictor.py ├── policy.py └── scheduler.py ├── engine ├── __init__.py ├── arg_utils.py ├── async_llm_engine.py ├── llm_engine.py ├── metrics.py └── ray_utils.py ├── entrypoints ├── __init__.py ├── api_server.py ├── llm.py └── openai │ ├── __init__.py │ ├── api_server.py │ ├── cli_args.py │ ├── protocol.py │ ├── serving_chat.py │ ├── serving_completion.py │ └── serving_engine.py ├── executor ├── __init__.py ├── executor_base.py ├── gpu_executor.py ├── ray_gpu_executor.py └── utils.py ├── logger.py ├── lora ├── __init__.py ├── layers.py ├── lora.py ├── models.py ├── punica.py ├── request.py ├── utils.py └── worker_manager.py ├── model_executor ├── __init__.py ├── guided_decoding.py ├── guided_logits_processors.py ├── input_metadata.py ├── layers │ ├── __init__.py │ ├── activation.py │ ├── attention │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── backends │ │ │ ├── __init__.py │ │ │ ├── flash_attn.py │ │ │ └── xformers.py │ │ └── ops │ │ │ ├── __init__.py │ │ │ ├── paged_attn.py │ │ │ └── prefix_prefill.py │ ├── fused_moe │ │ ├── __init__.py │ │ ├── configs │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ └── README │ │ └── fused_moe.py │ ├── layernorm.py │ ├── linear.py │ ├── logits_processor.py │ ├── ops │ │ ├── __init__.py │ │ ├── rand.py │ │ └── sample.py │ ├── quantization │ │ ├── __init__.py │ │ ├── awq.py │ │ ├── base_config.py │ │ ├── gptq.py │ │ ├── marlin.py │ │ └── squeezellm.py │ ├── rejection_sampler.py │ ├── rotary_embedding.py │ ├── sampler.py │ └── vocab_parallel_embedding.py ├── model_loader.py ├── models │ ├── __init__.py │ ├── baichuan.py │ ├── bloom.py │ ├── chatglm.py │ ├── decilm.py │ ├── deepseek.py │ ├── falcon.py │ ├── gemma.py │ ├── gpt2.py │ ├── gpt_bigcode.py │ ├── gpt_j.py │ ├── gpt_neox.py │ ├── internlm2.py │ ├── llama.py │ ├── mixtral.py │ ├── mixtral_quant.py │ ├── mpt.py │ ├── neuron │ │ ├── llama.py │ │ └── mistral.py │ ├── olmo.py │ ├── opt.py │ ├── orion.py │ ├── phi.py │ ├── qwen.py │ ├── qwen2.py │ ├── stablelm.py │ └── starcoder2.py ├── neuron_model_loader.py ├── parallel_utils │ ├── README.md │ ├── __init__.py │ ├── communication_op.py │ ├── cupy_utils.py │ ├── custom_all_reduce.py │ ├── parallel_state.py │ └── utils.py ├── sampling_metadata.py ├── utils.py └── weight_utils.py ├── outputs.py ├── py.typed ├── sampling_params.py ├── sequence.py ├── spec_decode ├── batch_expansion.py ├── interfaces.py ├── metrics.py ├── multi_step_worker.py ├── spec_decode_worker.py └── util.py ├── test_utils.py ├── transformers_utils ├── __init__.py ├── config.py ├── configs │ ├── __init__.py │ ├── chatglm.py │ ├── falcon.py │ ├── mpt.py │ └── starcoder2.py ├── tokenizer.py ├── tokenizer_group │ ├── __init__.py │ ├── base_tokenizer_group.py │ ├── ray_tokenizer_group.py │ └── tokenizer_group.py └── tokenizers │ ├── __init__.py │ └── baichuan.py ├── utils.py └── worker ├── __init__.py ├── cache_engine.py ├── model_runner.py ├── neuron_worker.py └── worker.py /.buildkite/run-amd-test.sh: -------------------------------------------------------------------------------- 1 | # This script build the ROCm docker image and run the API server inside the container. 2 | # It serves a sanity check for compilation and basic model usage. 3 | set -ex 4 | 5 | # Print ROCm version 6 | rocminfo 7 | 8 | # Try building the docker image 9 | docker build -t rocm -f Dockerfile.rocm . 10 | 11 | # Setup cleanup 12 | remove_docker_container() { docker rm -f rocm || true; } 13 | trap remove_docker_container EXIT 14 | remove_docker_container 15 | 16 | # Run the image 17 | docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server & 18 | 19 | # Wait for the server to start 20 | wait_for_server_to_start() { 21 | timeout=300 22 | counter=0 23 | 24 | while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do 25 | sleep 1 26 | counter=$((counter + 1)) 27 | if [ $counter -ge $timeout ]; then 28 | echo "Timeout after $timeout seconds" 29 | break 30 | fi 31 | done 32 | } 33 | wait_for_server_to_start 34 | 35 | # Test a simple prompt 36 | curl -X POST -H "Content-Type: application/json" \ 37 | localhost:8000/generate \ 38 | -d '{"prompt": "San Francisco is a"}' 39 | -------------------------------------------------------------------------------- /.buildkite/run-benchmarks.sh: -------------------------------------------------------------------------------- 1 | # This script is run by buildkite to run the benchmarks and upload the results to buildkite 2 | 3 | set -ex 4 | set -o pipefail 5 | 6 | # cd into parent directory of this file 7 | cd "$(dirname "${BASH_SOURCE[0]}")/.." 8 | 9 | (which wget && which curl) || (apt-get update && apt-get install -y wget curl) 10 | 11 | # run python-based benchmarks and upload the result to buildkite 12 | python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt 13 | bench_latency_exit_code=$? 14 | 15 | python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt 16 | bench_throughput_exit_code=$? 17 | 18 | # run server-based benchmarks and upload the result to buildkite 19 | python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf & 20 | server_pid=$! 21 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 22 | 23 | # wait for server to start, timeout after 600 seconds 24 | timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 25 | python3 benchmarks/benchmark_serving.py \ 26 | --backend openai \ 27 | --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \ 28 | --model meta-llama/Llama-2-7b-chat-hf \ 29 | --num-prompts 20 \ 30 | --endpoint /v1/completions \ 31 | --tokenizer meta-llama/Llama-2-7b-chat-hf \ 32 | --save-result \ 33 | 2>&1 | tee benchmark_serving.txt 34 | bench_serving_exit_code=$? 35 | kill $server_pid 36 | 37 | # write the results into a markdown file 38 | echo "### Latency Benchmarks" >> benchmark_results.md 39 | sed -n '1p' benchmark_latency.txt >> benchmark_results.md # first line 40 | echo "" >> benchmark_results.md 41 | sed -n '$p' benchmark_latency.txt >> benchmark_results.md # last line 42 | 43 | echo "### Throughput Benchmarks" >> benchmark_results.md 44 | sed -n '1p' benchmark_throughput.txt >> benchmark_results.md # first line 45 | echo "" >> benchmark_results.md 46 | sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line 47 | 48 | echo "### Serving Benchmarks" >> benchmark_results.md 49 | sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line 50 | echo "" >> benchmark_results.md 51 | tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines 52 | 53 | # upload the results to buildkite 54 | /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md 55 | 56 | # exit with the exit code of the benchmarks 57 | if [ $bench_latency_exit_code -ne 0 ]; then 58 | exit $bench_latency_exit_code 59 | fi 60 | 61 | if [ $bench_throughput_exit_code -ne 0 ]; then 62 | exit $bench_throughput_exit_code 63 | fi 64 | 65 | if [ $bench_serving_exit_code -ne 0 ]; then 66 | exit $bench_serving_exit_code 67 | fi 68 | 69 | /workspace/buildkite-agent artifact upload openai-*.json 70 | -------------------------------------------------------------------------------- /.buildkite/test-pipeline.yaml: -------------------------------------------------------------------------------- 1 | # In this file, you can add more tests to run either by adding a new step or 2 | # adding a new command to an existing step. See different options here for examples. 3 | # This script will be feed into Jinja template in `test-template.j2` to generate 4 | # the final pipeline yaml file. 5 | 6 | steps: 7 | - label: Regression Test 8 | command: pytest -v -s test_regression.py 9 | working_dir: "/vllm-workspace/tests" # optional 10 | 11 | - label: AsyncEngine Test 12 | command: pytest -v -s async_engine 13 | 14 | - label: Basic Correctness Test 15 | command: pytest -v -s --forked basic_correctness 16 | 17 | - label: Core Test 18 | command: pytest -v -s core 19 | 20 | - label: Distributed Comm Ops Test 21 | command: pytest -v -s --forked test_comm_ops.py 22 | working_dir: "/vllm-workspace/tests/distributed" 23 | num_gpus: 2 # only support 1 or 2 for now. 24 | 25 | - label: Distributed Correctness Test 26 | command: pytest -v -s --forked test_basic_distributed_correctness.py 27 | working_dir: "/vllm-workspace/tests/distributed" 28 | num_gpus: 2 # only support 1 or 2 for now. 29 | 30 | - label: Engine Test 31 | command: pytest -v -s engine tokenization test_sequence.py test_config.py 32 | 33 | - label: Entrypoints Test 34 | command: pytest -v -s entrypoints 35 | 36 | - label: Kernels Test %N 37 | command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT 38 | parallelism: 4 39 | 40 | - label: Models Test 41 | commands: 42 | - pytest -v -s models --forked 43 | soft_fail: true 44 | 45 | - label: Prefix Caching Test 46 | commands: 47 | - pytest -v -s prefix_caching 48 | 49 | - label: Samplers Test 50 | command: pytest -v -s samplers 51 | 52 | - label: LogitsProcessor Test 53 | command: pytest -v -s test_logits_processor.py 54 | 55 | - label: Worker Test 56 | command: pytest -v -s worker 57 | 58 | - label: Speculative decoding tests 59 | command: pytest -v -s spec_decode 60 | 61 | - label: LoRA Test %N 62 | command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT 63 | parallelism: 4 64 | 65 | - label: Metrics Test 66 | command: pytest -v -s metrics 67 | 68 | - label: Benchmarks 69 | working_dir: "/vllm-workspace/.buildkite" 70 | commands: 71 | - pip install aiohttp 72 | - bash run-benchmarks.sh 73 | 74 | - label: Documentation Build 75 | working_dir: "/vllm-workspace/docs" 76 | no_gpu: True 77 | commands: 78 | - pip install -r requirements-docs.txt 79 | - SPHINXOPTS=\"-W\" make html 80 | -------------------------------------------------------------------------------- /.buildkite/test-template.j2: -------------------------------------------------------------------------------- 1 | {% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %} 2 | {% set default_num_gpu = 1 %} 3 | {% set default_working_dir = "/vllm-workspace/tests" %} 4 | 5 | steps: 6 | - label: "AMD Test" 7 | agents: 8 | queue: amd 9 | command: bash .buildkite/run-amd-test.sh 10 | 11 | - label: ":docker: build image" 12 | commands: 13 | - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." 14 | - "docker push {{ docker_image }}" 15 | env: 16 | DOCKER_BUILDKIT: "1" 17 | retry: 18 | automatic: 19 | - exit_status: -1 # Agent was lost 20 | limit: 5 21 | - wait 22 | 23 | {% for step in steps %} 24 | - label: "{{ step.label }}" 25 | agents: 26 | queue: kubernetes 27 | soft_fail: {{ step.soft_fail or false }} 28 | {% if step.parallelism %} 29 | parallelism: {{ step.parallelism }} 30 | {% endif %} 31 | retry: 32 | automatic: 33 | - exit_status: -1 # Agent was lost 34 | limit: 5 35 | plugins: 36 | - kubernetes: 37 | podSpec: 38 | volumes: 39 | - name: dshm 40 | emptyDir: 41 | medium: Memory 42 | containers: 43 | - image: "{{ docker_image }}" 44 | command: ["bash"] 45 | args: 46 | - '-c' 47 | - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" 48 | {% if not step.no_gpu %} 49 | resources: 50 | requests: 51 | nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" 52 | limits: 53 | nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" 54 | {% endif %} 55 | env: 56 | - name: HF_TOKEN 57 | valueFrom: 58 | secretKeyRef: 59 | name: hf-token-secret 60 | key: token 61 | volumeMounts: 62 | - mountPath: /dev/shm 63 | name: dshm 64 | {% endfor %} 65 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | vllm/*.so 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/100-documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to https://docs.vllm.ai/ 3 | title: "[Doc]: " 4 | labels: ["documentation"] 5 | 6 | body: 7 | - type: textarea 8 | attributes: 9 | label: 📚 The doc issue 10 | description: > 11 | A clear and concise description of what content in https://docs.vllm.ai/ is an issue. 12 | validations: 13 | required: true 14 | - type: textarea 15 | attributes: 16 | label: Suggest a potential alternative/fix 17 | description: > 18 | Tell us how we could improve the documentation in this regard. 19 | - type: markdown 20 | attributes: 21 | value: > 22 | Thanks for contributing 🎉! 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/200-installation.yml: -------------------------------------------------------------------------------- 1 | name: 🛠️ Installation 2 | description: Report an issue here when you hit errors during installation. 3 | title: "[Installation]: " 4 | labels: ["installation"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Your current environment 14 | description: | 15 | Please run the following and paste the output below. 16 | ```sh 17 | wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py 18 | # For security purposes, please feel free to check the contents of collect_env.py before running it. 19 | python collect_env.py 20 | ``` 21 | value: | 22 | ```text 23 | The output of `python collect_env.py` 24 | ``` 25 | validations: 26 | required: true 27 | - type: textarea 28 | attributes: 29 | label: How you are installing vllm 30 | description: | 31 | Paste the full command you are trying to execute. 32 | value: | 33 | ```sh 34 | pip install -vvv vllm 35 | ``` 36 | - type: markdown 37 | attributes: 38 | value: > 39 | Thanks for contributing 🎉! 40 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/300-usage.yml: -------------------------------------------------------------------------------- 1 | name: 💻 Usage 2 | description: Raise an issue here if you don't know how to use vllm. 3 | title: "[Usage]: " 4 | labels: ["usage"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Your current environment 14 | description: | 15 | Please run the following and paste the output below. 16 | ```sh 17 | wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py 18 | # For security purposes, please feel free to check the contents of collect_env.py before running it. 19 | python collect_env.py 20 | ``` 21 | value: | 22 | ```text 23 | The output of `python collect_env.py` 24 | ``` 25 | validations: 26 | required: true 27 | - type: textarea 28 | attributes: 29 | label: How would you like to use vllm 30 | description: | 31 | A detailed description of how you want to use vllm. 32 | value: | 33 | I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm. 34 | - type: markdown 35 | attributes: 36 | value: > 37 | Thanks for contributing 🎉! 38 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/500-feature request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Submit a proposal/request for a new vllm feature 3 | title: "[Feature]: " 4 | labels: ["feature request"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: 🚀 The feature, motivation and pitch 14 | description: > 15 | A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. 16 | validations: 17 | required: true 18 | - type: textarea 19 | attributes: 20 | label: Alternatives 21 | description: > 22 | A description of any alternative solutions or features you've considered, if any. 23 | - type: textarea 24 | attributes: 25 | label: Additional context 26 | description: > 27 | Add any other context or screenshots about the feature request. 28 | - type: markdown 29 | attributes: 30 | value: > 31 | Thanks for contributing 🎉! 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/600-new model.yml: -------------------------------------------------------------------------------- 1 | name: 🤗 Support request for a new model from huggingface 2 | description: Submit a proposal/request for a new model from huggingface 3 | title: "[New Model]: " 4 | labels: ["new model"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | 12 | #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model. 13 | - type: textarea 14 | attributes: 15 | label: The model to consider. 16 | description: > 17 | A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 . 18 | validations: 19 | required: true 20 | - type: textarea 21 | attributes: 22 | label: The closest model vllm already supports. 23 | description: > 24 | Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for? 25 | - type: textarea 26 | attributes: 27 | label: What's your difficulty of supporting the model you want? 28 | description: > 29 | For example, any new operators or new architecture? 30 | - type: markdown 31 | attributes: 32 | value: > 33 | Thanks for contributing 🎉! 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/700-performance discussion.yml: -------------------------------------------------------------------------------- 1 | name: ⚡ Discussion on the performance of vllm 2 | description: Submit a proposal/discussion about the performance of vllm 3 | title: "[Performance]: " 4 | labels: ["performance"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Proposal to improve performance 14 | description: > 15 | How do you plan to improve vllm's performance? 16 | validations: 17 | required: false 18 | - type: textarea 19 | attributes: 20 | label: Report of performance regression 21 | description: > 22 | Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks . 23 | validations: 24 | required: false 25 | - type: textarea 26 | attributes: 27 | label: Misc discussion on performance 28 | description: > 29 | Anything about the performance. 30 | validations: 31 | required: false 32 | - type: textarea 33 | attributes: 34 | label: Your current environment (if you think it is necessary) 35 | description: | 36 | Please run the following and paste the output below. 37 | ```sh 38 | wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py 39 | # For security purposes, please feel free to check the contents of collect_env.py before running it. 40 | python collect_env.py 41 | ``` 42 | value: | 43 | ```text 44 | The output of `python collect_env.py` 45 | ``` 46 | validations: 47 | required: false 48 | - type: markdown 49 | attributes: 50 | value: > 51 | Thanks for contributing 🎉! 52 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/800-misc discussion.yml: -------------------------------------------------------------------------------- 1 | name: 🎲 Misc/random discussions that do not fit into the above categories. 2 | description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues. 3 | title: "[Misc]: " 4 | labels: ["misc"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Anything you want to discuss about vllm. 14 | description: > 15 | Anything you want to discuss about vllm. 16 | validations: 17 | required: true 18 | - type: markdown 19 | attributes: 20 | value: > 21 | Thanks for contributing 🎉! 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: ruff 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | ruff: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.10"] 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 29 | - name: Analysing the code with ruff 30 | run: | 31 | ruff . 32 | - name: Spelling check with codespell 33 | run: | 34 | codespell --toml pyproject.toml -------------------------------------------------------------------------------- /.github/workflows/scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | cuda_home=/usr/local/cuda-$2 5 | 6 | # Update paths 7 | PATH=${cuda_home}/bin:$PATH 8 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH 9 | 10 | # Install requirements 11 | $python_executable -m pip install wheel packaging 12 | $python_executable -m pip install -r requirements.txt 13 | 14 | # Limit the number of parallel jobs to avoid OOM 15 | export MAX_JOBS=1 16 | # Make sure punica is built for the release (for LoRA) 17 | export VLLM_INSTALL_PUNICA_KERNELS=1 18 | 19 | # Build 20 | $python_executable setup.py bdist_wheel --dist-dir=dist 21 | -------------------------------------------------------------------------------- /.github/workflows/scripts/create_release.js: -------------------------------------------------------------------------------- 1 | // Uses Github's API to create the release and wait for result. 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately. 3 | 4 | module.exports = async (github, context, core) => { 5 | try { 6 | const response = await github.rest.repos.createRelease({ 7 | draft: false, 8 | generate_release_notes: true, 9 | name: process.env.RELEASE_TAG, 10 | owner: context.repo.owner, 11 | prerelease: false, 12 | repo: context.repo.repo, 13 | tag_name: process.env.RELEASE_TAG, 14 | }); 15 | 16 | core.setOutput('upload_url', response.data.upload_url); 17 | } catch (error) { 18 | core.setFailed(error.message); 19 | } 20 | } -------------------------------------------------------------------------------- /.github/workflows/scripts/cuda-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Replace '.' with '-' ex: 11.8 -> 11-8 4 | cuda_version=$(echo $1 | tr "." "-") 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004 6 | OS=$(echo $2 | tr -d ".\-") 7 | 8 | # Installs CUDA 9 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb 10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 11 | rm cuda-keyring_1.1-1_all.deb 12 | sudo apt -qq update 13 | sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version} 14 | sudo apt clean 15 | 16 | # Test nvcc 17 | PATH=/usr/local/cuda-$1/bin:${PATH} 18 | nvcc --version 19 | 20 | # Log gcc, g++, c++ versions 21 | gcc --version 22 | g++ --version 23 | c++ --version 24 | -------------------------------------------------------------------------------- /.github/workflows/scripts/env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This file installs common linux environment tools 4 | 5 | export LANG C.UTF-8 6 | 7 | # python_version=$1 8 | 9 | sudo apt-get update && \ 10 | sudo apt-get install -y --no-install-recommends \ 11 | software-properties-common \ 12 | 13 | sudo apt-get install -y --no-install-recommends \ 14 | build-essential \ 15 | apt-utils \ 16 | ca-certificates \ 17 | wget \ 18 | git \ 19 | vim \ 20 | libssl-dev \ 21 | curl \ 22 | unzip \ 23 | unrar \ 24 | cmake \ 25 | net-tools \ 26 | sudo \ 27 | autotools-dev \ 28 | rsync \ 29 | jq \ 30 | openssh-server \ 31 | tmux \ 32 | screen \ 33 | htop \ 34 | pdsh \ 35 | openssh-client \ 36 | lshw \ 37 | dmidecode \ 38 | util-linux \ 39 | automake \ 40 | autoconf \ 41 | libtool \ 42 | net-tools \ 43 | pciutils \ 44 | libpci-dev \ 45 | libaio-dev \ 46 | libcap2 \ 47 | libtinfo5 \ 48 | fakeroot \ 49 | devscripts \ 50 | debhelper \ 51 | nfs-common 52 | 53 | # Remove github bloat files to free up disk space 54 | sudo rm -rf "/usr/local/share/boost" 55 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 56 | sudo rm -rf "/usr/share/dotnet" 57 | -------------------------------------------------------------------------------- /.github/workflows/scripts/pytorch-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | pytorch_version=$2 5 | cuda_version=$3 6 | 7 | # Install torch 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya 9 | $python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./} 10 | 11 | # Print version information 12 | $python_executable --version 13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)" 14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)" 15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" 16 | -------------------------------------------------------------------------------- /.github/workflows/yapf.yml: -------------------------------------------------------------------------------- 1 | name: yapf 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | jobs: 13 | yapf: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.10"] 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install yapf==0.32.0 28 | pip install toml==0.10.2 29 | - name: Running yapf 30 | run: | 31 | yapf --diff --recursive . 32 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "3.8" 10 | 11 | sphinx: 12 | configuration: docs/source/conf.py 13 | 14 | # If using Sphinx, optionally build your docs in additional formats such as PDF 15 | formats: 16 | - pdf 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements-docs.txt 22 | -------------------------------------------------------------------------------- /.yapfignore: -------------------------------------------------------------------------------- 1 | collect_env.py 2 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to vLLM 2 | 3 | Thank you for your interest in contributing to vLLM! 4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. 5 | There are several ways you can contribute to the project: 6 | 7 | - Identify and report any issues or bugs. 8 | - Request or add a new model. 9 | - Suggest or implement new features. 10 | 11 | However, remember that contributions aren't just about code. 12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions. 13 | 14 | Finally, one of the most impactful ways to support us is by raising awareness about vLLM. 15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects. 16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository. 17 | 18 | 19 | ## Setup for development 20 | 21 | ### Build from source 22 | 23 | ```bash 24 | pip install -r requirements.txt 25 | pip install -e . # This may take several minutes. 26 | ``` 27 | 28 | ### Testing 29 | 30 | ```bash 31 | pip install -r requirements-dev.txt 32 | 33 | # Static type checking 34 | mypy 35 | # Unit tests 36 | pytest tests/ 37 | ``` 38 | **Note:** Currently, the repository does not pass the mypy tests. 39 | 40 | 41 | ## Contributing Guidelines 42 | 43 | ### Issue Reporting 44 | 45 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it. 46 | If not, please file a new issue, providing as much relevant information as possible. 47 | 48 | ### Pull Requests & Code Reviews 49 | 50 | Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution. 51 | 52 | ### Thank You 53 | 54 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. 55 | Your contributions make vLLM a great tool for everyone! 56 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include requirements.txt 3 | include CMakeLists.txt 4 | 5 | recursive-include cmake * 6 | recursive-include csrc * 7 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking vLLM 2 | 3 | ## Downloading the ShareGPT dataset 4 | 5 | You can download the dataset by running: 6 | ```bash 7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 8 | ``` 9 | -------------------------------------------------------------------------------- /benchmarks/launch_tgi_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PORT=8000 4 | MODEL=$1 5 | TOKENS=$2 6 | 7 | docker run --gpus all --shm-size 1g -p $PORT:80 \ 8 | -v $PWD/data:/data \ 9 | ghcr.io/huggingface/text-generation-inference:1.4.0 \ 10 | --model-id $MODEL \ 11 | --sharded false \ 12 | --max-input-length 1024 \ 13 | --max-total-tokens 2048 \ 14 | --max-best-of 5 \ 15 | --max-concurrent-requests 5000 \ 16 | --max-batch-total-tokens $TOKENS 17 | -------------------------------------------------------------------------------- /cmake/hipify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # 4 | # A command line tool for running pytorch's hipify preprocessor on CUDA 5 | # source files. 6 | # 7 | # See https://github.com/ROCm/hipify_torch 8 | # and /utils/hipify/hipify_python.py 9 | # 10 | 11 | import argparse 12 | import shutil 13 | import os 14 | 15 | from torch.utils.hipify.hipify_python import hipify 16 | 17 | if __name__ == '__main__': 18 | parser = argparse.ArgumentParser() 19 | 20 | # Project directory where all the source + include files live. 21 | parser.add_argument( 22 | "-p", 23 | "--project_dir", 24 | help="The project directory.", 25 | ) 26 | 27 | # Directory where hipified files are written. 28 | parser.add_argument( 29 | "-o", 30 | "--output_dir", 31 | help="The output directory.", 32 | ) 33 | 34 | # Source files to convert. 35 | parser.add_argument("sources", 36 | help="Source files to hipify.", 37 | nargs="*", 38 | default=[]) 39 | 40 | args = parser.parse_args() 41 | 42 | # Limit include scope to project_dir only 43 | includes = [os.path.join(args.project_dir, '*')] 44 | 45 | # Get absolute path for all source files. 46 | extra_files = [os.path.abspath(s) for s in args.sources] 47 | 48 | # Copy sources from project directory to output directory. 49 | # The directory might already exist to hold object files so we ignore that. 50 | shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True) 51 | 52 | hipify_result = hipify(project_directory=args.project_dir, 53 | output_directory=args.output_dir, 54 | header_include_dirs=[], 55 | includes=includes, 56 | extra_files=extra_files, 57 | show_detailed=True, 58 | is_pytorch_extension=True, 59 | hipify_extra_files_only=True) 60 | 61 | hipified_sources = [] 62 | for source in args.sources: 63 | s_abs = os.path.abspath(source) 64 | hipified_s_abs = (hipify_result[s_abs].hipified_path if 65 | (s_abs in hipify_result 66 | and hipify_result[s_abs].hipified_path is not None) 67 | else s_abs) 68 | hipified_sources.append(hipified_s_abs) 69 | 70 | assert (len(hipified_sources) == len(args.sources)) 71 | 72 | # Print hipified source files. 73 | print("\n".join(hipified_sources)) 74 | -------------------------------------------------------------------------------- /csrc/attention/attention_dtypes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | #include "dtype_float16.cuh" 5 | #include "dtype_float32.cuh" 6 | #include "dtype_bfloat16.cuh" 7 | #include "dtype_fp8_e5m2.cuh" 8 | -------------------------------------------------------------------------------- /csrc/attention/attention_generic.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | #include 21 | 22 | namespace vllm { 23 | 24 | // A vector type to store Q, K, V elements. 25 | template 26 | struct Vec {}; 27 | 28 | // A vector type to store FP32 accumulators. 29 | template 30 | struct FloatVec {}; 31 | 32 | // Template vector operations. 33 | template 34 | inline __device__ Acc mul(A a, B b); 35 | 36 | template 37 | inline __device__ float sum(T v); 38 | 39 | template 40 | inline __device__ float dot(T a, T b) { 41 | return sum(mul(a, b)); 42 | } 43 | 44 | template 45 | inline __device__ float dot(T a, T b) { 46 | return sum(mul(a, b)); 47 | } 48 | 49 | template 50 | inline __device__ void zero(T& dst) { 51 | constexpr int WORDS = sizeof(T) / 4; 52 | union { 53 | T raw; 54 | uint32_t words[WORDS]; 55 | } tmp; 56 | 57 | #pragma unroll 58 | for (int ii = 0; ii < WORDS; ++ii) { 59 | tmp.words[ii] = 0u; 60 | } 61 | dst = tmp.raw; 62 | } 63 | 64 | } // namespace vllm 65 | -------------------------------------------------------------------------------- /csrc/attention/attention_utils.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | #include "../cuda_compat.h" 21 | #include "attention_dtypes.h" 22 | 23 | #include 24 | #include 25 | 26 | namespace vllm { 27 | 28 | // Q*K^T operation. 29 | template 30 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) { 31 | using A_vec = typename FloatVec::Type; 32 | // Compute the parallel products for Q*K^T (treat vector lanes separately). 33 | A_vec qk_vec = mul(q[0], k[0]); 34 | #pragma unroll 35 | for (int ii = 1; ii < N; ++ii) { 36 | qk_vec = fma(q[ii], k[ii], qk_vec); 37 | } 38 | 39 | // Finalize the reduction across lanes. 40 | float qk = sum(qk_vec); 41 | #pragma unroll 42 | for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) { 43 | qk += VLLM_SHFL_XOR_SYNC(qk, mask); 44 | } 45 | return qk; 46 | } 47 | 48 | template 49 | struct Qk_dot { 50 | template 51 | static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) { 52 | return qk_dot_(q, k); 53 | } 54 | }; 55 | 56 | } // namespace vllm 57 | -------------------------------------------------------------------------------- /csrc/attention/dtype_fp8_e5m2.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | 5 | #include 6 | #ifdef ENABLE_FP8_E5M2 7 | #include 8 | #endif 9 | 10 | namespace vllm { 11 | #ifdef ENABLE_FP8_E5M2 12 | // fp8 vector types for quantization of kv cache 13 | 14 | template<> 15 | struct Vec { 16 | using Type = uint8_t; 17 | }; 18 | 19 | template<> 20 | struct Vec { 21 | using Type = uint16_t; 22 | }; 23 | 24 | template<> 25 | struct Vec { 26 | using Type = uint32_t; 27 | }; 28 | 29 | template<> 30 | struct Vec { 31 | using Type = uint2; 32 | }; 33 | #endif // ENABLE_FP8_E5M2 34 | 35 | } // namespace vllm 36 | -------------------------------------------------------------------------------- /csrc/cache.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | void swap_blocks( 9 | torch::Tensor& src, 10 | torch::Tensor& dst, 11 | const std::map& block_mapping); 12 | 13 | void copy_blocks( 14 | std::vector& key_caches, 15 | std::vector& value_caches, 16 | const std::map>& block_mapping); 17 | 18 | void reshape_and_cache( 19 | torch::Tensor& key, 20 | torch::Tensor& value, 21 | torch::Tensor& key_cache, 22 | torch::Tensor& value_cache, 23 | torch::Tensor& slot_mapping, 24 | const std::string& kv_cache_dtype); 25 | 26 | // Just for unittest 27 | void convert_fp8_e5m2( 28 | torch::Tensor& src_cache, 29 | torch::Tensor& dst_cache); 30 | -------------------------------------------------------------------------------- /csrc/cuda_compat.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef USE_ROCM 4 | #include 5 | #endif 6 | 7 | #ifndef USE_ROCM 8 | #define WARP_SIZE 32 9 | #else 10 | #define WARP_SIZE warpSize 11 | #endif 12 | 13 | #ifndef USE_ROCM 14 | #define VLLM_LDG(arg) __ldg(arg) 15 | #else 16 | #define VLLM_LDG(arg) *(arg) 17 | #endif 18 | 19 | #ifndef USE_ROCM 20 | #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask) 21 | #else 22 | #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask) 23 | #endif 24 | 25 | #ifndef USE_ROCM 26 | #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane) 27 | #else 28 | #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane) 29 | #endif 30 | 31 | #ifndef USE_ROCM 32 | #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ 33 | cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL) 34 | #else 35 | #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ 36 | hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) 37 | #endif 38 | 39 | -------------------------------------------------------------------------------- /csrc/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | int get_device_attribute( 6 | int attribute, 7 | int device_id); 8 | 9 | int get_max_shared_memory_per_block_device_attribute( 10 | int device_id); 11 | -------------------------------------------------------------------------------- /csrc/cuda_utils_kernels.cu: -------------------------------------------------------------------------------- 1 | #ifdef USE_ROCM 2 | #include 3 | #include 4 | #endif 5 | int get_device_attribute( 6 | int attribute, 7 | int device_id) 8 | { 9 | int device, value; 10 | if (device_id < 0) { 11 | cudaGetDevice(&device); 12 | } 13 | else { 14 | device = device_id; 15 | } 16 | cudaDeviceGetAttribute(&value, static_cast(attribute), device); 17 | return value; 18 | } 19 | 20 | 21 | int get_max_shared_memory_per_block_device_attribute( 22 | int device_id) 23 | { 24 | int attribute; 25 | // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html 26 | // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74 27 | 28 | #ifdef USE_ROCM 29 | attribute = hipDeviceAttributeMaxSharedMemoryPerBlock; 30 | #else 31 | attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin; 32 | #endif 33 | 34 | return get_device_attribute(attribute, device_id); 35 | } 36 | -------------------------------------------------------------------------------- /csrc/dispatch_utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from 3 | * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h 4 | */ 5 | #pragma once 6 | 7 | #include 8 | 9 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ 10 | AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ 11 | AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ 12 | AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) 13 | 14 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ 15 | AT_DISPATCH_SWITCH( \ 16 | TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) 17 | 18 | #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...) \ 19 | AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ 20 | AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ 21 | AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ 22 | AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) 23 | 24 | #define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \ 25 | AT_DISPATCH_SWITCH( \ 26 | TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__)) 27 | 28 | #define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...) \ 29 | AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \ 30 | AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ 31 | AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ 32 | AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ 33 | AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) 34 | 35 | #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ 36 | AT_DISPATCH_SWITCH( \ 37 | TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)) 38 | -------------------------------------------------------------------------------- /csrc/moe/moe_ops.cpp: -------------------------------------------------------------------------------- 1 | #include "moe_ops.h" 2 | 3 | #include 4 | 5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 6 | m.def("topk_softmax", &topk_softmax, "Apply topk softmax to the gating outputs."); 7 | } 8 | -------------------------------------------------------------------------------- /csrc/moe/moe_ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void topk_softmax( 6 | torch::Tensor& topk_weights, 7 | torch::Tensor& topk_indices, 8 | torch::Tensor& token_expert_indices, 9 | torch::Tensor& gating_output); 10 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_half) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_bfloat16) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_half) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_half) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_config.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | template 5 | void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, 6 | const W_T *__restrict__ W, 7 | const int64_t *__restrict__ indicies, int64_t y_offset, 8 | int64_t full_y_size, int64_t batch_size, int64_t num_layers, 9 | int64_t layer_idx, float scale); 10 | 11 | // clang-format off 12 | 13 | #define FOR_BGMV_WIDE(f, in_T, out_T, W_T, narrow) \ 14 | f(in_T, out_T, W_T, narrow, 128) \ 15 | f(in_T, out_T, W_T, narrow, 256) \ 16 | f(in_T, out_T, W_T, narrow, 512) \ 17 | f(in_T, out_T, W_T, narrow, 768) \ 18 | f(in_T, out_T, W_T, narrow, 1024) \ 19 | f(in_T, out_T, W_T, narrow, 1280) \ 20 | f(in_T, out_T, W_T, narrow, 1728) \ 21 | f(in_T, out_T, W_T, narrow, 1792) \ 22 | f(in_T, out_T, W_T, narrow, 2048) \ 23 | f(in_T, out_T, W_T, narrow, 2560) \ 24 | f(in_T, out_T, W_T, narrow, 2752) \ 25 | f(in_T, out_T, W_T, narrow, 2816) \ 26 | f(in_T, out_T, W_T, narrow, 3072) \ 27 | f(in_T, out_T, W_T, narrow, 3456) \ 28 | f(in_T, out_T, W_T, narrow, 3584) \ 29 | f(in_T, out_T, W_T, narrow, 4096) \ 30 | f(in_T, out_T, W_T, narrow, 5120) \ 31 | f(in_T, out_T, W_T, narrow, 5504) \ 32 | f(in_T, out_T, W_T, narrow, 5632) \ 33 | f(in_T, out_T, W_T, narrow, 6144) \ 34 | f(in_T, out_T, W_T, narrow, 6912) \ 35 | f(in_T, out_T, W_T, narrow, 7168) \ 36 | f(in_T, out_T, W_T, narrow, 8192) \ 37 | f(in_T, out_T, W_T, narrow, 9216) \ 38 | f(in_T, out_T, W_T, narrow, 10240) \ 39 | f(in_T, out_T, W_T, narrow, 11008) \ 40 | f(in_T, out_T, W_T, narrow, 12288) \ 41 | f(in_T, out_T, W_T, narrow, 13696) \ 42 | f(in_T, out_T, W_T, narrow, 13824) \ 43 | f(in_T, out_T, W_T, narrow, 14336) \ 44 | f(in_T, out_T, W_T, narrow, 16384) \ 45 | f(in_T, out_T, W_T, narrow, 20480) \ 46 | f(in_T, out_T, W_T, narrow, 22016) \ 47 | f(in_T, out_T, W_T, narrow, 24576) \ 48 | f(in_T, out_T, W_T, narrow, 28672) \ 49 | f(in_T, out_T, W_T, narrow, 32000) \ 50 | f(in_T, out_T, W_T, narrow, 32256) \ 51 | f(in_T, out_T, W_T, narrow, 32512) \ 52 | f(in_T, out_T, W_T, narrow, 32768) \ 53 | f(in_T, out_T, W_T, narrow, 33024) \ 54 | f(in_T, out_T, W_T, narrow, 36864) \ 55 | f(in_T, out_T, W_T, narrow, 49152) \ 56 | // Keep above in sync with vllm/lora/layers::SamplerWithLoRA 57 | 58 | // Keep this in sync with vllm/config::LoRAConfig 59 | #define FOR_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \ 60 | FOR_BGMV_WIDE(f, in_T, out_T, W_T, 8) \ 61 | FOR_BGMV_WIDE(f, in_T, out_T, W_T, 16) \ 62 | FOR_BGMV_WIDE(f, in_T, out_T, W_T, 32) \ 63 | FOR_BGMV_WIDE(f, in_T, out_T, W_T, 64) 64 | 65 | // clang-format on 66 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_bfloat16) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_half) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_bfloat16) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_bfloat16) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_half) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_bfloat16) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_bfloat16) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_half) 5 | -------------------------------------------------------------------------------- /csrc/punica/bgmv/generator.py: -------------------------------------------------------------------------------- 1 | DTYPES = ["fp16", "bf16", "fp32"] 2 | DTYPE_MAP = { 3 | "fp16": "nv_half", 4 | "bf16": "nv_bfloat16", 5 | "fp32": "float", 6 | } 7 | 8 | TEMPLATE = """ 9 | #include "bgmv_config.h" 10 | #include "bgmv_impl.cuh" 11 | 12 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype}) 13 | """.lstrip() # noqa: E501 14 | 15 | for input_dtype in DTYPES: 16 | for output_dtype in DTYPES: 17 | for weight_dtype in DTYPES: 18 | if weight_dtype == "fp32": 19 | # FP32 weights are not supported. 20 | continue 21 | kernel_definition = TEMPLATE.format( 22 | input_dtype=DTYPE_MAP[input_dtype], 23 | output_dtype=DTYPE_MAP[output_dtype], 24 | weight_dtype=DTYPE_MAP[weight_dtype]) 25 | filename = f"bgmv_{input_dtype}_{output_dtype}_{weight_dtype}.cu" 26 | with open(filename, "w") as f: 27 | f.write(kernel_definition) 28 | -------------------------------------------------------------------------------- /csrc/quantization/gptq/compat.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _compat_cuh 6 | #define _compat_cuh 7 | 8 | namespace vllm { 9 | namespace gptq { 10 | // atomicAdd for half types, to support CC < 7.x 11 | 12 | __device__ __forceinline__ void atomicAdd_half(half* address, half val) 13 | { 14 | unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); 15 | unsigned int old = *address_as_ui; 16 | unsigned int assumed; 17 | 18 | do 19 | { 20 | assumed = old; 21 | __half_raw hsum; 22 | hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); 23 | half tmpres = __hadd(hsum, val); 24 | hsum = __half_raw(tmpres); 25 | old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; 26 | old = atomicCAS(address_as_ui, assumed, old); 27 | } 28 | while (assumed != old); 29 | } 30 | 31 | // atomicAdd for half2 types 32 | 33 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) 34 | { 35 | unsigned int* address_as_ui = (unsigned int*)address; 36 | unsigned int old = *address_as_ui; 37 | unsigned int assumed; 38 | do 39 | { 40 | assumed = old; 41 | half2 old_val = *((half2*)&old); 42 | half2 new_val = __hadd2(old_val, val); 43 | old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); 44 | } 45 | while (assumed != old); 46 | } 47 | 48 | // 49 | 50 | #if defined(__CUDA_ARCH__) || defined(USE_ROCM) 51 | #if __CUDA_ARCH__ < 700 || defined(USE_ROCM) 52 | 53 | __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } 54 | 55 | #if __CUDA_ARCH__ < 600 || defined(USE_ROCM) 56 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } 57 | #endif 58 | 59 | #endif 60 | #endif 61 | 62 | } // namespace gptq 63 | } // namespace vllm 64 | #endif 65 | -------------------------------------------------------------------------------- /csrc/quantization/gptq/qdq_2.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _qdq_2_cuh 6 | #define _qdq_2_cuh 7 | 8 | #include "qdq_util.cuh" 9 | 10 | namespace vllm { 11 | namespace gptq { 12 | 13 | // Permutation: 14 | // 15 | // ffddbb99 77553311 eeccaa88 66442200 16 | 17 | __forceinline__ __device__ void shuffle_2bit_16 18 | ( 19 | uint32_t* q, 20 | int stride 21 | ) 22 | { 23 | uint32_t qa = q[0]; 24 | uint32_t qb = 0; 25 | 26 | #pragma unroll 27 | for (int i = 0; i < 8; i++) 28 | { 29 | uint32_t qa0 = qa & 0x03; 30 | uint32_t qa1 = (qa & 0x0c) >> 2; 31 | qa >>= 4; 32 | qb |= (qa1 << (i * 2 + 16)); 33 | qb |= (qa0 << (i * 2)); 34 | } 35 | q[0] = qb; 36 | } 37 | 38 | __forceinline__ __device__ void dequant_2bit_16 39 | ( 40 | const uint32_t q_0, 41 | half2 (&dq)[8], 42 | int stride, 43 | const uint32_t zero 44 | ) 45 | { 46 | const uint32_t c0 = 0x64006400; 47 | const half y4_ = __float2half_rn(1.0f / 4.0f); 48 | const half y16_ = __float2half_rn(1.0f / 16.0f); 49 | const half y64_ = __float2half_rn(1.0f / 64.0f); 50 | const half2 y4 = __halves2half2(y4_, y4_); 51 | const half2 y16 = __halves2half2(y16_, y16_); 52 | const half2 y64 = __halves2half2(y64_, y64_); 53 | 54 | const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero); 55 | const half z4_ = __hsub(__int2half_rn(-256), __int2half_rn(zero)); 56 | const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero)); 57 | const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero)); 58 | const half2 z1 = __half2half2(z1_.as_half); 59 | const half2 z4 = __half2half2(z4_); 60 | const half2 z16 = __half2half2(z16_); 61 | const half2 z64 = __half2half2(z64_); 62 | 63 | uint32_t qa = q_0; 64 | half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1]) + 1024 65 | half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) * 4 + 1024 66 | half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024 67 | half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024 68 | qa >>= 8; 69 | half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8]) + 1024 70 | half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) * 4 + 1024 71 | half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024 72 | half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024 73 | 74 | dq[0] = __hadd2(q0.as_half2, z1); 75 | dq[1] = __hfma2(q1.as_half2, y4, z4); 76 | dq[2] = __hfma2(q2.as_half2, y16, z16); 77 | dq[3] = __hfma2(q3.as_half2, y64, z64); 78 | dq[4] = __hadd2(q4.as_half2, z1); 79 | dq[5] = __hfma2(q5.as_half2, y4, z4); 80 | dq[6] = __hfma2(q6.as_half2, y16, z16); 81 | dq[7] = __hfma2(q7.as_half2, y64, z64); 82 | } 83 | 84 | } // namespace gptq 85 | } // namespace vllm 86 | 87 | #endif 88 | -------------------------------------------------------------------------------- /csrc/quantization/gptq/qdq_8.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _qdq_8_cuh 6 | #define _qdq_8_cuh 7 | 8 | #include "qdq_util.cuh" 9 | 10 | namespace vllm { 11 | namespace gptq { 12 | 13 | __forceinline__ __device__ void shuffle_8bit_4 14 | ( 15 | uint32_t* q, 16 | int stride 17 | ) 18 | { 19 | } 20 | 21 | __forceinline__ __device__ void dequant_8bit_8 22 | ( 23 | const uint32_t q_0, 24 | const uint32_t q_1, 25 | half2 (&dq)[4], 26 | int stride, 27 | const uint32_t zero 28 | ) 29 | { 30 | half dqh[8]; 31 | for (int i = 0; i < 4; i++) dqh[i ] = dq_ns(exb(q_0, i * 8, 0xff), zero); 32 | for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero); 33 | 34 | for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); 35 | } 36 | 37 | } // namespace gptq 38 | } // namespace vllm 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /csrc/quantization/gptq/qdq_util.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _qdq_util_cuh 6 | #define _qdq_util_cuh 7 | 8 | namespace vllm { 9 | namespace gptq { 10 | 11 | union half2_uint32 12 | { 13 | uint32_t as_uint32; 14 | half2 as_half2; 15 | __device__ half2_uint32(uint32_t val) : as_uint32(val) {} 16 | __device__ half2_uint32(half2 val) : as_half2(val) {} 17 | }; 18 | 19 | union half_uint16 20 | { 21 | uint16_t as_uint16; 22 | half as_half; 23 | __device__ half_uint16(uint16_t val) : as_uint16(val) {} 24 | __device__ half_uint16(half val) : as_half(val) {} 25 | }; 26 | 27 | // Max_scale premultiplied by 1/256 28 | 29 | __forceinline__ __device__ half dq_scale(const int qs, const half max_scale) 30 | { 31 | int qs_i = qs + 1; 32 | half qs_h = __int2half_rn(qs_i * qs_i); 33 | qs_h = __hmul(qs_h, max_scale); 34 | return qs_h; 35 | } 36 | 37 | __forceinline__ __device__ half dq(const int q, const int qzero, const half scale) 38 | { 39 | return __hmul(__int2half_rn(q - qzero), scale); 40 | } 41 | 42 | __forceinline__ __device__ half dq_ns(const int q, const int qzero) 43 | { 44 | //return __hsub(__int2half_rn(q), __int2half_rn(qzero)); 45 | return __int2half_rn(q - qzero); 46 | } 47 | 48 | __forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask) 49 | { 50 | return (int)((q >> shift) & mask); 51 | } 52 | 53 | __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask) 54 | { 55 | return (int)(__funnelshift_rc(q0, q1, shift) & mask); 56 | } 57 | 58 | } // namespace gptq 59 | } // namespace vllm 60 | #endif 61 | -------------------------------------------------------------------------------- /csrc/reduction_utils.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | #include "cuda_compat.h" 21 | 22 | namespace vllm { 23 | 24 | template 25 | __inline__ __device__ T warpReduceSum(T val) { 26 | #pragma unroll 27 | for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) 28 | val += VLLM_SHFL_XOR_SYNC(val, mask); 29 | return val; 30 | } 31 | 32 | __inline__ __device__ constexpr int _calculateLaneMask(int warp_size) { 33 | return warp_size - 1; 34 | } 35 | 36 | __inline__ __device__ constexpr int _calculateWidShift(int warp_size) { 37 | return 5 + (warp_size >> 6); 38 | } 39 | 40 | /* Calculate the sum of all elements in a block */ 41 | template 42 | __inline__ __device__ T blockReduceSum(T val) { 43 | static __shared__ T shared[WARP_SIZE]; 44 | constexpr auto LANE_MASK = _calculateLaneMask(WARP_SIZE); 45 | constexpr auto WID_SHIFT = _calculateWidShift(WARP_SIZE); 46 | int lane = threadIdx.x & LANE_MASK; 47 | int wid = threadIdx.x >> WID_SHIFT; 48 | 49 | val = warpReduceSum(val); 50 | 51 | if (lane == 0) 52 | shared[wid] = val; 53 | 54 | __syncthreads(); 55 | 56 | // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent 57 | // blockDim.x is not divided by 32 58 | val = (threadIdx.x < (blockDim.x / (WARP_SIZE * 1.0f))) ? shared[lane] : (T)(0.0f); 59 | val = warpReduceSum(val); 60 | return val; 61 | } 62 | 63 | } // namespace vllm 64 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # vLLM documents 2 | 3 | ## Build the docs 4 | 5 | ```bash 6 | # Install dependencies. 7 | pip install -r requirements-docs.txt 8 | 9 | # Build the docs. 10 | make clean 11 | make html 12 | ``` 13 | 14 | ## Open the docs with your browser 15 | 16 | ```bash 17 | python -m http.server -d build/html/ 18 | ``` 19 | Launch your browser and open localhost:8000. 20 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx == 6.2.1 2 | sphinx-book-theme == 1.0.1 3 | sphinx-copybutton == 0.5.2 4 | myst-parser == 2.0.0 5 | sphinx-argparse 6 | 7 | # packages to install to build the documentation 8 | pydantic 9 | -f https://download.pytorch.org/whl/cpu 10 | torch -------------------------------------------------------------------------------- /docs/source/assets/kernel/k_vecs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/k_vecs.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/key.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/logits_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/logits_vec.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/q_vecs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/q_vecs.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/query.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/v_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/v_vec.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/value.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-only-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/logos/vllm-logo-only-light.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/logos/vllm-logo-text-dark.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/logos/vllm-logo-text-light.png -------------------------------------------------------------------------------- /docs/source/dev/engine/async_llm_engine.rst: -------------------------------------------------------------------------------- 1 | 2 | AsyncLLMEngine 3 | ================================= 4 | 5 | .. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine 6 | :members: generate, abort 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/dev/engine/engine_index.rst: -------------------------------------------------------------------------------- 1 | vLLM Engine 2 | ================================= 3 | 4 | .. automodule:: vllm.engine 5 | .. currentmodule:: vllm.engine 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | :caption: Engines 10 | 11 | llm_engine 12 | async_llm_engine 13 | 14 | -------------------------------------------------------------------------------- /docs/source/dev/engine/llm_engine.rst: -------------------------------------------------------------------------------- 1 | LLMEngine 2 | ================================= 3 | 4 | .. autoclass:: vllm.engine.llm_engine.LLMEngine 5 | :members: add_request, abort_request, step 6 | :show-inheritance: -------------------------------------------------------------------------------- /docs/source/dev/sampling_params.rst: -------------------------------------------------------------------------------- 1 | Sampling Params 2 | =============== 3 | 4 | .. automodule:: vllm.sampling_params.SamplingParams -------------------------------------------------------------------------------- /docs/source/getting_started/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | Installation 4 | ============ 5 | 6 | vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. 7 | 8 | Requirements 9 | ------------ 10 | 11 | * OS: Linux 12 | * Python: 3.8 -- 3.11 13 | * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) 14 | 15 | Install with pip 16 | ---------------- 17 | 18 | You can install vLLM using pip: 19 | 20 | .. code-block:: console 21 | 22 | $ # (Optional) Create a new conda environment. 23 | $ conda create -n myenv python=3.9 -y 24 | $ conda activate myenv 25 | 26 | $ # Install vLLM with CUDA 12.1. 27 | $ pip install vllm 28 | 29 | .. note:: 30 | 31 | As of now, vLLM's binaries are compiled on CUDA 12.1 by default. 32 | However, you can install vLLM with CUDA 11.8 by running: 33 | 34 | .. code-block:: console 35 | 36 | $ # Install vLLM with CUDA 11.8. 37 | $ export VLLM_VERSION=0.2.4 38 | $ export PYTHON_VERSION=39 39 | $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl 40 | 41 | $ # Re-install PyTorch with CUDA 11.8. 42 | $ pip uninstall torch -y 43 | $ pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118 44 | 45 | $ # Re-install xFormers with CUDA 11.8. 46 | $ pip uninstall xformers -y 47 | $ pip install --upgrade xformers --index-url https://download.pytorch.org/whl/cu118 48 | 49 | 50 | .. _build_from_source: 51 | 52 | Build from source 53 | ----------------- 54 | 55 | You can also build and install vLLM from source: 56 | 57 | .. code-block:: console 58 | 59 | $ git clone https://github.com/vllm-project/vllm.git 60 | $ cd vllm 61 | $ pip install -e . # This may take 5-10 minutes. 62 | 63 | .. tip:: 64 | If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. 65 | 66 | .. code-block:: console 67 | 68 | $ # Use `--ipc=host` to make sure the shared memory is large enough. 69 | $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 70 | 71 | .. note:: 72 | If you are developing the C++ backend of vLLM, consider building vLLM with 73 | 74 | .. code-block:: console 75 | 76 | $ python setup.py develop 77 | 78 | since it will give you incremental builds. The downside is that this method 79 | is `deprecated by setuptools `_. 80 | -------------------------------------------------------------------------------- /docs/source/quantization/fp8_e5m2_kv_cache.rst: -------------------------------------------------------------------------------- 1 | .. _fp8_e5m2_kv_cache: 2 | 3 | FP8 E5M2 KV Cache 4 | ================== 5 | 6 | The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. 7 | The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other. 8 | 9 | Here is an example of how to enable this feature: 10 | 11 | .. code-block:: python 12 | 13 | from vllm import LLM, SamplingParams 14 | # Sample prompts. 15 | prompts = [ 16 | "Hello, my name is", 17 | "The president of the United States is", 18 | "The capital of France is", 19 | "The future of AI is", 20 | ] 21 | # Create a sampling params object. 22 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 23 | # Create an LLM. 24 | llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8_e5m2") 25 | # Generate texts from the prompts. The output is a list of RequestOutput objects 26 | # that contain the prompt, generated text, and other information. 27 | outputs = llm.generate(prompts, sampling_params) 28 | # Print the outputs. 29 | for output in outputs: 30 | prompt = output.prompt 31 | generated_text = output.outputs[0].text 32 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 33 | 34 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_bentoml.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_bentoml: 2 | 3 | Deploying with BentoML 4 | ====================== 5 | 6 | `BentoML `_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. 7 | 8 | For details, see the tutorial `vLLM inference in the BentoML documentation `_. -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_docker.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_docker: 2 | 3 | Deploying with Docker 4 | ============================ 5 | 6 | vLLM offers official docker image for deployment. 7 | The image can be used to run OpenAI compatible server. 8 | The image is available on Docker Hub as `vllm/vllm-openai `_. 9 | 10 | .. code-block:: console 11 | 12 | $ docker run --runtime nvidia --gpus all \ 13 | -v ~/.cache/huggingface:/root/.cache/huggingface \ 14 | --env "HUGGING_FACE_HUB_TOKEN=" \ 15 | -p 8000:8000 \ 16 | --ipc=host \ 17 | vllm/vllm-openai:latest \ 18 | --model mistralai/Mistral-7B-v0.1 19 | 20 | 21 | .. note:: 22 | 23 | You can either use the ``ipc=host`` flag or ``--shm-size`` flag to allow the 24 | container to access the host's shared memory. vLLM uses PyTorch, which uses shared 25 | memory to share data between processes under the hood, particularly for tensor parallel inference. 26 | 27 | 28 | You can build and run vLLM from source via the provided dockerfile. To build vLLM: 29 | 30 | .. code-block:: console 31 | 32 | $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 33 | 34 | 35 | .. note:: 36 | 37 | By default vLLM will build for all GPU types for widest distribution. If you are just building for the 38 | current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""`` 39 | for vLLM to find the current GPU type and build for that. 40 | 41 | 42 | To run vLLM: 43 | 44 | .. code-block:: console 45 | 46 | $ docker run --runtime nvidia --gpus all \ 47 | -v ~/.cache/huggingface:/root/.cache/huggingface \ 48 | -p 8000:8000 \ 49 | --env "HUGGING_FACE_HUB_TOKEN=" \ 50 | vllm/vllm-openai 51 | 52 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_kserve.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_kserve: 2 | 3 | Deploying with KServe 4 | ============================ 5 | 6 | vLLM can be deployed with `KServe `_ on Kubernetes for highly scalable distributed model serving. 7 | 8 | Please see `this guide `_ for more details on using vLLM with KServe. 9 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_triton.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_triton: 2 | 3 | Deploying with NVIDIA Triton 4 | ============================ 5 | 6 | The `Triton Inference Server `_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m `_ model using vLLM. Please see `Deploying a vLLM model in Triton `_ for more details. 7 | -------------------------------------------------------------------------------- /docs/source/serving/distributed_serving.rst: -------------------------------------------------------------------------------- 1 | .. _distributed_serving: 2 | 3 | Distributed Inference and Serving 4 | ================================= 5 | 6 | vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with `Ray `_. To run distributed inference, install Ray with: 7 | 8 | .. code-block:: console 9 | 10 | $ pip install ray 11 | 12 | To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: 13 | 14 | .. code-block:: python 15 | 16 | from vllm import LLM 17 | llm = LLM("facebook/opt-13b", tensor_parallel_size=4) 18 | output = llm.generate("San Franciso is a") 19 | 20 | To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: 21 | 22 | .. code-block:: console 23 | 24 | $ python -m vllm.entrypoints.api_server \ 25 | $ --model facebook/opt-13b \ 26 | $ --tensor-parallel-size 4 27 | 28 | To scale vLLM beyond a single machine, start a `Ray runtime `_ via CLI before running vLLM: 29 | 30 | .. code-block:: console 31 | 32 | $ # On head node 33 | $ ray start --head 34 | 35 | $ # On worker nodes 36 | $ ray start --address= 37 | 38 | After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines. -------------------------------------------------------------------------------- /docs/source/serving/integrations.rst: -------------------------------------------------------------------------------- 1 | Integrations 2 | ------------ 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | run_on_sky 8 | deploying_with_kserve 9 | deploying_with_triton 10 | deploying_with_bentoml 11 | serving_with_langchain 12 | -------------------------------------------------------------------------------- /docs/source/serving/metrics.rst: -------------------------------------------------------------------------------- 1 | Production Metrics 2 | ================== 3 | 4 | vLLM exposes a number of metrics that can be used to monitor the health of the 5 | system. These metrics are exposed via the `/metrics` endpoint on the vLLM 6 | OpenAI compatible API server. 7 | 8 | The following metrics are exposed: 9 | 10 | .. literalinclude:: ../../../vllm/engine/metrics.py 11 | :language: python 12 | :start-after: begin-metrics-definitions 13 | :end-before: end-metrics-definitions 14 | -------------------------------------------------------------------------------- /docs/source/serving/run_on_sky.rst: -------------------------------------------------------------------------------- 1 | .. _on_cloud: 2 | 3 | Running on clouds with SkyPilot 4 | =============================== 5 | 6 | .. raw:: html 7 | 8 |

9 | vLLM 10 |

11 | 12 | vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot `__, an open-source framework for running LLMs on any cloud. 13 | 14 | To install SkyPilot and setup your cloud credentials, run: 15 | 16 | .. code-block:: console 17 | 18 | $ pip install skypilot 19 | $ sky check 20 | 21 | See the vLLM SkyPilot YAML for serving, `serving.yaml `__. 22 | 23 | .. code-block:: yaml 24 | 25 | resources: 26 | accelerators: A100 27 | 28 | envs: 29 | MODEL_NAME: decapoda-research/llama-13b-hf 30 | TOKENIZER: hf-internal-testing/llama-tokenizer 31 | 32 | setup: | 33 | conda create -n vllm python=3.9 -y 34 | conda activate vllm 35 | git clone https://github.com/vllm-project/vllm.git 36 | cd vllm 37 | pip install . 38 | pip install gradio 39 | 40 | run: | 41 | conda activate vllm 42 | echo 'Starting vllm api server...' 43 | python -u -m vllm.entrypoints.api_server \ 44 | --model $MODEL_NAME \ 45 | --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 46 | --tokenizer $TOKENIZER 2>&1 | tee api_server.log & 47 | echo 'Waiting for vllm api server to start...' 48 | while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done 49 | echo 'Starting gradio server...' 50 | python vllm/examples/gradio_webserver.py 51 | 52 | Start the serving the LLaMA-13B model on an A100 GPU: 53 | 54 | .. code-block:: console 55 | 56 | $ sky launch serving.yaml 57 | 58 | Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. 59 | 60 | .. code-block:: console 61 | 62 | (task, pid=7431) Running on public URL: https://.gradio.live 63 | 64 | **Optional**: Serve the 65B model instead of the default 13B and use more GPU: 65 | 66 | .. code-block:: console 67 | 68 | sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf 69 | 70 | -------------------------------------------------------------------------------- /docs/source/serving/serving_with_langchain.rst: -------------------------------------------------------------------------------- 1 | .. _run_on_langchain: 2 | 3 | Serving with Langchain 4 | ============================ 5 | 6 | vLLM is also available via `Langchain `_ . 7 | 8 | To install langchain, run 9 | 10 | .. code-block:: console 11 | 12 | $ pip install langchain langchain_community -q 13 | 14 | To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``. 15 | 16 | .. code-block:: python 17 | 18 | from langchain_community.llms import VLLM 19 | 20 | llm = VLLM(model="mosaicml/mpt-7b", 21 | trust_remote_code=True, # mandatory for hf models 22 | max_new_tokens=128, 23 | top_k=10, 24 | top_p=0.95, 25 | temperature=0.8, 26 | # tensor_parallel_size=... # for distributed inference 27 | ) 28 | 29 | print(llm("What is the capital of France ?")) 30 | 31 | Please refer to this `Tutorial `_ for more details. 32 | -------------------------------------------------------------------------------- /examples/api_client.py: -------------------------------------------------------------------------------- 1 | """Example Python client for vllm.entrypoints.api_server""" 2 | 3 | import argparse 4 | import json 5 | from typing import Iterable, List 6 | 7 | import requests 8 | 9 | 10 | def clear_line(n: int = 1) -> None: 11 | LINE_UP = '\033[1A' 12 | LINE_CLEAR = '\x1b[2K' 13 | for _ in range(n): 14 | print(LINE_UP, end=LINE_CLEAR, flush=True) 15 | 16 | 17 | def post_http_request(prompt: str, 18 | api_url: str, 19 | n: int = 1, 20 | stream: bool = False) -> requests.Response: 21 | headers = {"User-Agent": "Test Client"} 22 | pload = { 23 | "prompt": prompt, 24 | "n": n, 25 | "use_beam_search": True, 26 | "temperature": 0.0, 27 | "max_tokens": 16, 28 | "stream": stream, 29 | } 30 | response = requests.post(api_url, headers=headers, json=pload, stream=True) 31 | return response 32 | 33 | 34 | def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: 35 | for chunk in response.iter_lines(chunk_size=8192, 36 | decode_unicode=False, 37 | delimiter=b"\0"): 38 | if chunk: 39 | data = json.loads(chunk.decode("utf-8")) 40 | output = data["text"] 41 | yield output 42 | 43 | 44 | def get_response(response: requests.Response) -> List[str]: 45 | data = json.loads(response.content) 46 | output = data["text"] 47 | return output 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("--host", type=str, default="localhost") 53 | parser.add_argument("--port", type=int, default=8000) 54 | parser.add_argument("--n", type=int, default=4) 55 | parser.add_argument("--prompt", type=str, default="San Francisco is a") 56 | parser.add_argument("--stream", action="store_true") 57 | args = parser.parse_args() 58 | prompt = args.prompt 59 | api_url = f"http://{args.host}:{args.port}/generate" 60 | n = args.n 61 | stream = args.stream 62 | 63 | print(f"Prompt: {prompt!r}\n", flush=True) 64 | response = post_http_request(prompt, api_url, n, stream) 65 | 66 | if stream: 67 | num_printed_lines = 0 68 | for h in get_streaming_response(response): 69 | clear_line(num_printed_lines) 70 | num_printed_lines = 0 71 | for i, line in enumerate(h): 72 | num_printed_lines += 1 73 | print(f"Beam candidate {i}: {line!r}", flush=True) 74 | else: 75 | output = get_response(response) 76 | for i, line in enumerate(output): 77 | print(f"Beam candidate {i}: {line!r}", flush=True) 78 | -------------------------------------------------------------------------------- /examples/gradio_webserver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import gradio as gr 5 | import requests 6 | 7 | 8 | def http_bot(prompt): 9 | headers = {"User-Agent": "vLLM Client"} 10 | pload = { 11 | "prompt": prompt, 12 | "stream": True, 13 | "max_tokens": 128, 14 | } 15 | response = requests.post(args.model_url, 16 | headers=headers, 17 | json=pload, 18 | stream=True) 19 | 20 | for chunk in response.iter_lines(chunk_size=8192, 21 | decode_unicode=False, 22 | delimiter=b"\0"): 23 | if chunk: 24 | data = json.loads(chunk.decode("utf-8")) 25 | output = data["text"][0] 26 | yield output 27 | 28 | 29 | def build_demo(): 30 | with gr.Blocks() as demo: 31 | gr.Markdown("# vLLM text completion demo\n") 32 | inputbox = gr.Textbox(label="Input", 33 | placeholder="Enter text and press ENTER") 34 | outputbox = gr.Textbox(label="Output", 35 | placeholder="Generated result from the model") 36 | inputbox.submit(http_bot, [inputbox], [outputbox]) 37 | return demo 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--host", type=str, default=None) 43 | parser.add_argument("--port", type=int, default=8001) 44 | parser.add_argument("--model-url", 45 | type=str, 46 | default="http://localhost:8000/generate") 47 | args = parser.parse_args() 48 | 49 | demo = build_demo() 50 | demo.queue().launch(server_name=args.host, 51 | server_port=args.port, 52 | share=True) 53 | -------------------------------------------------------------------------------- /examples/llm_engine_example.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import List, Tuple 3 | 4 | from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput 5 | 6 | 7 | def create_test_prompts() -> List[Tuple[str, SamplingParams]]: 8 | """Create a list of test prompts with their sampling parameters.""" 9 | return [ 10 | ("A robot may not injure a human being", 11 | SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)), 12 | ("To be or not to be,", 13 | SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)), 14 | ("What is the meaning of life?", 15 | SamplingParams(n=2, 16 | best_of=5, 17 | temperature=0.8, 18 | top_p=0.95, 19 | frequency_penalty=0.1)), 20 | ("It is only with the heart that one can see rightly", 21 | SamplingParams(n=3, best_of=3, use_beam_search=True, 22 | temperature=0.0)), 23 | ] 24 | 25 | 26 | def process_requests(engine: LLMEngine, 27 | test_prompts: List[Tuple[str, SamplingParams]]): 28 | """Continuously process a list of prompts and handle the outputs.""" 29 | request_id = 0 30 | 31 | while test_prompts or engine.has_unfinished_requests(): 32 | if test_prompts: 33 | prompt, sampling_params = test_prompts.pop(0) 34 | engine.add_request(str(request_id), prompt, sampling_params) 35 | request_id += 1 36 | 37 | request_outputs: List[RequestOutput] = engine.step() 38 | 39 | for request_output in request_outputs: 40 | if request_output.finished: 41 | print(request_output) 42 | 43 | 44 | def initialize_engine(args: argparse.Namespace) -> LLMEngine: 45 | """Initialize the LLMEngine from the command line arguments.""" 46 | engine_args = EngineArgs.from_cli_args(args) 47 | return LLMEngine.from_engine_args(engine_args) 48 | 49 | 50 | def main(args: argparse.Namespace): 51 | """Main function that sets up and runs the prompt processing.""" 52 | engine = initialize_engine(args) 53 | test_prompts = create_test_prompts() 54 | process_requests(engine, test_prompts) 55 | 56 | 57 | if __name__ == '__main__': 58 | parser = argparse.ArgumentParser( 59 | description='Demo on using the LLMEngine class directly') 60 | parser = EngineArgs.add_cli_args(parser) 61 | args = parser.parse_args() 62 | main(args) 63 | -------------------------------------------------------------------------------- /examples/offline_inference.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM(model="facebook/opt-125m") 15 | # Generate texts from the prompts. The output is a list of RequestOutput objects 16 | # that contain the prompt, generated text, and other information. 17 | outputs = llm.generate(prompts, sampling_params) 18 | # Print the outputs. 19 | for output in outputs: 20 | prompt = output.prompt 21 | generated_text = output.outputs[0].text 22 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 23 | -------------------------------------------------------------------------------- /examples/offline_inference_distributed.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example shows how to use Ray Data for running offline batch inference 3 | distributively on a multi-nodes cluster. 4 | 5 | Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html 6 | """ 7 | 8 | from vllm import LLM, SamplingParams 9 | from typing import Dict 10 | import numpy as np 11 | import ray 12 | 13 | # Create a sampling params object. 14 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 15 | 16 | 17 | # Create a class to do batch inference. 18 | class LLMPredictor: 19 | 20 | def __init__(self): 21 | # Create an LLM. 22 | self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf") 23 | 24 | def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]: 25 | # Generate texts from the prompts. 26 | # The output is a list of RequestOutput objects that contain the prompt, 27 | # generated text, and other information. 28 | outputs = self.llm.generate(batch["text"], sampling_params) 29 | prompt = [] 30 | generated_text = [] 31 | for output in outputs: 32 | prompt.append(output.prompt) 33 | generated_text.append(' '.join([o.text for o in output.outputs])) 34 | return { 35 | "prompt": prompt, 36 | "generated_text": generated_text, 37 | } 38 | 39 | 40 | # Read one text file from S3. Ray Data supports reading multiple files 41 | # from cloud storage (such as JSONL, Parquet, CSV, binary format). 42 | ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt") 43 | 44 | # Apply batch inference for all input data. 45 | ds = ds.map_batches( 46 | LLMPredictor, 47 | # Set the concurrency to the number of LLM instances. 48 | concurrency=10, 49 | # Specify the number of GPUs required per LLM instance. 50 | # NOTE: Do NOT set `num_gpus` when using vLLM with tensor-parallelism 51 | # (i.e., `tensor_parallel_size`). 52 | num_gpus=1, 53 | # Specify the batch size for inference. 54 | batch_size=32, 55 | ) 56 | 57 | # Peek first 10 results. 58 | # NOTE: This is for local testing and debugging. For production use case, 59 | # one should write full result out as shown below. 60 | outputs = ds.take(limit=10) 61 | for output in outputs: 62 | prompt = output["prompt"] 63 | generated_text = output["generated_text"] 64 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 65 | 66 | # Write inference output data out as Parquet files to S3. 67 | # Multiple files would be written to the output destination, 68 | # and each task would write one or more files separately. 69 | # 70 | # ds.write_parquet("s3://") 71 | -------------------------------------------------------------------------------- /examples/offline_inference_neuron.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM( 15 | model="openlm-research/open_llama_3b", 16 | max_num_seqs=8, 17 | # The max_model_len and block_size arguments are required to be same as 18 | # max sequence length when targeting neuron device. 19 | # Currently, this is a known limitation in continuous batching support 20 | # in transformers-neuronx. 21 | # TODO(liangfu): Support paged-attention in transformers-neuronx. 22 | max_model_len=128, 23 | block_size=128, 24 | # The device can be automatically detected when AWS Neuron SDK is installed. 25 | # The device argument can be either unspecified for automated detection, 26 | # or explicitly assigned. 27 | device="neuron") 28 | # Generate texts from the prompts. The output is a list of RequestOutput objects 29 | # that contain the prompt, generated text, and other information. 30 | outputs = llm.generate(prompts, sampling_params) 31 | # Print the outputs. 32 | for output in outputs: 33 | prompt = output.prompt 34 | generated_text = output.outputs[0].text 35 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 36 | -------------------------------------------------------------------------------- /examples/offline_inference_with_prefix.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | prefix = ( 4 | "You are an expert school principal, skilled in effectively managing " 5 | "faculty and staff. Draft 10-15 questions for a potential first grade " 6 | "Head Teacher for my K-12, all-girls', independent school that emphasizes " 7 | "community, joyful discovery, and life-long learning. The candidate is " 8 | "coming in for a first-round panel interview for a 8th grade Math " 9 | "teaching role. They have 5 years of previous teaching experience " 10 | "as an assistant teacher at a co-ed, public school with experience " 11 | "in middle school math teaching. Based on these information, fulfill " 12 | "the following paragraph: ") 13 | 14 | # Sample prompts. 15 | prompts = [ 16 | "Hello, my name is", 17 | "The president of the United States is", 18 | "The capital of France is", 19 | "The future of AI is", 20 | ] 21 | # Create a sampling params object. 22 | sampling_params = SamplingParams(temperature=0.0) 23 | 24 | # Create an LLM. 25 | llm = LLM(model="facebook/opt-125m") 26 | 27 | generating_prompts = [prefix + prompt for prompt in prompts] 28 | 29 | # Generate texts from the prompts. The output is a list of RequestOutput objects 30 | # that contain the prompt, generated text, and other information. 31 | outputs = llm.generate(generating_prompts, sampling_params) 32 | # Print the outputs. 33 | for output in outputs: 34 | prompt = output.prompt 35 | generated_text = output.outputs[0].text 36 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 37 | 38 | print("-" * 80) 39 | 40 | # The llm.generate call will batch all prompts and send the batch at once 41 | # if resources allow. The prefix will only be cached after the first batch 42 | # is processed, so we need to call generate once to calculate the prefix 43 | # and cache it. 44 | outputs = llm.generate(generating_prompts[0], sampling_params) 45 | 46 | # Subsequent batches can leverage the cached prefix 47 | outputs = llm.generate(generating_prompts, sampling_params) 48 | 49 | # Print the outputs. You should see the same outputs as before 50 | for output in outputs: 51 | prompt = output.prompt 52 | generated_text = output.outputs[0].text 53 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 54 | -------------------------------------------------------------------------------- /examples/openai_chatcompletion_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai_api_key = "EMPTY" 5 | openai_api_base = "http://localhost:8000/v1" 6 | 7 | client = OpenAI( 8 | # defaults to os.environ.get("OPENAI_API_KEY") 9 | api_key=openai_api_key, 10 | base_url=openai_api_base, 11 | ) 12 | 13 | models = client.models.list() 14 | model = models.data[0].id 15 | 16 | chat_completion = client.chat.completions.create( 17 | messages=[{ 18 | "role": "system", 19 | "content": "You are a helpful assistant." 20 | }, { 21 | "role": "user", 22 | "content": "Who won the world series in 2020?" 23 | }, { 24 | "role": 25 | "assistant", 26 | "content": 27 | "The Los Angeles Dodgers won the World Series in 2020." 28 | }, { 29 | "role": "user", 30 | "content": "Where was it played?" 31 | }], 32 | model=model, 33 | ) 34 | 35 | print("Chat completion results:") 36 | print(chat_completion) 37 | -------------------------------------------------------------------------------- /examples/openai_completion_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai_api_key = "EMPTY" 5 | openai_api_base = "http://localhost:8000/v1" 6 | 7 | client = OpenAI( 8 | # defaults to os.environ.get("OPENAI_API_KEY") 9 | api_key=openai_api_key, 10 | base_url=openai_api_base, 11 | ) 12 | 13 | models = client.models.list() 14 | model = models.data[0].id 15 | 16 | # Completion API 17 | stream = False 18 | completion = client.completions.create( 19 | model=model, 20 | prompt="A robot may not injure a human being", 21 | echo=False, 22 | n=2, 23 | stream=stream, 24 | logprobs=3) 25 | 26 | print("Completion results:") 27 | if stream: 28 | for c in completion: 29 | print(c) 30 | else: 31 | print(completion) 32 | -------------------------------------------------------------------------------- /examples/production_monitoring/README.md: -------------------------------------------------------------------------------- 1 | # vLLM + Prometheus/Grafana 2 | 3 | This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. 4 | 5 | Install: 6 | - [`docker`](https://docs.docker.com/engine/install/) 7 | - [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository) 8 | 9 | ### Launch 10 | 11 | Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint: 12 | ```bash 13 | python3 -m vllm.entrypoints.openai.api_server \ 14 | --model mistralai/Mistral-7B-v0.1 \ 15 | --max-model-len 2048 \ 16 | --disable-log-requests 17 | ``` 18 | 19 | Launch Prometheus and Grafana servers with `docker compose`: 20 | ```bash 21 | docker compose up 22 | ``` 23 | 24 | Submit some sample requests to the server: 25 | ```bash 26 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 27 | 28 | python3 ../../benchmarks/benchmark_serving.py \ 29 | --model mistralai/Mistral-7B-v0.1 \ 30 | --tokenizer mistralai/Mistral-7B-v0.1 \ 31 | --endpoint /v1/completions \ 32 | --dataset ShareGPT_V3_unfiltered_cleaned_split.json \ 33 | --request-rate 3.0 34 | ``` 35 | 36 | Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM. 37 | 38 | ### Grafana Dashboard 39 | 40 | Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`). 41 | 42 | #### Add Prometheus Data Source 43 | 44 | Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. 45 | 46 | On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`. 47 | 48 | Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.". 49 | 50 | #### Import Dashboard 51 | 52 | Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following: 53 | 54 | ![Grafana Dashboard Image](https://i.imgur.com/R2vH9VW.png) 55 | -------------------------------------------------------------------------------- /examples/production_monitoring/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # docker-compose.yaml 2 | version: "3" 3 | 4 | services: 5 | prometheus: 6 | image: prom/prometheus:latest 7 | extra_hosts: 8 | - "host.docker.internal:host-gateway" # allow a direct connection from container to the local machine 9 | ports: 10 | - "9090:9090" # the default port used by Prometheus 11 | volumes: 12 | - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file 13 | 14 | grafana: 15 | image: grafana/grafana:latest 16 | depends_on: 17 | - prometheus 18 | ports: 19 | - "3000:3000" # the default port used by Grafana 20 | -------------------------------------------------------------------------------- /examples/production_monitoring/prometheus.yaml: -------------------------------------------------------------------------------- 1 | # prometheus.yaml 2 | global: 3 | scrape_interval: 5s 4 | evaluation_interval: 30s 5 | 6 | scrape_configs: 7 | - job_name: vllm 8 | static_configs: 9 | - targets: 10 | - 'host.docker.internal:8000' 11 | -------------------------------------------------------------------------------- /examples/template_alpaca.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {% for message in messages %} 4 | {% if message['role'] == 'user' %} 5 | ### Instruction: 6 | {{ message['content']|trim -}} 7 | {% if not loop.last %} 8 | 9 | 10 | {% endif %} 11 | {% elif message['role'] == 'assistant' %} 12 | ### Response: 13 | {{ message['content']|trim -}} 14 | {% if not loop.last %} 15 | 16 | 17 | {% endif %} 18 | {% elif message['role'] == 'user_context' %} 19 | ### Input: 20 | {{ message['content']|trim -}} 21 | {% if not loop.last %} 22 | 23 | 24 | {% endif %} 25 | {% endif %} 26 | {% endfor %} 27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} 28 | ### Response: 29 | {% endif %} -------------------------------------------------------------------------------- /examples/template_baichuan.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {%- for message in messages -%} 4 | {%- if message['role'] == 'user' -%} 5 | {{- '' + message['content'] -}} 6 | {%- elif message['role'] == 'assistant' -%} 7 | {{- '' + message['content'] -}} 8 | {%- endif -%} 9 | {%- endfor -%} 10 | 11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 12 | {{- '' -}} 13 | {% endif %} -------------------------------------------------------------------------------- /examples/template_chatglm.jinja: -------------------------------------------------------------------------------- 1 | {%- set counter = namespace(index=0) -%} 2 | {%- for message in messages -%} 3 | {%- if message['role'] == 'user' -%} 4 | {{- '[Round ' + counter.index|string + ']\n问:' + message['content'] -}} 5 | {%- set counter.index = counter.index + 1 -%} 6 | {%- endif -%} 7 | {%- if message['role'] == 'assistant' -%} 8 | {{- '\n答:' + message['content'] -}} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n' -}} 11 | {%- endif -%} 12 | {%- endif -%} 13 | {%- endfor -%} 14 | 15 | 16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 17 | {{- '\n答:' -}} 18 | {%- endif -%} -------------------------------------------------------------------------------- /examples/template_chatglm2.jinja: -------------------------------------------------------------------------------- 1 | {%- set counter = namespace(index=1) -%} 2 | {%- for message in messages -%} 3 | {%- if message['role'] == 'user' -%} 4 | {{- '[Round ' + counter.index|string + ']\n\n问:' + message['content'] -}} 5 | {%- set counter.index = counter.index + 1 -%} 6 | {%- endif -%} 7 | {%- if message['role'] == 'assistant' -%} 8 | {{- '\n\n答:' + message['content'] -}} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n\n' -}} 11 | {%- endif -%} 12 | {%- endif -%} 13 | {%- endfor -%} 14 | 15 | 16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 17 | {{- '\n\n答:' -}} 18 | {%- endif -%} -------------------------------------------------------------------------------- /examples/template_chatml.jinja: -------------------------------------------------------------------------------- 1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} 2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} -------------------------------------------------------------------------------- /examples/template_falcon.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'user' -%} 3 | {{- 'User: ' + message['content'] -}} 4 | {%- elif message['role'] == 'assistant' -%} 5 | {{- 'Assistant: ' + message['content'] -}} 6 | {%- endif -%} 7 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 8 | {{- '\n' -}} 9 | {%- endif -%} 10 | {%- endfor -%} 11 | 12 | 13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 14 | {{- 'Assistant:' -}} 15 | {% endif %} -------------------------------------------------------------------------------- /examples/template_falcon_180b.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'system' -%} 3 | {{- 'System: ' + message['content'] -}} 4 | {%- elif message['role'] == 'user' -%} 5 | {{- 'User: ' + message['content'] -}} 6 | {%- elif message['role'] == 'assistant' -%} 7 | {{- 'Falcon: ' + message['content'] -}} 8 | {%- endif -%} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n' -}} 11 | {%- endif -%} 12 | {%- endfor -%} 13 | 14 | 15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 16 | {{- 'Falcon:' -}} 17 | {% endif %} -------------------------------------------------------------------------------- /examples/template_inkbot.jinja: -------------------------------------------------------------------------------- 1 | <#meta#> 2 | - Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }} 3 | - Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }} 4 | <#system#> 5 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 6 | <#chat#> 7 | {% for message in messages %} 8 | {% if message['role'] == 'user' %} 9 | <#user#> 10 | {{ message['content']|trim -}} 11 | {% if not loop.last %} 12 | 13 | {% endif %} 14 | {% elif message['role'] == 'assistant' %} 15 | <#bot#> 16 | {{ message['content']|trim -}} 17 | {% if not loop.last %} 18 | 19 | {% endif %} 20 | {% elif message['role'] == 'user_context' %} 21 | <#user_context#> 22 | {{ message['content']|trim -}} 23 | {% if not loop.last %} 24 | 25 | {% endif %} 26 | {% endif %} 27 | {% endfor %} 28 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} 29 | <#bot#> 30 | {% endif %} -------------------------------------------------------------------------------- /patch_xformers.rocm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | XFORMERS_VERSION="0.0.23" 5 | 6 | export XFORMERS_INSTALLED_VERSION=$(python -c 'import xformers; print(xformers.__version__)') 7 | 8 | if [ "$XFORMERS_INSTALLED_VERSION" != "$XFORMERS_VERSION" ]; then 9 | echo "ERROR: xformers version must be ${XFORMERS_VERSION}. ${XFORMERS_INSTALLED_VERSION} is installed" 10 | exit 1 11 | fi 12 | 13 | export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)') 14 | export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)') 15 | 16 | echo "XFORMERS_FMHA_FLASH_PATH = ${XFORMERS_FMHA_FLASH_PATH}" 17 | echo "XFORMERS_FMHA_COMMON_PATH = ${XFORMERS_FMHA_COMMON_PATH}" 18 | 19 | if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then 20 | echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}" 21 | patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch" 22 | echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}" 23 | else 24 | echo "${XFORMERS_FMHA_FLASH_PATH} was patched before" 25 | fi 26 | 27 | if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then 28 | echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}" 29 | patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch" 30 | echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}" 31 | else 32 | echo "${XFORMERS_FMHA_COMMON_PATH} was patched before" 33 | fi 34 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | # Should be mirrored in requirements-build.txt 3 | requires = [ 4 | "cmake>=3.21", 5 | "ninja", 6 | "packaging", 7 | "setuptools >= 49.4.0", 8 | "torch == 2.1.2", 9 | "wheel", 10 | ] 11 | build-backend = "setuptools.build_meta" 12 | 13 | [tool.ruff] 14 | # Allow lines to be as long as 80. 15 | line-length = 80 16 | 17 | [tool.ruff.lint] 18 | select = [ 19 | # pycodestyle 20 | "E", 21 | # Pyflakes 22 | "F", 23 | # pyupgrade 24 | # "UP", 25 | # flake8-bugbear 26 | "B", 27 | # flake8-simplify 28 | "SIM", 29 | # isort 30 | # "I", 31 | ] 32 | ignore = [ 33 | # star imports 34 | "F405", "F403", 35 | # lambda expression assignment 36 | "E731", 37 | # Loop control variable not used within loop body 38 | "B007", 39 | ] 40 | 41 | [tool.mypy] 42 | python_version = "3.8" 43 | 44 | ignore_missing_imports = true 45 | 46 | files = "vllm" 47 | # TODO(woosuk): Include the code from Megatron and HuggingFace. 48 | exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/" 49 | 50 | 51 | [tool.codespell] 52 | ignore-words-list = "dout, te, indicies" 53 | skip = "./tests/prompts" 54 | -------------------------------------------------------------------------------- /requirements-build.txt: -------------------------------------------------------------------------------- 1 | # Should be mirrored in pyproject.toml 2 | cmake>=3.21 3 | ninja 4 | packaging 5 | setuptools>=49.4.0 6 | torch==2.1.2 7 | wheel 8 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # formatting 2 | yapf==0.32.0 3 | toml==0.10.2 4 | tomli==2.0.1 5 | ruff==0.1.5 6 | codespell==2.2.6 7 | 8 | # type checking 9 | mypy==0.991 10 | types-PyYAML 11 | types-requests 12 | types-setuptools 13 | 14 | # testing 15 | pytest 16 | pytest-forked 17 | pytest-asyncio 18 | pytest-rerunfailures 19 | pytest-shard 20 | httpx 21 | einops # required for MPT 22 | openai 23 | requests 24 | ray 25 | peft 26 | 27 | # Benchmarking 28 | aiohttp 29 | -------------------------------------------------------------------------------- /requirements-neuron.txt: -------------------------------------------------------------------------------- 1 | sentencepiece # Required for LLaMA tokenizer. 2 | numpy 3 | transformers-neuronx >= 0.9.0 4 | torch-neuronx >= 2.1.0 5 | neuronx-cc 6 | fastapi 7 | uvicorn[standard] 8 | pydantic >= 2.0 # Required for OpenAI server. 9 | prometheus_client >= 0.18.0 10 | -------------------------------------------------------------------------------- /requirements-rocm.txt: -------------------------------------------------------------------------------- 1 | cmake>=3.21 2 | ninja # For faster builds. 3 | typing-extensions>=4.8.0 4 | starlette 5 | psutil 6 | ray >= 2.9 7 | sentencepiece # Required for LLaMA tokenizer. 8 | numpy 9 | tokenizers>=0.15.0 10 | transformers >= 4.38.0 # Required for Gemma. 11 | fastapi 12 | uvicorn[standard] 13 | pydantic >= 2.0 # Required for OpenAI server. 14 | prometheus_client >= 0.18.0 15 | outlines == 0.0.34 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cmake>=3.21 2 | ninja # For faster builds. 3 | psutil 4 | ray >= 2.9 5 | sentencepiece # Required for LLaMA tokenizer. 6 | numpy 7 | torch == 2.1.2 8 | transformers >= 4.38.0 # Required for Gemma. 9 | xformers == 0.0.23.post1 # Required for CUDA 12.1. 10 | fastapi 11 | uvicorn[standard] 12 | pydantic >= 2.0 # Required for OpenAI server. 13 | prometheus_client >= 0.18.0 14 | pynvml == 11.5.0 15 | triton >= 2.1.0 16 | outlines == 0.0.34 17 | cupy-cuda12x == 12.1.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead. 18 | -------------------------------------------------------------------------------- /rocm_patch/commonpy_xformers-0.0.23.rocm.patch: -------------------------------------------------------------------------------- 1 | --- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/common.py 2023-11-29 03:17:03.930103539 +0000 2 | +++ common.py 2023-11-28 16:14:19.846233146 +0000 3 | @@ -298,8 +298,8 @@ 4 | dtype = d.query.dtype 5 | if device_type not in cls.SUPPORTED_DEVICES: 6 | reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})") 7 | - if device_type == "cuda" and not _built_with_cuda: 8 | - reasons.append("xFormers wasn't build with CUDA support") 9 | + #if device_type == "cuda" and not _built_with_cuda: 10 | + # reasons.append("xFormers wasn't build with CUDA support") 11 | if device_type == "cuda": 12 | device_capability = torch.cuda.get_device_capability(d.device) 13 | if device_capability < cls.CUDA_MINIMUM_COMPUTE_CAPABILITY: 14 | -------------------------------------------------------------------------------- /rocm_patch/rocm_bf16.patch: -------------------------------------------------------------------------------- 1 | --- amd_hip_bf16.h 2024-02-06 18:28:58.268699142 +0000 2 | +++ amd_hip_bf16.h.new 2024-02-06 18:28:31.988647133 +0000 3 | @@ -90,10 +90,10 @@ 4 | #include "math_fwd.h" // ocml device functions 5 | 6 | #if defined(__HIPCC_RTC__) 7 | -#define __HOST_DEVICE__ __device__ 8 | +#define __HOST_DEVICE__ __device__ static 9 | #else 10 | #include 11 | -#define __HOST_DEVICE__ __host__ __device__ 12 | +#define __HOST_DEVICE__ __host__ __device__ static inline 13 | #endif 14 | 15 | // Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on 16 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/tests/__init__.py -------------------------------------------------------------------------------- /tests/async_engine/api_server_async_engine.py: -------------------------------------------------------------------------------- 1 | """vllm.entrypoints.api_server with some extra logging for testing.""" 2 | import argparse 3 | from typing import Any, Dict 4 | 5 | import uvicorn 6 | from fastapi.responses import JSONResponse, Response 7 | 8 | import vllm.entrypoints.api_server 9 | from vllm.engine.arg_utils import AsyncEngineArgs 10 | from vllm.engine.async_llm_engine import AsyncLLMEngine 11 | 12 | app = vllm.entrypoints.api_server.app 13 | 14 | 15 | class AsyncLLMEngineWithStats(AsyncLLMEngine): 16 | 17 | def __init__(self, *args, **kwargs): 18 | super().__init__(*args, **kwargs) 19 | self._num_aborts = 0 20 | 21 | async def abort(self, request_id: str) -> None: 22 | await super().abort(request_id) 23 | self._num_aborts += 1 24 | 25 | def testing_stats(self) -> Dict[str, Any]: 26 | return {"num_aborted_requests": self._num_aborts} 27 | 28 | 29 | @app.get("/stats") 30 | def stats() -> Response: 31 | """Get the statistics of the engine.""" 32 | return JSONResponse(engine.testing_stats()) 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument("--host", type=str, default="localhost") 38 | parser.add_argument("--port", type=int, default=8000) 39 | parser = AsyncEngineArgs.add_cli_args(parser) 40 | args = parser.parse_args() 41 | 42 | engine_args = AsyncEngineArgs.from_cli_args(args) 43 | engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) 44 | vllm.entrypoints.api_server.engine = engine 45 | uvicorn.run( 46 | app, 47 | host=args.host, 48 | port=args.port, 49 | log_level="debug", 50 | timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE) 51 | -------------------------------------------------------------------------------- /tests/async_engine/test_async_llm_engine.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from dataclasses import dataclass 3 | 4 | import pytest 5 | 6 | from vllm.engine.async_llm_engine import AsyncLLMEngine 7 | 8 | 9 | @dataclass 10 | class RequestOutput: 11 | request_id: int 12 | finished: bool = False 13 | 14 | 15 | class MockEngine: 16 | 17 | def __init__(self): 18 | self.step_calls = 0 19 | self.add_request_calls = 0 20 | self.abort_request_calls = 0 21 | self.request_id = None 22 | 23 | async def step_async(self): 24 | self.step_calls += 1 25 | return [RequestOutput( 26 | request_id=self.request_id)] if self.request_id else [] 27 | 28 | async def encode_request_async(self, *args, **kwargs): 29 | pass 30 | 31 | def generate(self, request_id): 32 | self.request_id = request_id 33 | 34 | def stop_generating(self): 35 | self.request_id = None 36 | 37 | def add_request(self, **kwargs): 38 | del kwargs # Unused 39 | self.add_request_calls += 1 40 | 41 | async def add_request_async(self, **kwargs): 42 | self.add_request_calls += 1 43 | return 44 | 45 | def abort_request(self, request_id): 46 | del request_id # Unused 47 | self.abort_request_calls += 1 48 | 49 | def has_unfinished_requests(self): 50 | return self.request_id is not None 51 | 52 | 53 | class MockAsyncLLMEngine(AsyncLLMEngine): 54 | 55 | def _init_engine(self, *args, **kwargs): 56 | return MockEngine() 57 | 58 | 59 | @pytest.mark.asyncio 60 | async def test_new_requests_event(): 61 | engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False) 62 | engine.start_background_loop() 63 | await asyncio.sleep(0.01) 64 | assert engine.engine.step_calls == 0 65 | 66 | await engine.add_request("1", "", None) 67 | await asyncio.sleep(0.01) 68 | assert engine.engine.add_request_calls == 1 69 | assert engine.engine.step_calls == 1 70 | 71 | await engine.add_request("2", "", None) 72 | engine.engine.generate("2") 73 | await asyncio.sleep(0) 74 | await asyncio.sleep(0) 75 | assert engine.engine.add_request_calls == 2 76 | assert engine.engine.step_calls >= 2 77 | await asyncio.sleep(0.001) 78 | assert engine.engine.step_calls >= 3 79 | engine.engine.stop_generating() 80 | await asyncio.sleep(0.001) 81 | old_step_calls = engine.engine.step_calls 82 | await asyncio.sleep(0.001) 83 | assert engine.engine.step_calls == old_step_calls 84 | 85 | await engine.add_request("3", "", None) 86 | await asyncio.sleep(0.01) 87 | assert engine.engine.add_request_calls == 3 88 | assert engine.engine.step_calls == old_step_calls + 1 89 | await asyncio.sleep(0.01) 90 | assert engine.engine.add_request_calls == 3 91 | assert engine.engine.step_calls == old_step_calls + 1 92 | 93 | engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True) 94 | assert engine.get_tokenizer() is not None 95 | -------------------------------------------------------------------------------- /tests/async_engine/test_request_tracker.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.engine.async_llm_engine import RequestTracker 4 | from vllm.outputs import RequestOutput 5 | 6 | 7 | @pytest.mark.asyncio 8 | async def test_request_tracker(): 9 | tracker = RequestTracker() 10 | stream_1 = tracker.add_request("1") 11 | assert tracker.new_requests_event.is_set() 12 | await tracker.wait_for_new_requests() 13 | new, finished = tracker.get_new_and_finished_requests() 14 | assert not tracker.new_requests_event.is_set() 15 | assert len(new) == 1 16 | assert new[0]["request_id"] == "1" 17 | assert not finished 18 | assert not stream_1.finished 19 | 20 | stream_2 = tracker.add_request("2") 21 | stream_3 = tracker.add_request("3") 22 | assert tracker.new_requests_event.is_set() 23 | await tracker.wait_for_new_requests() 24 | new, finished = tracker.get_new_and_finished_requests() 25 | assert not tracker.new_requests_event.is_set() 26 | assert len(new) == 2 27 | assert new[0]["request_id"] == "2" 28 | assert new[1]["request_id"] == "3" 29 | assert not finished 30 | assert not stream_2.finished 31 | assert not stream_3.finished 32 | 33 | # request_ids must be unique 34 | with pytest.raises(KeyError): 35 | tracker.add_request("1") 36 | assert not tracker.new_requests_event.is_set() 37 | 38 | tracker.abort_request("1") 39 | new, finished = tracker.get_new_and_finished_requests() 40 | assert len(finished) == 1 41 | assert "1" in finished 42 | assert not new 43 | assert stream_1.finished 44 | 45 | stream_4 = tracker.add_request("4") 46 | tracker.abort_request("4") 47 | assert tracker.new_requests_event.is_set() 48 | await tracker.wait_for_new_requests() 49 | new, finished = tracker.get_new_and_finished_requests() 50 | assert len(finished) == 1 51 | assert "4" in finished 52 | assert not new 53 | assert stream_4.finished 54 | 55 | stream_5 = tracker.add_request("5") 56 | assert tracker.new_requests_event.is_set() 57 | tracker.process_request_output( 58 | RequestOutput("2", "output", [], [], [], finished=True)) 59 | await tracker.wait_for_new_requests() 60 | new, finished = tracker.get_new_and_finished_requests() 61 | assert not tracker.new_requests_event.is_set() 62 | assert len(finished) == 1 63 | assert "2" in finished 64 | assert len(new) == 1 65 | assert new[0]["request_id"] == "5" 66 | assert stream_2.finished 67 | assert not stream_5.finished 68 | -------------------------------------------------------------------------------- /tests/basic_correctness/test_basic_correctness.py: -------------------------------------------------------------------------------- 1 | """Compare the short outputs of HF and vLLM when using greedy sampling. 2 | 3 | Run `pytest tests/basic_correctness/test_basic_correctness.py --forked`. 4 | """ 5 | import pytest 6 | 7 | MODELS = [ 8 | "facebook/opt-125m", 9 | "meta-llama/Llama-2-7b-hf", 10 | ] 11 | 12 | 13 | @pytest.mark.parametrize("model", MODELS) 14 | @pytest.mark.parametrize("dtype", ["half"]) 15 | @pytest.mark.parametrize("max_tokens", [5]) 16 | @pytest.mark.parametrize("enforce_eager", [False, True]) 17 | def test_models( 18 | hf_runner, 19 | vllm_runner, 20 | example_prompts, 21 | model: str, 22 | dtype: str, 23 | max_tokens: int, 24 | enforce_eager: bool, 25 | ) -> None: 26 | hf_model = hf_runner(model, dtype=dtype) 27 | hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) 28 | del hf_model 29 | 30 | vllm_model = vllm_runner(model, dtype=dtype, enforce_eager=enforce_eager) 31 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 32 | del vllm_model 33 | 34 | for i in range(len(example_prompts)): 35 | hf_output_ids, hf_output_str = hf_outputs[i] 36 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 37 | assert hf_output_str == vllm_output_str, ( 38 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 39 | assert hf_output_ids == vllm_output_ids, ( 40 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 41 | -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/tests/core/__init__.py -------------------------------------------------------------------------------- /tests/core/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Tuple 3 | 4 | from vllm import SamplingParams 5 | from vllm.sequence import Sequence, SequenceGroup 6 | 7 | 8 | def create_dummy_prompt( 9 | request_id: str, 10 | prompt_length: int, 11 | block_size: int = None) -> Tuple[Sequence, SequenceGroup]: 12 | if not block_size: 13 | block_size = prompt_length 14 | 15 | # Create dummy prompt sequence with tokens 0...block_size-1 16 | # and prompt "0 ... block_size". 17 | prompt_tokens = list(range(prompt_length)) 18 | prompt_str = " ".join([str(t) for t in prompt_tokens]) 19 | prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size) 20 | seq_group = SequenceGroup(request_id, [prompt], SamplingParams(), 21 | time.time(), None) 22 | 23 | return prompt, seq_group 24 | 25 | 26 | def round_up_to_next_block(seq_len: int, block_size: int) -> int: 27 | return (seq_len + block_size - 1) // block_size 28 | -------------------------------------------------------------------------------- /tests/distributed/test_basic_distributed_correctness.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and distributed vLLM when using greedy sampling. 2 | 3 | Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`. 4 | """ 5 | import pytest 6 | import torch 7 | 8 | MODELS = [ 9 | "facebook/opt-125m", 10 | "meta-llama/Llama-2-7b-hf", 11 | ] 12 | 13 | 14 | @pytest.mark.skipif(torch.cuda.device_count() < 2, 15 | reason="Need at least 2 GPUs to run the test.") 16 | @pytest.mark.parametrize("model", MODELS) 17 | @pytest.mark.parametrize("dtype", ["half"]) 18 | @pytest.mark.parametrize("max_tokens", [5]) 19 | def test_models( 20 | hf_runner, 21 | vllm_runner, 22 | example_prompts, 23 | model: str, 24 | dtype: str, 25 | max_tokens: int, 26 | ) -> None: 27 | hf_model = hf_runner(model, dtype=dtype) 28 | hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) 29 | del hf_model 30 | 31 | vllm_model = vllm_runner(model, dtype=dtype, tensor_parallel_size=2) 32 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 33 | del vllm_model 34 | 35 | for i in range(len(example_prompts)): 36 | hf_output_ids, hf_output_str = hf_outputs[i] 37 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 38 | assert hf_output_str == vllm_output_str, ( 39 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 40 | assert hf_output_ids == vllm_output_ids, ( 41 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 42 | -------------------------------------------------------------------------------- /tests/engine/test_computed_prefix_blocks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.engine.arg_utils import EngineArgs 4 | from vllm.engine.llm_engine import LLMEngine 5 | from vllm.sampling_params import SamplingParams 6 | 7 | 8 | @pytest.mark.parametrize("model", ["facebook/opt-125m"]) 9 | @pytest.mark.parametrize("block_size", [16]) 10 | def test_computed_prefix_blocks(model: str, block_size: int): 11 | # This test checks if we are able to run the engine to completion 12 | # without triggering asserts. 13 | # We are in a scenario where all blocks from the second request's prompt 14 | # are full and already computed when the second request arrives. 15 | prompt = ( 16 | "You are a helpful assistant. How do I build a car from cardboard and " 17 | "paper clips? Is there an easy to follow video tutorial available " 18 | "online for free?") 19 | prompt2 = ( 20 | " Please recommend to me some resources where I can learn not only to " 21 | "handle technical difficulties of building a car, but also " 22 | "decoration.") 23 | 24 | engine_args = EngineArgs(model=model, 25 | block_size=block_size, 26 | enable_prefix_caching=True) 27 | 28 | engine = LLMEngine.from_engine_args(engine_args) 29 | sampling_params = SamplingParams() 30 | 31 | engine.add_request("0", prompt + prompt2, sampling_params) 32 | engine.step() 33 | engine.add_request("1", prompt, sampling_params) 34 | engine.step() 35 | -------------------------------------------------------------------------------- /tests/entrypoints/test_guided_processors.py: -------------------------------------------------------------------------------- 1 | # This unit test should be moved to a new 2 | # tests/test_guided_decoding directory. 3 | 4 | from transformers import AutoTokenizer 5 | import torch 6 | 7 | from vllm.model_executor.guided_logits_processors import (RegexLogitsProcessor, 8 | JSONLogitsProcessor) 9 | 10 | TEST_SCHEMA = { 11 | "type": "object", 12 | "properties": { 13 | "name": { 14 | "type": "string" 15 | }, 16 | "age": { 17 | "type": "integer" 18 | }, 19 | "skills": { 20 | "type": "array", 21 | "items": { 22 | "type": "string", 23 | "maxLength": 10 24 | }, 25 | "minItems": 3 26 | }, 27 | "work history": { 28 | "type": "array", 29 | "items": { 30 | "type": "object", 31 | "properties": { 32 | "company": { 33 | "type": "string" 34 | }, 35 | "duration": { 36 | "type": "string" 37 | }, 38 | "position": { 39 | "type": "string" 40 | } 41 | }, 42 | "required": ["company", "position"] 43 | } 44 | } 45 | }, 46 | "required": ["name", "age", "skills", "work history"] 47 | } 48 | 49 | TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" 50 | r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") 51 | 52 | 53 | def test_guided_logits_processors(): 54 | """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor.""" 55 | tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') 56 | regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer) 57 | json_LP = JSONLogitsProcessor(TEST_SCHEMA, tokenizer) 58 | 59 | regex_LP.init_state() 60 | token_ids = tokenizer.encode( 61 | f"Give an example IPv4 address with this regex: {TEST_REGEX}") 62 | tensor = torch.rand(32000) 63 | original_tensor = torch.clone(tensor) 64 | regex_LP(token_ids, tensor) 65 | assert tensor.shape == original_tensor.shape 66 | assert not torch.allclose(tensor, original_tensor) 67 | 68 | json_LP.init_state() 69 | token_ids = tokenizer.encode( 70 | f"Give an employee profile that fits this schema: {TEST_SCHEMA}") 71 | tensor = torch.rand(32000) 72 | original_tensor = torch.clone(tensor) 73 | json_LP(token_ids, tensor) 74 | assert tensor.shape == original_tensor.shape 75 | assert not torch.allclose(tensor, original_tensor) 76 | -------------------------------------------------------------------------------- /tests/kernels/allclose_default.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | # Reference default values of atol and rtol are from 4 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67 5 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5} 6 | default_rtol = { 7 | torch.float16: 1e-3, 8 | torch.bfloat16: 1.6e-2, 9 | torch.float: 1.3e-6 10 | } 11 | 12 | 13 | def get_default_atol(output) -> float: 14 | return default_atol[output.dtype] 15 | 16 | 17 | def get_default_rtol(output) -> float: 18 | return default_rtol[output.dtype] 19 | -------------------------------------------------------------------------------- /tests/kernels/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from vllm.utils import create_kv_caches_with_random 3 | 4 | 5 | @pytest.fixture() 6 | def kv_cache_factory(): 7 | return create_kv_caches_with_random 8 | -------------------------------------------------------------------------------- /tests/kernels/test_activation.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | import pytest 4 | import torch 5 | 6 | from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, 7 | NewGELU, SiluAndMul) 8 | from allclose_default import get_default_atol, get_default_rtol 9 | 10 | DTYPES = [torch.half, torch.bfloat16, torch.float] 11 | NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing 12 | D = [512, 4096, 5120, 13824] # Arbitrary values for testing 13 | SEEDS = [0] 14 | CUDA_DEVICES = [ 15 | f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) 16 | ] 17 | 18 | 19 | @pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"]) 20 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 21 | @pytest.mark.parametrize("d", D) 22 | @pytest.mark.parametrize("dtype", DTYPES) 23 | @pytest.mark.parametrize("seed", SEEDS) 24 | @pytest.mark.parametrize("device", CUDA_DEVICES) 25 | @torch.inference_mode() 26 | def test_act_and_mul( 27 | activation: str, 28 | num_tokens: int, 29 | d: int, 30 | dtype: torch.dtype, 31 | seed: int, 32 | device: str, 33 | ) -> None: 34 | torch.random.manual_seed(seed) 35 | if torch.cuda.is_available(): 36 | torch.cuda.manual_seed(seed) 37 | torch.set_default_device(device) 38 | x = torch.randn(num_tokens, 2 * d, dtype=dtype) 39 | if activation == "silu": 40 | layer = SiluAndMul() 41 | elif activation == "gelu": 42 | layer = GeluAndMul(approximate="none") 43 | elif activation == "gelu_tanh": 44 | layer = GeluAndMul(approximate="tanh") 45 | out = layer(x) 46 | ref_out = layer._forward(x) 47 | # The SiLU and GELU implementations are equivalent to the native PyTorch 48 | # implementations, so we can do exact comparison. 49 | assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0) 50 | 51 | 52 | @pytest.mark.parametrize("activation", [FastGELU, NewGELU]) 53 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 54 | @pytest.mark.parametrize("d", D) 55 | @pytest.mark.parametrize("dtype", DTYPES) 56 | @pytest.mark.parametrize("seed", SEEDS) 57 | @pytest.mark.parametrize("device", CUDA_DEVICES) 58 | @torch.inference_mode() 59 | def test_activation( 60 | activation: Type[torch.nn.Module], 61 | num_tokens: int, 62 | d: int, 63 | dtype: torch.dtype, 64 | seed: int, 65 | device: str, 66 | ) -> None: 67 | torch.random.manual_seed(seed) 68 | if torch.cuda.is_available(): 69 | torch.cuda.manual_seed(seed) 70 | torch.set_default_device(device) 71 | x = torch.randn(num_tokens, d, dtype=dtype) 72 | layer = activation() 73 | out = layer(x) 74 | ref_out = layer._forward(x) 75 | assert torch.allclose(out, 76 | ref_out, 77 | atol=get_default_atol(out), 78 | rtol=get_default_rtol(out)) 79 | -------------------------------------------------------------------------------- /tests/kernels/test_layernorm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from vllm.model_executor.layers.layernorm import RMSNorm 5 | 6 | DTYPES = [torch.half, torch.bfloat16, torch.float] 7 | NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing 8 | HIDDEN_SIZES = [768, 5120, 8192] # Arbitrary values for testing 9 | ADD_RESIDUAL = [False, True] 10 | SEEDS = [0] 11 | CUDA_DEVICES = [ 12 | f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) 13 | ] 14 | 15 | 16 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 17 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) 18 | @pytest.mark.parametrize("add_residual", ADD_RESIDUAL) 19 | @pytest.mark.parametrize("dtype", DTYPES) 20 | @pytest.mark.parametrize("seed", SEEDS) 21 | @pytest.mark.parametrize("device", CUDA_DEVICES) 22 | @torch.inference_mode() 23 | def test_rms_norm( 24 | num_tokens: int, 25 | hidden_size: int, 26 | add_residual: bool, 27 | dtype: torch.dtype, 28 | seed: int, 29 | device: str, 30 | ) -> None: 31 | torch.random.manual_seed(seed) 32 | if torch.cuda.is_available(): 33 | torch.cuda.manual_seed(seed) 34 | torch.set_default_device(device) 35 | layer = RMSNorm(hidden_size).to(dtype=dtype) 36 | layer.weight.data.normal_(mean=1.0, std=0.1) 37 | scale = 1 / (2 * hidden_size) 38 | x = torch.randn(num_tokens, hidden_size, dtype=dtype) 39 | x *= scale 40 | residual = torch.randn_like(x) * scale if add_residual else None 41 | 42 | # NOTE(woosuk): The reference implementation should be executed first 43 | # because the custom kernel is in-place. 44 | ref_out = layer._forward(x, residual) 45 | out = layer(x, residual) 46 | # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger 47 | # numerical errors than other operators because they involve reductions. 48 | # Therefore, we use a larger tolerance. 49 | if add_residual: 50 | assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2) 51 | assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2) 52 | else: 53 | assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2) 54 | -------------------------------------------------------------------------------- /tests/kernels/test_rand.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pytest 3 | import random 4 | 5 | from vllm.model_executor.layers.ops.rand import seeded_uniform 6 | from vllm.model_executor.utils import set_random_seed 7 | 8 | 9 | @pytest.mark.parametrize("dtype", 10 | [torch.float32, torch.float16, torch.bfloat16]) 11 | @pytest.mark.parametrize("use_3d", [True, False]) 12 | def test_seeded_uniform(dtype: torch.dtype, use_3d: bool): 13 | device = "cuda" 14 | for seed in range(512): 15 | set_random_seed(seed) 16 | rows = random.randint(1, 512) 17 | cols = random.randint(1, 64000) 18 | if use_3d: 19 | third_dim = random.randint(2, 10) 20 | dims = [rows, third_dim, cols] 21 | else: 22 | dims = [rows, cols] 23 | seeds = torch.randint(torch.iinfo(torch.long).min, 24 | torch.iinfo(torch.long).max, (rows, ), 25 | device=device) 26 | 27 | # Test that the same seed produces the same output 28 | out = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device) 29 | out2 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device) 30 | torch.testing.assert_close(out, out2) 31 | # del to save memory 32 | del out2 33 | 34 | out3 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device) 35 | torch.testing.assert_close(out, out3) 36 | # del to save memory 37 | del out3 38 | 39 | # Initialize out tensor with garbage to ensure that it is overwritten 40 | out_with_tensor = seeded_uniform( 41 | *dims, 42 | out=torch.full( 43 | (*dims, ), 44 | -1, 45 | dtype=dtype, 46 | device=device, 47 | ), 48 | seeds=seeds, 49 | dtype=dtype, 50 | ) 51 | torch.testing.assert_close(out, out_with_tensor) 52 | -------------------------------------------------------------------------------- /tests/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/tests/lora/__init__.py -------------------------------------------------------------------------------- /tests/lora/test_gemma.py: -------------------------------------------------------------------------------- 1 | import vllm 2 | from vllm.lora.request import LoRARequest 3 | 4 | MODEL_PATH = "google/gemma-7b" 5 | 6 | 7 | def do_sample(llm, lora_path: str, lora_id: int) -> str: 8 | prompts = [ 9 | "Quote: Imagination is", 10 | "Quote: Be yourself;", 11 | "Quote: So many books,", 12 | ] 13 | sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) 14 | outputs = llm.generate( 15 | prompts, 16 | sampling_params, 17 | lora_request=LoRARequest(str(lora_id), lora_id, lora_path) 18 | if lora_id else None) 19 | # Print the outputs. 20 | generated_texts = [] 21 | for output in outputs: 22 | prompt = output.prompt 23 | generated_text = output.outputs[0].text.strip() 24 | generated_texts.append(generated_text) 25 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 26 | return generated_texts 27 | 28 | 29 | def test_gemma_lora(gemma_lora_files): 30 | llm = vllm.LLM(MODEL_PATH, 31 | max_model_len=1024, 32 | enable_lora=True, 33 | max_loras=4) 34 | 35 | expected_lora_output = [ 36 | "more important than knowledge.\nAuthor: Albert Einstein\n", 37 | "everyone else is already taken.\nAuthor: Oscar Wilde\n", 38 | "so little time\nAuthor: Frank Zappa\n", 39 | ] 40 | 41 | output1 = do_sample(llm, gemma_lora_files, lora_id=1) 42 | for i in range(len(expected_lora_output)): 43 | assert output1[i].startswith(expected_lora_output[i]) 44 | output2 = do_sample(llm, gemma_lora_files, lora_id=2) 45 | for i in range(len(expected_lora_output)): 46 | assert output2[i].startswith(expected_lora_output[i]) 47 | -------------------------------------------------------------------------------- /tests/lora/test_tokenizer_group.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from transformers import AutoTokenizer, PreTrainedTokenizerBase 3 | from vllm.lora.request import LoRARequest 4 | from vllm.transformers_utils.tokenizer_group import get_tokenizer_group 5 | from vllm.transformers_utils.tokenizer import get_lora_tokenizer 6 | from ..conftest import get_tokenizer_pool_config 7 | 8 | 9 | @pytest.mark.asyncio 10 | @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"]) 11 | async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type): 12 | reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files) 13 | tokenizer_group = get_tokenizer_group( 14 | get_tokenizer_pool_config(tokenizer_group_type), 15 | tokenizer_id="gpt2", 16 | enable_lora=True, 17 | max_num_seqs=1, 18 | max_input_length=None, 19 | ) 20 | lora_request = LoRARequest("1", 1, sql_lora_files) 21 | assert reference_tokenizer.encode("prompt") == tokenizer_group.encode( 22 | request_id="request_id", prompt="prompt", lora_request=lora_request) 23 | assert reference_tokenizer.encode( 24 | "prompt") == await tokenizer_group.encode_async( 25 | request_id="request_id", 26 | prompt="prompt", 27 | lora_request=lora_request) 28 | assert isinstance(tokenizer_group.get_lora_tokenizer(None), 29 | PreTrainedTokenizerBase) 30 | assert tokenizer_group.get_lora_tokenizer( 31 | None) == await tokenizer_group.get_lora_tokenizer_async(None) 32 | 33 | assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request), 34 | PreTrainedTokenizerBase) 35 | assert tokenizer_group.get_lora_tokenizer( 36 | lora_request) != tokenizer_group.get_lora_tokenizer(None) 37 | assert tokenizer_group.get_lora_tokenizer( 38 | lora_request) == await tokenizer_group.get_lora_tokenizer_async( 39 | lora_request) 40 | 41 | 42 | def test_get_lora_tokenizer(sql_lora_files, tmpdir): 43 | lora_request = None 44 | tokenizer = get_lora_tokenizer(lora_request) 45 | assert not tokenizer 46 | 47 | lora_request = LoRARequest("1", 1, sql_lora_files) 48 | tokenizer = get_lora_tokenizer(lora_request) 49 | assert tokenizer.get_added_vocab() 50 | 51 | lora_request = LoRARequest("1", 1, str(tmpdir)) 52 | tokenizer = get_lora_tokenizer(lora_request) 53 | assert not tokenizer 54 | -------------------------------------------------------------------------------- /tests/lora/test_worker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import tempfile 4 | from unittest.mock import patch 5 | 6 | from vllm.lora.models import LoRAMapping 7 | from vllm.lora.request import LoRARequest 8 | from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig, 9 | DeviceConfig, LoRAConfig) 10 | from vllm.worker.worker import Worker 11 | 12 | 13 | @patch.dict(os.environ, {"RANK": "0"}) 14 | def test_worker_apply_lora(sql_lora_files): 15 | worker = Worker( 16 | model_config=ModelConfig( 17 | "meta-llama/Llama-2-7b-hf", 18 | "meta-llama/Llama-2-7b-hf", 19 | tokenizer_mode="auto", 20 | trust_remote_code=False, 21 | download_dir=None, 22 | load_format="dummy", 23 | seed=0, 24 | dtype="float16", 25 | revision=None, 26 | ), 27 | parallel_config=ParallelConfig(1, 1, False), 28 | scheduler_config=SchedulerConfig(32, 32, 32), 29 | device_config=DeviceConfig("cuda"), 30 | local_rank=0, 31 | rank=0, 32 | lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32, 33 | max_loras=32), 34 | distributed_init_method=f"file://{tempfile.mkstemp()[1]}", 35 | ) 36 | worker.init_model() 37 | worker.load_model() 38 | 39 | worker.model_runner.set_active_loras([], LoRAMapping([], [])) 40 | assert worker.list_loras() == set() 41 | 42 | n_loras = 32 43 | lora_requests = [ 44 | LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras) 45 | ] 46 | 47 | worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], [])) 48 | assert worker.list_loras() == { 49 | lora_request.lora_int_id 50 | for lora_request in lora_requests 51 | } 52 | 53 | for i in range(32): 54 | random.seed(i) 55 | iter_lora_requests = random.choices(lora_requests, 56 | k=random.randint(1, n_loras)) 57 | random.shuffle(iter_lora_requests) 58 | iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)] 59 | worker.model_runner.set_active_loras(iter_lora_requests, 60 | LoRAMapping([], [])) 61 | assert worker.list_loras().issuperset( 62 | {lora_request.lora_int_id 63 | for lora_request in iter_lora_requests}) 64 | -------------------------------------------------------------------------------- /tests/metrics/test_metrics.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | MODELS = [ 4 | "facebook/opt-125m", 5 | ] 6 | 7 | 8 | @pytest.mark.parametrize("model", MODELS) 9 | @pytest.mark.parametrize("dtype", ["float"]) 10 | @pytest.mark.parametrize("max_tokens", [128]) 11 | def test_metric_counter_prompt_tokens( 12 | vllm_runner, 13 | example_prompts, 14 | model: str, 15 | dtype: str, 16 | max_tokens: int, 17 | ) -> None: 18 | vllm_model = vllm_runner(model, 19 | dtype=dtype, 20 | disable_log_stats=False, 21 | gpu_memory_utilization=0.4) 22 | tokenizer = vllm_model.model.get_tokenizer() 23 | prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts] 24 | # This test needs at least 2 prompts in a batch of different lengths to 25 | # verify their token count is correct despite padding. 26 | assert len(example_prompts) > 1, "at least 2 prompts are required" 27 | assert prompt_token_counts[0] != prompt_token_counts[1], ( 28 | "prompts of different lengths are required") 29 | vllm_prompt_token_count = sum(prompt_token_counts) 30 | 31 | _ = vllm_model.generate_greedy(example_prompts, max_tokens) 32 | stat_logger = vllm_model.model.llm_engine.stat_logger 33 | metric_count = stat_logger.metrics.counter_prompt_tokens.labels( 34 | **stat_logger.labels)._value.get() 35 | 36 | assert vllm_prompt_token_count == metric_count, ( 37 | f"prompt token count: {vllm_prompt_token_count!r}\n" 38 | f"metric: {metric_count!r}") 39 | 40 | 41 | @pytest.mark.parametrize("model", MODELS) 42 | @pytest.mark.parametrize("dtype", ["float"]) 43 | @pytest.mark.parametrize("max_tokens", [128]) 44 | def test_metric_counter_generation_tokens( 45 | vllm_runner, 46 | example_prompts, 47 | model: str, 48 | dtype: str, 49 | max_tokens: int, 50 | ) -> None: 51 | vllm_model = vllm_runner(model, 52 | dtype=dtype, 53 | disable_log_stats=False, 54 | gpu_memory_utilization=0.4) 55 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 56 | tokenizer = vllm_model.model.get_tokenizer() 57 | stat_logger = vllm_model.model.llm_engine.stat_logger 58 | metric_count = stat_logger.metrics.counter_generation_tokens.labels( 59 | **stat_logger.labels)._value.get() 60 | vllm_generation_count = 0 61 | for i in range(len(example_prompts)): 62 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 63 | prompt_ids = tokenizer.encode(example_prompts[i]) 64 | # vllm_output_ids contains both prompt tokens and generation tokens. 65 | # We're interested only in the count of the generation tokens. 66 | vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) 67 | 68 | assert vllm_generation_count == metric_count, ( 69 | f"generation token count: {vllm_generation_count!r}\n" 70 | f"metric: {metric_count!r}") 71 | -------------------------------------------------------------------------------- /tests/models/test_mistral.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM for Mistral models using greedy sampling. 2 | 3 | Run `pytest tests/models/test_mistral.py --forked`. 4 | """ 5 | import pytest 6 | 7 | MODELS = [ 8 | "mistralai/Mistral-7B-Instruct-v0.1", 9 | ] 10 | 11 | 12 | @pytest.mark.parametrize("model", MODELS) 13 | @pytest.mark.parametrize("dtype", ["bfloat16"]) 14 | @pytest.mark.parametrize("max_tokens", [128]) 15 | def test_models( 16 | hf_runner, 17 | vllm_runner, 18 | example_long_prompts, 19 | model: str, 20 | dtype: str, 21 | max_tokens: int, 22 | ) -> None: 23 | hf_model = hf_runner(model, dtype=dtype) 24 | hf_outputs = hf_model.generate_greedy(example_long_prompts, max_tokens) 25 | del hf_model 26 | 27 | vllm_model = vllm_runner(model, dtype=dtype) 28 | vllm_outputs = vllm_model.generate_greedy(example_long_prompts, max_tokens) 29 | del vllm_model 30 | 31 | for i in range(len(example_long_prompts)): 32 | hf_output_ids, hf_output_str = hf_outputs[i] 33 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 34 | assert hf_output_str == vllm_output_str, ( 35 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 36 | assert hf_output_ids == vllm_output_ids, ( 37 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 38 | -------------------------------------------------------------------------------- /tests/models/test_models.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM when using greedy sampling. 2 | 3 | Run `pytest tests/models/test_models.py --forked`. 4 | """ 5 | import pytest 6 | 7 | MODELS = [ 8 | "facebook/opt-125m", 9 | "meta-llama/Llama-2-7b-hf", 10 | "mistralai/Mistral-7B-v0.1", 11 | "Deci/DeciLM-7b", 12 | "tiiuae/falcon-7b", 13 | "gpt2", 14 | "bigcode/tiny_starcoder_py", 15 | "EleutherAI/gpt-j-6b", 16 | "EleutherAI/pythia-70m", 17 | "bigscience/bloom-560m", 18 | "mosaicml/mpt-7b", 19 | "microsoft/phi-2", 20 | "stabilityai/stablelm-3b-4e1t", 21 | "allenai/OLMo-1B", 22 | "bigcode/starcoder2-3b", 23 | ] 24 | 25 | 26 | @pytest.mark.parametrize("model", MODELS) 27 | @pytest.mark.parametrize("dtype", ["float"]) 28 | @pytest.mark.parametrize("max_tokens", [128]) 29 | def test_models( 30 | hf_runner, 31 | vllm_runner, 32 | example_prompts, 33 | model: str, 34 | dtype: str, 35 | max_tokens: int, 36 | ) -> None: 37 | hf_model = hf_runner(model, dtype=dtype) 38 | hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) 39 | del hf_model 40 | 41 | vllm_model = vllm_runner(model, dtype=dtype) 42 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 43 | del vllm_model 44 | 45 | for i in range(len(example_prompts)): 46 | hf_output_ids, hf_output_str = hf_outputs[i] 47 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 48 | assert hf_output_str == vllm_output_str, ( 49 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 50 | assert hf_output_ids == vllm_output_ids, ( 51 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 52 | -------------------------------------------------------------------------------- /tests/prefix_caching/test_prefix_caching.py: -------------------------------------------------------------------------------- 1 | """Compare the with and without prefix caching. 2 | 3 | Run `pytest tests/prefix_caching/test_prefix_caching.py`. 4 | """ 5 | import pytest 6 | 7 | from vllm.core.block_manager import CachedBlockAllocator 8 | from vllm.utils import Device 9 | 10 | 11 | @pytest.mark.parametrize("block_size", [16]) 12 | @pytest.mark.parametrize("num_blocks", [16]) 13 | def test_block_allocator( 14 | block_size: int, 15 | num_blocks: int, 16 | ): 17 | block_hash = 1 18 | block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks) 19 | 20 | # Allocate two PysicalTokenBlocks with the same hash and check 21 | # that they are the same PhysicalTokenBlock 22 | first_block = block_allocator.allocate(block_hash, 0) 23 | second_block = block_allocator.allocate(block_hash, 0) 24 | assert (first_block == second_block) 25 | assert (second_block.ref_count == 2) 26 | 27 | # Free the first_block and confirm that the ref_count is correctly 28 | # decremented on the second block 29 | block_allocator.free(first_block) 30 | assert (second_block.ref_count == 1) 31 | 32 | # Free the second block 33 | block_allocator.free(second_block) 34 | 35 | # Reallocate the first block and confirm that, even after the block 36 | # had its ref_count go to 0, we still get the same block back 37 | first_block = block_allocator.allocate(block_hash, 0) 38 | assert (first_block == second_block) 39 | assert (first_block.block_hash == block_hash) 40 | 41 | 42 | @pytest.mark.parametrize("num_blocks", [16]) 43 | def test_eviction(num_blocks: int, ): 44 | block_size = 16 45 | block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks) 46 | blocks = [] 47 | 48 | for i in range(num_blocks): 49 | # use i as the block_hash 50 | blocks.append(block_allocator.allocate(i, 0)) 51 | 52 | #Free all blocks 53 | for block in blocks: 54 | block_allocator.free(block) 55 | 56 | # Allocate a new block and confirm that it's the first block freed. 57 | # I.E The Least Recently Used block 58 | new_block_hash = block_size 59 | new_block = block_allocator.allocate(new_block_hash, 0) 60 | assert (new_block == blocks[0]) 61 | assert (new_block.block_hash == new_block_hash) 62 | 63 | # Reallocate the second in blocks to remove it from the free list 64 | realloc_block_hash = 1 65 | realloc_block = block_allocator.allocate(realloc_block_hash, 0) 66 | assert (realloc_block == blocks[realloc_block_hash]) 67 | assert (realloc_block.block_hash == realloc_block_hash) 68 | 69 | # Allocate a new block and confirm that it's not the realloc_block, 70 | # since the realloc_block shouldn't be in the free list 71 | new_block_hash = block_size + 1 72 | new_block = block_allocator.allocate(new_block_hash, 0) 73 | assert (realloc_block != new_block) 74 | assert (new_block.block_hash == new_block_hash) 75 | assert (new_block.block_number == 2) 76 | -------------------------------------------------------------------------------- /tests/prompts/example.txt: -------------------------------------------------------------------------------- 1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. 2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. 3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information. 4 | Describe the basic components of a neural network and how it can be trained. 5 | Write a short story about a robot that dreams for the first time. 6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. 7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. 8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' 9 | -------------------------------------------------------------------------------- /tests/samplers/test_beam_search.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM when using beam search. 2 | 3 | Run `pytest tests/samplers/test_beam_search.py --forked`. 4 | """ 5 | import pytest 6 | 7 | # FIXME(zhuohan): The test can not pass if we: 8 | # 1. Increase max_tokens to 256. 9 | # 2. Increase beam_width to 8. 10 | # 3. Use the model "huggyllama/llama-7b". 11 | MAX_TOKENS = [128] 12 | BEAM_WIDTHS = [4] 13 | MODELS = ["facebook/opt-125m"] 14 | 15 | 16 | @pytest.mark.parametrize("model", MODELS) 17 | @pytest.mark.parametrize("dtype", ["half"]) 18 | @pytest.mark.parametrize("max_tokens", MAX_TOKENS) 19 | @pytest.mark.parametrize("beam_width", BEAM_WIDTHS) 20 | def test_beam_search_single_input( 21 | hf_runner, 22 | vllm_runner, 23 | example_prompts, 24 | model: str, 25 | dtype: str, 26 | max_tokens: int, 27 | beam_width: int, 28 | ) -> None: 29 | example_prompts = example_prompts[:1] 30 | hf_model = hf_runner(model, dtype=dtype) 31 | hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, 32 | max_tokens) 33 | del hf_model 34 | 35 | vllm_model = vllm_runner(model, dtype=dtype) 36 | vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, 37 | max_tokens) 38 | del vllm_model 39 | 40 | for i in range(len(example_prompts)): 41 | hf_output_ids, _ = hf_outputs[i] 42 | vllm_output_ids, _ = vllm_outputs[i] 43 | assert len(hf_output_ids) == len(vllm_output_ids) 44 | for j in range(len(hf_output_ids)): 45 | assert hf_output_ids[j] == vllm_output_ids[j], ( 46 | f"Test{i} output{j}:\nHF: {hf_output_ids}\n" 47 | f"vLLM: {vllm_output_ids}") 48 | -------------------------------------------------------------------------------- /tests/samplers/test_seeded_generate.py: -------------------------------------------------------------------------------- 1 | """Verify that seeded random sampling is deterministic. 2 | 3 | Run `pytest tests/samplers/test_seeded_generate.py --forked`. 4 | """ 5 | import copy 6 | import random 7 | from itertools import combinations 8 | 9 | import pytest 10 | 11 | from vllm.model_executor.utils import set_random_seed 12 | from vllm import SamplingParams 13 | 14 | MODEL = "facebook/opt-125m" 15 | RANDOM_SEEDS = list(range(5)) 16 | 17 | 18 | @pytest.fixture 19 | def vllm_model(vllm_runner): 20 | vllm_model = vllm_runner(MODEL, dtype="half") 21 | yield vllm_model 22 | del vllm_model 23 | 24 | 25 | @pytest.mark.parametrize("seed", RANDOM_SEEDS) 26 | def test_random_sample_with_seed( 27 | vllm_model, 28 | example_prompts, 29 | seed: int, 30 | ) -> None: 31 | set_random_seed(seed) 32 | 33 | sampling_params = SamplingParams( 34 | # Parameters to ensure sufficient randomness 35 | temperature=2.0, 36 | top_p=min(random.random() + 0.3, 1), 37 | top_k=random.randint(5, 20), 38 | n=random.randint(1, 10), 39 | presence_penalty=random.randint(0, 1), 40 | max_tokens=8, 41 | ignore_eos=True, 42 | ) 43 | 44 | sampling_params_seed_1 = copy.deepcopy(sampling_params) 45 | sampling_params_seed_1.seed = 100 46 | sampling_params_seed_2 = copy.deepcopy(sampling_params) 47 | sampling_params_seed_2.seed = 200 48 | 49 | llm = vllm_model.model 50 | 51 | for prompt in example_prompts: 52 | for params in ( 53 | sampling_params, 54 | sampling_params_seed_1, 55 | sampling_params_seed_2, 56 | sampling_params, 57 | sampling_params_seed_1, 58 | sampling_params_seed_2, 59 | ): 60 | llm._add_request( 61 | prompt=prompt, 62 | prompt_token_ids=None, 63 | sampling_params=params, 64 | ) 65 | 66 | results = llm._run_engine(use_tqdm=False) 67 | all_outputs = [[out.token_ids for out in output.outputs] 68 | for output in results] 69 | 70 | for i in range(0, len(example_prompts), 6): 71 | outputs = all_outputs[i:i + 6] 72 | 73 | # verify all non-seeded requests differ 74 | for output_a, output_b in combinations( 75 | (outputs[0], outputs[1], outputs[2], outputs[3]), 76 | 2, 77 | ): 78 | assert output_a != output_b 79 | 80 | # verify requests with the same seed match 81 | assert outputs[1] == outputs[4] 82 | assert outputs[2] == outputs[5] 83 | -------------------------------------------------------------------------------- /tests/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/tests/spec_decode/__init__.py -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | from vllm.config import ModelConfig 2 | 3 | 4 | def test_get_sliding_window(): 5 | TEST_SLIDING_WINDOW = 4096 6 | # Test that the sliding window is correctly computed. 7 | # For Qwen1.5/Qwen2, get_sliding_window() should be None 8 | # when use_sliding_window is False. 9 | qwen2_model_config = ModelConfig( 10 | "Qwen/Qwen1.5-7B", 11 | "Qwen/Qwen1.5-7B", 12 | tokenizer_mode="auto", 13 | trust_remote_code=False, 14 | download_dir=None, 15 | load_format="dummy", 16 | seed=0, 17 | dtype="float16", 18 | revision=None, 19 | ) 20 | 21 | qwen2_model_config.hf_config.use_sliding_window = False 22 | qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW 23 | assert qwen2_model_config.get_sliding_window() is None 24 | 25 | qwen2_model_config.hf_config.use_sliding_window = True 26 | assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW 27 | 28 | mistral_model_config = ModelConfig( 29 | "mistralai/Mistral-7B-v0.1", 30 | "mistralai/Mistral-7B-v0.1", 31 | tokenizer_mode="auto", 32 | trust_remote_code=False, 33 | download_dir=None, 34 | load_format="dummy", 35 | seed=0, 36 | dtype="float16", 37 | revision=None, 38 | ) 39 | mistral_model_config.hf_config.sliding_window = None 40 | assert mistral_model_config.get_sliding_window() is None 41 | 42 | mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW 43 | assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW -------------------------------------------------------------------------------- /tests/test_regression.py: -------------------------------------------------------------------------------- 1 | """Containing tests that check for regressions in vLLM's behavior. 2 | 3 | It should include tests that are reported by users and making sure they 4 | will never happen again. 5 | 6 | """ 7 | import gc 8 | 9 | import torch 10 | 11 | from vllm import LLM, SamplingParams 12 | 13 | 14 | def test_duplicated_ignored_sequence_group(): 15 | """https://github.com/vllm-project/vllm/issues/1655""" 16 | 17 | sampling_params = SamplingParams(temperature=0.01, 18 | top_p=0.1, 19 | max_tokens=256) 20 | llm = LLM(model="facebook/opt-125m", 21 | max_num_batched_tokens=4096, 22 | tensor_parallel_size=1) 23 | prompts = ["This is a short prompt", "This is a very long prompt " * 1000] 24 | outputs = llm.generate(prompts, sampling_params=sampling_params) 25 | 26 | assert len(prompts) == len(outputs) 27 | 28 | 29 | def test_max_tokens_none(): 30 | sampling_params = SamplingParams(temperature=0.01, 31 | top_p=0.1, 32 | max_tokens=None) 33 | llm = LLM(model="facebook/opt-125m", 34 | max_num_batched_tokens=4096, 35 | tensor_parallel_size=1) 36 | prompts = ["Just say hello!"] 37 | outputs = llm.generate(prompts, sampling_params=sampling_params) 38 | 39 | assert len(prompts) == len(outputs) 40 | 41 | 42 | def test_gc(): 43 | llm = LLM("facebook/opt-125m", enforce_eager=True) 44 | del llm 45 | 46 | gc.collect() 47 | torch.cuda.empty_cache() 48 | 49 | # The memory allocated for model and KV cache should be released. 50 | # The memory allocated for PyTorch and others should be less than 50MB. 51 | # Usually, it's around 10MB. 52 | allocated = torch.cuda.memory_allocated() 53 | assert allocated < 50 * 1024 * 1024 54 | 55 | 56 | if __name__ == "__main__": 57 | import pytest 58 | pytest.main([__file__]) 59 | -------------------------------------------------------------------------------- /tests/test_sampling_params.py: -------------------------------------------------------------------------------- 1 | """Tests for the SamplingParams class. 2 | """ 3 | from vllm import SamplingParams 4 | 5 | 6 | def test_max_tokens_none(): 7 | """max_tokens=None should be allowed""" 8 | SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None) 9 | 10 | 11 | if __name__ == "__main__": 12 | import pytest 13 | pytest.main([__file__]) 14 | -------------------------------------------------------------------------------- /tests/test_sequence.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.sequence import SequenceGroupOutput, SamplerOutput, SequenceOutput 4 | 5 | 6 | @pytest.fixture 7 | def sample_outputs(): 8 | return [ 9 | SequenceGroupOutput(samples=[ 10 | SequenceOutput(parent_seq_id=0, output_token=i, logprobs={}) 11 | ], 12 | prompt_logprobs=None) for i in range(5) 13 | ] 14 | 15 | 16 | @pytest.fixture 17 | def sampler_output(sample_outputs): 18 | return SamplerOutput(outputs=sample_outputs) 19 | 20 | 21 | def test_sampler_output_initialization(sampler_output, sample_outputs): 22 | assert len(sampler_output) == len(sample_outputs) 23 | assert sampler_output.sampled_token_probs is None 24 | assert sampler_output.sampled_token_ids is None 25 | assert sampler_output.spec_decode_worker_metrics is None 26 | 27 | 28 | def test_sampler_output_getitem(sampler_output, sample_outputs): 29 | assert sampler_output[2] == sample_outputs[2] 30 | 31 | 32 | def test_sampler_output_setitem(sampler_output): 33 | new_output = SequenceGroupOutput(samples=[ 34 | SequenceOutput(parent_seq_id=0, output_token=99, logprobs={}) 35 | ], 36 | prompt_logprobs=None) 37 | sampler_output[2] = new_output 38 | assert sampler_output[2] == new_output 39 | 40 | 41 | def test_sampler_output_len(sampler_output, sample_outputs): 42 | assert len(sampler_output) == len(sample_outputs) 43 | 44 | 45 | def test_sampler_output_eq(sample_outputs): 46 | sampler_output1 = SamplerOutput(outputs=sample_outputs) 47 | sampler_output2 = SamplerOutput(outputs=sample_outputs.copy()) 48 | sampler_output3 = SamplerOutput(outputs=sample_outputs[:-1]) 49 | assert sampler_output1 == sampler_output2 50 | assert sampler_output1 != sampler_output3 51 | -------------------------------------------------------------------------------- /tests/tokenization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/tests/tokenization/__init__.py -------------------------------------------------------------------------------- /tests/tokenization/test_cached_tokenizer.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from vllm.transformers_utils.tokenizer import get_cached_tokenizer 3 | from transformers import AutoTokenizer 4 | 5 | 6 | def test_cached_tokenizer(): 7 | reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") 8 | reference_tokenizer.add_special_tokens({"cls_token": ""}) 9 | reference_tokenizer.add_special_tokens( 10 | {"additional_special_tokens": [""]}) 11 | cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer)) 12 | 13 | assert reference_tokenizer.encode("prompt") == cached_tokenizer.encode( 14 | "prompt") 15 | assert set(reference_tokenizer.all_special_ids) == set( 16 | cached_tokenizer.all_special_ids) 17 | assert set(reference_tokenizer.all_special_tokens) == set( 18 | cached_tokenizer.all_special_tokens) 19 | assert set(reference_tokenizer.all_special_tokens_extended) == set( 20 | cached_tokenizer.all_special_tokens_extended) 21 | -------------------------------------------------------------------------------- /tests/tokenization/test_detokenize.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from transformers import AutoTokenizer 4 | 5 | from vllm.transformers_utils.tokenizer import detokenize_incrementally 6 | 7 | TRUTH = [ 8 | "Hello here, this is a simple test", # noqa: E501 9 | "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa: E501 10 | "我很感谢你的热情" # noqa: E501 11 | ] 12 | TOKENIZERS = [ 13 | "facebook/opt-125m", 14 | "gpt2", 15 | "bigcode/tiny_starcoder_py", 16 | "EleutherAI/gpt-j-6b", 17 | "EleutherAI/pythia-70m", 18 | "bigscience/bloom-560m", 19 | "mosaicml/mpt-7b", 20 | "tiiuae/falcon-7b", 21 | "meta-llama/Llama-2-7b-hf", 22 | "codellama/CodeLlama-7b-hf", 23 | ] 24 | 25 | 26 | def _run_incremental_decode(tokenizer, all_input_ids, 27 | skip_special_tokens: bool): 28 | decoded_text = "" 29 | offset = 0 30 | token_offset = 0 31 | prev_tokens = None 32 | for i in range(len(all_input_ids)): 33 | new_tokens, text, offset, token_offset = detokenize_incrementally( 34 | tokenizer, 35 | all_input_ids[:i + 1], 36 | prev_tokens, 37 | offset, 38 | token_offset, 39 | skip_special_tokens=skip_special_tokens) 40 | decoded_text += text 41 | if prev_tokens is None: 42 | prev_tokens = new_tokens 43 | else: 44 | prev_tokens += new_tokens 45 | return decoded_text 46 | 47 | 48 | @pytest.mark.parametrize("truth", TRUTH) 49 | @pytest.mark.parametrize("tokenizer_id", TOKENIZERS) 50 | @pytest.mark.parametrize("skip_special_tokens", (True, False)) 51 | def test_decode_streaming(tokenizer_id, truth, skip_special_tokens): 52 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_id) 53 | all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"] 54 | if skip_special_tokens: 55 | all_input_ids = ([tokenizer.bos_token_id] 56 | if tokenizer.bos_token_id is not None else 57 | []) + all_input_ids + [tokenizer.eos_token_id] 58 | 59 | decoded_text = _run_incremental_decode( 60 | tokenizer, all_input_ids, skip_special_tokens=skip_special_tokens) 61 | 62 | assert decoded_text == truth 63 | -------------------------------------------------------------------------------- /tests/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/tests/worker/__init__.py -------------------------------------------------------------------------------- /vllm/__init__.py: -------------------------------------------------------------------------------- 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" 2 | 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine 5 | from vllm.engine.llm_engine import LLMEngine 6 | from vllm.engine.ray_utils import initialize_ray_cluster 7 | from vllm.entrypoints.llm import LLM 8 | from vllm.outputs import CompletionOutput, RequestOutput 9 | from vllm.sampling_params import SamplingParams 10 | 11 | __version__ = "0.3.3" 12 | 13 | __all__ = [ 14 | "LLM", 15 | "SamplingParams", 16 | "RequestOutput", 17 | "CompletionOutput", 18 | "LLMEngine", 19 | "EngineArgs", 20 | "AsyncLLMEngine", 21 | "AsyncEngineArgs", 22 | "initialize_ray_cluster", 23 | ] 24 | -------------------------------------------------------------------------------- /vllm/block.py: -------------------------------------------------------------------------------- 1 | """Token blocks.""" 2 | from typing import List 3 | 4 | from vllm.utils import Device 5 | 6 | _BLANK_TOKEN_ID = -1 7 | 8 | DEFAULT_LAST_ACCESSED_TIME = -1 9 | 10 | 11 | class LogicalTokenBlock: 12 | """A block that stores a contiguous chunk of tokens from left to right. 13 | 14 | Logical blocks are used to represent the states of the corresponding 15 | physical blocks in the KV cache. 16 | """ 17 | 18 | def __init__( 19 | self, 20 | block_number: int, 21 | block_size: int, 22 | ) -> None: 23 | self.block_number = block_number 24 | self.block_size = block_size 25 | 26 | self.token_ids = [_BLANK_TOKEN_ID] * block_size 27 | self.num_tokens = 0 28 | 29 | def is_empty(self) -> bool: 30 | return self.num_tokens == 0 31 | 32 | def get_num_empty_slots(self) -> int: 33 | return self.block_size - self.num_tokens 34 | 35 | def is_full(self) -> bool: 36 | return self.num_tokens == self.block_size 37 | 38 | def append_tokens(self, token_ids: List[int]) -> None: 39 | assert len(token_ids) <= self.get_num_empty_slots() 40 | curr_idx = self.num_tokens 41 | self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids 42 | self.num_tokens += len(token_ids) 43 | 44 | def get_token_ids(self) -> List[int]: 45 | return self.token_ids[:self.num_tokens] 46 | 47 | def get_last_token_id(self) -> int: 48 | assert self.num_tokens > 0 49 | return self.token_ids[self.num_tokens - 1] 50 | 51 | 52 | class PhysicalTokenBlock: 53 | """Represents the state of a block in the KV cache.""" 54 | 55 | def __init__( 56 | self, 57 | device: Device, 58 | block_number: int, 59 | block_size: int, 60 | block_hash: int, 61 | num_hashed_tokens: int, 62 | ) -> None: 63 | self.device = device 64 | self.block_number = block_number 65 | self.block_size = block_size 66 | self.block_hash = block_hash 67 | self.num_hashed_tokens = num_hashed_tokens 68 | 69 | self.ref_count = 0 70 | self.last_accessed = DEFAULT_LAST_ACCESSED_TIME 71 | 72 | self.computed = False 73 | 74 | def __repr__(self) -> str: 75 | return (f'PhysicalTokenBlock(device={self.device}, ' 76 | f'block_number={self.block_number}, ' 77 | f'num_hashed_tokens={self.num_hashed_tokens}, ' 78 | f'ref_count={self.ref_count}, ' 79 | f'last_accessed={self.last_accessed}, ' 80 | f'computed={self.computed})') 81 | 82 | 83 | # Mapping: logical block number -> physical block. 84 | BlockTable = List[PhysicalTokenBlock] 85 | -------------------------------------------------------------------------------- /vllm/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/core/__init__.py -------------------------------------------------------------------------------- /vllm/core/policy.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | from typing import Deque 3 | 4 | from vllm.sequence import SequenceGroup 5 | 6 | 7 | class Policy: 8 | 9 | def get_priority( 10 | self, 11 | now: float, 12 | seq_group: SequenceGroup, 13 | ) -> float: 14 | raise NotImplementedError 15 | 16 | def sort_by_priority( 17 | self, 18 | now: float, 19 | seq_groups: Deque[SequenceGroup], 20 | ) -> Deque[SequenceGroup]: 21 | return deque( 22 | sorted( 23 | seq_groups, 24 | key=lambda seq_group: self.get_priority(now, seq_group), 25 | reverse=True, 26 | )) 27 | 28 | 29 | class FCFS(Policy): 30 | 31 | def get_priority( 32 | self, 33 | now: float, 34 | seq_group: SequenceGroup, 35 | ) -> float: 36 | return now - seq_group.metrics.arrival_time 37 | 38 | 39 | class PolicyFactory: 40 | 41 | _POLICY_REGISTRY = { 42 | 'fcfs': FCFS, 43 | } 44 | 45 | @classmethod 46 | def get_policy(cls, policy_name: str, **kwargs) -> Policy: 47 | return cls._POLICY_REGISTRY[policy_name](**kwargs) 48 | -------------------------------------------------------------------------------- /vllm/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/engine/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/entrypoints/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/entrypoints/openai/__init__.py -------------------------------------------------------------------------------- /vllm/executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/executor/__init__.py -------------------------------------------------------------------------------- /vllm/executor/executor_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Dict, List, Optional 3 | 4 | from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, 5 | ParallelConfig, SchedulerConfig, LoRAConfig) 6 | from vllm.lora.request import LoRARequest 7 | from vllm.sequence import SamplerOutput, SequenceGroupMetadata 8 | 9 | 10 | class ExecutorBase(ABC): 11 | """Base class for all executors. 12 | 13 | An executor is responsible for executing the model on a specific device 14 | type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor 15 | that can execute the model on multiple devices. 16 | """ 17 | 18 | @abstractmethod 19 | def __init__( 20 | self, 21 | model_config: ModelConfig, 22 | cache_config: CacheConfig, 23 | parallel_config: ParallelConfig, 24 | scheduler_config: SchedulerConfig, 25 | device_config: DeviceConfig, 26 | lora_config: Optional[LoRAConfig], 27 | ) -> None: 28 | raise NotImplementedError 29 | 30 | @abstractmethod 31 | def execute_model(self, 32 | seq_group_metadata_list: List[SequenceGroupMetadata], 33 | blocks_to_swap_in: Dict[int, int], 34 | blocks_to_swap_out: Dict[int, int], 35 | blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: 36 | """Executes one model step on the given sequences.""" 37 | raise NotImplementedError 38 | 39 | @abstractmethod 40 | def add_lora(self, lora_request: LoRARequest) -> bool: 41 | raise NotImplementedError 42 | 43 | @abstractmethod 44 | def remove_lora(self, lora_id: int) -> bool: 45 | raise NotImplementedError 46 | 47 | @abstractmethod 48 | def list_loras(self) -> List[int]: 49 | raise NotImplementedError 50 | 51 | @abstractmethod 52 | def check_health(self) -> None: 53 | """Checks if the executor is healthy. If not, it should raise an 54 | exception.""" 55 | raise NotImplementedError 56 | 57 | 58 | class ExecutorAsyncBase(ExecutorBase): 59 | 60 | @abstractmethod 61 | async def execute_model_async( 62 | self, 63 | seq_group_metadata_list: List[SequenceGroupMetadata], 64 | blocks_to_swap_in: Dict[int, int], 65 | blocks_to_swap_out: Dict[int, int], 66 | blocks_to_copy: Dict[int, List[int]], 67 | ) -> SamplerOutput: 68 | """Executes one model step on the given sequences.""" 69 | raise NotImplementedError 70 | 71 | @abstractmethod 72 | async def check_health_async(self) -> None: 73 | """Checks if the executor is healthy. If not, it should raise an 74 | exception.""" 75 | raise NotImplementedError 76 | -------------------------------------------------------------------------------- /vllm/executor/utils.py: -------------------------------------------------------------------------------- 1 | def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None: 2 | if num_gpu_blocks <= 0: 3 | raise ValueError("No available memory for the cache blocks. " 4 | "Try increasing `gpu_memory_utilization` when " 5 | "initializing the engine.") 6 | max_seq_len = block_size * num_gpu_blocks 7 | if max_model_len > max_seq_len: 8 | raise ValueError( 9 | f"The model's max seq len ({max_model_len}) " 10 | "is larger than the maximum number of tokens that can be " 11 | f"stored in KV cache ({max_seq_len}). Try increasing " 12 | "`gpu_memory_utilization` or decreasing `max_model_len` when " 13 | "initializing the engine.") 14 | -------------------------------------------------------------------------------- /vllm/logger.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py 3 | """Logging configuration for vLLM.""" 4 | import logging 5 | import sys 6 | import os 7 | 8 | VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")) 9 | 10 | _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" 11 | _DATE_FORMAT = "%m-%d %H:%M:%S" 12 | 13 | 14 | class NewLineFormatter(logging.Formatter): 15 | """Adds logging prefix to newlines to align multi-line messages.""" 16 | 17 | def __init__(self, fmt, datefmt=None): 18 | logging.Formatter.__init__(self, fmt, datefmt) 19 | 20 | def format(self, record): 21 | msg = logging.Formatter.format(self, record) 22 | if record.message != "": 23 | parts = msg.split(record.message) 24 | msg = msg.replace("\n", "\r\n" + parts[0]) 25 | return msg 26 | 27 | 28 | _root_logger = logging.getLogger("vllm") 29 | _default_handler = None 30 | 31 | 32 | def _setup_logger(): 33 | _root_logger.setLevel(logging.DEBUG) 34 | global _default_handler 35 | if _default_handler is None: 36 | _default_handler = logging.StreamHandler(sys.stdout) 37 | _default_handler.flush = sys.stdout.flush # type: ignore 38 | _default_handler.setLevel(logging.INFO) 39 | _root_logger.addHandler(_default_handler) 40 | fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT) 41 | _default_handler.setFormatter(fmt) 42 | # Setting this will avoid the message 43 | # being propagated to the parent logger. 44 | _root_logger.propagate = False 45 | 46 | 47 | # The logger is initialized when the module is imported. 48 | # This is thread-safe as the module is only imported once, 49 | # guaranteed by the Python GIL. 50 | if VLLM_CONFIGURE_LOGGING: 51 | _setup_logger() 52 | 53 | 54 | def init_logger(name: str): 55 | # Use the same settings as above for root logger 56 | logger = logging.getLogger(name) 57 | logger.setLevel(os.getenv("LOG_LEVEL", "DEBUG")) 58 | if VLLM_CONFIGURE_LOGGING: 59 | logger.addHandler(_default_handler) 60 | logger.propagate = False 61 | return logger 62 | -------------------------------------------------------------------------------- /vllm/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/lora/__init__.py -------------------------------------------------------------------------------- /vllm/lora/request.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class LoRARequest: 6 | """ 7 | Request for a LoRA adapter. 8 | 9 | Note that this class should be be used internally. For online 10 | serving, it is recommended to not allow users to use this class but 11 | instead provide another layer of abstraction to prevent users from 12 | accessing unauthorized LoRA adapters. 13 | 14 | lora_int_id must be globally unique for a given adapter. 15 | This is currently not enforced in vLLM. 16 | """ 17 | 18 | lora_name: str 19 | lora_int_id: int 20 | lora_local_path: str 21 | 22 | def __post_init__(self): 23 | if self.lora_int_id < 1: 24 | raise ValueError( 25 | f"lora_int_id must be > 0, got {self.lora_int_id}") 26 | 27 | def __eq__(self, value: object) -> bool: 28 | return isinstance( 29 | value, LoRARequest) and self.lora_int_id == value.lora_int_id 30 | 31 | def __hash__(self) -> int: 32 | return self.lora_int_id 33 | -------------------------------------------------------------------------------- /vllm/lora/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Tuple 3 | 4 | from torch import nn 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def replace_submodule(model: nn.Module, module_name: str, 10 | new_module: nn.Module) -> nn.Module: 11 | """Replace a submodule in a model with a new module.""" 12 | parent = model.get_submodule(".".join(module_name.split(".")[:-1])) 13 | target_name = module_name.split(".")[-1] 14 | setattr(parent, target_name, new_module) 15 | return new_module 16 | 17 | 18 | def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]: 19 | """Parse the name of lora weights. 20 | 21 | args: 22 | name: the name of the fine-tuned LoRA, e.g. 23 | base_model.model.dense1.weight 24 | return: 25 | Tuple(module_name, is_lora_a): 26 | module_name: the name of the module, e.g. model.dense1, 27 | is_lora_a whether the tensor is lora_a or lora_b. 28 | """ 29 | parts = name.split(".") 30 | assert parts[0] == "base_model" 31 | assert parts[1] == "model" 32 | if parts[-1] == "weight": 33 | assert parts[-2] == "lora_A" or parts[-2] == "lora_B" 34 | return ".".join(parts[2:-2]), parts[-2] == "lora_A" 35 | 36 | if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B": 37 | return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A" 38 | 39 | raise ValueError(f"{name} is unsupported format") 40 | -------------------------------------------------------------------------------- /vllm/model_executor/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.input_metadata import InputMetadata 2 | from vllm.model_executor.sampling_metadata import SamplingMetadata 3 | from vllm.model_executor.utils import set_random_seed, get_model 4 | 5 | __all__ = [ 6 | "InputMetadata", 7 | "get_model", 8 | "SamplingMetadata", 9 | "set_random_seed", 10 | ] 11 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/model_executor/layers/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/attention/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.layers.attention.attention import Attention 2 | 3 | __all__ = [ 4 | "Attention", 5 | ] 6 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/attention/backends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/model_executor/layers/attention/backends/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/attention/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/model_executor/layers/attention/ops/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/fused_moe/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.layers.fused_moe.fused_moe import ( 2 | fused_moe, 3 | get_config_file_name, 4 | ) 5 | 6 | __all__ = [ 7 | "fused_moe", 8 | "get_config_file_name", 9 | ] 10 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/fused_moe/configs/README: -------------------------------------------------------------------------------- 1 | This directory contains tuned configurations for different settings of the fused_moe kernel. 2 | For different settings of 3 | - E (number of experts) 4 | - N (intermediate size) 5 | - device_name (torch.cuda.get_device_name()) 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration. 7 | 8 | The example configurations provided are for the Mixtral model for TP2 on H100 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have 10 | N = 7168 and for TP4 we have N = 3584. 11 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/layernorm.py: -------------------------------------------------------------------------------- 1 | """Custom normalization layers.""" 2 | from typing import Optional, Tuple, Union 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from vllm._C import ops 8 | 9 | 10 | class RMSNorm(nn.Module): 11 | """Root mean square normalization. 12 | 13 | Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight. 14 | Refer to https://arxiv.org/abs/1910.07467 15 | """ 16 | 17 | def __init__( 18 | self, 19 | hidden_size: int, 20 | eps: float = 1e-6, 21 | ) -> None: 22 | super().__init__() 23 | self.weight = nn.Parameter(torch.ones(hidden_size)) 24 | self.variance_epsilon = eps 25 | 26 | def _forward( 27 | self, 28 | x: torch.Tensor, 29 | residual: Optional[torch.Tensor] = None, 30 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 31 | """PyTorch-native implementation equivalent to forward().""" 32 | orig_dtype = x.dtype 33 | x = x.to(torch.float32) 34 | if residual is not None: 35 | x = x + residual.to(torch.float32) 36 | residual = x.to(orig_dtype) 37 | 38 | variance = x.pow(2).mean(dim=-1, keepdim=True) 39 | x = x * torch.rsqrt(variance + self.variance_epsilon) 40 | x = x.to(orig_dtype) * self.weight 41 | if residual is None: 42 | return x 43 | else: 44 | return x, residual 45 | 46 | def forward( 47 | self, 48 | x: torch.Tensor, 49 | residual: Optional[torch.Tensor] = None, 50 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 51 | if residual is not None: 52 | ops.fused_add_rms_norm( 53 | x, 54 | residual, 55 | self.weight.data, 56 | self.variance_epsilon, 57 | ) 58 | return x, residual 59 | out = torch.empty_like(x) 60 | ops.rms_norm( 61 | out, 62 | x, 63 | self.weight.data, 64 | self.variance_epsilon, 65 | ) 66 | return out 67 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/model_executor/layers/ops/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | from vllm.model_executor.layers.quantization.base_config import ( 4 | QuantizationConfig) 5 | from vllm.model_executor.layers.quantization.awq import AWQConfig 6 | from vllm.model_executor.layers.quantization.gptq import GPTQConfig 7 | from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig 8 | from vllm.model_executor.layers.quantization.marlin import MarlinConfig 9 | 10 | _QUANTIZATION_CONFIG_REGISTRY = { 11 | "awq": AWQConfig, 12 | "gptq": GPTQConfig, 13 | "squeezellm": SqueezeLLMConfig, 14 | "marlin": MarlinConfig, 15 | } 16 | 17 | 18 | def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: 19 | if quantization not in _QUANTIZATION_CONFIG_REGISTRY: 20 | raise ValueError(f"Invalid quantization method: {quantization}") 21 | return _QUANTIZATION_CONFIG_REGISTRY[quantization] 22 | 23 | 24 | __all__ = [ 25 | "QuantizationConfig", 26 | "get_quantization_config", 27 | ] 28 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/base_config.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Dict, List 3 | 4 | import torch 5 | 6 | from vllm.model_executor.layers.linear import LinearMethodBase 7 | 8 | 9 | class QuantizationConfig(ABC): 10 | """Base class for quantization configs.""" 11 | 12 | @abstractmethod 13 | def get_name(self) -> str: 14 | """Name of the quantization method.""" 15 | raise NotImplementedError 16 | 17 | @abstractmethod 18 | def get_supported_act_dtypes(self) -> List[torch.dtype]: 19 | """List of supported activation dtypes.""" 20 | raise NotImplementedError 21 | 22 | @abstractmethod 23 | def get_min_capability(self) -> int: 24 | """Minimum GPU capability to support the quantization method. 25 | 26 | E.g., 70 for Volta, 75 for Turing, 80 for Ampere. 27 | This requirement is due to the custom CUDA kernels used by the 28 | quantization method. 29 | """ 30 | raise NotImplementedError 31 | 32 | @staticmethod 33 | @abstractmethod 34 | def get_config_filenames() -> List[str]: 35 | """List of filenames to search for in the model directory.""" 36 | raise NotImplementedError 37 | 38 | @classmethod 39 | @abstractmethod 40 | def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig": 41 | """Create a config class from the model's quantization config.""" 42 | raise NotImplementedError 43 | 44 | @staticmethod 45 | def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any: 46 | """Get a value from the model's quantization config.""" 47 | for key in keys: 48 | if key in config: 49 | return config[key] 50 | raise ValueError(f"Cannot find any of {keys} in the model's " 51 | "quantization config.") 52 | 53 | @abstractmethod 54 | def get_linear_method(self) -> LinearMethodBase: 55 | """Get the linear method to use for the quantized linear layer.""" 56 | raise NotImplementedError 57 | 58 | @abstractmethod 59 | def get_scaled_act_names(self) -> List[str]: 60 | """Returns the activation function names that should be post-scaled. 61 | 62 | For now, this is only used by AWQ. 63 | """ 64 | raise NotImplementedError 65 | -------------------------------------------------------------------------------- /vllm/model_executor/neuron_model_loader.py: -------------------------------------------------------------------------------- 1 | """Utilities for selecting and loading models.""" 2 | from typing import Type 3 | 4 | import torch 5 | import torch.nn as nn 6 | from transformers import PretrainedConfig 7 | 8 | from vllm.config import ModelConfig, DeviceConfig 9 | from vllm.model_executor.models import ModelRegistry 10 | 11 | TORCH_DTYPE_TO_NEURON_AMP = { 12 | "auto": "f32", 13 | "half": "f16", 14 | "float16": "f16", 15 | "bfloat16": "bf16", 16 | "float": "f32", 17 | "float32": "f32", 18 | torch.float16: "f16", 19 | torch.bfloat16: "bf16", 20 | torch.float32: "f32", 21 | } 22 | 23 | 24 | def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: 25 | architectures = getattr(config, "architectures", []) 26 | for arch in architectures: 27 | model_cls = ModelRegistry.load_model_cls(arch) 28 | if model_cls is not None: 29 | return model_cls 30 | raise ValueError( 31 | f"Model architectures {architectures} are not supported for now. " 32 | f"Supported architectures: {ModelRegistry.get_supported_archs()}") 33 | 34 | 35 | def get_model(model_config: ModelConfig, device_config: DeviceConfig, 36 | **kwargs) -> nn.Module: 37 | from transformers_neuronx.config import (NeuronConfig, 38 | ContinuousBatchingConfig) 39 | 40 | parallel_config = kwargs.get("parallel_config") 41 | scheduler_config = kwargs.get("scheduler_config") 42 | 43 | model_class = _get_model_architecture(model_config.hf_config) 44 | linear_method = None 45 | 46 | # Create a model instance. 47 | model = model_class(model_config.hf_config, linear_method) 48 | 49 | continuous_batching_config = ContinuousBatchingConfig( 50 | batch_size_for_shared_caches=scheduler_config.max_num_seqs) 51 | neuron_config = NeuronConfig( 52 | continuous_batching=continuous_batching_config) 53 | 54 | # Load the weights from the cached or downloaded files. 55 | model.load_weights( 56 | model_config.model, 57 | model_config.download_dir, 58 | model_config.load_format, 59 | model_config.revision, 60 | tp_degree=parallel_config.neuron_tp_degree, 61 | amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], 62 | neuron_config=neuron_config, 63 | context_length_estimate=[scheduler_config.max_model_len], 64 | n_positions=[scheduler_config.max_model_len], 65 | batch_size=scheduler_config.max_num_seqs) 66 | 67 | return model.eval() 68 | -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/README.md: -------------------------------------------------------------------------------- 1 | The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference. -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/model_executor/parallel_utils/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The vLLM team. 2 | # Adapted from 3 | # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py 4 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 5 | from typing import Sequence 6 | 7 | import torch 8 | 9 | 10 | def ensure_divisibility(numerator, denominator): 11 | """Ensure that numerator is divisible by the denominator.""" 12 | assert numerator % denominator == 0, "{} is not divisible by {}".format( 13 | numerator, denominator) 14 | 15 | 16 | def divide(numerator, denominator): 17 | """Ensure that numerator is divisible by the denominator and return 18 | the division value.""" 19 | ensure_divisibility(numerator, denominator) 20 | return numerator // denominator 21 | 22 | 23 | def split_tensor_along_last_dim( 24 | tensor: torch.Tensor, 25 | num_partitions: int, 26 | contiguous_split_chunks: bool = False, 27 | ) -> Sequence[torch.Tensor]: 28 | """ Split a tensor along its last dimension. 29 | 30 | Arguments: 31 | tensor: input tensor. 32 | num_partitions: number of partitions to split the tensor 33 | contiguous_split_chunks: If True, make each chunk contiguous 34 | in memory. 35 | 36 | Returns: 37 | A list of Tensors 38 | """ 39 | # Get the size and dimension. 40 | last_dim = tensor.dim() - 1 41 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 42 | # Split. 43 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 44 | # NOTE: torch.split does not create contiguous tensors by default. 45 | if contiguous_split_chunks: 46 | return tuple(chunk.contiguous() for chunk in tensor_list) 47 | 48 | return tensor_list 49 | -------------------------------------------------------------------------------- /vllm/model_executor/utils.py: -------------------------------------------------------------------------------- 1 | """Utils for model executor.""" 2 | import random 3 | import importlib 4 | from typing import Any, Dict, Optional 5 | 6 | import numpy as np 7 | import torch 8 | 9 | from vllm.config import DeviceConfig, ModelConfig 10 | 11 | DEVICE_TO_MODEL_LOADER_MAP = { 12 | "cuda": "model_loader", 13 | "neuron": "neuron_model_loader", 14 | } 15 | 16 | 17 | def set_random_seed(seed: int) -> None: 18 | random.seed(seed) 19 | np.random.seed(seed) 20 | torch.manual_seed(seed) 21 | if torch.cuda.is_available(): 22 | torch.cuda.manual_seed_all(seed) 23 | 24 | 25 | def set_weight_attrs( 26 | weight: torch.Tensor, 27 | weight_attrs: Optional[Dict[str, Any]], 28 | ): 29 | """Set attributes on a weight tensor. 30 | 31 | This method is used to set attributes on a weight tensor. This method 32 | will not overwrite existing attributes. 33 | 34 | Args: 35 | weight: The weight tensor. 36 | weight_attrs: A dictionary of attributes to set on the weight tensor. 37 | """ 38 | if weight_attrs is None: 39 | return 40 | for key, value in weight_attrs.items(): 41 | assert not hasattr( 42 | weight, key), (f"Overwriting existing tensor attribute: {key}") 43 | setattr(weight, key, value) 44 | 45 | 46 | def get_model(model_config: ModelConfig, device_config: DeviceConfig, 47 | **kwargs) -> torch.nn.Module: 48 | model_loader_module = DEVICE_TO_MODEL_LOADER_MAP[device_config.device_type] 49 | imported_model_loader = importlib.import_module( 50 | f"vllm.model_executor.{model_loader_module}") 51 | get_model_fn = imported_model_loader.get_model 52 | return get_model_fn(model_config, device_config, **kwargs) 53 | -------------------------------------------------------------------------------- /vllm/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561. 2 | # The vllm package uses inline types. 3 | -------------------------------------------------------------------------------- /vllm/spec_decode/interfaces.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Optional, Dict 2 | from dataclasses import dataclass 3 | from abc import ABC, abstractmethod 4 | 5 | import torch 6 | 7 | from vllm.sequence import SequenceGroupMetadata 8 | 9 | 10 | @dataclass 11 | class SpeculativeProposals: 12 | """Datastructure used to represent proposal tokens from some proposer. It 13 | also tracks how many speculative tokens each sequence has. 14 | """ 15 | 16 | # Speculative proposal tokens. 17 | proposal_token_ids: torch.Tensor 18 | 19 | # Probabilities of the proposal tokens according to the proposer. 20 | proposal_probs: torch.Tensor 21 | 22 | # The valid length of each proposal; can be zero. 23 | proposal_lens: torch.Tensor 24 | 25 | def __repr__(self): 26 | return (f"SpeculativeProposals(" 27 | f"proposal_token_ids={self.proposal_token_ids.shape}, " 28 | f"proposal_probs={self.proposal_probs.shape}, " 29 | f"proposal_lens={self.proposal_lens.shape})") 30 | 31 | 32 | @dataclass 33 | class SpeculativeScores: 34 | """Datastructure used to represent the scores of speculative tokens 35 | according to the scoring model. 36 | """ 37 | 38 | # Probabilities of the speculative tokens according to the scoring model. 39 | probs: torch.Tensor 40 | 41 | # Token ids sampled from the scoring model. Used for speculative bonus 42 | # tokens and also non-speculative normal decoding. 43 | token_ids: torch.Tensor 44 | 45 | def __repr__(self): 46 | return (f"SpeculativeScores(" 47 | f"probs={self.probs.shape}, " 48 | f"token_ids={self.token_ids.shape})") 49 | 50 | 51 | class SpeculativeProposer(ABC): 52 | 53 | @abstractmethod 54 | def get_proposals( 55 | self, 56 | seq_group_metadata_list: List[SequenceGroupMetadata], 57 | blocks_to_swap_in: Dict[int, int], 58 | blocks_to_swap_out: Dict[int, int], 59 | blocks_to_copy: Dict[int, List[int]], 60 | max_proposal_len: int, 61 | ) -> SpeculativeProposals: 62 | raise NotImplementedError 63 | 64 | 65 | class SpeculativeScorer(ABC): 66 | 67 | @abstractmethod 68 | def score_proposals( 69 | self, 70 | seq_group_metadata_list: List[SequenceGroupMetadata], 71 | blocks_to_swap_in: Optional[Dict[int, int]], 72 | blocks_to_swap_out: Optional[Dict[int, int]], 73 | blocks_to_copy: Optional[Dict[int, List[int]]], 74 | k: int, 75 | proposals: SpeculativeProposals, 76 | ) -> Tuple[torch.Tensor, torch.Tensor]: 77 | raise NotImplementedError 78 | -------------------------------------------------------------------------------- /vllm/test_utils.py: -------------------------------------------------------------------------------- 1 | import ray 2 | 3 | from vllm.config import ParallelConfig 4 | from vllm.utils import get_open_port 5 | from vllm.worker.worker import init_distributed_environment 6 | 7 | 8 | def init_test_distributed_environment( 9 | pipeline_parallel_size: int, 10 | tensor_parallel_size: int, 11 | rank: int, 12 | distributed_init_port: str, 13 | ) -> None: 14 | parallel_config = ParallelConfig(pipeline_parallel_size, 15 | tensor_parallel_size, 16 | worker_use_ray=True) 17 | distributed_init_method = f"tcp://localhost:{distributed_init_port}" 18 | init_distributed_environment( 19 | parallel_config, 20 | rank, 21 | cupy_port=None, 22 | distributed_init_method=distributed_init_method) 23 | 24 | 25 | def multi_process_tensor_parallel( 26 | tensor_parallel_size: int, 27 | test_target, 28 | ) -> None: 29 | # Using ray helps debugging the error when it failed 30 | # as compared to multiprocessing. 31 | ray.init() 32 | 33 | distributed_init_port = get_open_port() 34 | refs = [] 35 | for rank in range(tensor_parallel_size): 36 | refs.append( 37 | test_target.remote(tensor_parallel_size, rank, 38 | distributed_init_port)) 39 | ray.get(refs) 40 | 41 | ray.shutdown() 42 | -------------------------------------------------------------------------------- /vllm/transformers_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/transformers_utils/__init__.py -------------------------------------------------------------------------------- /vllm/transformers_utils/config.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from transformers import AutoConfig, PretrainedConfig 4 | 5 | from vllm.transformers_utils.configs import * 6 | 7 | _CONFIG_REGISTRY = { 8 | "chatglm": ChatGLMConfig, 9 | "mpt": MPTConfig, 10 | "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) 11 | "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) 12 | "starcoder2": Starcoder2Config, 13 | } 14 | 15 | 16 | def get_config(model: str, 17 | trust_remote_code: bool, 18 | revision: Optional[str] = None, 19 | code_revision: Optional[str] = None) -> PretrainedConfig: 20 | # FIXME(woosuk): This is a temporary fix for StarCoder2. 21 | # Remove this when the model is supported by HuggingFace transformers. 22 | if "bigcode" in model and "starcoder2" in model: 23 | config_class = _CONFIG_REGISTRY["starcoder2"] 24 | config = config_class.from_pretrained(model, 25 | revision=revision, 26 | code_revision=code_revision) 27 | return config 28 | 29 | try: 30 | config = AutoConfig.from_pretrained( 31 | model, 32 | trust_remote_code=trust_remote_code, 33 | revision=revision, 34 | code_revision=code_revision) 35 | except ValueError as e: 36 | if (not trust_remote_code and 37 | "requires you to execute the configuration file" in str(e)): 38 | err_msg = ( 39 | "Failed to load the model config. If the model is a custom " 40 | "model not yet available in the HuggingFace transformers " 41 | "library, consider setting `trust_remote_code=True` in LLM " 42 | "or using the `--trust-remote-code` flag in the CLI.") 43 | raise RuntimeError(err_msg) from e 44 | else: 45 | raise e 46 | if config.model_type in _CONFIG_REGISTRY: 47 | config_class = _CONFIG_REGISTRY[config.model_type] 48 | config = config_class.from_pretrained(model, 49 | revision=revision, 50 | code_revision=code_revision) 51 | return config 52 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.transformers_utils.configs.chatglm import ChatGLMConfig 2 | from vllm.transformers_utils.configs.mpt import MPTConfig 3 | # RWConfig is for the original tiiuae/falcon-40b(-instruct) and 4 | # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the 5 | # `FalconConfig` class from the official HuggingFace transformers library. 6 | from vllm.transformers_utils.configs.falcon import RWConfig 7 | from vllm.transformers_utils.configs.starcoder2 import Starcoder2Config 8 | 9 | __all__ = [ 10 | "ChatGLMConfig", 11 | "MPTConfig", 12 | "RWConfig", 13 | "Starcoder2Config", 14 | ] 15 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/chatglm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Adapted from 3 | # https://github.com/THUDM/ChatGLM2-6B 4 | from transformers import PretrainedConfig 5 | 6 | 7 | class ChatGLMConfig(PretrainedConfig): 8 | model_type = "chatglm" 9 | attribute_map = { 10 | "num_hidden_layers": "num_layers", 11 | "n_head_kv": "multi_query_group_num", 12 | } 13 | 14 | def __init__(self, 15 | num_layers=28, 16 | padded_vocab_size=65024, 17 | hidden_size=4096, 18 | ffn_hidden_size=13696, 19 | kv_channels=128, 20 | num_attention_heads=32, 21 | seq_length=2048, 22 | hidden_dropout=0.0, 23 | attention_dropout=0.0, 24 | layernorm_epsilon=1e-5, 25 | rmsnorm=True, 26 | apply_residual_connection_post_layernorm=False, 27 | post_layer_norm=True, 28 | add_bias_linear=False, 29 | add_qkv_bias=False, 30 | interleaved_qkv=False, 31 | bias_dropout_fusion=True, 32 | multi_query_attention=False, 33 | multi_query_group_num=1, 34 | apply_query_key_layer_scaling=True, 35 | attention_softmax_in_fp32=True, 36 | fp32_residual_connection=False, 37 | quantization_bit=0, 38 | pre_seq_len=None, 39 | prefix_projection=False, 40 | **kwargs): 41 | self.num_layers = num_layers 42 | self.vocab_size = padded_vocab_size 43 | self.padded_vocab_size = padded_vocab_size 44 | self.hidden_size = hidden_size 45 | self.ffn_hidden_size = ffn_hidden_size 46 | self.kv_channels = kv_channels 47 | self.num_attention_heads = num_attention_heads 48 | self.seq_length = seq_length 49 | self.hidden_dropout = hidden_dropout 50 | self.attention_dropout = attention_dropout 51 | self.layernorm_epsilon = layernorm_epsilon 52 | self.rmsnorm = rmsnorm 53 | self.apply_residual_connection_post_layernorm = ( 54 | apply_residual_connection_post_layernorm) 55 | self.post_layer_norm = post_layer_norm 56 | self.add_bias_linear = add_bias_linear 57 | self.add_qkv_bias = add_qkv_bias 58 | self.bias_dropout_fusion = bias_dropout_fusion 59 | self.multi_query_attention = multi_query_attention 60 | self.multi_query_group_num = multi_query_group_num 61 | self.apply_query_key_layer_scaling = apply_query_key_layer_scaling 62 | self.attention_softmax_in_fp32 = attention_softmax_in_fp32 63 | self.fp32_residual_connection = fp32_residual_connection 64 | self.quantization_bit = quantization_bit 65 | self.pre_seq_len = pre_seq_len 66 | self.prefix_projection = prefix_projection 67 | self.interleaved_qkv = interleaved_qkv 68 | super().__init__(**kwargs) 69 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/starcoder2.py: -------------------------------------------------------------------------------- 1 | from transformers import PretrainedConfig 2 | 3 | 4 | class Starcoder2Config(PretrainedConfig): 5 | model_type = "starcoder2" 6 | keys_to_ignore_at_inference = ["past_key_values"] 7 | 8 | def __init__( 9 | self, 10 | vocab_size=49152, 11 | hidden_size=3072, 12 | intermediate_size=12288, 13 | num_hidden_layers=30, 14 | num_attention_heads=24, 15 | num_key_value_heads=2, 16 | hidden_act="gelu_pytorch_tanh", 17 | max_position_embeddings=4096, 18 | initializer_range=0.018042, 19 | norm_epsilon=1e-5, 20 | use_cache=True, 21 | bos_token_id=50256, 22 | eos_token_id=50256, 23 | rope_theta=10000.0, 24 | sliding_window=None, 25 | attention_dropout=0.0, 26 | residual_dropout=0.0, 27 | embedding_dropout=0.0, 28 | use_bias=True, 29 | **kwargs, 30 | ): 31 | self.vocab_size = vocab_size 32 | self.max_position_embeddings = max_position_embeddings 33 | self.hidden_size = hidden_size 34 | self.intermediate_size = intermediate_size 35 | self.num_hidden_layers = num_hidden_layers 36 | self.num_attention_heads = num_attention_heads 37 | self.sliding_window = sliding_window 38 | self.use_bias = use_bias 39 | self.num_key_value_heads = num_key_value_heads 40 | self.hidden_act = hidden_act 41 | self.initializer_range = initializer_range 42 | self.norm_epsilon = norm_epsilon 43 | self.use_cache = use_cache 44 | self.rope_theta = rope_theta 45 | self.attention_dropout = attention_dropout 46 | self.residual_dropout = residual_dropout 47 | self.embedding_dropout = embedding_dropout 48 | 49 | super().__init__( 50 | bos_token_id=bos_token_id, 51 | eos_token_id=eos_token_id, 52 | **kwargs, 53 | ) 54 | if self.architectures is None: 55 | self.architectures = ['Starcoder2ForCausalLM'] 56 | -------------------------------------------------------------------------------- /vllm/transformers_utils/tokenizer_group/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from vllm.config import TokenizerPoolConfig 3 | from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( 4 | BaseTokenizerGroup) 5 | from vllm.transformers_utils.tokenizer_group.tokenizer_group import ( 6 | TokenizerGroup) 7 | from vllm.engine.ray_utils import ray 8 | 9 | if ray: 10 | from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import ( 11 | RayTokenizerGroupPool) 12 | else: 13 | RayTokenizerGroupPool = None 14 | 15 | 16 | def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig], 17 | **init_kwargs) -> BaseTokenizerGroup: 18 | if tokenizer_pool_config is None: 19 | return TokenizerGroup(**init_kwargs) 20 | if tokenizer_pool_config.pool_type == "ray": 21 | if RayTokenizerGroupPool is None: 22 | raise ImportError( 23 | "RayTokenizerGroupPool is not available. Please install " 24 | "the ray package to use the Ray tokenizer group pool.") 25 | return RayTokenizerGroupPool.from_config(tokenizer_pool_config, 26 | **init_kwargs) 27 | else: 28 | raise ValueError( 29 | f"Unknown pool type: {tokenizer_pool_config.pool_type}") 30 | 31 | 32 | __all__ = ["get_tokenizer_group", "BaseTokenizerGroup"] 33 | -------------------------------------------------------------------------------- /vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Optional 3 | 4 | from transformers import PreTrainedTokenizer 5 | 6 | from vllm.lora.request import LoRARequest 7 | 8 | 9 | class BaseTokenizerGroup(ABC): 10 | """A group of tokenizers that can be used for LoRA adapters.""" 11 | 12 | @abstractmethod 13 | def ping(self) -> bool: 14 | """Check if the tokenizer group is alive.""" 15 | pass 16 | 17 | @abstractmethod 18 | def get_max_input_len(self, 19 | lora_request: Optional[LoRARequest] = None 20 | ) -> Optional[int]: 21 | """Get the maximum input length for the LoRA request.""" 22 | pass 23 | 24 | @abstractmethod 25 | def encode(self, 26 | prompt: str, 27 | request_id: Optional[str] = None, 28 | lora_request: Optional[LoRARequest] = None) -> List[int]: 29 | """Encode a prompt using the tokenizer group.""" 30 | pass 31 | 32 | @abstractmethod 33 | async def encode_async( 34 | self, 35 | prompt: str, 36 | request_id: Optional[str] = None, 37 | lora_request: Optional[LoRARequest] = None) -> List[int]: 38 | """Encode a prompt using the tokenizer group.""" 39 | pass 40 | 41 | @abstractmethod 42 | def get_lora_tokenizer( 43 | self, 44 | lora_request: Optional[LoRARequest] = None 45 | ) -> "PreTrainedTokenizer": 46 | """Get a tokenizer for a LoRA request.""" 47 | pass 48 | 49 | @abstractmethod 50 | async def get_lora_tokenizer_async( 51 | self, 52 | lora_request: Optional[LoRARequest] = None 53 | ) -> "PreTrainedTokenizer": 54 | """Get a tokenizer for a LoRA request.""" 55 | pass 56 | -------------------------------------------------------------------------------- /vllm/transformers_utils/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer 2 | 3 | __all__ = [ 4 | "BaichuanTokenizer", 5 | ] 6 | -------------------------------------------------------------------------------- /vllm/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/worker/__init__.py --------------------------------------------------------------------------------