├── .buildkite
    ├── run-amd-test.sh
    ├── run-benchmarks.sh
    ├── test-pipeline.yaml
    └── test-template.j2
├── .dockerignore
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── 100-documentation.yml
    │   ├── 200-installation.yml
    │   ├── 300-usage.yml
    │   ├── 400-bug report.yml
    │   ├── 500-feature request.yml
    │   ├── 600-new model.yml
    │   ├── 700-performance discussion.yml
    │   ├── 800-misc discussion.yml
    │   └── config.yml
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── publish.yml
    │   ├── ruff.yml
    │   ├── scripts
    │       ├── build.sh
    │       ├── create_release.js
    │       ├── cuda-install.sh
    │       ├── env.sh
    │       └── pytorch-install.sh
    │   └── yapf.yml
├── .gitignore
├── .readthedocs.yaml
├── .yapfignore
├── CMakeLists.txt
├── CONTRIBUTING.md
├── Dockerfile
├── Dockerfile.rocm
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmarks
    ├── README.md
    ├── backend_request_func.py
    ├── benchmark_latency.py
    ├── benchmark_prefix_caching.py
    ├── benchmark_serving.py
    ├── benchmark_throughput.py
    ├── kernels
    │   ├── benchmark_mixtral_moe.py
    │   ├── benchmark_paged_attention.py
    │   └── benchmark_rope.py
    └── launch_tgi_server.sh
├── cmake
    ├── hipify.py
    └── utils.cmake
├── collect_env.py
├── csrc
    ├── activation_kernels.cu
    ├── attention
    │   ├── attention_dtypes.h
    │   ├── attention_generic.cuh
    │   ├── attention_kernels.cu
    │   ├── attention_utils.cuh
    │   ├── dtype_bfloat16.cuh
    │   ├── dtype_float16.cuh
    │   ├── dtype_float32.cuh
    │   └── dtype_fp8_e5m2.cuh
    ├── cache.h
    ├── cache_kernels.cu
    ├── cuda_compat.h
    ├── cuda_utils.h
    ├── cuda_utils_kernels.cu
    ├── custom_all_reduce.cu
    ├── custom_all_reduce.cuh
    ├── custom_all_reduce_test.cu
    ├── dispatch_utils.h
    ├── layernorm_kernels.cu
    ├── moe
    │   ├── moe_ops.cpp
    │   ├── moe_ops.h
    │   └── topk_softmax_kernels.cu
    ├── moe_align_block_size_kernels.cu
    ├── ops.h
    ├── pos_encoding_kernels.cu
    ├── punica
    │   ├── LICENSE
    │   ├── bgmv
    │   │   ├── bgmv_bf16_bf16_bf16.cu
    │   │   ├── bgmv_bf16_bf16_fp16.cu
    │   │   ├── bgmv_bf16_fp16_bf16.cu
    │   │   ├── bgmv_bf16_fp16_fp16.cu
    │   │   ├── bgmv_bf16_fp32_bf16.cu
    │   │   ├── bgmv_bf16_fp32_fp16.cu
    │   │   ├── bgmv_config.h
    │   │   ├── bgmv_fp16_bf16_bf16.cu
    │   │   ├── bgmv_fp16_bf16_fp16.cu
    │   │   ├── bgmv_fp16_fp16_bf16.cu
    │   │   ├── bgmv_fp16_fp16_fp16.cu
    │   │   ├── bgmv_fp16_fp32_bf16.cu
    │   │   ├── bgmv_fp16_fp32_fp16.cu
    │   │   ├── bgmv_fp32_bf16_bf16.cu
    │   │   ├── bgmv_fp32_bf16_fp16.cu
    │   │   ├── bgmv_fp32_fp16_bf16.cu
    │   │   ├── bgmv_fp32_fp16_fp16.cu
    │   │   ├── bgmv_fp32_fp32_bf16.cu
    │   │   ├── bgmv_fp32_fp32_fp16.cu
    │   │   ├── bgmv_impl.cuh
    │   │   ├── generator.py
    │   │   └── vec_dtypes.cuh
    │   └── punica_ops.cc
    ├── pybind.cpp
    ├── quantization
    │   ├── awq
    │   │   ├── dequantize.cuh
    │   │   └── gemm_kernels.cu
    │   ├── fp8_e5m2_kvcache
    │   │   └── quant_utils.cuh
    │   ├── gptq
    │   │   ├── compat.cuh
    │   │   ├── matrix_view.cuh
    │   │   ├── q_gemm.cu
    │   │   ├── qdq_2.cuh
    │   │   ├── qdq_3.cuh
    │   │   ├── qdq_4.cuh
    │   │   ├── qdq_8.cuh
    │   │   └── qdq_util.cuh
    │   ├── marlin
    │   │   ├── LICENSE
    │   │   └── marlin_cuda_kernel.cu
    │   └── squeezellm
    │   │   └── quant_cuda_kernel.cu
    └── reduction_utils.cuh
├── docs
    ├── Makefile
    ├── README.md
    ├── make.bat
    ├── requirements-docs.txt
    └── source
    │   ├── assets
    │       ├── kernel
    │       │   ├── k_vecs.png
    │       │   ├── key.png
    │       │   ├── logits_vec.png
    │       │   ├── q_vecs.png
    │       │   ├── query.png
    │       │   ├── v_vec.png
    │       │   └── value.png
    │       └── logos
    │       │   ├── vllm-logo-only-light.png
    │       │   ├── vllm-logo-text-dark.png
    │       │   └── vllm-logo-text-light.png
    │   ├── conf.py
    │   ├── dev
    │       ├── engine
    │       │   ├── async_llm_engine.rst
    │       │   ├── engine_index.rst
    │       │   └── llm_engine.rst
    │       ├── kernel
    │       │   └── paged_attention.rst
    │       └── sampling_params.rst
    │   ├── getting_started
    │       ├── amd-installation.rst
    │       ├── installation.rst
    │       ├── neuron-installation.rst
    │       └── quickstart.rst
    │   ├── index.rst
    │   ├── models
    │       ├── adding_model.rst
    │       ├── engine_args.rst
    │       ├── lora.rst
    │       └── supported_models.rst
    │   ├── quantization
    │       ├── auto_awq.rst
    │       └── fp8_e5m2_kv_cache.rst
    │   └── serving
    │       ├── deploying_with_bentoml.rst
    │       ├── deploying_with_docker.rst
    │       ├── deploying_with_kserve.rst
    │       ├── deploying_with_triton.rst
    │       ├── distributed_serving.rst
    │       ├── integrations.rst
    │       ├── metrics.rst
    │       ├── openai_compatible_server.md
    │       ├── run_on_sky.rst
    │       └── serving_with_langchain.rst
├── examples
    ├── api_client.py
    ├── gradio_openai_chatbot_webserver.py
    ├── gradio_webserver.py
    ├── llm_engine_example.py
    ├── multilora_inference.py
    ├── offline_inference.py
    ├── offline_inference_distributed.py
    ├── offline_inference_neuron.py
    ├── offline_inference_with_prefix.py
    ├── openai_chatcompletion_client.py
    ├── openai_completion_client.py
    ├── production_monitoring
    │   ├── README.md
    │   ├── docker-compose.yaml
    │   ├── grafana.json
    │   └── prometheus.yaml
    ├── template_alpaca.jinja
    ├── template_baichuan.jinja
    ├── template_chatglm.jinja
    ├── template_chatglm2.jinja
    ├── template_chatml.jinja
    ├── template_falcon.jinja
    ├── template_falcon_180b.jinja
    └── template_inkbot.jinja
├── format.sh
├── patch_xformers.rocm.sh
├── pyproject.toml
├── requirements-build.txt
├── requirements-dev.txt
├── requirements-neuron.txt
├── requirements-rocm.txt
├── requirements.txt
├── rocm_patch
    ├── commonpy_xformers-0.0.23.rocm.patch
    ├── flashpy_xformers-0.0.23.rocm.patch
    └── rocm_bf16.patch
├── setup.py
├── tests
    ├── __init__.py
    ├── async_engine
    │   ├── api_server_async_engine.py
    │   ├── test_api_server.py
    │   ├── test_async_llm_engine.py
    │   ├── test_chat_template.py
    │   └── test_request_tracker.py
    ├── basic_correctness
    │   └── test_basic_correctness.py
    ├── conftest.py
    ├── core
    │   ├── __init__.py
    │   ├── test_block_manager.py
    │   ├── test_scheduler.py
    │   └── utils.py
    ├── distributed
    │   ├── test_basic_distributed_correctness.py
    │   ├── test_comm_ops.py
    │   └── test_custom_all_reduce.py
    ├── engine
    │   └── test_computed_prefix_blocks.py
    ├── entrypoints
    │   ├── test_guided_processors.py
    │   └── test_openai_server.py
    ├── kernels
    │   ├── allclose_default.py
    │   ├── conftest.py
    │   ├── test_activation.py
    │   ├── test_attention.py
    │   ├── test_cache.py
    │   ├── test_layernorm.py
    │   ├── test_moe.py
    │   ├── test_pos_encoding.py
    │   ├── test_prefix_prefill.py
    │   ├── test_rand.py
    │   └── test_sampler.py
    ├── lora
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_gemma.py
    │   ├── test_layer_variation.py
    │   ├── test_layers.py
    │   ├── test_llama.py
    │   ├── test_lora.py
    │   ├── test_lora_manager.py
    │   ├── test_mixtral.py
    │   ├── test_punica.py
    │   ├── test_tokenizer_group.py
    │   ├── test_utils.py
    │   ├── test_worker.py
    │   └── utils.py
    ├── metrics
    │   └── test_metrics.py
    ├── models
    │   ├── test_marlin.py
    │   ├── test_mistral.py
    │   └── test_models.py
    ├── prefix_caching
    │   └── test_prefix_caching.py
    ├── prompts
    │   ├── example.txt
    │   └── summary.txt
    ├── samplers
    │   ├── test_beam_search.py
    │   ├── test_logprobs.py
    │   ├── test_rejection_sampler.py
    │   ├── test_sampler.py
    │   └── test_seeded_generate.py
    ├── spec_decode
    │   ├── __init__.py
    │   ├── test_batch_expansion.py
    │   ├── test_metrics.py
    │   ├── test_multi_step_worker.py
    │   ├── test_spec_decode_worker.py
    │   ├── test_utils.py
    │   └── utils.py
    ├── test_cache_block_hashing.py
    ├── test_config.py
    ├── test_logits_processor.py
    ├── test_regression.py
    ├── test_sampling_params.py
    ├── test_sequence.py
    ├── tokenization
    │   ├── __init__.py
    │   ├── test_cached_tokenizer.py
    │   ├── test_detokenize.py
    │   └── test_tokenizer_group.py
    └── worker
    │   ├── __init__.py
    │   ├── test_model_runner.py
    │   └── test_swap.py
└── vllm
    ├── __init__.py
    ├── block.py
    ├── config.py
    ├── core
        ├── __init__.py
        ├── block_manager.py
        ├── evictor.py
        ├── policy.py
        └── scheduler.py
    ├── engine
        ├── __init__.py
        ├── arg_utils.py
        ├── async_llm_engine.py
        ├── llm_engine.py
        ├── metrics.py
        └── ray_utils.py
    ├── entrypoints
        ├── __init__.py
        ├── api_server.py
        ├── llm.py
        └── openai
        │   ├── __init__.py
        │   ├── api_server.py
        │   ├── cli_args.py
        │   ├── protocol.py
        │   ├── serving_chat.py
        │   ├── serving_completion.py
        │   └── serving_engine.py
    ├── executor
        ├── __init__.py
        ├── executor_base.py
        ├── gpu_executor.py
        ├── ray_gpu_executor.py
        └── utils.py
    ├── logger.py
    ├── lora
        ├── __init__.py
        ├── layers.py
        ├── lora.py
        ├── models.py
        ├── punica.py
        ├── request.py
        ├── utils.py
        └── worker_manager.py
    ├── model_executor
        ├── __init__.py
        ├── guided_decoding.py
        ├── guided_logits_processors.py
        ├── input_metadata.py
        ├── layers
        │   ├── __init__.py
        │   ├── activation.py
        │   ├── attention
        │   │   ├── __init__.py
        │   │   ├── attention.py
        │   │   ├── backends
        │   │   │   ├── __init__.py
        │   │   │   ├── flash_attn.py
        │   │   │   └── xformers.py
        │   │   └── ops
        │   │   │   ├── __init__.py
        │   │   │   ├── paged_attn.py
        │   │   │   └── prefix_prefill.py
        │   ├── fused_moe
        │   │   ├── __init__.py
        │   │   ├── configs
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   └── README
        │   │   └── fused_moe.py
        │   ├── layernorm.py
        │   ├── linear.py
        │   ├── logits_processor.py
        │   ├── ops
        │   │   ├── __init__.py
        │   │   ├── rand.py
        │   │   └── sample.py
        │   ├── quantization
        │   │   ├── __init__.py
        │   │   ├── awq.py
        │   │   ├── base_config.py
        │   │   ├── gptq.py
        │   │   ├── marlin.py
        │   │   └── squeezellm.py
        │   ├── rejection_sampler.py
        │   ├── rotary_embedding.py
        │   ├── sampler.py
        │   └── vocab_parallel_embedding.py
        ├── model_loader.py
        ├── models
        │   ├── __init__.py
        │   ├── baichuan.py
        │   ├── bloom.py
        │   ├── chatglm.py
        │   ├── decilm.py
        │   ├── deepseek.py
        │   ├── falcon.py
        │   ├── gemma.py
        │   ├── gpt2.py
        │   ├── gpt_bigcode.py
        │   ├── gpt_j.py
        │   ├── gpt_neox.py
        │   ├── internlm2.py
        │   ├── llama.py
        │   ├── mixtral.py
        │   ├── mixtral_quant.py
        │   ├── mpt.py
        │   ├── neuron
        │   │   ├── llama.py
        │   │   └── mistral.py
        │   ├── olmo.py
        │   ├── opt.py
        │   ├── orion.py
        │   ├── phi.py
        │   ├── qwen.py
        │   ├── qwen2.py
        │   ├── stablelm.py
        │   └── starcoder2.py
        ├── neuron_model_loader.py
        ├── parallel_utils
        │   ├── README.md
        │   ├── __init__.py
        │   ├── communication_op.py
        │   ├── cupy_utils.py
        │   ├── custom_all_reduce.py
        │   ├── parallel_state.py
        │   └── utils.py
        ├── sampling_metadata.py
        ├── utils.py
        └── weight_utils.py
    ├── outputs.py
    ├── py.typed
    ├── sampling_params.py
    ├── sequence.py
    ├── spec_decode
        ├── batch_expansion.py
        ├── interfaces.py
        ├── metrics.py
        ├── multi_step_worker.py
        ├── spec_decode_worker.py
        └── util.py
    ├── test_utils.py
    ├── transformers_utils
        ├── __init__.py
        ├── config.py
        ├── configs
        │   ├── __init__.py
        │   ├── chatglm.py
        │   ├── falcon.py
        │   ├── mpt.py
        │   └── starcoder2.py
        ├── tokenizer.py
        ├── tokenizer_group
        │   ├── __init__.py
        │   ├── base_tokenizer_group.py
        │   ├── ray_tokenizer_group.py
        │   └── tokenizer_group.py
        └── tokenizers
        │   ├── __init__.py
        │   └── baichuan.py
    ├── utils.py
    └── worker
        ├── __init__.py
        ├── cache_engine.py
        ├── model_runner.py
        ├── neuron_worker.py
        └── worker.py


/.buildkite/run-amd-test.sh:
--------------------------------------------------------------------------------
 1 | # This script build the ROCm docker image and run the API server inside the container.
 2 | # It serves a sanity check for compilation and basic model usage.
 3 | set -ex
 4 | 
 5 | # Print ROCm version
 6 | rocminfo
 7 | 
 8 | # Try building the docker image
 9 | docker build -t rocm -f Dockerfile.rocm .
10 | 
11 | # Setup cleanup
12 | remove_docker_container() { docker rm -f rocm || true; }
13 | trap remove_docker_container EXIT
14 | remove_docker_container
15 | 
16 | # Run the image
17 | docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
18 | 
19 | # Wait for the server to start
20 | wait_for_server_to_start() {
21 |     timeout=300
22 |     counter=0
23 | 
24 |     while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
25 |         sleep 1
26 |         counter=$((counter + 1))
27 |         if [ $counter -ge $timeout ]; then
28 |             echo "Timeout after $timeout seconds"
29 |             break
30 |         fi
31 |     done
32 | }
33 | wait_for_server_to_start
34 | 
35 | # Test a simple prompt
36 | curl -X POST -H "Content-Type: application/json" \
37 |     localhost:8000/generate \
38 |     -d '{"prompt": "San Francisco is a"}'
39 | 


--------------------------------------------------------------------------------
/.buildkite/run-benchmarks.sh:
--------------------------------------------------------------------------------
 1 | # This script is run by buildkite to run the benchmarks and upload the results to buildkite
 2 | 
 3 | set -ex
 4 | set -o pipefail
 5 | 
 6 | # cd into parent directory of this file
 7 | cd "$(dirname "${BASH_SOURCE[0]}")/.."
 8 | 
 9 | (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
10 | 
11 | # run python-based benchmarks and upload the result to buildkite
12 | python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
13 | bench_latency_exit_code=$?
14 | 
15 | python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
16 | bench_throughput_exit_code=$?
17 | 
18 | # run server-based benchmarks and upload the result to buildkite
19 | python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
20 | server_pid=$!
21 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
22 | 
23 | # wait for server to start, timeout after 600 seconds
24 | timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
25 | python3 benchmarks/benchmark_serving.py \
26 |     --backend openai \
27 |     --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
28 |     --model meta-llama/Llama-2-7b-chat-hf \
29 |     --num-prompts 20 \
30 |     --endpoint /v1/completions \
31 |     --tokenizer meta-llama/Llama-2-7b-chat-hf \
32 |     --save-result \
33 |     2>&1 | tee benchmark_serving.txt
34 | bench_serving_exit_code=$?
35 | kill $server_pid
36 | 
37 | # write the results into a markdown file
38 | echo "### Latency Benchmarks" >> benchmark_results.md
39 | sed -n '1p' benchmark_latency.txt >> benchmark_results.md # first line
40 | echo "" >> benchmark_results.md
41 | sed -n '$p' benchmark_latency.txt >> benchmark_results.md # last line
42 | 
43 | echo "### Throughput Benchmarks" >> benchmark_results.md
44 | sed -n '1p' benchmark_throughput.txt >> benchmark_results.md # first line
45 | echo "" >> benchmark_results.md
46 | sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
47 | 
48 | echo "### Serving Benchmarks" >> benchmark_results.md
49 | sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
50 | echo "" >> benchmark_results.md
51 | tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
52 | 
53 | # upload the results to buildkite
54 | /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
55 | 
56 | # exit with the exit code of the benchmarks
57 | if [ $bench_latency_exit_code -ne 0 ]; then
58 |     exit $bench_latency_exit_code
59 | fi
60 | 
61 | if [ $bench_throughput_exit_code -ne 0 ]; then
62 |     exit $bench_throughput_exit_code
63 | fi
64 | 
65 | if [ $bench_serving_exit_code -ne 0 ]; then
66 |     exit $bench_serving_exit_code
67 | fi
68 | 
69 | /workspace/buildkite-agent artifact upload openai-*.json
70 | 


--------------------------------------------------------------------------------
/.buildkite/test-pipeline.yaml:
--------------------------------------------------------------------------------
 1 | # In this file, you can add more tests to run either by adding a new step or
 2 | # adding a new command to an existing step. See different options here for examples.
 3 | # This script will be feed into Jinja template in `test-template.j2` to generate
 4 | # the final pipeline yaml file.
 5 | 
 6 | steps:
 7 | - label: Regression Test
 8 |   command: pytest -v -s test_regression.py
 9 |   working_dir: "/vllm-workspace/tests" # optional
10 | 
11 | - label: AsyncEngine Test
12 |   command: pytest -v -s async_engine
13 | 
14 | - label: Basic Correctness Test
15 |   command: pytest -v -s --forked basic_correctness
16 | 
17 | - label: Core Test
18 |   command: pytest -v -s core
19 | 
20 | - label: Distributed Comm Ops Test
21 |   command: pytest -v -s --forked test_comm_ops.py
22 |   working_dir: "/vllm-workspace/tests/distributed"
23 |   num_gpus: 2 # only support 1 or 2 for now.
24 | 
25 | - label: Distributed Correctness Test
26 |   command: pytest -v -s --forked test_basic_distributed_correctness.py
27 |   working_dir: "/vllm-workspace/tests/distributed"
28 |   num_gpus: 2 # only support 1 or 2 for now.
29 | 
30 | - label: Engine Test
31 |   command: pytest -v -s engine tokenization test_sequence.py test_config.py
32 | 
33 | - label: Entrypoints Test
34 |   command: pytest -v -s entrypoints
35 | 
36 | - label: Kernels Test %N
37 |   command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
38 |   parallelism: 4
39 | 
40 | - label: Models Test
41 |   commands:
42 |     - pytest -v -s models --forked
43 |   soft_fail: true
44 | 
45 | - label: Prefix Caching Test
46 |   commands:
47 |     - pytest -v -s prefix_caching
48 | 
49 | - label: Samplers Test
50 |   command: pytest -v -s samplers
51 | 
52 | - label: LogitsProcessor Test
53 |   command: pytest -v -s test_logits_processor.py
54 | 
55 | - label: Worker Test
56 |   command: pytest -v -s worker
57 | 
58 | - label: Speculative decoding tests
59 |   command: pytest -v -s spec_decode
60 | 
61 | - label: LoRA Test %N
62 |   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
63 |   parallelism: 4
64 | 
65 | - label: Metrics Test
66 |   command: pytest -v -s metrics
67 | 
68 | - label: Benchmarks
69 |   working_dir: "/vllm-workspace/.buildkite"
70 |   commands:
71 |   - pip install aiohttp
72 |   - bash run-benchmarks.sh
73 | 
74 | - label: Documentation Build
75 |   working_dir: "/vllm-workspace/docs"
76 |   no_gpu: True
77 |   commands:
78 |   - pip install -r requirements-docs.txt
79 |   - SPHINXOPTS=\"-W\" make html
80 | 


--------------------------------------------------------------------------------
/.buildkite/test-template.j2:
--------------------------------------------------------------------------------
 1 | {% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
 2 | {% set default_num_gpu = 1 %}
 3 | {% set default_working_dir = "/vllm-workspace/tests" %}
 4 | 
 5 | steps:
 6 |   - label: "AMD Test"
 7 |     agents:
 8 |       queue: amd
 9 |     command: bash .buildkite/run-amd-test.sh
10 | 
11 |   - label: ":docker: build image"
12 |     commands:
13 |       - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
14 |       - "docker push {{ docker_image }}"
15 |     env:
16 |       DOCKER_BUILDKIT: "1"
17 |     retry:
18 |       automatic:
19 |         - exit_status: -1  # Agent was lost
20 |           limit: 5
21 |   - wait
22 | 
23 |   {% for step in steps %}
24 |   - label: "{{ step.label }}"
25 |     agents:
26 |       queue: kubernetes
27 |     soft_fail: {{ step.soft_fail or false }}
28 |     {% if step.parallelism %}
29 |     parallelism: {{ step.parallelism }}
30 |     {% endif %}
31 |     retry:
32 |       automatic:
33 |         - exit_status: -1  # Agent was lost
34 |           limit: 5
35 |     plugins:
36 |       - kubernetes:
37 |           podSpec:
38 |             volumes:
39 |               - name: dshm
40 |                 emptyDir:
41 |                   medium: Memory
42 |             containers:
43 |               - image: "{{ docker_image }}"
44 |                 command: ["bash"]
45 |                 args:
46 |                 - '-c'
47 |                 - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
48 |                 {% if not step.no_gpu %}
49 |                 resources:
50 |                   requests:
51 |                     nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
52 |                   limits:
53 |                     nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
54 |                 {% endif %}
55 |                 env:
56 |                   - name: HF_TOKEN
57 |                     valueFrom:
58 |                       secretKeyRef:
59 |                         name: hf-token-secret
60 |                         key: token
61 |                 volumeMounts:
62 |                   - mountPath: /dev/shm
63 |                     name: dshm
64 |   {% endfor %}
65 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | vllm/*.so
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/100-documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to https://docs.vllm.ai/
 3 | title: "[Doc]: "
 4 | labels: ["documentation"]
 5 | 
 6 | body:
 7 | - type: textarea
 8 |   attributes:
 9 |     label: 📚 The doc issue
10 |     description: >
11 |       A clear and concise description of what content in https://docs.vllm.ai/ is an issue.
12 |   validations:
13 |     required: true
14 | - type: textarea
15 |   attributes:
16 |     label: Suggest a potential alternative/fix
17 |     description: >
18 |       Tell us how we could improve the documentation in this regard.
19 | - type: markdown
20 |   attributes:
21 |     value: >
22 |       Thanks for contributing 🎉!
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/200-installation.yml:
--------------------------------------------------------------------------------
 1 | name: 🛠️ Installation
 2 | description: Report an issue here when you hit errors during installation.
 3 | title: "[Installation]: "
 4 | labels: ["installation"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Your current environment
14 |     description: |
15 |       Please run the following and paste the output below.
16 |       ```sh
17 |       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
18 |       # For security purposes, please feel free to check the contents of collect_env.py before running it.
19 |       python collect_env.py
20 |       ```
21 |     value: |
22 |       ```text
23 |       The output of `python collect_env.py`
24 |       ```
25 |   validations:
26 |     required: true
27 | - type: textarea
28 |   attributes:
29 |     label: How you are installing vllm
30 |     description: |
31 |       Paste the full command you are trying to execute.
32 |     value: |
33 |       ```sh
34 |       pip install -vvv vllm
35 |       ```
36 | - type: markdown
37 |   attributes:
38 |     value: >
39 |       Thanks for contributing 🎉!
40 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/300-usage.yml:
--------------------------------------------------------------------------------
 1 | name: 💻 Usage
 2 | description: Raise an issue here if you don't know how to use vllm.
 3 | title: "[Usage]: "
 4 | labels: ["usage"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Your current environment
14 |     description: |
15 |       Please run the following and paste the output below.
16 |       ```sh
17 |       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
18 |       # For security purposes, please feel free to check the contents of collect_env.py before running it.
19 |       python collect_env.py
20 |       ```
21 |     value: |
22 |       ```text
23 |       The output of `python collect_env.py`
24 |       ```
25 |   validations:
26 |     required: true
27 | - type: textarea
28 |   attributes:
29 |     label: How would you like to use vllm
30 |     description: |
31 |       A detailed description of how you want to use vllm.
32 |     value: |
33 |       I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm.
34 | - type: markdown
35 |   attributes:
36 |     value: >
37 |       Thanks for contributing 🎉!
38 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/500-feature request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Submit a proposal/request for a new vllm feature
 3 | title: "[Feature]: "
 4 | labels: ["feature request"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: 🚀 The feature, motivation and pitch
14 |     description: >
15 |       A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
16 |   validations:
17 |     required: true
18 | - type: textarea
19 |   attributes:
20 |     label: Alternatives
21 |     description: >
22 |       A description of any alternative solutions or features you've considered, if any.
23 | - type: textarea
24 |   attributes:
25 |     label: Additional context
26 |     description: >
27 |       Add any other context or screenshots about the feature request.
28 | - type: markdown
29 |   attributes:
30 |     value: >
31 |       Thanks for contributing 🎉!
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/600-new model.yml:
--------------------------------------------------------------------------------
 1 | name: 🤗 Support request for a new model from huggingface
 2 | description: Submit a proposal/request for a new model from huggingface
 3 | title: "[New Model]: "
 4 | labels: ["new model"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | 
12 |       #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
13 | - type: textarea
14 |   attributes:
15 |     label: The model to consider.
16 |     description: >
17 |       A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 .
18 |   validations:
19 |     required: true
20 | - type: textarea
21 |   attributes:
22 |     label: The closest model vllm already supports.
23 |     description: >
24 |       Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for?
25 | - type: textarea
26 |   attributes:
27 |     label: What's your difficulty of supporting the model you want?
28 |     description: >
29 |       For example, any new operators or new architecture?
30 | - type: markdown
31 |   attributes:
32 |     value: >
33 |       Thanks for contributing 🎉!
34 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/700-performance discussion.yml:
--------------------------------------------------------------------------------
 1 | name: ⚡ Discussion on the performance of vllm
 2 | description: Submit a proposal/discussion about the performance of vllm
 3 | title: "[Performance]: "
 4 | labels: ["performance"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Proposal to improve performance
14 |     description: >
15 |       How do you plan to improve vllm's performance?
16 |   validations:
17 |     required: false
18 | - type: textarea
19 |   attributes:
20 |     label: Report of performance regression
21 |     description: >
22 |       Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks .
23 |   validations:
24 |     required: false
25 | - type: textarea
26 |   attributes:
27 |     label: Misc discussion on performance
28 |     description: >
29 |       Anything about the performance.
30 |   validations:
31 |     required: false
32 | - type: textarea
33 |   attributes:
34 |     label: Your current environment (if you think it is necessary)
35 |     description: |
36 |       Please run the following and paste the output below.
37 |       ```sh
38 |       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
39 |       # For security purposes, please feel free to check the contents of collect_env.py before running it.
40 |       python collect_env.py
41 |       ```
42 |     value: |
43 |       ```text
44 |       The output of `python collect_env.py`
45 |       ```
46 |   validations:
47 |     required: false
48 | - type: markdown
49 |   attributes:
50 |     value: >
51 |       Thanks for contributing 🎉!
52 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/800-misc discussion.yml:
--------------------------------------------------------------------------------
 1 | name: 🎲 Misc/random discussions that do not fit into the above categories.
 2 | description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
 3 | title: "[Misc]: "
 4 | labels: ["misc"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Anything you want to discuss about vllm.
14 |     description: >
15 |       Anything you want to discuss about vllm.
16 |   validations:
17 |     required: true
18 | - type: markdown
19 |   attributes:
20 |     value: >
21 |       Thanks for contributing 🎉!
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | 


--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
 1 | name: ruff
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 |   ruff:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.10"]
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v2
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1
29 |     - name: Analysing the code with ruff
30 |       run: |
31 |         ruff .
32 |     - name: Spelling check with codespell
33 |       run: |
34 |         codespell --toml pyproject.toml


--------------------------------------------------------------------------------
/.github/workflows/scripts/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | cuda_home=/usr/local/cuda-$2
 5 | 
 6 | # Update paths
 7 | PATH=${cuda_home}/bin:$PATH
 8 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 9 | 
10 | # Install requirements
11 | $python_executable -m pip install wheel packaging
12 | $python_executable -m pip install -r requirements.txt
13 | 
14 | # Limit the number of parallel jobs to avoid OOM
15 | export MAX_JOBS=1
16 | # Make sure punica is built for the release (for LoRA)
17 | export VLLM_INSTALL_PUNICA_KERNELS=1
18 | 
19 | # Build
20 | $python_executable setup.py bdist_wheel --dist-dir=dist
21 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/create_release.js:
--------------------------------------------------------------------------------
 1 | // Uses Github's API to create the release and wait for result.
 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
 3 | 
 4 | module.exports = async (github, context, core) => {
 5 | 	try {
 6 | 		const response = await github.rest.repos.createRelease({
 7 | 			draft: false,
 8 | 			generate_release_notes: true,
 9 | 			name: process.env.RELEASE_TAG,
10 | 			owner: context.repo.owner,
11 | 			prerelease: false,
12 | 			repo: context.repo.repo,
13 | 			tag_name: process.env.RELEASE_TAG,
14 | 		});
15 | 
16 | 		core.setOutput('upload_url', response.data.upload_url);
17 | 	} catch (error) {
18 | 		core.setFailed(error.message);
19 | 	}
20 | }


--------------------------------------------------------------------------------
/.github/workflows/scripts/cuda-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Replace '.' with '-' ex: 11.8 -> 11-8
 4 | cuda_version=$(echo $1 | tr "." "-")
 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
 6 | OS=$(echo $2 | tr -d ".\-")
 7 | 
 8 | # Installs CUDA
 9 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
11 | rm cuda-keyring_1.1-1_all.deb
12 | sudo apt -qq update
13 | sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
14 | sudo apt clean
15 | 
16 | # Test nvcc
17 | PATH=/usr/local/cuda-$1/bin:${PATH}
18 | nvcc --version
19 | 
20 | # Log gcc, g++, c++ versions
21 | gcc --version
22 | g++ --version
23 | c++ --version
24 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This file installs common linux environment tools
 4 | 
 5 | export LANG C.UTF-8
 6 | 
 7 | # python_version=$1
 8 | 
 9 | sudo    apt-get update && \
10 | sudo    apt-get install -y --no-install-recommends \
11 |         software-properties-common \
12 | 
13 | sudo    apt-get install -y --no-install-recommends \
14 |         build-essential \
15 |         apt-utils \
16 |         ca-certificates \
17 |         wget \
18 |         git \
19 |         vim \
20 |         libssl-dev \
21 |         curl \
22 |         unzip \
23 |         unrar \
24 |         cmake \
25 |         net-tools \
26 |         sudo \
27 |         autotools-dev \
28 |         rsync \
29 |         jq \
30 |         openssh-server \
31 |         tmux \
32 |         screen \
33 |         htop \
34 |         pdsh \
35 |         openssh-client \
36 |         lshw \
37 |         dmidecode \
38 |         util-linux \
39 |         automake \
40 |         autoconf \
41 |         libtool \
42 |         net-tools \
43 |         pciutils \
44 |         libpci-dev \
45 |         libaio-dev \
46 |         libcap2 \
47 |         libtinfo5 \
48 |         fakeroot \
49 |         devscripts \
50 |         debhelper \
51 |         nfs-common
52 | 
53 | # Remove github bloat files to free up disk space
54 | sudo rm -rf "/usr/local/share/boost"
55 | sudo rm -rf "$AGENT_TOOLSDIRECTORY"
56 | sudo rm -rf "/usr/share/dotnet"
57 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/pytorch-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | pytorch_version=$2
 5 | cuda_version=$3
 6 | 
 7 | # Install torch
 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
 9 | $python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
10 | 
11 | # Print version information
12 | $python_executable --version
13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)"
14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
16 | 


--------------------------------------------------------------------------------
/.github/workflows/yapf.yml:
--------------------------------------------------------------------------------
 1 | name: yapf
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | jobs:
13 |   yapf:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.10"]
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install yapf==0.32.0
28 |         pip install toml==0.10.2
29 |     - name: Running yapf
30 |       run: |
31 |         yapf --diff --recursive .
32 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3.8"
10 | 
11 | sphinx:
12 |    configuration: docs/source/conf.py
13 | 
14 | # If using Sphinx, optionally build your docs in additional formats such as PDF
15 | formats:
16 |    - pdf
17 | 
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 |    install:
21 |    - requirements: docs/requirements-docs.txt
22 | 


--------------------------------------------------------------------------------
/.yapfignore:
--------------------------------------------------------------------------------
1 | collect_env.py
2 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to vLLM
 2 | 
 3 | Thank you for your interest in contributing to vLLM!
 4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
 5 | There are several ways you can contribute to the project:
 6 | 
 7 | - Identify and report any issues or bugs.
 8 | - Request or add a new model.
 9 | - Suggest or implement new features.
10 | 
11 | However, remember that contributions aren't just about code.
12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
13 | 
14 | Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects.
16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
17 | 
18 | 
19 | ## Setup for development
20 | 
21 | ### Build from source
22 | 
23 | ```bash
24 | pip install -r requirements.txt
25 | pip install -e .  # This may take several minutes.
26 | ```
27 | 
28 | ### Testing
29 | 
30 | ```bash
31 | pip install -r requirements-dev.txt
32 | 
33 | # Static type checking
34 | mypy
35 | # Unit tests
36 | pytest tests/
37 | ```
38 | **Note:** Currently, the repository does not pass the mypy tests.
39 | 
40 | 
41 | ## Contributing Guidelines
42 | 
43 | ### Issue Reporting
44 | 
45 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
46 | If not, please file a new issue, providing as much relevant information as possible.
47 | 
48 | ### Pull Requests & Code Reviews
49 | 
50 | Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
51 | 
52 | ### Thank You
53 | 
54 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
55 | Your contributions make vLLM a great tool for everyone!
56 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include requirements.txt
3 | include CMakeLists.txt
4 | 
5 | recursive-include cmake *
6 | recursive-include csrc *
7 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarking vLLM
2 | 
3 | ## Downloading the ShareGPT dataset
4 | 
5 | You can download the dataset by running:
6 | ```bash
7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
8 | ```
9 | 


--------------------------------------------------------------------------------
/benchmarks/launch_tgi_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PORT=8000
 4 | MODEL=$1
 5 | TOKENS=$2
 6 | 
 7 | docker run --gpus all --shm-size 1g -p $PORT:80 \
 8 |            -v $PWD/data:/data \
 9 |            ghcr.io/huggingface/text-generation-inference:1.4.0 \
10 |            --model-id $MODEL \
11 |            --sharded false  \
12 |            --max-input-length 1024 \
13 |            --max-total-tokens 2048 \
14 |            --max-best-of 5 \
15 |            --max-concurrent-requests 5000 \
16 |            --max-batch-total-tokens $TOKENS
17 | 


--------------------------------------------------------------------------------
/cmake/hipify.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | #
 4 | # A command line tool for running pytorch's hipify preprocessor on CUDA
 5 | # source files.
 6 | #
 7 | # See https://github.com/ROCm/hipify_torch
 8 | # and <torch install dir>/utils/hipify/hipify_python.py
 9 | #
10 | 
11 | import argparse
12 | import shutil
13 | import os
14 | 
15 | from torch.utils.hipify.hipify_python import hipify
16 | 
17 | if __name__ == '__main__':
18 |     parser = argparse.ArgumentParser()
19 | 
20 |     # Project directory where all the source + include files live.
21 |     parser.add_argument(
22 |         "-p",
23 |         "--project_dir",
24 |         help="The project directory.",
25 |     )
26 | 
27 |     # Directory where hipified files are written.
28 |     parser.add_argument(
29 |         "-o",
30 |         "--output_dir",
31 |         help="The output directory.",
32 |     )
33 | 
34 |     # Source files to convert.
35 |     parser.add_argument("sources",
36 |                         help="Source files to hipify.",
37 |                         nargs="*",
38 |                         default=[])
39 | 
40 |     args = parser.parse_args()
41 | 
42 |     # Limit include scope to project_dir only
43 |     includes = [os.path.join(args.project_dir, '*')]
44 | 
45 |     # Get absolute path for all source files.
46 |     extra_files = [os.path.abspath(s) for s in args.sources]
47 | 
48 |     # Copy sources from project directory to output directory.
49 |     # The directory might already exist to hold object files so we ignore that.
50 |     shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
51 | 
52 |     hipify_result = hipify(project_directory=args.project_dir,
53 |                            output_directory=args.output_dir,
54 |                            header_include_dirs=[],
55 |                            includes=includes,
56 |                            extra_files=extra_files,
57 |                            show_detailed=True,
58 |                            is_pytorch_extension=True,
59 |                            hipify_extra_files_only=True)
60 | 
61 |     hipified_sources = []
62 |     for source in args.sources:
63 |         s_abs = os.path.abspath(source)
64 |         hipified_s_abs = (hipify_result[s_abs].hipified_path if
65 |                           (s_abs in hipify_result
66 |                            and hipify_result[s_abs].hipified_path is not None)
67 |                           else s_abs)
68 |         hipified_sources.append(hipified_s_abs)
69 | 
70 |     assert (len(hipified_sources) == len(args.sources))
71 | 
72 |     # Print hipified source files.
73 |     print("\n".join(hipified_sources))
74 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | #include "dtype_fp8_e5m2.cuh"
8 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_generic.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | #include <stdint.h>
21 | 
22 | namespace vllm {
23 | 
24 | // A vector type to store Q, K, V elements.
25 | template<typename T, int VEC_SIZE>
26 | struct Vec {};
27 | 
28 | // A vector type to store FP32 accumulators.
29 | template<typename T>
30 | struct FloatVec {};
31 | 
32 | // Template vector operations.
33 | template<typename Acc, typename A, typename B>
34 | inline __device__ Acc mul(A a, B b);
35 | 
36 | template<typename T>
37 | inline __device__ float sum(T v);
38 | 
39 | template<typename T>
40 | inline __device__ float dot(T a, T b) {
41 |   return sum(mul<T, T, T>(a, b));
42 | }
43 | 
44 | template<typename A, typename T>
45 | inline __device__ float dot(T a, T b) {
46 |   return sum(mul<A, T, T>(a, b));
47 | }
48 | 
49 | template<typename T>
50 | inline __device__ void zero(T& dst) {
51 |   constexpr int WORDS = sizeof(T) / 4;
52 |   union {
53 |     T raw;
54 |     uint32_t words[WORDS];
55 |   } tmp;
56 | 
57 | #pragma unroll
58 |   for (int ii = 0; ii < WORDS; ++ii) {
59 |     tmp.words[ii] = 0u;
60 |   }
61 |   dst = tmp.raw;
62 | }
63 | 
64 | } // namespace vllm
65 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_utils.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | #include "../cuda_compat.h"
21 | #include "attention_dtypes.h"
22 | 
23 | #include <float.h>
24 | #include <type_traits>
25 | 
26 | namespace vllm {
27 | 
28 | // Q*K^T operation.
29 | template<int THREAD_GROUP_SIZE, typename Vec, int N>
30 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
31 |   using A_vec = typename FloatVec<Vec>::Type;
32 |   // Compute the parallel products for Q*K^T (treat vector lanes separately).
33 |   A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
34 | #pragma unroll
35 |   for (int ii = 1; ii < N; ++ii) {
36 |     qk_vec = fma(q[ii], k[ii], qk_vec);
37 |   }
38 | 
39 |   // Finalize the reduction across lanes.
40 |   float qk = sum(qk_vec);
41 | #pragma unroll
42 |   for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
43 |     qk += VLLM_SHFL_XOR_SYNC(qk, mask);
44 |   }
45 |   return qk;
46 | }
47 | 
48 | template<typename T, int THREAD_GROUP_SIZE>
49 | struct Qk_dot {
50 |   template<typename Vec, int N>
51 |   static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
52 |     return qk_dot_<THREAD_GROUP_SIZE>(q, k);
53 |   }
54 | };
55 | 
56 | } // namespace vllm
57 | 


--------------------------------------------------------------------------------
/csrc/attention/dtype_fp8_e5m2.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "attention_generic.cuh"
 4 | 
 5 | #include <stdint.h>
 6 | #ifdef ENABLE_FP8_E5M2
 7 | #include <cuda_fp8.h>
 8 | #endif
 9 | 
10 | namespace vllm {
11 | #ifdef ENABLE_FP8_E5M2
12 | // fp8 vector types for quantization of kv cache
13 | 
14 | template<>
15 | struct Vec<uint8_t, 1> {
16 |     using Type = uint8_t;
17 | };
18 | 
19 | template<>
20 | struct Vec<uint8_t, 2> {
21 |     using Type = uint16_t;
22 | };
23 | 
24 | template<>
25 | struct Vec<uint8_t, 4> {
26 |     using Type = uint32_t;
27 | };
28 | 
29 | template<>
30 | struct Vec<uint8_t, 8> {
31 |     using Type = uint2;
32 | };
33 | #endif // ENABLE_FP8_E5M2
34 | 
35 | } // namespace vllm
36 | 


--------------------------------------------------------------------------------
/csrc/cache.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/extension.h>
 4 | 
 5 | #include <map>
 6 | #include <vector>
 7 | 
 8 | void swap_blocks(
 9 |   torch::Tensor& src,
10 |   torch::Tensor& dst,
11 |   const std::map<int64_t, int64_t>& block_mapping);
12 | 
13 | void copy_blocks(
14 |   std::vector<torch::Tensor>& key_caches,
15 |   std::vector<torch::Tensor>& value_caches,
16 |   const std::map<int64_t, std::vector<int64_t>>& block_mapping);
17 | 
18 | void reshape_and_cache(
19 |   torch::Tensor& key,
20 |   torch::Tensor& value,
21 |   torch::Tensor& key_cache,
22 |   torch::Tensor& value_cache,
23 |   torch::Tensor& slot_mapping,
24 |   const std::string& kv_cache_dtype);
25 | 
26 | // Just for unittest
27 | void convert_fp8_e5m2(
28 |   torch::Tensor& src_cache,
29 |   torch::Tensor& dst_cache);
30 | 


--------------------------------------------------------------------------------
/csrc/cuda_compat.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef USE_ROCM
 4 | #include <hip/hip_runtime.h>
 5 | #endif
 6 | 
 7 | #ifndef USE_ROCM
 8 |   #define WARP_SIZE 32
 9 | #else
10 |   #define WARP_SIZE warpSize
11 | #endif
12 | 
13 | #ifndef USE_ROCM
14 |   #define VLLM_LDG(arg) __ldg(arg)
15 | #else
16 |   #define VLLM_LDG(arg) *(arg)
17 | #endif
18 | 
19 | #ifndef USE_ROCM
20 |   #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask)
21 | #else
22 |   #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
23 | #endif
24 | 
25 | #ifndef USE_ROCM
26 |   #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
27 | #else
28 |   #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
29 | #endif
30 | 
31 | #ifndef USE_ROCM
32 |   #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
33 |     cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
34 | #else
35 |   #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
36 |     hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
37 | #endif
38 | 
39 | 


--------------------------------------------------------------------------------
/csrc/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/extension.h>
 4 | 
 5 | int get_device_attribute(
 6 |     int attribute,
 7 |     int device_id);
 8 | 
 9 | int get_max_shared_memory_per_block_device_attribute(
10 |     int device_id);
11 | 


--------------------------------------------------------------------------------
/csrc/cuda_utils_kernels.cu:
--------------------------------------------------------------------------------
 1 | #ifdef USE_ROCM
 2 |   #include <hip/hip_runtime.h>
 3 |   #include <hip/hip_runtime_api.h>
 4 | #endif
 5 | int get_device_attribute(
 6 |     int attribute,
 7 |     int device_id)
 8 | {
 9 |     int device, value;
10 |     if (device_id < 0) {
11 |         cudaGetDevice(&device);
12 |     }
13 |     else {
14 |         device = device_id;
15 |     }
16 |     cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute), device);
17 |     return value;
18 | }
19 | 
20 | 
21 | int get_max_shared_memory_per_block_device_attribute(
22 |     int device_id)
23 | {
24 | int attribute;    
25 | // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
26 | // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
27 | 
28 | #ifdef USE_ROCM
29 |     attribute = hipDeviceAttributeMaxSharedMemoryPerBlock;
30 | #else
31 |     attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin;
32 | #endif
33 | 
34 |     return get_device_attribute(attribute, device_id);
35 | }
36 | 


--------------------------------------------------------------------------------
/csrc/dispatch_utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from
 3 |  * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
 4 |  */
 5 | #pragma once
 6 | 
 7 | #include <torch/extension.h>
 8 | 
 9 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)              \
10 |   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
11 |   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
12 |   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
13 | 
14 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
15 |   AT_DISPATCH_SWITCH(                                             \
16 |     TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
17 | 
18 | #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)     \
19 |   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
20 |   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
21 |   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)   \
22 |   AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
23 | 
24 | #define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...)           \
25 |   AT_DISPATCH_SWITCH(                                                    \
26 |     TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
27 |     
28 | #define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...)             \
29 |   AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)      \
30 |   AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)      \
31 |   AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)     \
32 |   AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)       \
33 |   AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
34 | 
35 | #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)             \
36 |   AT_DISPATCH_SWITCH(                                             \
37 |     TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
38 | 


--------------------------------------------------------------------------------
/csrc/moe/moe_ops.cpp:
--------------------------------------------------------------------------------
1 | #include "moe_ops.h"
2 | 
3 | #include <torch/extension.h>
4 | 
5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
6 |   m.def("topk_softmax", &topk_softmax, "Apply topk softmax to the gating outputs.");
7 | }
8 | 


--------------------------------------------------------------------------------
/csrc/moe/moe_ops.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/extension.h>
 4 | 
 5 | void topk_softmax(
 6 |   torch::Tensor& topk_weights,
 7 |   torch::Tensor& topk_indices,
 8 |   torch::Tensor& token_expert_indices,
 9 |   torch::Tensor& gating_output);
10 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_half)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_bfloat16)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_half)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_half)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_config.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | template <int feat_in, int feat_out, typename in_T, typename out_T,
 4 |           typename W_T>
 5 | void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
 6 |                  const W_T *__restrict__ W,
 7 |                  const int64_t *__restrict__ indicies, int64_t y_offset,
 8 |                  int64_t full_y_size, int64_t batch_size, int64_t num_layers,
 9 |                  int64_t layer_idx, float scale);
10 | 
11 | // clang-format off
12 | 
13 | #define FOR_BGMV_WIDE(f, in_T, out_T, W_T, narrow) \
14 |     f(in_T, out_T, W_T, narrow, 128) \
15 |     f(in_T, out_T, W_T, narrow, 256) \
16 |     f(in_T, out_T, W_T, narrow, 512) \
17 |     f(in_T, out_T, W_T, narrow, 768) \
18 |     f(in_T, out_T, W_T, narrow, 1024) \
19 |     f(in_T, out_T, W_T, narrow, 1280) \
20 |     f(in_T, out_T, W_T, narrow, 1728) \
21 |     f(in_T, out_T, W_T, narrow, 1792) \
22 |     f(in_T, out_T, W_T, narrow, 2048) \
23 |     f(in_T, out_T, W_T, narrow, 2560) \
24 |     f(in_T, out_T, W_T, narrow, 2752) \
25 |     f(in_T, out_T, W_T, narrow, 2816) \
26 |     f(in_T, out_T, W_T, narrow, 3072) \
27 |     f(in_T, out_T, W_T, narrow, 3456) \
28 |     f(in_T, out_T, W_T, narrow, 3584) \
29 |     f(in_T, out_T, W_T, narrow, 4096) \
30 |     f(in_T, out_T, W_T, narrow, 5120) \
31 |     f(in_T, out_T, W_T, narrow, 5504) \
32 |     f(in_T, out_T, W_T, narrow, 5632) \
33 |     f(in_T, out_T, W_T, narrow, 6144) \
34 |     f(in_T, out_T, W_T, narrow, 6912) \
35 |     f(in_T, out_T, W_T, narrow, 7168) \
36 |     f(in_T, out_T, W_T, narrow, 8192) \
37 |     f(in_T, out_T, W_T, narrow, 9216) \
38 |     f(in_T, out_T, W_T, narrow, 10240) \
39 |     f(in_T, out_T, W_T, narrow, 11008) \
40 |     f(in_T, out_T, W_T, narrow, 12288) \
41 |     f(in_T, out_T, W_T, narrow, 13696) \
42 |     f(in_T, out_T, W_T, narrow, 13824) \
43 |     f(in_T, out_T, W_T, narrow, 14336) \
44 |     f(in_T, out_T, W_T, narrow, 16384) \
45 |     f(in_T, out_T, W_T, narrow, 20480) \
46 |     f(in_T, out_T, W_T, narrow, 22016) \
47 |     f(in_T, out_T, W_T, narrow, 24576) \
48 |     f(in_T, out_T, W_T, narrow, 28672) \
49 |     f(in_T, out_T, W_T, narrow, 32000) \
50 |     f(in_T, out_T, W_T, narrow, 32256) \
51 |     f(in_T, out_T, W_T, narrow, 32512) \
52 |     f(in_T, out_T, W_T, narrow, 32768) \
53 |     f(in_T, out_T, W_T, narrow, 33024) \
54 |     f(in_T, out_T, W_T, narrow, 36864) \
55 |     f(in_T, out_T, W_T, narrow, 49152) \
56 | // Keep above in sync with vllm/lora/layers::SamplerWithLoRA
57 | 
58 | // Keep this in sync with vllm/config::LoRAConfig
59 | #define FOR_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
60 |     FOR_BGMV_WIDE(f, in_T, out_T, W_T, 8)  \
61 |     FOR_BGMV_WIDE(f, in_T, out_T, W_T, 16) \
62 |     FOR_BGMV_WIDE(f, in_T, out_T, W_T, 32) \
63 |     FOR_BGMV_WIDE(f, in_T, out_T, W_T, 64)
64 | 
65 | // clang-format on
66 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_bfloat16)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_half)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_bfloat16)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_bfloat16)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_half)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_bfloat16)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_bfloat16)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_half)
5 | 


--------------------------------------------------------------------------------
/csrc/punica/bgmv/generator.py:
--------------------------------------------------------------------------------
 1 | DTYPES = ["fp16", "bf16", "fp32"]
 2 | DTYPE_MAP = {
 3 |     "fp16": "nv_half",
 4 |     "bf16": "nv_bfloat16",
 5 |     "fp32": "float",
 6 | }
 7 | 
 8 | TEMPLATE = """
 9 | #include "bgmv_config.h"
10 | #include "bgmv_impl.cuh"
11 | 
12 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
13 | """.lstrip()  # noqa: E501
14 | 
15 | for input_dtype in DTYPES:
16 |     for output_dtype in DTYPES:
17 |         for weight_dtype in DTYPES:
18 |             if weight_dtype == "fp32":
19 |                 # FP32 weights are not supported.
20 |                 continue
21 |             kernel_definition = TEMPLATE.format(
22 |                 input_dtype=DTYPE_MAP[input_dtype],
23 |                 output_dtype=DTYPE_MAP[output_dtype],
24 |                 weight_dtype=DTYPE_MAP[weight_dtype])
25 |             filename = f"bgmv_{input_dtype}_{output_dtype}_{weight_dtype}.cu"
26 |             with open(filename, "w") as f:
27 |                 f.write(kernel_definition)
28 | 


--------------------------------------------------------------------------------
/csrc/quantization/gptq/compat.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _compat_cuh
 6 | #define _compat_cuh
 7 | 
 8 | namespace vllm {
 9 | namespace gptq {
10 | // atomicAdd for half types, to support CC < 7.x
11 | 
12 | __device__ __forceinline__ void atomicAdd_half(half* address, half val)
13 | {
14 |     unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
15 |     unsigned int old = *address_as_ui;
16 |     unsigned int assumed;
17 | 
18 |     do
19 |     {
20 |         assumed = old;
21 |         __half_raw hsum;
22 |         hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
23 |         half tmpres = __hadd(hsum, val);
24 |         hsum = __half_raw(tmpres);
25 |         old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
26 |         old = atomicCAS(address_as_ui, assumed, old);
27 |     }
28 |     while (assumed != old);
29 | }
30 | 
31 | // atomicAdd for half2 types
32 | 
33 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
34 | {
35 |     unsigned int* address_as_ui = (unsigned int*)address;
36 |     unsigned int old = *address_as_ui;
37 |     unsigned int assumed;
38 |     do
39 |     {
40 |         assumed = old;
41 |         half2 old_val = *((half2*)&old);
42 |         half2 new_val = __hadd2(old_val, val);
43 |         old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
44 |     }
45 |     while (assumed != old);
46 | }
47 | 
48 | //
49 | 
50 | #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
51 | #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
52 | 
53 | __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
54 | 
55 | #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
56 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
57 | #endif
58 | 
59 | #endif
60 | #endif
61 | 
62 | }  // namespace gptq
63 | }  // namespace vllm
64 | #endif
65 | 


--------------------------------------------------------------------------------
/csrc/quantization/gptq/qdq_2.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_2_cuh
 6 | #define _qdq_2_cuh
 7 | 
 8 | #include "qdq_util.cuh"
 9 | 
10 | namespace vllm {
11 | namespace gptq {
12 | 
13 | // Permutation:
14 | //
15 | // ffddbb99 77553311  eeccaa88 66442200
16 | 
17 | __forceinline__ __device__ void shuffle_2bit_16
18 | (
19 |     uint32_t* q,
20 |     int stride
21 | )
22 | {
23 |     uint32_t qa = q[0];
24 |     uint32_t qb = 0;
25 | 
26 |     #pragma unroll
27 |     for (int i = 0; i < 8; i++)
28 |     {
29 |         uint32_t qa0 = qa & 0x03;
30 |         uint32_t qa1 = (qa & 0x0c) >> 2;
31 |         qa >>= 4;
32 |         qb |= (qa1 << (i * 2 + 16));
33 |         qb |= (qa0 << (i * 2));
34 |     }
35 |     q[0] = qb;
36 | }
37 | 
38 | __forceinline__ __device__ void dequant_2bit_16
39 | (
40 |     const uint32_t q_0,
41 |     half2 (&dq)[8],
42 |     int stride,
43 |     const uint32_t zero
44 | )
45 | {
46 |     const uint32_t c0 = 0x64006400;
47 |     const half y4_  = __float2half_rn(1.0f /  4.0f);
48 |     const half y16_ = __float2half_rn(1.0f / 16.0f);
49 |     const half y64_ = __float2half_rn(1.0f / 64.0f);
50 |     const half2 y4  = __halves2half2(y4_,  y4_);
51 |     const half2 y16 = __halves2half2(y16_, y16_);
52 |     const half2 y64 = __halves2half2(y64_, y64_);
53 | 
54 |     const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero);
55 |     const half z4_ = __hsub(__int2half_rn(-256), __int2half_rn(zero));
56 |     const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
57 |     const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
58 |     const half2 z1 = __half2half2(z1_.as_half);
59 |     const half2 z4 = __half2half2(z4_);
60 |     const half2 z16 = __half2half2(z16_);
61 |     const half2 z64 = __half2half2(z64_);
62 | 
63 |     uint32_t qa = q_0;
64 |     half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1])      + 1024
65 |     half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) *  4 + 1024
66 |     half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024
67 |     half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024
68 |     qa >>= 8;
69 |     half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8])      + 1024
70 |     half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) *  4 + 1024
71 |     half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024
72 |     half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024
73 | 
74 |     dq[0] = __hadd2(q0.as_half2, z1);
75 |     dq[1] = __hfma2(q1.as_half2, y4,  z4);
76 |     dq[2] = __hfma2(q2.as_half2, y16, z16);
77 |     dq[3] = __hfma2(q3.as_half2, y64, z64);
78 |     dq[4] = __hadd2(q4.as_half2, z1);
79 |     dq[5] = __hfma2(q5.as_half2, y4,  z4);
80 |     dq[6] = __hfma2(q6.as_half2, y16, z16);
81 |     dq[7] = __hfma2(q7.as_half2, y64, z64);
82 | }
83 | 
84 | }  // namespace gptq
85 | }  // namespace vllm
86 | 
87 | #endif
88 | 


--------------------------------------------------------------------------------
/csrc/quantization/gptq/qdq_8.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_8_cuh
 6 | #define _qdq_8_cuh
 7 | 
 8 | #include "qdq_util.cuh"
 9 | 
10 | namespace vllm {
11 | namespace gptq {
12 | 
13 | __forceinline__ __device__ void shuffle_8bit_4
14 | (
15 |     uint32_t* q,
16 |     int stride
17 | )
18 | {
19 | }
20 | 
21 | __forceinline__ __device__ void dequant_8bit_8
22 | (
23 |     const uint32_t q_0,
24 |     const uint32_t q_1,
25 |     half2 (&dq)[4],
26 |     int stride,
27 |     const uint32_t zero
28 | )
29 | {
30 |     half dqh[8];
31 |     for (int i = 0; i < 4; i++) dqh[i    ] = dq_ns(exb(q_0, i * 8, 0xff), zero);
32 |     for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
33 | 
34 |     for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
35 | }
36 | 
37 | }  // namespace gptq
38 | }  // namespace vllm
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/csrc/quantization/gptq/qdq_util.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_util_cuh
 6 | #define _qdq_util_cuh
 7 | 
 8 | namespace vllm {
 9 | namespace gptq {
10 | 
11 | union half2_uint32
12 | {
13 |     uint32_t as_uint32;
14 |     half2 as_half2;
15 |     __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
16 |     __device__ half2_uint32(half2 val) : as_half2(val) {}
17 | };
18 | 
19 | union half_uint16
20 | {
21 |     uint16_t as_uint16;
22 |     half as_half;
23 |     __device__ half_uint16(uint16_t val) : as_uint16(val) {}
24 |     __device__ half_uint16(half val) : as_half(val) {}
25 | };
26 | 
27 | // Max_scale premultiplied by 1/256
28 | 
29 | __forceinline__ __device__ half dq_scale(const int qs, const half max_scale)
30 | {
31 |     int qs_i = qs + 1;
32 |     half qs_h = __int2half_rn(qs_i * qs_i);
33 |     qs_h = __hmul(qs_h, max_scale);
34 |     return qs_h;
35 | }
36 | 
37 | __forceinline__ __device__ half dq(const int q, const int qzero, const half scale)
38 | {
39 |     return __hmul(__int2half_rn(q - qzero), scale);
40 | }
41 | 
42 | __forceinline__ __device__ half dq_ns(const int q, const int qzero)
43 | {
44 |     //return __hsub(__int2half_rn(q), __int2half_rn(qzero));
45 |     return __int2half_rn(q - qzero);
46 | }
47 | 
48 | __forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask)
49 | {
50 |     return (int)((q >> shift) & mask);
51 | }
52 | 
53 | __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask)
54 | {
55 |     return (int)(__funnelshift_rc(q0, q1, shift) & mask);
56 | }
57 | 
58 | }  // namespace gptq
59 | }  // namespace vllm
60 | #endif
61 | 


--------------------------------------------------------------------------------
/csrc/reduction_utils.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | #include "cuda_compat.h"
21 | 
22 | namespace vllm {
23 | 
24 | template<typename T>
25 | __inline__ __device__ T warpReduceSum(T val) {
26 | #pragma unroll
27 |   for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1)
28 |     val += VLLM_SHFL_XOR_SYNC(val, mask);
29 |   return val;
30 | }
31 | 
32 | __inline__ __device__ constexpr int _calculateLaneMask(int warp_size) {
33 |   return warp_size - 1;
34 | }
35 | 
36 | __inline__ __device__ constexpr int _calculateWidShift(int warp_size) {
37 |   return 5 + (warp_size >> 6);
38 | }
39 | 
40 | /* Calculate the sum of all elements in a block */
41 | template<typename T>
42 | __inline__ __device__ T blockReduceSum(T val) {
43 |   static __shared__ T shared[WARP_SIZE];
44 |   constexpr auto LANE_MASK = _calculateLaneMask(WARP_SIZE);
45 |   constexpr auto WID_SHIFT = _calculateWidShift(WARP_SIZE);
46 |   int lane = threadIdx.x & LANE_MASK;
47 |   int wid = threadIdx.x >> WID_SHIFT;
48 | 
49 |   val = warpReduceSum<T>(val);
50 | 
51 |   if (lane == 0)
52 |     shared[wid] = val;
53 | 
54 |   __syncthreads();
55 | 
56 |   // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
57 |   // blockDim.x is not divided by 32
58 |   val = (threadIdx.x < (blockDim.x / (WARP_SIZE * 1.0f))) ? shared[lane] : (T)(0.0f);
59 |   val = warpReduceSum<T>(val);
60 |   return val;
61 | }
62 | 
63 | } // namespace vllm
64 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # vLLM documents
 2 | 
 3 | ## Build the docs
 4 | 
 5 | ```bash
 6 | # Install dependencies.
 7 | pip install -r requirements-docs.txt
 8 | 
 9 | # Build the docs.
10 | make clean
11 | make html
12 | ```
13 | 
14 | ## Open the docs with your browser
15 | 
16 | ```bash
17 | python -m http.server -d build/html/
18 | ```
19 | Launch your browser and open localhost:8000.
20 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
 1 | sphinx == 6.2.1
 2 | sphinx-book-theme == 1.0.1
 3 | sphinx-copybutton == 0.5.2
 4 | myst-parser == 2.0.0
 5 | sphinx-argparse
 6 | 
 7 | # packages to install to build the documentation
 8 | pydantic
 9 | -f https://download.pytorch.org/whl/cpu
10 | torch


--------------------------------------------------------------------------------
/docs/source/assets/kernel/k_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/k_vecs.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/key.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/logits_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/logits_vec.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/q_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/q_vecs.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/query.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/v_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/v_vec.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/kernel/value.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-only-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/logos/vllm-logo-only-light.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/logos/vllm-logo-text-dark.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/docs/source/assets/logos/vllm-logo-text-light.png


--------------------------------------------------------------------------------
/docs/source/dev/engine/async_llm_engine.rst:
--------------------------------------------------------------------------------
1 | 
2 | AsyncLLMEngine
3 | =================================
4 | 
5 | .. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine
6 |     :members: generate, abort
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/dev/engine/engine_index.rst:
--------------------------------------------------------------------------------
 1 | vLLM Engine
 2 | =================================
 3 | 
 4 | .. automodule:: vllm.engine
 5 | .. currentmodule:: vllm.engine
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 |    :caption: Engines
10 | 
11 |    llm_engine
12 |    async_llm_engine
13 | 
14 | 


--------------------------------------------------------------------------------
/docs/source/dev/engine/llm_engine.rst:
--------------------------------------------------------------------------------
1 | LLMEngine
2 | =================================
3 | 
4 | .. autoclass:: vllm.engine.llm_engine.LLMEngine
5 |     :members: add_request, abort_request, step
6 |     :show-inheritance:


--------------------------------------------------------------------------------
/docs/source/dev/sampling_params.rst:
--------------------------------------------------------------------------------
1 | Sampling Params
2 | ===============
3 | 
4 | .. automodule:: vllm.sampling_params.SamplingParams


--------------------------------------------------------------------------------
/docs/source/getting_started/installation.rst:
--------------------------------------------------------------------------------
 1 | .. _installation:
 2 | 
 3 | Installation
 4 | ============
 5 | 
 6 | vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
 7 | 
 8 | Requirements
 9 | ------------
10 | 
11 | * OS: Linux
12 | * Python: 3.8 -- 3.11
13 | * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
14 | 
15 | Install with pip
16 | ----------------
17 | 
18 | You can install vLLM using pip:
19 | 
20 | .. code-block:: console
21 | 
22 |     $ # (Optional) Create a new conda environment.
23 |     $ conda create -n myenv python=3.9 -y
24 |     $ conda activate myenv
25 | 
26 |     $ # Install vLLM with CUDA 12.1.
27 |     $ pip install vllm
28 | 
29 | .. note::
30 | 
31 |     As of now, vLLM's binaries are compiled on CUDA 12.1 by default.
32 |     However, you can install vLLM with CUDA 11.8 by running:
33 | 
34 |     .. code-block:: console
35 | 
36 |         $ # Install vLLM with CUDA 11.8.
37 |         $ export VLLM_VERSION=0.2.4
38 |         $ export PYTHON_VERSION=39
39 |         $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl
40 | 
41 |         $ # Re-install PyTorch with CUDA 11.8.
42 |         $ pip uninstall torch -y
43 |         $ pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118
44 | 
45 |         $ # Re-install xFormers with CUDA 11.8.
46 |         $ pip uninstall xformers -y
47 |         $ pip install --upgrade xformers --index-url https://download.pytorch.org/whl/cu118
48 | 
49 | 
50 | .. _build_from_source:
51 | 
52 | Build from source
53 | -----------------
54 | 
55 | You can also build and install vLLM from source:
56 | 
57 | .. code-block:: console
58 | 
59 |     $ git clone https://github.com/vllm-project/vllm.git
60 |     $ cd vllm
61 |     $ pip install -e .  # This may take 5-10 minutes.
62 | 
63 | .. tip::
64 |     If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
65 | 
66 |     .. code-block:: console
67 | 
68 |         $ # Use `--ipc=host` to make sure the shared memory is large enough.
69 |         $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
70 | 
71 | .. note::
72 |     If you are developing the C++ backend of vLLM, consider building vLLM with
73 | 
74 |     .. code-block:: console
75 | 
76 |         $ python setup.py develop
77 | 
78 |     since it will give you incremental builds. The downside is that this method
79 |     is `deprecated by setuptools <https://github.com/pypa/setuptools/issues/917>`_.
80 | 


--------------------------------------------------------------------------------
/docs/source/quantization/fp8_e5m2_kv_cache.rst:
--------------------------------------------------------------------------------
 1 | .. _fp8_e5m2_kv_cache:
 2 | 
 3 | FP8 E5M2 KV Cache
 4 | ==================
 5 | 
 6 | The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits.
 7 | The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other.
 8 | 
 9 | Here is an example of how to enable this feature:
10 | 
11 | .. code-block:: python
12 | 
13 |     from vllm import LLM, SamplingParams
14 |     # Sample prompts.
15 |     prompts = [
16 |         "Hello, my name is",
17 |         "The president of the United States is",
18 |         "The capital of France is",
19 |         "The future of AI is",
20 |     ]
21 |     # Create a sampling params object.
22 |     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
23 |     # Create an LLM.
24 |     llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8_e5m2")
25 |     # Generate texts from the prompts. The output is a list of RequestOutput objects
26 |     # that contain the prompt, generated text, and other information.
27 |     outputs = llm.generate(prompts, sampling_params)
28 |     # Print the outputs.
29 |     for output in outputs:
30 |         prompt = output.prompt
31 |         generated_text = output.outputs[0].text
32 |         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_bentoml.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_bentoml:
2 | 
3 | Deploying with BentoML
4 | ======================
5 | 
6 | `BentoML <https://github.com/bentoml/BentoML>`_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
7 | 
8 | For details, see the tutorial `vLLM inference in the BentoML documentation <https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html>`_.


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_docker.rst:
--------------------------------------------------------------------------------
 1 | .. _deploying_with_docker:
 2 | 
 3 | Deploying with Docker
 4 | ============================
 5 | 
 6 | vLLM offers official docker image for deployment.
 7 | The image can be used to run OpenAI compatible server.
 8 | The image is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
 9 | 
10 | .. code-block:: console
11 | 
12 |     $ docker run --runtime nvidia --gpus all \
13 |         -v ~/.cache/huggingface:/root/.cache/huggingface \
14 |         --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
15 |         -p 8000:8000 \
16 |         --ipc=host \
17 |         vllm/vllm-openai:latest \
18 |         --model mistralai/Mistral-7B-v0.1
19 | 
20 | 
21 | .. note::
22 | 
23 |         You can either use the ``ipc=host`` flag or ``--shm-size`` flag to allow the
24 |         container to access the host's shared memory. vLLM uses PyTorch, which uses shared
25 |         memory to share data between processes under the hood, particularly for tensor parallel inference.
26 | 
27 | 
28 | You can build and run vLLM from source via the provided dockerfile. To build vLLM:
29 | 
30 | .. code-block:: console
31 | 
32 |     $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
33 | 
34 | 
35 | .. note::
36 | 
37 |         By default vLLM will build for all GPU types for widest distribution. If you are just building for the
38 |         current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""``
39 |         for vLLM to find the current GPU type and build for that.
40 | 
41 | 
42 | To run vLLM:
43 | 
44 | .. code-block:: console
45 | 
46 |     $ docker run --runtime nvidia --gpus all \
47 |         -v ~/.cache/huggingface:/root/.cache/huggingface \
48 |         -p 8000:8000 \
49 |         --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
50 |         vllm/vllm-openai <args...>
51 | 
52 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_kserve.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_kserve:
2 | 
3 | Deploying with KServe
4 | ============================
5 | 
6 | vLLM can be deployed with `KServe <https://github.com/kserve/kserve>`_ on Kubernetes for highly scalable distributed model serving.
7 | 
8 | Please see `this guide <https://kserve.github.io/website/latest/modelserving/v1beta1/llm/vllm/>`_ for more details on using vLLM with KServe.
9 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_triton.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_triton:
2 | 
3 | Deploying with NVIDIA Triton
4 | ============================
5 | 
6 | The `Triton Inference Server <https://github.com/triton-inference-server>`_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m <https://huggingface.co/facebook/opt-125m>`_ model using vLLM. Please see `Deploying a vLLM model in Triton <https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton>`_ for more details.
7 | 


--------------------------------------------------------------------------------
/docs/source/serving/distributed_serving.rst:
--------------------------------------------------------------------------------
 1 | .. _distributed_serving:
 2 | 
 3 | Distributed Inference and Serving
 4 | =================================
 5 | 
 6 | vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with `Ray <https://github.com/ray-project/ray>`_. To run distributed inference, install Ray with:
 7 | 
 8 | .. code-block:: console
 9 | 
10 |     $ pip install ray
11 | 
12 | To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
13 | 
14 | .. code-block:: python
15 | 
16 |     from vllm import LLM
17 |     llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
18 |     output = llm.generate("San Franciso is a")
19 | 
20 | To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
21 | 
22 | .. code-block:: console
23 | 
24 |     $ python -m vllm.entrypoints.api_server \
25 |     $     --model facebook/opt-13b \
26 |     $     --tensor-parallel-size 4
27 | 
28 | To scale vLLM beyond a single machine, start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM:
29 | 
30 | .. code-block:: console
31 | 
32 |     $ # On head node
33 |     $ ray start --head
34 | 
35 |     $ # On worker nodes
36 |     $ ray start --address=<ray-head-address>
37 | 
38 | After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.


--------------------------------------------------------------------------------
/docs/source/serving/integrations.rst:
--------------------------------------------------------------------------------
 1 | Integrations
 2 | ------------
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    run_on_sky
 8 |    deploying_with_kserve
 9 |    deploying_with_triton
10 |    deploying_with_bentoml
11 |    serving_with_langchain
12 | 


--------------------------------------------------------------------------------
/docs/source/serving/metrics.rst:
--------------------------------------------------------------------------------
 1 | Production Metrics
 2 | ==================
 3 | 
 4 | vLLM exposes a number of metrics that can be used to monitor the health of the
 5 | system. These metrics are exposed via the `/metrics` endpoint on the vLLM
 6 | OpenAI compatible API server.
 7 | 
 8 | The following metrics are exposed:
 9 | 
10 | .. literalinclude:: ../../../vllm/engine/metrics.py
11 |     :language: python
12 |     :start-after: begin-metrics-definitions
13 |     :end-before: end-metrics-definitions
14 | 


--------------------------------------------------------------------------------
/docs/source/serving/run_on_sky.rst:
--------------------------------------------------------------------------------
 1 | .. _on_cloud:
 2 | 
 3 | Running on clouds with SkyPilot
 4 | ===============================
 5 | 
 6 | .. raw:: html
 7 | 
 8 |     <p align="center">
 9 |         <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
10 |     </p>
11 | 
12 | vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud.
13 | 
14 | To install SkyPilot and setup your cloud credentials, run:
15 | 
16 | .. code-block:: console
17 | 
18 |     $ pip install skypilot
19 |     $ sky check
20 | 
21 | See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml>`__.
22 | 
23 | .. code-block:: yaml
24 | 
25 |     resources:
26 |         accelerators: A100
27 | 
28 |     envs:
29 |         MODEL_NAME: decapoda-research/llama-13b-hf
30 |         TOKENIZER: hf-internal-testing/llama-tokenizer
31 | 
32 |     setup: |
33 |         conda create -n vllm python=3.9 -y
34 |         conda activate vllm
35 |         git clone https://github.com/vllm-project/vllm.git
36 |         cd vllm
37 |         pip install .
38 |         pip install gradio
39 | 
40 |     run: |
41 |         conda activate vllm
42 |         echo 'Starting vllm api server...'
43 |         python -u -m vllm.entrypoints.api_server \
44 |                         --model $MODEL_NAME \
45 |                         --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
46 |                         --tokenizer $TOKENIZER 2>&1 | tee api_server.log &
47 |         echo 'Waiting for vllm api server to start...'
48 |         while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
49 |         echo 'Starting gradio server...'
50 |         python vllm/examples/gradio_webserver.py
51 | 
52 | Start the serving the LLaMA-13B model on an A100 GPU:
53 | 
54 | .. code-block:: console
55 | 
56 |     $ sky launch serving.yaml
57 | 
58 | Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
59 | 
60 | .. code-block:: console
61 | 
62 |     (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
63 | 
64 | **Optional**: Serve the 65B model instead of the default 13B and use more GPU:
65 | 
66 | .. code-block:: console
67 | 
68 |     sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf
69 | 
70 | 


--------------------------------------------------------------------------------
/docs/source/serving/serving_with_langchain.rst:
--------------------------------------------------------------------------------
 1 | .. _run_on_langchain:
 2 | 
 3 | Serving with Langchain
 4 | ============================
 5 | 
 6 | vLLM is also available via `Langchain <https://github.com/langchain-ai/langchain>`_ .
 7 | 
 8 | To install langchain, run
 9 | 
10 | .. code-block:: console
11 | 
12 |     $ pip install langchain langchain_community -q
13 | 
14 | To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``.
15 | 
16 | .. code-block:: python
17 | 
18 |     from langchain_community.llms import VLLM
19 | 
20 |     llm = VLLM(model="mosaicml/mpt-7b",
21 |                trust_remote_code=True,  # mandatory for hf models
22 |                max_new_tokens=128,
23 |                top_k=10,
24 |                top_p=0.95,
25 |                temperature=0.8,
26 |                # tensor_parallel_size=... # for distributed inference
27 |     )
28 | 
29 |     print(llm("What is the capital of France ?"))
30 | 
31 | Please refer to this `Tutorial <https://python.langchain.com/docs/integrations/llms/vllm>`_ for more details.
32 | 


--------------------------------------------------------------------------------
/examples/api_client.py:
--------------------------------------------------------------------------------
 1 | """Example Python client for vllm.entrypoints.api_server"""
 2 | 
 3 | import argparse
 4 | import json
 5 | from typing import Iterable, List
 6 | 
 7 | import requests
 8 | 
 9 | 
10 | def clear_line(n: int = 1) -> None:
11 |     LINE_UP = '\033[1A'
12 |     LINE_CLEAR = '\x1b[2K'
13 |     for _ in range(n):
14 |         print(LINE_UP, end=LINE_CLEAR, flush=True)
15 | 
16 | 
17 | def post_http_request(prompt: str,
18 |                       api_url: str,
19 |                       n: int = 1,
20 |                       stream: bool = False) -> requests.Response:
21 |     headers = {"User-Agent": "Test Client"}
22 |     pload = {
23 |         "prompt": prompt,
24 |         "n": n,
25 |         "use_beam_search": True,
26 |         "temperature": 0.0,
27 |         "max_tokens": 16,
28 |         "stream": stream,
29 |     }
30 |     response = requests.post(api_url, headers=headers, json=pload, stream=True)
31 |     return response
32 | 
33 | 
34 | def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
35 |     for chunk in response.iter_lines(chunk_size=8192,
36 |                                      decode_unicode=False,
37 |                                      delimiter=b"\0"):
38 |         if chunk:
39 |             data = json.loads(chunk.decode("utf-8"))
40 |             output = data["text"]
41 |             yield output
42 | 
43 | 
44 | def get_response(response: requests.Response) -> List[str]:
45 |     data = json.loads(response.content)
46 |     output = data["text"]
47 |     return output
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument("--host", type=str, default="localhost")
53 |     parser.add_argument("--port", type=int, default=8000)
54 |     parser.add_argument("--n", type=int, default=4)
55 |     parser.add_argument("--prompt", type=str, default="San Francisco is a")
56 |     parser.add_argument("--stream", action="store_true")
57 |     args = parser.parse_args()
58 |     prompt = args.prompt
59 |     api_url = f"http://{args.host}:{args.port}/generate"
60 |     n = args.n
61 |     stream = args.stream
62 | 
63 |     print(f"Prompt: {prompt!r}\n", flush=True)
64 |     response = post_http_request(prompt, api_url, n, stream)
65 | 
66 |     if stream:
67 |         num_printed_lines = 0
68 |         for h in get_streaming_response(response):
69 |             clear_line(num_printed_lines)
70 |             num_printed_lines = 0
71 |             for i, line in enumerate(h):
72 |                 num_printed_lines += 1
73 |                 print(f"Beam candidate {i}: {line!r}", flush=True)
74 |     else:
75 |         output = get_response(response)
76 |         for i, line in enumerate(output):
77 |             print(f"Beam candidate {i}: {line!r}", flush=True)
78 | 


--------------------------------------------------------------------------------
/examples/gradio_webserver.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import gradio as gr
 5 | import requests
 6 | 
 7 | 
 8 | def http_bot(prompt):
 9 |     headers = {"User-Agent": "vLLM Client"}
10 |     pload = {
11 |         "prompt": prompt,
12 |         "stream": True,
13 |         "max_tokens": 128,
14 |     }
15 |     response = requests.post(args.model_url,
16 |                              headers=headers,
17 |                              json=pload,
18 |                              stream=True)
19 | 
20 |     for chunk in response.iter_lines(chunk_size=8192,
21 |                                      decode_unicode=False,
22 |                                      delimiter=b"\0"):
23 |         if chunk:
24 |             data = json.loads(chunk.decode("utf-8"))
25 |             output = data["text"][0]
26 |             yield output
27 | 
28 | 
29 | def build_demo():
30 |     with gr.Blocks() as demo:
31 |         gr.Markdown("# vLLM text completion demo\n")
32 |         inputbox = gr.Textbox(label="Input",
33 |                               placeholder="Enter text and press ENTER")
34 |         outputbox = gr.Textbox(label="Output",
35 |                                placeholder="Generated result from the model")
36 |         inputbox.submit(http_bot, [inputbox], [outputbox])
37 |     return demo
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--host", type=str, default=None)
43 |     parser.add_argument("--port", type=int, default=8001)
44 |     parser.add_argument("--model-url",
45 |                         type=str,
46 |                         default="http://localhost:8000/generate")
47 |     args = parser.parse_args()
48 | 
49 |     demo = build_demo()
50 |     demo.queue().launch(server_name=args.host,
51 |                         server_port=args.port,
52 |                         share=True)
53 | 


--------------------------------------------------------------------------------
/examples/llm_engine_example.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from typing import List, Tuple
 3 | 
 4 | from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
 5 | 
 6 | 
 7 | def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
 8 |     """Create a list of test prompts with their sampling parameters."""
 9 |     return [
10 |         ("A robot may not injure a human being",
11 |          SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
12 |         ("To be or not to be,",
13 |          SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
14 |         ("What is the meaning of life?",
15 |          SamplingParams(n=2,
16 |                         best_of=5,
17 |                         temperature=0.8,
18 |                         top_p=0.95,
19 |                         frequency_penalty=0.1)),
20 |         ("It is only with the heart that one can see rightly",
21 |          SamplingParams(n=3, best_of=3, use_beam_search=True,
22 |                         temperature=0.0)),
23 |     ]
24 | 
25 | 
26 | def process_requests(engine: LLMEngine,
27 |                      test_prompts: List[Tuple[str, SamplingParams]]):
28 |     """Continuously process a list of prompts and handle the outputs."""
29 |     request_id = 0
30 | 
31 |     while test_prompts or engine.has_unfinished_requests():
32 |         if test_prompts:
33 |             prompt, sampling_params = test_prompts.pop(0)
34 |             engine.add_request(str(request_id), prompt, sampling_params)
35 |             request_id += 1
36 | 
37 |         request_outputs: List[RequestOutput] = engine.step()
38 | 
39 |         for request_output in request_outputs:
40 |             if request_output.finished:
41 |                 print(request_output)
42 | 
43 | 
44 | def initialize_engine(args: argparse.Namespace) -> LLMEngine:
45 |     """Initialize the LLMEngine from the command line arguments."""
46 |     engine_args = EngineArgs.from_cli_args(args)
47 |     return LLMEngine.from_engine_args(engine_args)
48 | 
49 | 
50 | def main(args: argparse.Namespace):
51 |     """Main function that sets up and runs the prompt processing."""
52 |     engine = initialize_engine(args)
53 |     test_prompts = create_test_prompts()
54 |     process_requests(engine, test_prompts)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     parser = argparse.ArgumentParser(
59 |         description='Demo on using the LLMEngine class directly')
60 |     parser = EngineArgs.add_cli_args(parser)
61 |     args = parser.parse_args()
62 |     main(args)
63 | 


--------------------------------------------------------------------------------
/examples/offline_inference.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(model="facebook/opt-125m")
15 | # Generate texts from the prompts. The output is a list of RequestOutput objects
16 | # that contain the prompt, generated text, and other information.
17 | outputs = llm.generate(prompts, sampling_params)
18 | # Print the outputs.
19 | for output in outputs:
20 |     prompt = output.prompt
21 |     generated_text = output.outputs[0].text
22 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
23 | 


--------------------------------------------------------------------------------
/examples/offline_inference_distributed.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example shows how to use Ray Data for running offline batch inference
 3 | distributively on a multi-nodes cluster.
 4 | 
 5 | Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
 6 | """
 7 | 
 8 | from vllm import LLM, SamplingParams
 9 | from typing import Dict
10 | import numpy as np
11 | import ray
12 | 
13 | # Create a sampling params object.
14 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
15 | 
16 | 
17 | # Create a class to do batch inference.
18 | class LLMPredictor:
19 | 
20 |     def __init__(self):
21 |         # Create an LLM.
22 |         self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
23 | 
24 |     def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
25 |         # Generate texts from the prompts.
26 |         # The output is a list of RequestOutput objects that contain the prompt,
27 |         # generated text, and other information.
28 |         outputs = self.llm.generate(batch["text"], sampling_params)
29 |         prompt = []
30 |         generated_text = []
31 |         for output in outputs:
32 |             prompt.append(output.prompt)
33 |             generated_text.append(' '.join([o.text for o in output.outputs]))
34 |         return {
35 |             "prompt": prompt,
36 |             "generated_text": generated_text,
37 |         }
38 | 
39 | 
40 | # Read one text file from S3. Ray Data supports reading multiple files
41 | # from cloud storage (such as JSONL, Parquet, CSV, binary format).
42 | ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
43 | 
44 | # Apply batch inference for all input data.
45 | ds = ds.map_batches(
46 |     LLMPredictor,
47 |     # Set the concurrency to the number of LLM instances.
48 |     concurrency=10,
49 |     # Specify the number of GPUs required per LLM instance.
50 |     # NOTE: Do NOT set `num_gpus` when using vLLM with tensor-parallelism
51 |     # (i.e., `tensor_parallel_size`).
52 |     num_gpus=1,
53 |     # Specify the batch size for inference.
54 |     batch_size=32,
55 | )
56 | 
57 | # Peek first 10 results.
58 | # NOTE: This is for local testing and debugging. For production use case,
59 | # one should write full result out as shown below.
60 | outputs = ds.take(limit=10)
61 | for output in outputs:
62 |     prompt = output["prompt"]
63 |     generated_text = output["generated_text"]
64 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
65 | 
66 | # Write inference output data out as Parquet files to S3.
67 | # Multiple files would be written to the output destination,
68 | # and each task would write one or more files separately.
69 | #
70 | # ds.write_parquet("s3://<your-output-bucket>")
71 | 


--------------------------------------------------------------------------------
/examples/offline_inference_neuron.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(
15 |     model="openlm-research/open_llama_3b",
16 |     max_num_seqs=8,
17 |     # The max_model_len and block_size arguments are required to be same as
18 |     # max sequence length when targeting neuron device.
19 |     # Currently, this is a known limitation in continuous batching support
20 |     # in transformers-neuronx.
21 |     # TODO(liangfu): Support paged-attention in transformers-neuronx.
22 |     max_model_len=128,
23 |     block_size=128,
24 |     # The device can be automatically detected when AWS Neuron SDK is installed.
25 |     # The device argument can be either unspecified for automated detection,
26 |     # or explicitly assigned.
27 |     device="neuron")
28 | # Generate texts from the prompts. The output is a list of RequestOutput objects
29 | # that contain the prompt, generated text, and other information.
30 | outputs = llm.generate(prompts, sampling_params)
31 | # Print the outputs.
32 | for output in outputs:
33 |     prompt = output.prompt
34 |     generated_text = output.outputs[0].text
35 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
36 | 


--------------------------------------------------------------------------------
/examples/offline_inference_with_prefix.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | prefix = (
 4 |     "You are an expert school principal, skilled in effectively managing "
 5 |     "faculty and staff. Draft 10-15 questions for a potential first grade "
 6 |     "Head Teacher for my K-12, all-girls', independent school that emphasizes "
 7 |     "community, joyful discovery, and life-long learning. The candidate is "
 8 |     "coming in for a first-round panel interview for a 8th grade Math "
 9 |     "teaching role. They have 5 years of previous teaching experience "
10 |     "as an assistant teacher at a co-ed, public school with experience "
11 |     "in middle school math teaching. Based on these information, fulfill "
12 |     "the following paragraph: ")
13 | 
14 | # Sample prompts.
15 | prompts = [
16 |     "Hello, my name is",
17 |     "The president of the United States is",
18 |     "The capital of France is",
19 |     "The future of AI is",
20 | ]
21 | # Create a sampling params object.
22 | sampling_params = SamplingParams(temperature=0.0)
23 | 
24 | # Create an LLM.
25 | llm = LLM(model="facebook/opt-125m")
26 | 
27 | generating_prompts = [prefix + prompt for prompt in prompts]
28 | 
29 | # Generate texts from the prompts. The output is a list of RequestOutput objects
30 | # that contain the prompt, generated text, and other information.
31 | outputs = llm.generate(generating_prompts, sampling_params)
32 | # Print the outputs.
33 | for output in outputs:
34 |     prompt = output.prompt
35 |     generated_text = output.outputs[0].text
36 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
37 | 
38 | print("-" * 80)
39 | 
40 | # The llm.generate call will batch all prompts and send the batch at once
41 | # if resources allow. The prefix will only be cached after the first batch
42 | # is processed, so we need to call generate once to calculate the prefix
43 | # and cache it.
44 | outputs = llm.generate(generating_prompts[0], sampling_params)
45 | 
46 | # Subsequent batches can leverage the cached prefix
47 | outputs = llm.generate(generating_prompts, sampling_params)
48 | 
49 | # Print the outputs. You should see the same outputs as before
50 | for output in outputs:
51 |     prompt = output.prompt
52 |     generated_text = output.outputs[0].text
53 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
54 | 


--------------------------------------------------------------------------------
/examples/openai_chatcompletion_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai_api_key = "EMPTY"
 5 | openai_api_base = "http://localhost:8000/v1"
 6 | 
 7 | client = OpenAI(
 8 |     # defaults to os.environ.get("OPENAI_API_KEY")
 9 |     api_key=openai_api_key,
10 |     base_url=openai_api_base,
11 | )
12 | 
13 | models = client.models.list()
14 | model = models.data[0].id
15 | 
16 | chat_completion = client.chat.completions.create(
17 |     messages=[{
18 |         "role": "system",
19 |         "content": "You are a helpful assistant."
20 |     }, {
21 |         "role": "user",
22 |         "content": "Who won the world series in 2020?"
23 |     }, {
24 |         "role":
25 |         "assistant",
26 |         "content":
27 |         "The Los Angeles Dodgers won the World Series in 2020."
28 |     }, {
29 |         "role": "user",
30 |         "content": "Where was it played?"
31 |     }],
32 |     model=model,
33 | )
34 | 
35 | print("Chat completion results:")
36 | print(chat_completion)
37 | 


--------------------------------------------------------------------------------
/examples/openai_completion_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai_api_key = "EMPTY"
 5 | openai_api_base = "http://localhost:8000/v1"
 6 | 
 7 | client = OpenAI(
 8 |     # defaults to os.environ.get("OPENAI_API_KEY")
 9 |     api_key=openai_api_key,
10 |     base_url=openai_api_base,
11 | )
12 | 
13 | models = client.models.list()
14 | model = models.data[0].id
15 | 
16 | # Completion API
17 | stream = False
18 | completion = client.completions.create(
19 |     model=model,
20 |     prompt="A robot may not injure a human being",
21 |     echo=False,
22 |     n=2,
23 |     stream=stream,
24 |     logprobs=3)
25 | 
26 | print("Completion results:")
27 | if stream:
28 |     for c in completion:
29 |         print(c)
30 | else:
31 |     print(completion)
32 | 


--------------------------------------------------------------------------------
/examples/production_monitoring/README.md:
--------------------------------------------------------------------------------
 1 | # vLLM + Prometheus/Grafana 
 2 | 
 3 | This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. 
 4 | 
 5 | Install: 
 6 | - [`docker`](https://docs.docker.com/engine/install/)
 7 | - [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
 8 | 
 9 | ### Launch
10 | 
11 | Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
12 | ```bash
13 | python3 -m vllm.entrypoints.openai.api_server \
14 |     --model mistralai/Mistral-7B-v0.1 \
15 |     --max-model-len 2048 \
16 |     --disable-log-requests
17 | ```
18 | 
19 | Launch Prometheus and Grafana servers with `docker compose`:
20 | ```bash
21 | docker compose up
22 | ```
23 | 
24 | Submit some sample requests to the server:
25 | ```bash
26 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
27 | 
28 | python3 ../../benchmarks/benchmark_serving.py \
29 |     --model mistralai/Mistral-7B-v0.1 \
30 |     --tokenizer mistralai/Mistral-7B-v0.1 \
31 |     --endpoint /v1/completions \
32 |     --dataset ShareGPT_V3_unfiltered_cleaned_split.json \
33 |     --request-rate 3.0
34 | ```
35 | 
36 | Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
37 | 
38 | ### Grafana Dashboard
39 | 
40 | Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
41 | 
42 | #### Add Prometheus Data Source
43 | 
44 | Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. 
45 | 
46 | On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`.
47 | 
48 | Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
49 | 
50 | #### Import Dashboard 
51 | 
52 | Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
53 | 
54 | ![Grafana Dashboard Image](https://i.imgur.com/R2vH9VW.png)
55 | 


--------------------------------------------------------------------------------
/examples/production_monitoring/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | # docker-compose.yaml
 2 | version: "3"
 3 | 
 4 | services:
 5 |   prometheus:
 6 |     image: prom/prometheus:latest
 7 |     extra_hosts:
 8 |       - "host.docker.internal:host-gateway"     # allow a direct connection from container to the local machine
 9 |     ports:
10 |       - "9090:9090"   # the default port used by Prometheus
11 |     volumes:
12 |       - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
13 | 
14 |   grafana:
15 |     image: grafana/grafana:latest
16 |     depends_on:
17 |       - prometheus
18 |     ports:
19 |       - "3000:3000" # the default port used by Grafana
20 | 


--------------------------------------------------------------------------------
/examples/production_monitoring/prometheus.yaml:
--------------------------------------------------------------------------------
 1 | # prometheus.yaml
 2 | global:
 3 |   scrape_interval: 5s
 4 |   evaluation_interval: 30s
 5 | 
 6 | scrape_configs:
 7 |   - job_name: vllm
 8 |     static_configs:
 9 |       - targets:
10 |           - 'host.docker.internal:8000'
11 | 


--------------------------------------------------------------------------------
/examples/template_alpaca.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {% for message in messages %}
 4 | {% if message['role'] == 'user' %}
 5 | ### Instruction:
 6 | {{ message['content']|trim -}}
 7 | {% if not loop.last %}
 8 | 
 9 | 
10 | {% endif %}
11 | {% elif message['role'] == 'assistant' %}
12 | ### Response:
13 | {{ message['content']|trim -}}
14 | {% if not loop.last %}
15 | 
16 | 
17 | {% endif %}
18 | {% elif message['role'] == 'user_context' %}
19 | ### Input:
20 | {{ message['content']|trim -}}
21 | {% if not loop.last %}
22 | 
23 | 
24 | {% endif %}
25 | {% endif %}
26 | {% endfor %}
27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
28 | ### Response:
29 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_baichuan.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {%- for message in messages -%}
 4 |     {%- if message['role'] == 'user' -%}
 5 |         {{- '<reserved_106>' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- '<reserved_107>' + message['content'] -}}
 8 |     {%- endif -%}
 9 | {%- endfor -%}
10 | 
11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
12 |     {{- '<reserved_107>' -}}
13 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_chatglm.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=0) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/examples/template_chatglm2.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=1) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/examples/template_chatml.jinja:
--------------------------------------------------------------------------------
1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}


--------------------------------------------------------------------------------
/examples/template_falcon.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'user' -%}
 3 |         {{- 'User: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'assistant' -%}
 5 |         {{- 'Assistant: ' + message['content'] -}}
 6 |     {%- endif -%}
 7 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
 8 |         {{- '\n' -}}
 9 |     {%- endif -%}
10 | {%- endfor -%}
11 | 
12 | 
13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
14 |     {{- 'Assistant:' -}}
15 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_falcon_180b.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'system' -%}
 3 |         {{- 'System: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'user' -%}
 5 |         {{- 'User: ' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- 'Falcon: ' + message['content'] -}}
 8 |     {%- endif -%}
 9 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |         {{- '\n' -}}
11 |     {%- endif -%}
12 | {%- endfor -%}
13 | 
14 | 
15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
16 |     {{- 'Falcon:' -}}
17 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_inkbot.jinja:
--------------------------------------------------------------------------------
 1 | <#meta#>
 2 | - Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }}
 3 | - Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }}
 4 | <#system#>
 5 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 6 | <#chat#>
 7 | {% for message in messages %}
 8 | {% if message['role'] == 'user' %}
 9 | <#user#>
10 | {{ message['content']|trim -}}
11 | {% if not loop.last %}
12 | 
13 | {% endif %}
14 | {% elif message['role'] == 'assistant' %}
15 | <#bot#>
16 | {{ message['content']|trim -}}
17 | {% if not loop.last %}
18 | 
19 | {% endif %}
20 | {% elif message['role'] == 'user_context' %}
21 | <#user_context#>
22 | {{ message['content']|trim -}}
23 | {% if not loop.last %}
24 | 
25 | {% endif %}
26 | {% endif %}
27 | {% endfor %}
28 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
29 | <#bot#>
30 | {% endif %}


--------------------------------------------------------------------------------
/patch_xformers.rocm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | XFORMERS_VERSION="0.0.23"
 5 | 
 6 | export XFORMERS_INSTALLED_VERSION=$(python -c 'import xformers; print(xformers.__version__)')
 7 | 
 8 | if [ "$XFORMERS_INSTALLED_VERSION" != "$XFORMERS_VERSION" ]; then
 9 |     echo "ERROR: xformers version must be ${XFORMERS_VERSION}. ${XFORMERS_INSTALLED_VERSION} is installed"
10 |     exit 1
11 | fi
12 | 
13 | export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)')
14 | export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)')
15 | 
16 | echo "XFORMERS_FMHA_FLASH_PATH = ${XFORMERS_FMHA_FLASH_PATH}"
17 | echo "XFORMERS_FMHA_COMMON_PATH = ${XFORMERS_FMHA_COMMON_PATH}"
18 | 
19 | if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
20 |     echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}"
21 |     patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"
22 |     echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}"
23 | else
24 |     echo "${XFORMERS_FMHA_FLASH_PATH} was patched before"
25 | fi
26 | 
27 | if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
28 |     echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}"
29 |     patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"
30 |     echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}"
31 | else
32 |     echo "${XFORMERS_FMHA_COMMON_PATH} was patched before"
33 | fi
34 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | # Should be mirrored in requirements-build.txt
 3 | requires = [
 4 |     "cmake>=3.21",
 5 |     "ninja",
 6 |     "packaging",
 7 |     "setuptools >= 49.4.0",
 8 |     "torch == 2.1.2",
 9 |     "wheel",
10 | ]
11 | build-backend = "setuptools.build_meta"
12 | 
13 | [tool.ruff]
14 | # Allow lines to be as long as 80.
15 | line-length = 80
16 | 
17 | [tool.ruff.lint]
18 | select = [
19 |     # pycodestyle
20 |     "E",
21 |     # Pyflakes
22 |     "F",
23 |     # pyupgrade
24 |     # "UP",
25 |     # flake8-bugbear
26 |     "B",
27 |     # flake8-simplify
28 |     "SIM",
29 |     # isort
30 |     # "I",
31 | ]
32 | ignore = [
33 |     # star imports
34 |     "F405", "F403",
35 |     # lambda expression assignment
36 |     "E731",
37 |     # Loop control variable not used within loop body
38 |     "B007",
39 | ]
40 | 
41 | [tool.mypy]
42 | python_version = "3.8"
43 | 
44 | ignore_missing_imports = true
45 | 
46 | files = "vllm"
47 | # TODO(woosuk): Include the code from Megatron and HuggingFace.
48 | exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/"
49 | 
50 | 
51 | [tool.codespell]
52 | ignore-words-list = "dout, te, indicies"
53 | skip = "./tests/prompts"
54 | 


--------------------------------------------------------------------------------
/requirements-build.txt:
--------------------------------------------------------------------------------
1 | # Should be mirrored in pyproject.toml
2 | cmake>=3.21
3 | ninja
4 | packaging
5 | setuptools>=49.4.0
6 | torch==2.1.2
7 | wheel
8 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # formatting
 2 | yapf==0.32.0
 3 | toml==0.10.2
 4 | tomli==2.0.1
 5 | ruff==0.1.5
 6 | codespell==2.2.6
 7 | 
 8 | # type checking
 9 | mypy==0.991
10 | types-PyYAML
11 | types-requests
12 | types-setuptools
13 | 
14 | # testing
15 | pytest
16 | pytest-forked
17 | pytest-asyncio
18 | pytest-rerunfailures
19 | pytest-shard
20 | httpx
21 | einops # required for MPT
22 | openai
23 | requests
24 | ray
25 | peft
26 | 
27 | # Benchmarking
28 | aiohttp
29 | 


--------------------------------------------------------------------------------
/requirements-neuron.txt:
--------------------------------------------------------------------------------
 1 | sentencepiece  # Required for LLaMA tokenizer.
 2 | numpy
 3 | transformers-neuronx >= 0.9.0
 4 | torch-neuronx >= 2.1.0
 5 | neuronx-cc
 6 | fastapi
 7 | uvicorn[standard]
 8 | pydantic >= 2.0  # Required for OpenAI server.
 9 | prometheus_client >= 0.18.0
10 | 


--------------------------------------------------------------------------------
/requirements-rocm.txt:
--------------------------------------------------------------------------------
 1 | cmake>=3.21
 2 | ninja  # For faster builds.
 3 | typing-extensions>=4.8.0
 4 | starlette
 5 | psutil
 6 | ray >= 2.9
 7 | sentencepiece  # Required for LLaMA tokenizer.
 8 | numpy
 9 | tokenizers>=0.15.0
10 | transformers >= 4.38.0  # Required for Gemma.
11 | fastapi
12 | uvicorn[standard]
13 | pydantic >= 2.0  # Required for OpenAI server.
14 | prometheus_client >= 0.18.0
15 | outlines == 0.0.34


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cmake>=3.21
 2 | ninja  # For faster builds.
 3 | psutil
 4 | ray >= 2.9
 5 | sentencepiece  # Required for LLaMA tokenizer.
 6 | numpy
 7 | torch == 2.1.2
 8 | transformers >= 4.38.0  # Required for Gemma.
 9 | xformers == 0.0.23.post1  # Required for CUDA 12.1.
10 | fastapi
11 | uvicorn[standard]
12 | pydantic >= 2.0  # Required for OpenAI server.
13 | prometheus_client >= 0.18.0
14 | pynvml == 11.5.0
15 | triton >= 2.1.0
16 | outlines == 0.0.34
17 | cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
18 | 


--------------------------------------------------------------------------------
/rocm_patch/commonpy_xformers-0.0.23.rocm.patch:
--------------------------------------------------------------------------------
 1 | --- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/common.py	2023-11-29 03:17:03.930103539 +0000
 2 | +++ common.py	2023-11-28 16:14:19.846233146 +0000
 3 | @@ -298,8 +298,8 @@
 4 |          dtype = d.query.dtype
 5 |          if device_type not in cls.SUPPORTED_DEVICES:
 6 |              reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})")
 7 | -        if device_type == "cuda" and not _built_with_cuda:
 8 | -            reasons.append("xFormers wasn't build with CUDA support")
 9 | +        #if device_type == "cuda" and not _built_with_cuda:
10 | +        #    reasons.append("xFormers wasn't build with CUDA support")
11 |          if device_type == "cuda":
12 |              device_capability = torch.cuda.get_device_capability(d.device)
13 |              if device_capability < cls.CUDA_MINIMUM_COMPUTE_CAPABILITY:
14 | 


--------------------------------------------------------------------------------
/rocm_patch/rocm_bf16.patch:
--------------------------------------------------------------------------------
 1 | --- amd_hip_bf16.h	2024-02-06 18:28:58.268699142 +0000
 2 | +++ amd_hip_bf16.h.new	2024-02-06 18:28:31.988647133 +0000
 3 | @@ -90,10 +90,10 @@
 4 |  #include "math_fwd.h"              // ocml device functions
 5 |  
 6 |  #if defined(__HIPCC_RTC__)
 7 | -#define __HOST_DEVICE__ __device__
 8 | +#define __HOST_DEVICE__ __device__ static
 9 |  #else
10 |  #include <climits>
11 | -#define __HOST_DEVICE__ __host__ __device__
12 | +#define __HOST_DEVICE__ __host__ __device__ static inline
13 |  #endif
14 |  
15 |  // Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on
16 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/tests/__init__.py


--------------------------------------------------------------------------------
/tests/async_engine/api_server_async_engine.py:
--------------------------------------------------------------------------------
 1 | """vllm.entrypoints.api_server with some extra logging for testing."""
 2 | import argparse
 3 | from typing import Any, Dict
 4 | 
 5 | import uvicorn
 6 | from fastapi.responses import JSONResponse, Response
 7 | 
 8 | import vllm.entrypoints.api_server
 9 | from vllm.engine.arg_utils import AsyncEngineArgs
10 | from vllm.engine.async_llm_engine import AsyncLLMEngine
11 | 
12 | app = vllm.entrypoints.api_server.app
13 | 
14 | 
15 | class AsyncLLMEngineWithStats(AsyncLLMEngine):
16 | 
17 |     def __init__(self, *args, **kwargs):
18 |         super().__init__(*args, **kwargs)
19 |         self._num_aborts = 0
20 | 
21 |     async def abort(self, request_id: str) -> None:
22 |         await super().abort(request_id)
23 |         self._num_aborts += 1
24 | 
25 |     def testing_stats(self) -> Dict[str, Any]:
26 |         return {"num_aborted_requests": self._num_aborts}
27 | 
28 | 
29 | @app.get("/stats")
30 | def stats() -> Response:
31 |     """Get the statistics of the engine."""
32 |     return JSONResponse(engine.testing_stats())
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument("--host", type=str, default="localhost")
38 |     parser.add_argument("--port", type=int, default=8000)
39 |     parser = AsyncEngineArgs.add_cli_args(parser)
40 |     args = parser.parse_args()
41 | 
42 |     engine_args = AsyncEngineArgs.from_cli_args(args)
43 |     engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
44 |     vllm.entrypoints.api_server.engine = engine
45 |     uvicorn.run(
46 |         app,
47 |         host=args.host,
48 |         port=args.port,
49 |         log_level="debug",
50 |         timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
51 | 


--------------------------------------------------------------------------------
/tests/async_engine/test_async_llm_engine.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from dataclasses import dataclass
 3 | 
 4 | import pytest
 5 | 
 6 | from vllm.engine.async_llm_engine import AsyncLLMEngine
 7 | 
 8 | 
 9 | @dataclass
10 | class RequestOutput:
11 |     request_id: int
12 |     finished: bool = False
13 | 
14 | 
15 | class MockEngine:
16 | 
17 |     def __init__(self):
18 |         self.step_calls = 0
19 |         self.add_request_calls = 0
20 |         self.abort_request_calls = 0
21 |         self.request_id = None
22 | 
23 |     async def step_async(self):
24 |         self.step_calls += 1
25 |         return [RequestOutput(
26 |             request_id=self.request_id)] if self.request_id else []
27 | 
28 |     async def encode_request_async(self, *args, **kwargs):
29 |         pass
30 | 
31 |     def generate(self, request_id):
32 |         self.request_id = request_id
33 | 
34 |     def stop_generating(self):
35 |         self.request_id = None
36 | 
37 |     def add_request(self, **kwargs):
38 |         del kwargs  # Unused
39 |         self.add_request_calls += 1
40 | 
41 |     async def add_request_async(self, **kwargs):
42 |         self.add_request_calls += 1
43 |         return
44 | 
45 |     def abort_request(self, request_id):
46 |         del request_id  # Unused
47 |         self.abort_request_calls += 1
48 | 
49 |     def has_unfinished_requests(self):
50 |         return self.request_id is not None
51 | 
52 | 
53 | class MockAsyncLLMEngine(AsyncLLMEngine):
54 | 
55 |     def _init_engine(self, *args, **kwargs):
56 |         return MockEngine()
57 | 
58 | 
59 | @pytest.mark.asyncio
60 | async def test_new_requests_event():
61 |     engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
62 |     engine.start_background_loop()
63 |     await asyncio.sleep(0.01)
64 |     assert engine.engine.step_calls == 0
65 | 
66 |     await engine.add_request("1", "", None)
67 |     await asyncio.sleep(0.01)
68 |     assert engine.engine.add_request_calls == 1
69 |     assert engine.engine.step_calls == 1
70 | 
71 |     await engine.add_request("2", "", None)
72 |     engine.engine.generate("2")
73 |     await asyncio.sleep(0)
74 |     await asyncio.sleep(0)
75 |     assert engine.engine.add_request_calls == 2
76 |     assert engine.engine.step_calls >= 2
77 |     await asyncio.sleep(0.001)
78 |     assert engine.engine.step_calls >= 3
79 |     engine.engine.stop_generating()
80 |     await asyncio.sleep(0.001)
81 |     old_step_calls = engine.engine.step_calls
82 |     await asyncio.sleep(0.001)
83 |     assert engine.engine.step_calls == old_step_calls
84 | 
85 |     await engine.add_request("3", "", None)
86 |     await asyncio.sleep(0.01)
87 |     assert engine.engine.add_request_calls == 3
88 |     assert engine.engine.step_calls == old_step_calls + 1
89 |     await asyncio.sleep(0.01)
90 |     assert engine.engine.add_request_calls == 3
91 |     assert engine.engine.step_calls == old_step_calls + 1
92 | 
93 |     engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
94 |     assert engine.get_tokenizer() is not None
95 | 


--------------------------------------------------------------------------------
/tests/async_engine/test_request_tracker.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.engine.async_llm_engine import RequestTracker
 4 | from vllm.outputs import RequestOutput
 5 | 
 6 | 
 7 | @pytest.mark.asyncio
 8 | async def test_request_tracker():
 9 |     tracker = RequestTracker()
10 |     stream_1 = tracker.add_request("1")
11 |     assert tracker.new_requests_event.is_set()
12 |     await tracker.wait_for_new_requests()
13 |     new, finished = tracker.get_new_and_finished_requests()
14 |     assert not tracker.new_requests_event.is_set()
15 |     assert len(new) == 1
16 |     assert new[0]["request_id"] == "1"
17 |     assert not finished
18 |     assert not stream_1.finished
19 | 
20 |     stream_2 = tracker.add_request("2")
21 |     stream_3 = tracker.add_request("3")
22 |     assert tracker.new_requests_event.is_set()
23 |     await tracker.wait_for_new_requests()
24 |     new, finished = tracker.get_new_and_finished_requests()
25 |     assert not tracker.new_requests_event.is_set()
26 |     assert len(new) == 2
27 |     assert new[0]["request_id"] == "2"
28 |     assert new[1]["request_id"] == "3"
29 |     assert not finished
30 |     assert not stream_2.finished
31 |     assert not stream_3.finished
32 | 
33 |     # request_ids must be unique
34 |     with pytest.raises(KeyError):
35 |         tracker.add_request("1")
36 |     assert not tracker.new_requests_event.is_set()
37 | 
38 |     tracker.abort_request("1")
39 |     new, finished = tracker.get_new_and_finished_requests()
40 |     assert len(finished) == 1
41 |     assert "1" in finished
42 |     assert not new
43 |     assert stream_1.finished
44 | 
45 |     stream_4 = tracker.add_request("4")
46 |     tracker.abort_request("4")
47 |     assert tracker.new_requests_event.is_set()
48 |     await tracker.wait_for_new_requests()
49 |     new, finished = tracker.get_new_and_finished_requests()
50 |     assert len(finished) == 1
51 |     assert "4" in finished
52 |     assert not new
53 |     assert stream_4.finished
54 | 
55 |     stream_5 = tracker.add_request("5")
56 |     assert tracker.new_requests_event.is_set()
57 |     tracker.process_request_output(
58 |         RequestOutput("2", "output", [], [], [], finished=True))
59 |     await tracker.wait_for_new_requests()
60 |     new, finished = tracker.get_new_and_finished_requests()
61 |     assert not tracker.new_requests_event.is_set()
62 |     assert len(finished) == 1
63 |     assert "2" in finished
64 |     assert len(new) == 1
65 |     assert new[0]["request_id"] == "5"
66 |     assert stream_2.finished
67 |     assert not stream_5.finished
68 | 


--------------------------------------------------------------------------------
/tests/basic_correctness/test_basic_correctness.py:
--------------------------------------------------------------------------------
 1 | """Compare the short outputs of HF and vLLM when using greedy sampling.
 2 | 
 3 | Run `pytest tests/basic_correctness/test_basic_correctness.py --forked`.
 4 | """
 5 | import pytest
 6 | 
 7 | MODELS = [
 8 |     "facebook/opt-125m",
 9 |     "meta-llama/Llama-2-7b-hf",
10 | ]
11 | 
12 | 
13 | @pytest.mark.parametrize("model", MODELS)
14 | @pytest.mark.parametrize("dtype", ["half"])
15 | @pytest.mark.parametrize("max_tokens", [5])
16 | @pytest.mark.parametrize("enforce_eager", [False, True])
17 | def test_models(
18 |     hf_runner,
19 |     vllm_runner,
20 |     example_prompts,
21 |     model: str,
22 |     dtype: str,
23 |     max_tokens: int,
24 |     enforce_eager: bool,
25 | ) -> None:
26 |     hf_model = hf_runner(model, dtype=dtype)
27 |     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
28 |     del hf_model
29 | 
30 |     vllm_model = vllm_runner(model, dtype=dtype, enforce_eager=enforce_eager)
31 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
32 |     del vllm_model
33 | 
34 |     for i in range(len(example_prompts)):
35 |         hf_output_ids, hf_output_str = hf_outputs[i]
36 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
37 |         assert hf_output_str == vllm_output_str, (
38 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
39 |         assert hf_output_ids == vllm_output_ids, (
40 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
41 | 


--------------------------------------------------------------------------------
/tests/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/tests/core/__init__.py


--------------------------------------------------------------------------------
/tests/core/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from typing import Tuple
 3 | 
 4 | from vllm import SamplingParams
 5 | from vllm.sequence import Sequence, SequenceGroup
 6 | 
 7 | 
 8 | def create_dummy_prompt(
 9 |         request_id: str,
10 |         prompt_length: int,
11 |         block_size: int = None) -> Tuple[Sequence, SequenceGroup]:
12 |     if not block_size:
13 |         block_size = prompt_length
14 | 
15 |     # Create dummy prompt sequence with tokens 0...block_size-1
16 |     # and prompt "0 ... block_size".
17 |     prompt_tokens = list(range(prompt_length))
18 |     prompt_str = " ".join([str(t) for t in prompt_tokens])
19 |     prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
20 |     seq_group = SequenceGroup(request_id, [prompt], SamplingParams(),
21 |                               time.time(), None)
22 | 
23 |     return prompt, seq_group
24 | 
25 | 
26 | def round_up_to_next_block(seq_len: int, block_size: int) -> int:
27 |     return (seq_len + block_size - 1) // block_size
28 | 


--------------------------------------------------------------------------------
/tests/distributed/test_basic_distributed_correctness.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and distributed vLLM when using greedy sampling.
 2 | 
 3 | Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`.
 4 | """
 5 | import pytest
 6 | import torch
 7 | 
 8 | MODELS = [
 9 |     "facebook/opt-125m",
10 |     "meta-llama/Llama-2-7b-hf",
11 | ]
12 | 
13 | 
14 | @pytest.mark.skipif(torch.cuda.device_count() < 2,
15 |                     reason="Need at least 2 GPUs to run the test.")
16 | @pytest.mark.parametrize("model", MODELS)
17 | @pytest.mark.parametrize("dtype", ["half"])
18 | @pytest.mark.parametrize("max_tokens", [5])
19 | def test_models(
20 |     hf_runner,
21 |     vllm_runner,
22 |     example_prompts,
23 |     model: str,
24 |     dtype: str,
25 |     max_tokens: int,
26 | ) -> None:
27 |     hf_model = hf_runner(model, dtype=dtype)
28 |     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
29 |     del hf_model
30 | 
31 |     vllm_model = vllm_runner(model, dtype=dtype, tensor_parallel_size=2)
32 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
33 |     del vllm_model
34 | 
35 |     for i in range(len(example_prompts)):
36 |         hf_output_ids, hf_output_str = hf_outputs[i]
37 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
38 |         assert hf_output_str == vllm_output_str, (
39 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
40 |         assert hf_output_ids == vllm_output_ids, (
41 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
42 | 


--------------------------------------------------------------------------------
/tests/engine/test_computed_prefix_blocks.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.engine.arg_utils import EngineArgs
 4 | from vllm.engine.llm_engine import LLMEngine
 5 | from vllm.sampling_params import SamplingParams
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 9 | @pytest.mark.parametrize("block_size", [16])
10 | def test_computed_prefix_blocks(model: str, block_size: int):
11 |     # This test checks if we are able to run the engine to completion
12 |     # without triggering asserts.
13 |     # We are in a scenario where all blocks from the second request's prompt
14 |     # are full and already computed when the second request arrives.
15 |     prompt = (
16 |         "You are a helpful assistant. How do I build a car from cardboard and "
17 |         "paper clips? Is there an easy to follow video tutorial available "
18 |         "online for free?")
19 |     prompt2 = (
20 |         " Please recommend to me some resources where I can learn not only to "
21 |         "handle technical difficulties of building a car, but also "
22 |         "decoration.")
23 | 
24 |     engine_args = EngineArgs(model=model,
25 |                              block_size=block_size,
26 |                              enable_prefix_caching=True)
27 | 
28 |     engine = LLMEngine.from_engine_args(engine_args)
29 |     sampling_params = SamplingParams()
30 | 
31 |     engine.add_request("0", prompt + prompt2, sampling_params)
32 |     engine.step()
33 |     engine.add_request("1", prompt, sampling_params)
34 |     engine.step()
35 | 


--------------------------------------------------------------------------------
/tests/entrypoints/test_guided_processors.py:
--------------------------------------------------------------------------------
 1 | # This unit test should be moved to a new
 2 | # tests/test_guided_decoding directory.
 3 | 
 4 | from transformers import AutoTokenizer
 5 | import torch
 6 | 
 7 | from vllm.model_executor.guided_logits_processors import (RegexLogitsProcessor,
 8 |                                                           JSONLogitsProcessor)
 9 | 
10 | TEST_SCHEMA = {
11 |     "type": "object",
12 |     "properties": {
13 |         "name": {
14 |             "type": "string"
15 |         },
16 |         "age": {
17 |             "type": "integer"
18 |         },
19 |         "skills": {
20 |             "type": "array",
21 |             "items": {
22 |                 "type": "string",
23 |                 "maxLength": 10
24 |             },
25 |             "minItems": 3
26 |         },
27 |         "work history": {
28 |             "type": "array",
29 |             "items": {
30 |                 "type": "object",
31 |                 "properties": {
32 |                     "company": {
33 |                         "type": "string"
34 |                     },
35 |                     "duration": {
36 |                         "type": "string"
37 |                     },
38 |                     "position": {
39 |                         "type": "string"
40 |                     }
41 |                 },
42 |                 "required": ["company", "position"]
43 |             }
44 |         }
45 |     },
46 |     "required": ["name", "age", "skills", "work history"]
47 | }
48 | 
49 | TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
50 |               r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
51 | 
52 | 
53 | def test_guided_logits_processors():
54 |     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
55 |     tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
56 |     regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer)
57 |     json_LP = JSONLogitsProcessor(TEST_SCHEMA, tokenizer)
58 | 
59 |     regex_LP.init_state()
60 |     token_ids = tokenizer.encode(
61 |         f"Give an example IPv4 address with this regex: {TEST_REGEX}")
62 |     tensor = torch.rand(32000)
63 |     original_tensor = torch.clone(tensor)
64 |     regex_LP(token_ids, tensor)
65 |     assert tensor.shape == original_tensor.shape
66 |     assert not torch.allclose(tensor, original_tensor)
67 | 
68 |     json_LP.init_state()
69 |     token_ids = tokenizer.encode(
70 |         f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
71 |     tensor = torch.rand(32000)
72 |     original_tensor = torch.clone(tensor)
73 |     json_LP(token_ids, tensor)
74 |     assert tensor.shape == original_tensor.shape
75 |     assert not torch.allclose(tensor, original_tensor)
76 | 


--------------------------------------------------------------------------------
/tests/kernels/allclose_default.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # Reference default values of atol and rtol are from
 4 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
 5 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
 6 | default_rtol = {
 7 |     torch.float16: 1e-3,
 8 |     torch.bfloat16: 1.6e-2,
 9 |     torch.float: 1.3e-6
10 | }
11 | 
12 | 
13 | def get_default_atol(output) -> float:
14 |     return default_atol[output.dtype]
15 | 
16 | 
17 | def get_default_rtol(output) -> float:
18 |     return default_rtol[output.dtype]
19 | 


--------------------------------------------------------------------------------
/tests/kernels/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from vllm.utils import create_kv_caches_with_random
3 | 
4 | 
5 | @pytest.fixture()
6 | def kv_cache_factory():
7 |     return create_kv_caches_with_random
8 | 


--------------------------------------------------------------------------------
/tests/kernels/test_activation.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
 7 |                                                    NewGELU, SiluAndMul)
 8 | from allclose_default import get_default_atol, get_default_rtol
 9 | 
10 | DTYPES = [torch.half, torch.bfloat16, torch.float]
11 | NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
12 | D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
13 | SEEDS = [0]
14 | CUDA_DEVICES = [
15 |     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
16 | ]
17 | 
18 | 
19 | @pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"])
20 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
21 | @pytest.mark.parametrize("d", D)
22 | @pytest.mark.parametrize("dtype", DTYPES)
23 | @pytest.mark.parametrize("seed", SEEDS)
24 | @pytest.mark.parametrize("device", CUDA_DEVICES)
25 | @torch.inference_mode()
26 | def test_act_and_mul(
27 |     activation: str,
28 |     num_tokens: int,
29 |     d: int,
30 |     dtype: torch.dtype,
31 |     seed: int,
32 |     device: str,
33 | ) -> None:
34 |     torch.random.manual_seed(seed)
35 |     if torch.cuda.is_available():
36 |         torch.cuda.manual_seed(seed)
37 |     torch.set_default_device(device)
38 |     x = torch.randn(num_tokens, 2 * d, dtype=dtype)
39 |     if activation == "silu":
40 |         layer = SiluAndMul()
41 |     elif activation == "gelu":
42 |         layer = GeluAndMul(approximate="none")
43 |     elif activation == "gelu_tanh":
44 |         layer = GeluAndMul(approximate="tanh")
45 |     out = layer(x)
46 |     ref_out = layer._forward(x)
47 |     # The SiLU and GELU implementations are equivalent to the native PyTorch
48 |     # implementations, so we can do exact comparison.
49 |     assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0)
50 | 
51 | 
52 | @pytest.mark.parametrize("activation", [FastGELU, NewGELU])
53 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
54 | @pytest.mark.parametrize("d", D)
55 | @pytest.mark.parametrize("dtype", DTYPES)
56 | @pytest.mark.parametrize("seed", SEEDS)
57 | @pytest.mark.parametrize("device", CUDA_DEVICES)
58 | @torch.inference_mode()
59 | def test_activation(
60 |     activation: Type[torch.nn.Module],
61 |     num_tokens: int,
62 |     d: int,
63 |     dtype: torch.dtype,
64 |     seed: int,
65 |     device: str,
66 | ) -> None:
67 |     torch.random.manual_seed(seed)
68 |     if torch.cuda.is_available():
69 |         torch.cuda.manual_seed(seed)
70 |     torch.set_default_device(device)
71 |     x = torch.randn(num_tokens, d, dtype=dtype)
72 |     layer = activation()
73 |     out = layer(x)
74 |     ref_out = layer._forward(x)
75 |     assert torch.allclose(out,
76 |                           ref_out,
77 |                           atol=get_default_atol(out),
78 |                           rtol=get_default_rtol(out))
79 | 


--------------------------------------------------------------------------------
/tests/kernels/test_layernorm.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from vllm.model_executor.layers.layernorm import RMSNorm
 5 | 
 6 | DTYPES = [torch.half, torch.bfloat16, torch.float]
 7 | NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
 8 | HIDDEN_SIZES = [768, 5120, 8192]  # Arbitrary values for testing
 9 | ADD_RESIDUAL = [False, True]
10 | SEEDS = [0]
11 | CUDA_DEVICES = [
12 |     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
13 | ]
14 | 
15 | 
16 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
17 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
18 | @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
19 | @pytest.mark.parametrize("dtype", DTYPES)
20 | @pytest.mark.parametrize("seed", SEEDS)
21 | @pytest.mark.parametrize("device", CUDA_DEVICES)
22 | @torch.inference_mode()
23 | def test_rms_norm(
24 |     num_tokens: int,
25 |     hidden_size: int,
26 |     add_residual: bool,
27 |     dtype: torch.dtype,
28 |     seed: int,
29 |     device: str,
30 | ) -> None:
31 |     torch.random.manual_seed(seed)
32 |     if torch.cuda.is_available():
33 |         torch.cuda.manual_seed(seed)
34 |     torch.set_default_device(device)
35 |     layer = RMSNorm(hidden_size).to(dtype=dtype)
36 |     layer.weight.data.normal_(mean=1.0, std=0.1)
37 |     scale = 1 / (2 * hidden_size)
38 |     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
39 |     x *= scale
40 |     residual = torch.randn_like(x) * scale if add_residual else None
41 | 
42 |     # NOTE(woosuk): The reference implementation should be executed first
43 |     # because the custom kernel is in-place.
44 |     ref_out = layer._forward(x, residual)
45 |     out = layer(x, residual)
46 |     # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
47 |     # numerical errors than other operators because they involve reductions.
48 |     # Therefore, we use a larger tolerance.
49 |     if add_residual:
50 |         assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
51 |         assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
52 |     else:
53 |         assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2)
54 | 


--------------------------------------------------------------------------------
/tests/kernels/test_rand.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pytest
 3 | import random
 4 | 
 5 | from vllm.model_executor.layers.ops.rand import seeded_uniform
 6 | from vllm.model_executor.utils import set_random_seed
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("dtype",
10 |                          [torch.float32, torch.float16, torch.bfloat16])
11 | @pytest.mark.parametrize("use_3d", [True, False])
12 | def test_seeded_uniform(dtype: torch.dtype, use_3d: bool):
13 |     device = "cuda"
14 |     for seed in range(512):
15 |         set_random_seed(seed)
16 |         rows = random.randint(1, 512)
17 |         cols = random.randint(1, 64000)
18 |         if use_3d:
19 |             third_dim = random.randint(2, 10)
20 |             dims = [rows, third_dim, cols]
21 |         else:
22 |             dims = [rows, cols]
23 |         seeds = torch.randint(torch.iinfo(torch.long).min,
24 |                               torch.iinfo(torch.long).max, (rows, ),
25 |                               device=device)
26 | 
27 |         # Test that the same seed produces the same output
28 |         out = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
29 |         out2 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
30 |         torch.testing.assert_close(out, out2)
31 |         # del to save memory
32 |         del out2
33 | 
34 |         out3 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
35 |         torch.testing.assert_close(out, out3)
36 |         # del to save memory
37 |         del out3
38 | 
39 |         # Initialize out tensor with garbage to ensure that it is overwritten
40 |         out_with_tensor = seeded_uniform(
41 |             *dims,
42 |             out=torch.full(
43 |                 (*dims, ),
44 |                 -1,
45 |                 dtype=dtype,
46 |                 device=device,
47 |             ),
48 |             seeds=seeds,
49 |             dtype=dtype,
50 |         )
51 |         torch.testing.assert_close(out, out_with_tensor)
52 | 


--------------------------------------------------------------------------------
/tests/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/tests/lora/__init__.py


--------------------------------------------------------------------------------
/tests/lora/test_gemma.py:
--------------------------------------------------------------------------------
 1 | import vllm
 2 | from vllm.lora.request import LoRARequest
 3 | 
 4 | MODEL_PATH = "google/gemma-7b"
 5 | 
 6 | 
 7 | def do_sample(llm, lora_path: str, lora_id: int) -> str:
 8 |     prompts = [
 9 |         "Quote: Imagination is",
10 |         "Quote: Be yourself;",
11 |         "Quote: So many books,",
12 |     ]
13 |     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
14 |     outputs = llm.generate(
15 |         prompts,
16 |         sampling_params,
17 |         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
18 |         if lora_id else None)
19 |     # Print the outputs.
20 |     generated_texts = []
21 |     for output in outputs:
22 |         prompt = output.prompt
23 |         generated_text = output.outputs[0].text.strip()
24 |         generated_texts.append(generated_text)
25 |         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
26 |     return generated_texts
27 | 
28 | 
29 | def test_gemma_lora(gemma_lora_files):
30 |     llm = vllm.LLM(MODEL_PATH,
31 |                    max_model_len=1024,
32 |                    enable_lora=True,
33 |                    max_loras=4)
34 | 
35 |     expected_lora_output = [
36 |         "more important than knowledge.\nAuthor: Albert Einstein\n",
37 |         "everyone else is already taken.\nAuthor: Oscar Wilde\n",
38 |         "so little time\nAuthor: Frank Zappa\n",
39 |     ]
40 | 
41 |     output1 = do_sample(llm, gemma_lora_files, lora_id=1)
42 |     for i in range(len(expected_lora_output)):
43 |         assert output1[i].startswith(expected_lora_output[i])
44 |     output2 = do_sample(llm, gemma_lora_files, lora_id=2)
45 |     for i in range(len(expected_lora_output)):
46 |         assert output2[i].startswith(expected_lora_output[i])
47 | 


--------------------------------------------------------------------------------
/tests/lora/test_tokenizer_group.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from transformers import AutoTokenizer, PreTrainedTokenizerBase
 3 | from vllm.lora.request import LoRARequest
 4 | from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
 5 | from vllm.transformers_utils.tokenizer import get_lora_tokenizer
 6 | from ..conftest import get_tokenizer_pool_config
 7 | 
 8 | 
 9 | @pytest.mark.asyncio
10 | @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
11 | async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
12 |     reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
13 |     tokenizer_group = get_tokenizer_group(
14 |         get_tokenizer_pool_config(tokenizer_group_type),
15 |         tokenizer_id="gpt2",
16 |         enable_lora=True,
17 |         max_num_seqs=1,
18 |         max_input_length=None,
19 |     )
20 |     lora_request = LoRARequest("1", 1, sql_lora_files)
21 |     assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
22 |         request_id="request_id", prompt="prompt", lora_request=lora_request)
23 |     assert reference_tokenizer.encode(
24 |         "prompt") == await tokenizer_group.encode_async(
25 |             request_id="request_id",
26 |             prompt="prompt",
27 |             lora_request=lora_request)
28 |     assert isinstance(tokenizer_group.get_lora_tokenizer(None),
29 |                       PreTrainedTokenizerBase)
30 |     assert tokenizer_group.get_lora_tokenizer(
31 |         None) == await tokenizer_group.get_lora_tokenizer_async(None)
32 | 
33 |     assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request),
34 |                       PreTrainedTokenizerBase)
35 |     assert tokenizer_group.get_lora_tokenizer(
36 |         lora_request) != tokenizer_group.get_lora_tokenizer(None)
37 |     assert tokenizer_group.get_lora_tokenizer(
38 |         lora_request) == await tokenizer_group.get_lora_tokenizer_async(
39 |             lora_request)
40 | 
41 | 
42 | def test_get_lora_tokenizer(sql_lora_files, tmpdir):
43 |     lora_request = None
44 |     tokenizer = get_lora_tokenizer(lora_request)
45 |     assert not tokenizer
46 | 
47 |     lora_request = LoRARequest("1", 1, sql_lora_files)
48 |     tokenizer = get_lora_tokenizer(lora_request)
49 |     assert tokenizer.get_added_vocab()
50 | 
51 |     lora_request = LoRARequest("1", 1, str(tmpdir))
52 |     tokenizer = get_lora_tokenizer(lora_request)
53 |     assert not tokenizer
54 | 


--------------------------------------------------------------------------------
/tests/lora/test_worker.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import tempfile
 4 | from unittest.mock import patch
 5 | 
 6 | from vllm.lora.models import LoRAMapping
 7 | from vllm.lora.request import LoRARequest
 8 | from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
 9 |                          DeviceConfig, LoRAConfig)
10 | from vllm.worker.worker import Worker
11 | 
12 | 
13 | @patch.dict(os.environ, {"RANK": "0"})
14 | def test_worker_apply_lora(sql_lora_files):
15 |     worker = Worker(
16 |         model_config=ModelConfig(
17 |             "meta-llama/Llama-2-7b-hf",
18 |             "meta-llama/Llama-2-7b-hf",
19 |             tokenizer_mode="auto",
20 |             trust_remote_code=False,
21 |             download_dir=None,
22 |             load_format="dummy",
23 |             seed=0,
24 |             dtype="float16",
25 |             revision=None,
26 |         ),
27 |         parallel_config=ParallelConfig(1, 1, False),
28 |         scheduler_config=SchedulerConfig(32, 32, 32),
29 |         device_config=DeviceConfig("cuda"),
30 |         local_rank=0,
31 |         rank=0,
32 |         lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
33 |                                max_loras=32),
34 |         distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
35 |     )
36 |     worker.init_model()
37 |     worker.load_model()
38 | 
39 |     worker.model_runner.set_active_loras([], LoRAMapping([], []))
40 |     assert worker.list_loras() == set()
41 | 
42 |     n_loras = 32
43 |     lora_requests = [
44 |         LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
45 |     ]
46 | 
47 |     worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], []))
48 |     assert worker.list_loras() == {
49 |         lora_request.lora_int_id
50 |         for lora_request in lora_requests
51 |     }
52 | 
53 |     for i in range(32):
54 |         random.seed(i)
55 |         iter_lora_requests = random.choices(lora_requests,
56 |                                             k=random.randint(1, n_loras))
57 |         random.shuffle(iter_lora_requests)
58 |         iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
59 |         worker.model_runner.set_active_loras(iter_lora_requests,
60 |                                              LoRAMapping([], []))
61 |         assert worker.list_loras().issuperset(
62 |             {lora_request.lora_int_id
63 |              for lora_request in iter_lora_requests})
64 | 


--------------------------------------------------------------------------------
/tests/metrics/test_metrics.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | MODELS = [
 4 |     "facebook/opt-125m",
 5 | ]
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("model", MODELS)
 9 | @pytest.mark.parametrize("dtype", ["float"])
10 | @pytest.mark.parametrize("max_tokens", [128])
11 | def test_metric_counter_prompt_tokens(
12 |     vllm_runner,
13 |     example_prompts,
14 |     model: str,
15 |     dtype: str,
16 |     max_tokens: int,
17 | ) -> None:
18 |     vllm_model = vllm_runner(model,
19 |                              dtype=dtype,
20 |                              disable_log_stats=False,
21 |                              gpu_memory_utilization=0.4)
22 |     tokenizer = vllm_model.model.get_tokenizer()
23 |     prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
24 |     # This test needs at least 2 prompts in a batch of different lengths to
25 |     # verify their token count is correct despite padding.
26 |     assert len(example_prompts) > 1, "at least 2 prompts are required"
27 |     assert prompt_token_counts[0] != prompt_token_counts[1], (
28 |         "prompts of different lengths are required")
29 |     vllm_prompt_token_count = sum(prompt_token_counts)
30 | 
31 |     _ = vllm_model.generate_greedy(example_prompts, max_tokens)
32 |     stat_logger = vllm_model.model.llm_engine.stat_logger
33 |     metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
34 |         **stat_logger.labels)._value.get()
35 | 
36 |     assert vllm_prompt_token_count == metric_count, (
37 |         f"prompt token count: {vllm_prompt_token_count!r}\n"
38 |         f"metric: {metric_count!r}")
39 | 
40 | 
41 | @pytest.mark.parametrize("model", MODELS)
42 | @pytest.mark.parametrize("dtype", ["float"])
43 | @pytest.mark.parametrize("max_tokens", [128])
44 | def test_metric_counter_generation_tokens(
45 |     vllm_runner,
46 |     example_prompts,
47 |     model: str,
48 |     dtype: str,
49 |     max_tokens: int,
50 | ) -> None:
51 |     vllm_model = vllm_runner(model,
52 |                              dtype=dtype,
53 |                              disable_log_stats=False,
54 |                              gpu_memory_utilization=0.4)
55 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
56 |     tokenizer = vllm_model.model.get_tokenizer()
57 |     stat_logger = vllm_model.model.llm_engine.stat_logger
58 |     metric_count = stat_logger.metrics.counter_generation_tokens.labels(
59 |         **stat_logger.labels)._value.get()
60 |     vllm_generation_count = 0
61 |     for i in range(len(example_prompts)):
62 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
63 |         prompt_ids = tokenizer.encode(example_prompts[i])
64 |         # vllm_output_ids contains both prompt tokens and generation tokens.
65 |         # We're interested only in the count of the generation tokens.
66 |         vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
67 | 
68 |     assert vllm_generation_count == metric_count, (
69 |         f"generation token count: {vllm_generation_count!r}\n"
70 |         f"metric: {metric_count!r}")
71 | 


--------------------------------------------------------------------------------
/tests/models/test_mistral.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
 2 | 
 3 | Run `pytest tests/models/test_mistral.py --forked`.
 4 | """
 5 | import pytest
 6 | 
 7 | MODELS = [
 8 |     "mistralai/Mistral-7B-Instruct-v0.1",
 9 | ]
10 | 
11 | 
12 | @pytest.mark.parametrize("model", MODELS)
13 | @pytest.mark.parametrize("dtype", ["bfloat16"])
14 | @pytest.mark.parametrize("max_tokens", [128])
15 | def test_models(
16 |     hf_runner,
17 |     vllm_runner,
18 |     example_long_prompts,
19 |     model: str,
20 |     dtype: str,
21 |     max_tokens: int,
22 | ) -> None:
23 |     hf_model = hf_runner(model, dtype=dtype)
24 |     hf_outputs = hf_model.generate_greedy(example_long_prompts, max_tokens)
25 |     del hf_model
26 | 
27 |     vllm_model = vllm_runner(model, dtype=dtype)
28 |     vllm_outputs = vllm_model.generate_greedy(example_long_prompts, max_tokens)
29 |     del vllm_model
30 | 
31 |     for i in range(len(example_long_prompts)):
32 |         hf_output_ids, hf_output_str = hf_outputs[i]
33 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
34 |         assert hf_output_str == vllm_output_str, (
35 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
36 |         assert hf_output_ids == vllm_output_ids, (
37 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
38 | 


--------------------------------------------------------------------------------
/tests/models/test_models.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM when using greedy sampling.
 2 | 
 3 | Run `pytest tests/models/test_models.py --forked`.
 4 | """
 5 | import pytest
 6 | 
 7 | MODELS = [
 8 |     "facebook/opt-125m",
 9 |     "meta-llama/Llama-2-7b-hf",
10 |     "mistralai/Mistral-7B-v0.1",
11 |     "Deci/DeciLM-7b",
12 |     "tiiuae/falcon-7b",
13 |     "gpt2",
14 |     "bigcode/tiny_starcoder_py",
15 |     "EleutherAI/gpt-j-6b",
16 |     "EleutherAI/pythia-70m",
17 |     "bigscience/bloom-560m",
18 |     "mosaicml/mpt-7b",
19 |     "microsoft/phi-2",
20 |     "stabilityai/stablelm-3b-4e1t",
21 |     "allenai/OLMo-1B",
22 |     "bigcode/starcoder2-3b",
23 | ]
24 | 
25 | 
26 | @pytest.mark.parametrize("model", MODELS)
27 | @pytest.mark.parametrize("dtype", ["float"])
28 | @pytest.mark.parametrize("max_tokens", [128])
29 | def test_models(
30 |     hf_runner,
31 |     vllm_runner,
32 |     example_prompts,
33 |     model: str,
34 |     dtype: str,
35 |     max_tokens: int,
36 | ) -> None:
37 |     hf_model = hf_runner(model, dtype=dtype)
38 |     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
39 |     del hf_model
40 | 
41 |     vllm_model = vllm_runner(model, dtype=dtype)
42 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
43 |     del vllm_model
44 | 
45 |     for i in range(len(example_prompts)):
46 |         hf_output_ids, hf_output_str = hf_outputs[i]
47 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
48 |         assert hf_output_str == vllm_output_str, (
49 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
50 |         assert hf_output_ids == vllm_output_ids, (
51 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
52 | 


--------------------------------------------------------------------------------
/tests/prefix_caching/test_prefix_caching.py:
--------------------------------------------------------------------------------
 1 | """Compare the with and without prefix caching.
 2 | 
 3 | Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 4 | """
 5 | import pytest
 6 | 
 7 | from vllm.core.block_manager import CachedBlockAllocator
 8 | from vllm.utils import Device
 9 | 
10 | 
11 | @pytest.mark.parametrize("block_size", [16])
12 | @pytest.mark.parametrize("num_blocks", [16])
13 | def test_block_allocator(
14 |     block_size: int,
15 |     num_blocks: int,
16 | ):
17 |     block_hash = 1
18 |     block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
19 | 
20 |     # Allocate two PysicalTokenBlocks with the same hash and check
21 |     # that they are the same PhysicalTokenBlock
22 |     first_block = block_allocator.allocate(block_hash, 0)
23 |     second_block = block_allocator.allocate(block_hash, 0)
24 |     assert (first_block == second_block)
25 |     assert (second_block.ref_count == 2)
26 | 
27 |     # Free the first_block and confirm that the ref_count is correctly
28 |     # decremented on the second block
29 |     block_allocator.free(first_block)
30 |     assert (second_block.ref_count == 1)
31 | 
32 |     # Free the second block
33 |     block_allocator.free(second_block)
34 | 
35 |     # Reallocate the first block and confirm that, even after the block
36 |     # had its ref_count go to 0, we still get the same block back
37 |     first_block = block_allocator.allocate(block_hash, 0)
38 |     assert (first_block == second_block)
39 |     assert (first_block.block_hash == block_hash)
40 | 
41 | 
42 | @pytest.mark.parametrize("num_blocks", [16])
43 | def test_eviction(num_blocks: int, ):
44 |     block_size = 16
45 |     block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks)
46 |     blocks = []
47 | 
48 |     for i in range(num_blocks):
49 |         # use i as the block_hash
50 |         blocks.append(block_allocator.allocate(i, 0))
51 | 
52 |     #Free all blocks
53 |     for block in blocks:
54 |         block_allocator.free(block)
55 | 
56 |     # Allocate a new block and confirm that it's the first block freed.
57 |     # I.E The Least Recently Used block
58 |     new_block_hash = block_size
59 |     new_block = block_allocator.allocate(new_block_hash, 0)
60 |     assert (new_block == blocks[0])
61 |     assert (new_block.block_hash == new_block_hash)
62 | 
63 |     # Reallocate the second in blocks to remove it from the free list
64 |     realloc_block_hash = 1
65 |     realloc_block = block_allocator.allocate(realloc_block_hash, 0)
66 |     assert (realloc_block == blocks[realloc_block_hash])
67 |     assert (realloc_block.block_hash == realloc_block_hash)
68 | 
69 |     # Allocate a new block and confirm that it's not the realloc_block,
70 |     # since the realloc_block shouldn't be in the free list
71 |     new_block_hash = block_size + 1
72 |     new_block = block_allocator.allocate(new_block_hash, 0)
73 |     assert (realloc_block != new_block)
74 |     assert (new_block.block_hash == new_block_hash)
75 |     assert (new_block.block_number == 2)
76 | 


--------------------------------------------------------------------------------
/tests/prompts/example.txt:
--------------------------------------------------------------------------------
1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information.
4 | Describe the basic components of a neural network and how it can be trained.
5 | Write a short story about a robot that dreams for the first time.
6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
9 | 


--------------------------------------------------------------------------------
/tests/samplers/test_beam_search.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM when using beam search.
 2 | 
 3 | Run `pytest tests/samplers/test_beam_search.py --forked`.
 4 | """
 5 | import pytest
 6 | 
 7 | # FIXME(zhuohan): The test can not pass if we:
 8 | #   1. Increase max_tokens to 256.
 9 | #   2. Increase beam_width to 8.
10 | #   3. Use the model "huggyllama/llama-7b".
11 | MAX_TOKENS = [128]
12 | BEAM_WIDTHS = [4]
13 | MODELS = ["facebook/opt-125m"]
14 | 
15 | 
16 | @pytest.mark.parametrize("model", MODELS)
17 | @pytest.mark.parametrize("dtype", ["half"])
18 | @pytest.mark.parametrize("max_tokens", MAX_TOKENS)
19 | @pytest.mark.parametrize("beam_width", BEAM_WIDTHS)
20 | def test_beam_search_single_input(
21 |     hf_runner,
22 |     vllm_runner,
23 |     example_prompts,
24 |     model: str,
25 |     dtype: str,
26 |     max_tokens: int,
27 |     beam_width: int,
28 | ) -> None:
29 |     example_prompts = example_prompts[:1]
30 |     hf_model = hf_runner(model, dtype=dtype)
31 |     hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
32 |                                                max_tokens)
33 |     del hf_model
34 | 
35 |     vllm_model = vllm_runner(model, dtype=dtype)
36 |     vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
37 |                                                    max_tokens)
38 |     del vllm_model
39 | 
40 |     for i in range(len(example_prompts)):
41 |         hf_output_ids, _ = hf_outputs[i]
42 |         vllm_output_ids, _ = vllm_outputs[i]
43 |         assert len(hf_output_ids) == len(vllm_output_ids)
44 |         for j in range(len(hf_output_ids)):
45 |             assert hf_output_ids[j] == vllm_output_ids[j], (
46 |                 f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
47 |                 f"vLLM: {vllm_output_ids}")
48 | 


--------------------------------------------------------------------------------
/tests/samplers/test_seeded_generate.py:
--------------------------------------------------------------------------------
 1 | """Verify that seeded random sampling is deterministic.
 2 | 
 3 | Run `pytest tests/samplers/test_seeded_generate.py --forked`.
 4 | """
 5 | import copy
 6 | import random
 7 | from itertools import combinations
 8 | 
 9 | import pytest
10 | 
11 | from vllm.model_executor.utils import set_random_seed
12 | from vllm import SamplingParams
13 | 
14 | MODEL = "facebook/opt-125m"
15 | RANDOM_SEEDS = list(range(5))
16 | 
17 | 
18 | @pytest.fixture
19 | def vllm_model(vllm_runner):
20 |     vllm_model = vllm_runner(MODEL, dtype="half")
21 |     yield vllm_model
22 |     del vllm_model
23 | 
24 | 
25 | @pytest.mark.parametrize("seed", RANDOM_SEEDS)
26 | def test_random_sample_with_seed(
27 |     vllm_model,
28 |     example_prompts,
29 |     seed: int,
30 | ) -> None:
31 |     set_random_seed(seed)
32 | 
33 |     sampling_params = SamplingParams(
34 |         # Parameters to ensure sufficient randomness
35 |         temperature=2.0,
36 |         top_p=min(random.random() + 0.3, 1),
37 |         top_k=random.randint(5, 20),
38 |         n=random.randint(1, 10),
39 |         presence_penalty=random.randint(0, 1),
40 |         max_tokens=8,
41 |         ignore_eos=True,
42 |     )
43 | 
44 |     sampling_params_seed_1 = copy.deepcopy(sampling_params)
45 |     sampling_params_seed_1.seed = 100
46 |     sampling_params_seed_2 = copy.deepcopy(sampling_params)
47 |     sampling_params_seed_2.seed = 200
48 | 
49 |     llm = vllm_model.model
50 | 
51 |     for prompt in example_prompts:
52 |         for params in (
53 |                 sampling_params,
54 |                 sampling_params_seed_1,
55 |                 sampling_params_seed_2,
56 |                 sampling_params,
57 |                 sampling_params_seed_1,
58 |                 sampling_params_seed_2,
59 |         ):
60 |             llm._add_request(
61 |                 prompt=prompt,
62 |                 prompt_token_ids=None,
63 |                 sampling_params=params,
64 |             )
65 | 
66 |     results = llm._run_engine(use_tqdm=False)
67 |     all_outputs = [[out.token_ids for out in output.outputs]
68 |                    for output in results]
69 | 
70 |     for i in range(0, len(example_prompts), 6):
71 |         outputs = all_outputs[i:i + 6]
72 | 
73 |         # verify all non-seeded requests differ
74 |         for output_a, output_b in combinations(
75 |             (outputs[0], outputs[1], outputs[2], outputs[3]),
76 |                 2,
77 |         ):
78 |             assert output_a != output_b
79 | 
80 |         # verify requests with the same seed match
81 |         assert outputs[1] == outputs[4]
82 |         assert outputs[2] == outputs[5]
83 | 


--------------------------------------------------------------------------------
/tests/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/tests/spec_decode/__init__.py


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | from vllm.config import ModelConfig
 2 | 
 3 | 
 4 | def test_get_sliding_window():
 5 |     TEST_SLIDING_WINDOW = 4096
 6 |     # Test that the sliding window is correctly computed.
 7 |     # For Qwen1.5/Qwen2, get_sliding_window() should be None
 8 |     # when use_sliding_window is False.
 9 |     qwen2_model_config = ModelConfig(
10 |         "Qwen/Qwen1.5-7B",
11 |         "Qwen/Qwen1.5-7B",
12 |         tokenizer_mode="auto",
13 |         trust_remote_code=False,
14 |         download_dir=None,
15 |         load_format="dummy",
16 |         seed=0,
17 |         dtype="float16",
18 |         revision=None,
19 |     )
20 | 
21 |     qwen2_model_config.hf_config.use_sliding_window = False
22 |     qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
23 |     assert qwen2_model_config.get_sliding_window() is None
24 | 
25 |     qwen2_model_config.hf_config.use_sliding_window = True
26 |     assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
27 | 
28 |     mistral_model_config = ModelConfig(
29 |         "mistralai/Mistral-7B-v0.1",
30 |         "mistralai/Mistral-7B-v0.1",
31 |         tokenizer_mode="auto",
32 |         trust_remote_code=False,
33 |         download_dir=None,
34 |         load_format="dummy",
35 |         seed=0,
36 |         dtype="float16",
37 |         revision=None,
38 |     )
39 |     mistral_model_config.hf_config.sliding_window = None
40 |     assert mistral_model_config.get_sliding_window() is None
41 | 
42 |     mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
43 |     assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW


--------------------------------------------------------------------------------
/tests/test_regression.py:
--------------------------------------------------------------------------------
 1 | """Containing tests that check for regressions in vLLM's behavior.
 2 | 
 3 | It should include tests that are reported by users and making sure they
 4 | will never happen again.
 5 | 
 6 | """
 7 | import gc
 8 | 
 9 | import torch
10 | 
11 | from vllm import LLM, SamplingParams
12 | 
13 | 
14 | def test_duplicated_ignored_sequence_group():
15 |     """https://github.com/vllm-project/vllm/issues/1655"""
16 | 
17 |     sampling_params = SamplingParams(temperature=0.01,
18 |                                      top_p=0.1,
19 |                                      max_tokens=256)
20 |     llm = LLM(model="facebook/opt-125m",
21 |               max_num_batched_tokens=4096,
22 |               tensor_parallel_size=1)
23 |     prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
24 |     outputs = llm.generate(prompts, sampling_params=sampling_params)
25 | 
26 |     assert len(prompts) == len(outputs)
27 | 
28 | 
29 | def test_max_tokens_none():
30 |     sampling_params = SamplingParams(temperature=0.01,
31 |                                      top_p=0.1,
32 |                                      max_tokens=None)
33 |     llm = LLM(model="facebook/opt-125m",
34 |               max_num_batched_tokens=4096,
35 |               tensor_parallel_size=1)
36 |     prompts = ["Just say hello!"]
37 |     outputs = llm.generate(prompts, sampling_params=sampling_params)
38 | 
39 |     assert len(prompts) == len(outputs)
40 | 
41 | 
42 | def test_gc():
43 |     llm = LLM("facebook/opt-125m", enforce_eager=True)
44 |     del llm
45 | 
46 |     gc.collect()
47 |     torch.cuda.empty_cache()
48 | 
49 |     # The memory allocated for model and KV cache should be released.
50 |     # The memory allocated for PyTorch and others should be less than 50MB.
51 |     # Usually, it's around 10MB.
52 |     allocated = torch.cuda.memory_allocated()
53 |     assert allocated < 50 * 1024 * 1024
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     import pytest
58 |     pytest.main([__file__])
59 | 


--------------------------------------------------------------------------------
/tests/test_sampling_params.py:
--------------------------------------------------------------------------------
 1 | """Tests for the SamplingParams class.
 2 | """
 3 | from vllm import SamplingParams
 4 | 
 5 | 
 6 | def test_max_tokens_none():
 7 |     """max_tokens=None should be allowed"""
 8 |     SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     import pytest
13 |     pytest.main([__file__])
14 | 


--------------------------------------------------------------------------------
/tests/test_sequence.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.sequence import SequenceGroupOutput, SamplerOutput, SequenceOutput
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def sample_outputs():
 8 |     return [
 9 |         SequenceGroupOutput(samples=[
10 |             SequenceOutput(parent_seq_id=0, output_token=i, logprobs={})
11 |         ],
12 |                             prompt_logprobs=None) for i in range(5)
13 |     ]
14 | 
15 | 
16 | @pytest.fixture
17 | def sampler_output(sample_outputs):
18 |     return SamplerOutput(outputs=sample_outputs)
19 | 
20 | 
21 | def test_sampler_output_initialization(sampler_output, sample_outputs):
22 |     assert len(sampler_output) == len(sample_outputs)
23 |     assert sampler_output.sampled_token_probs is None
24 |     assert sampler_output.sampled_token_ids is None
25 |     assert sampler_output.spec_decode_worker_metrics is None
26 | 
27 | 
28 | def test_sampler_output_getitem(sampler_output, sample_outputs):
29 |     assert sampler_output[2] == sample_outputs[2]
30 | 
31 | 
32 | def test_sampler_output_setitem(sampler_output):
33 |     new_output = SequenceGroupOutput(samples=[
34 |         SequenceOutput(parent_seq_id=0, output_token=99, logprobs={})
35 |     ],
36 |                                      prompt_logprobs=None)
37 |     sampler_output[2] = new_output
38 |     assert sampler_output[2] == new_output
39 | 
40 | 
41 | def test_sampler_output_len(sampler_output, sample_outputs):
42 |     assert len(sampler_output) == len(sample_outputs)
43 | 
44 | 
45 | def test_sampler_output_eq(sample_outputs):
46 |     sampler_output1 = SamplerOutput(outputs=sample_outputs)
47 |     sampler_output2 = SamplerOutput(outputs=sample_outputs.copy())
48 |     sampler_output3 = SamplerOutput(outputs=sample_outputs[:-1])
49 |     assert sampler_output1 == sampler_output2
50 |     assert sampler_output1 != sampler_output3
51 | 


--------------------------------------------------------------------------------
/tests/tokenization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/tests/tokenization/__init__.py


--------------------------------------------------------------------------------
/tests/tokenization/test_cached_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from vllm.transformers_utils.tokenizer import get_cached_tokenizer
 3 | from transformers import AutoTokenizer
 4 | 
 5 | 
 6 | def test_cached_tokenizer():
 7 |     reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
 8 |     reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"})
 9 |     reference_tokenizer.add_special_tokens(
10 |         {"additional_special_tokens": ["<SEP>"]})
11 |     cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer))
12 | 
13 |     assert reference_tokenizer.encode("prompt") == cached_tokenizer.encode(
14 |         "prompt")
15 |     assert set(reference_tokenizer.all_special_ids) == set(
16 |         cached_tokenizer.all_special_ids)
17 |     assert set(reference_tokenizer.all_special_tokens) == set(
18 |         cached_tokenizer.all_special_tokens)
19 |     assert set(reference_tokenizer.all_special_tokens_extended) == set(
20 |         cached_tokenizer.all_special_tokens_extended)
21 | 


--------------------------------------------------------------------------------
/tests/tokenization/test_detokenize.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | from vllm.transformers_utils.tokenizer import detokenize_incrementally
 6 | 
 7 | TRUTH = [
 8 |     "Hello here, this is a simple test",  # noqa: E501
 9 |     "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",  # noqa: E501
10 |     "我很感谢你的热情"  # noqa: E501
11 | ]
12 | TOKENIZERS = [
13 |     "facebook/opt-125m",
14 |     "gpt2",
15 |     "bigcode/tiny_starcoder_py",
16 |     "EleutherAI/gpt-j-6b",
17 |     "EleutherAI/pythia-70m",
18 |     "bigscience/bloom-560m",
19 |     "mosaicml/mpt-7b",
20 |     "tiiuae/falcon-7b",
21 |     "meta-llama/Llama-2-7b-hf",
22 |     "codellama/CodeLlama-7b-hf",
23 | ]
24 | 
25 | 
26 | def _run_incremental_decode(tokenizer, all_input_ids,
27 |                             skip_special_tokens: bool):
28 |     decoded_text = ""
29 |     offset = 0
30 |     token_offset = 0
31 |     prev_tokens = None
32 |     for i in range(len(all_input_ids)):
33 |         new_tokens, text, offset, token_offset = detokenize_incrementally(
34 |             tokenizer,
35 |             all_input_ids[:i + 1],
36 |             prev_tokens,
37 |             offset,
38 |             token_offset,
39 |             skip_special_tokens=skip_special_tokens)
40 |         decoded_text += text
41 |         if prev_tokens is None:
42 |             prev_tokens = new_tokens
43 |         else:
44 |             prev_tokens += new_tokens
45 |     return decoded_text
46 | 
47 | 
48 | @pytest.mark.parametrize("truth", TRUTH)
49 | @pytest.mark.parametrize("tokenizer_id", TOKENIZERS)
50 | @pytest.mark.parametrize("skip_special_tokens", (True, False))
51 | def test_decode_streaming(tokenizer_id, truth, skip_special_tokens):
52 |     tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
53 |     all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"]
54 |     if skip_special_tokens:
55 |         all_input_ids = ([tokenizer.bos_token_id]
56 |                          if tokenizer.bos_token_id is not None else
57 |                          []) + all_input_ids + [tokenizer.eos_token_id]
58 | 
59 |     decoded_text = _run_incremental_decode(
60 |         tokenizer, all_input_ids, skip_special_tokens=skip_special_tokens)
61 | 
62 |     assert decoded_text == truth
63 | 


--------------------------------------------------------------------------------
/tests/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/tests/worker/__init__.py


--------------------------------------------------------------------------------
/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 2 | 
 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine
 5 | from vllm.engine.llm_engine import LLMEngine
 6 | from vllm.engine.ray_utils import initialize_ray_cluster
 7 | from vllm.entrypoints.llm import LLM
 8 | from vllm.outputs import CompletionOutput, RequestOutput
 9 | from vllm.sampling_params import SamplingParams
10 | 
11 | __version__ = "0.3.3"
12 | 
13 | __all__ = [
14 |     "LLM",
15 |     "SamplingParams",
16 |     "RequestOutput",
17 |     "CompletionOutput",
18 |     "LLMEngine",
19 |     "EngineArgs",
20 |     "AsyncLLMEngine",
21 |     "AsyncEngineArgs",
22 |     "initialize_ray_cluster",
23 | ]
24 | 


--------------------------------------------------------------------------------
/vllm/block.py:
--------------------------------------------------------------------------------
 1 | """Token blocks."""
 2 | from typing import List
 3 | 
 4 | from vllm.utils import Device
 5 | 
 6 | _BLANK_TOKEN_ID = -1
 7 | 
 8 | DEFAULT_LAST_ACCESSED_TIME = -1
 9 | 
10 | 
11 | class LogicalTokenBlock:
12 |     """A block that stores a contiguous chunk of tokens from left to right.
13 | 
14 |     Logical blocks are used to represent the states of the corresponding
15 |     physical blocks in the KV cache.
16 |     """
17 | 
18 |     def __init__(
19 |         self,
20 |         block_number: int,
21 |         block_size: int,
22 |     ) -> None:
23 |         self.block_number = block_number
24 |         self.block_size = block_size
25 | 
26 |         self.token_ids = [_BLANK_TOKEN_ID] * block_size
27 |         self.num_tokens = 0
28 | 
29 |     def is_empty(self) -> bool:
30 |         return self.num_tokens == 0
31 | 
32 |     def get_num_empty_slots(self) -> int:
33 |         return self.block_size - self.num_tokens
34 | 
35 |     def is_full(self) -> bool:
36 |         return self.num_tokens == self.block_size
37 | 
38 |     def append_tokens(self, token_ids: List[int]) -> None:
39 |         assert len(token_ids) <= self.get_num_empty_slots()
40 |         curr_idx = self.num_tokens
41 |         self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids
42 |         self.num_tokens += len(token_ids)
43 | 
44 |     def get_token_ids(self) -> List[int]:
45 |         return self.token_ids[:self.num_tokens]
46 | 
47 |     def get_last_token_id(self) -> int:
48 |         assert self.num_tokens > 0
49 |         return self.token_ids[self.num_tokens - 1]
50 | 
51 | 
52 | class PhysicalTokenBlock:
53 |     """Represents the state of a block in the KV cache."""
54 | 
55 |     def __init__(
56 |         self,
57 |         device: Device,
58 |         block_number: int,
59 |         block_size: int,
60 |         block_hash: int,
61 |         num_hashed_tokens: int,
62 |     ) -> None:
63 |         self.device = device
64 |         self.block_number = block_number
65 |         self.block_size = block_size
66 |         self.block_hash = block_hash
67 |         self.num_hashed_tokens = num_hashed_tokens
68 | 
69 |         self.ref_count = 0
70 |         self.last_accessed = DEFAULT_LAST_ACCESSED_TIME
71 | 
72 |         self.computed = False
73 | 
74 |     def __repr__(self) -> str:
75 |         return (f'PhysicalTokenBlock(device={self.device}, '
76 |                 f'block_number={self.block_number}, '
77 |                 f'num_hashed_tokens={self.num_hashed_tokens}, '
78 |                 f'ref_count={self.ref_count}, '
79 |                 f'last_accessed={self.last_accessed}, '
80 |                 f'computed={self.computed})')
81 | 
82 | 
83 | # Mapping: logical block number -> physical block.
84 | BlockTable = List[PhysicalTokenBlock]
85 | 


--------------------------------------------------------------------------------
/vllm/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/core/__init__.py


--------------------------------------------------------------------------------
/vllm/core/policy.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | from typing import Deque
 3 | 
 4 | from vllm.sequence import SequenceGroup
 5 | 
 6 | 
 7 | class Policy:
 8 | 
 9 |     def get_priority(
10 |         self,
11 |         now: float,
12 |         seq_group: SequenceGroup,
13 |     ) -> float:
14 |         raise NotImplementedError
15 | 
16 |     def sort_by_priority(
17 |         self,
18 |         now: float,
19 |         seq_groups: Deque[SequenceGroup],
20 |     ) -> Deque[SequenceGroup]:
21 |         return deque(
22 |             sorted(
23 |                 seq_groups,
24 |                 key=lambda seq_group: self.get_priority(now, seq_group),
25 |                 reverse=True,
26 |             ))
27 | 
28 | 
29 | class FCFS(Policy):
30 | 
31 |     def get_priority(
32 |         self,
33 |         now: float,
34 |         seq_group: SequenceGroup,
35 |     ) -> float:
36 |         return now - seq_group.metrics.arrival_time
37 | 
38 | 
39 | class PolicyFactory:
40 | 
41 |     _POLICY_REGISTRY = {
42 |         'fcfs': FCFS,
43 |     }
44 | 
45 |     @classmethod
46 |     def get_policy(cls, policy_name: str, **kwargs) -> Policy:
47 |         return cls._POLICY_REGISTRY[policy_name](**kwargs)
48 | 


--------------------------------------------------------------------------------
/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/engine/__init__.py


--------------------------------------------------------------------------------
/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/entrypoints/__init__.py


--------------------------------------------------------------------------------
/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/entrypoints/openai/__init__.py


--------------------------------------------------------------------------------
/vllm/executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/executor/__init__.py


--------------------------------------------------------------------------------
/vllm/executor/executor_base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Dict, List, Optional
 3 | 
 4 | from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
 5 |                          ParallelConfig, SchedulerConfig, LoRAConfig)
 6 | from vllm.lora.request import LoRARequest
 7 | from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 8 | 
 9 | 
10 | class ExecutorBase(ABC):
11 |     """Base class for all executors.
12 | 
13 |     An executor is responsible for executing the model on a specific device
14 |     type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor
15 |     that can execute the model on multiple devices.
16 |     """
17 | 
18 |     @abstractmethod
19 |     def __init__(
20 |         self,
21 |         model_config: ModelConfig,
22 |         cache_config: CacheConfig,
23 |         parallel_config: ParallelConfig,
24 |         scheduler_config: SchedulerConfig,
25 |         device_config: DeviceConfig,
26 |         lora_config: Optional[LoRAConfig],
27 |     ) -> None:
28 |         raise NotImplementedError
29 | 
30 |     @abstractmethod
31 |     def execute_model(self,
32 |                       seq_group_metadata_list: List[SequenceGroupMetadata],
33 |                       blocks_to_swap_in: Dict[int, int],
34 |                       blocks_to_swap_out: Dict[int, int],
35 |                       blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
36 |         """Executes one model step on the given sequences."""
37 |         raise NotImplementedError
38 | 
39 |     @abstractmethod
40 |     def add_lora(self, lora_request: LoRARequest) -> bool:
41 |         raise NotImplementedError
42 | 
43 |     @abstractmethod
44 |     def remove_lora(self, lora_id: int) -> bool:
45 |         raise NotImplementedError
46 | 
47 |     @abstractmethod
48 |     def list_loras(self) -> List[int]:
49 |         raise NotImplementedError
50 | 
51 |     @abstractmethod
52 |     def check_health(self) -> None:
53 |         """Checks if the executor is healthy. If not, it should raise an
54 |         exception."""
55 |         raise NotImplementedError
56 | 
57 | 
58 | class ExecutorAsyncBase(ExecutorBase):
59 | 
60 |     @abstractmethod
61 |     async def execute_model_async(
62 |         self,
63 |         seq_group_metadata_list: List[SequenceGroupMetadata],
64 |         blocks_to_swap_in: Dict[int, int],
65 |         blocks_to_swap_out: Dict[int, int],
66 |         blocks_to_copy: Dict[int, List[int]],
67 |     ) -> SamplerOutput:
68 |         """Executes one model step on the given sequences."""
69 |         raise NotImplementedError
70 | 
71 |     @abstractmethod
72 |     async def check_health_async(self) -> None:
73 |         """Checks if the executor is healthy. If not, it should raise an
74 |         exception."""
75 |         raise NotImplementedError
76 | 


--------------------------------------------------------------------------------
/vllm/executor/utils.py:
--------------------------------------------------------------------------------
 1 | def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None:
 2 |     if num_gpu_blocks <= 0:
 3 |         raise ValueError("No available memory for the cache blocks. "
 4 |                          "Try increasing `gpu_memory_utilization` when "
 5 |                          "initializing the engine.")
 6 |     max_seq_len = block_size * num_gpu_blocks
 7 |     if max_model_len > max_seq_len:
 8 |         raise ValueError(
 9 |             f"The model's max seq len ({max_model_len}) "
10 |             "is larger than the maximum number of tokens that can be "
11 |             f"stored in KV cache ({max_seq_len}). Try increasing "
12 |             "`gpu_memory_utilization` or decreasing `max_model_len` when "
13 |             "initializing the engine.")
14 | 


--------------------------------------------------------------------------------
/vllm/logger.py:
--------------------------------------------------------------------------------
 1 | # Adapted from
 2 | # https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
 3 | """Logging configuration for vLLM."""
 4 | import logging
 5 | import sys
 6 | import os
 7 | 
 8 | VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1"))
 9 | 
10 | _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
11 | _DATE_FORMAT = "%m-%d %H:%M:%S"
12 | 
13 | 
14 | class NewLineFormatter(logging.Formatter):
15 |     """Adds logging prefix to newlines to align multi-line messages."""
16 | 
17 |     def __init__(self, fmt, datefmt=None):
18 |         logging.Formatter.__init__(self, fmt, datefmt)
19 | 
20 |     def format(self, record):
21 |         msg = logging.Formatter.format(self, record)
22 |         if record.message != "":
23 |             parts = msg.split(record.message)
24 |             msg = msg.replace("\n", "\r\n" + parts[0])
25 |         return msg
26 | 
27 | 
28 | _root_logger = logging.getLogger("vllm")
29 | _default_handler = None
30 | 
31 | 
32 | def _setup_logger():
33 |     _root_logger.setLevel(logging.DEBUG)
34 |     global _default_handler
35 |     if _default_handler is None:
36 |         _default_handler = logging.StreamHandler(sys.stdout)
37 |         _default_handler.flush = sys.stdout.flush  # type: ignore
38 |         _default_handler.setLevel(logging.INFO)
39 |         _root_logger.addHandler(_default_handler)
40 |     fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT)
41 |     _default_handler.setFormatter(fmt)
42 |     # Setting this will avoid the message
43 |     # being propagated to the parent logger.
44 |     _root_logger.propagate = False
45 | 
46 | 
47 | # The logger is initialized when the module is imported.
48 | # This is thread-safe as the module is only imported once,
49 | # guaranteed by the Python GIL.
50 | if VLLM_CONFIGURE_LOGGING:
51 |     _setup_logger()
52 | 
53 | 
54 | def init_logger(name: str):
55 |     # Use the same settings as above for root logger
56 |     logger = logging.getLogger(name)
57 |     logger.setLevel(os.getenv("LOG_LEVEL", "DEBUG"))
58 |     if VLLM_CONFIGURE_LOGGING:
59 |         logger.addHandler(_default_handler)
60 |         logger.propagate = False
61 |     return logger
62 | 


--------------------------------------------------------------------------------
/vllm/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/lora/__init__.py


--------------------------------------------------------------------------------
/vllm/lora/request.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class LoRARequest:
 6 |     """
 7 |     Request for a LoRA adapter.
 8 | 
 9 |     Note that this class should be be used internally. For online
10 |     serving, it is recommended to not allow users to use this class but
11 |     instead provide another layer of abstraction to prevent users from
12 |     accessing unauthorized LoRA adapters.
13 | 
14 |     lora_int_id must be globally unique for a given adapter.
15 |     This is currently not enforced in vLLM.
16 |     """
17 | 
18 |     lora_name: str
19 |     lora_int_id: int
20 |     lora_local_path: str
21 | 
22 |     def __post_init__(self):
23 |         if self.lora_int_id < 1:
24 |             raise ValueError(
25 |                 f"lora_int_id must be > 0, got {self.lora_int_id}")
26 | 
27 |     def __eq__(self, value: object) -> bool:
28 |         return isinstance(
29 |             value, LoRARequest) and self.lora_int_id == value.lora_int_id
30 | 
31 |     def __hash__(self) -> int:
32 |         return self.lora_int_id
33 | 


--------------------------------------------------------------------------------
/vllm/lora/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Tuple
 3 | 
 4 | from torch import nn
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | def replace_submodule(model: nn.Module, module_name: str,
10 |                       new_module: nn.Module) -> nn.Module:
11 |     """Replace a submodule in a model with a new module."""
12 |     parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
13 |     target_name = module_name.split(".")[-1]
14 |     setattr(parent, target_name, new_module)
15 |     return new_module
16 | 
17 | 
18 | def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]:
19 |     """Parse the name of lora weights.
20 | 
21 |     args:
22 |         name: the name of the fine-tuned LoRA, e.g.
23 |             base_model.model.dense1.weight
24 |     return:
25 |         Tuple(module_name, is_lora_a):
26 |             module_name: the name of the module, e.g. model.dense1,
27 |             is_lora_a whether the tensor is lora_a or lora_b.
28 |     """
29 |     parts = name.split(".")
30 |     assert parts[0] == "base_model"
31 |     assert parts[1] == "model"
32 |     if parts[-1] == "weight":
33 |         assert parts[-2] == "lora_A" or parts[-2] == "lora_B"
34 |         return ".".join(parts[2:-2]), parts[-2] == "lora_A"
35 | 
36 |     if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
37 |         return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A"
38 | 
39 |     raise ValueError(f"{name} is unsupported format")
40 | 


--------------------------------------------------------------------------------
/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.input_metadata import InputMetadata
 2 | from vllm.model_executor.sampling_metadata import SamplingMetadata
 3 | from vllm.model_executor.utils import set_random_seed, get_model
 4 | 
 5 | __all__ = [
 6 |     "InputMetadata",
 7 |     "get_model",
 8 |     "SamplingMetadata",
 9 |     "set_random_seed",
10 | ]
11 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/model_executor/layers/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/attention/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.model_executor.layers.attention.attention import Attention
2 | 
3 | __all__ = [
4 |     "Attention",
5 | ]
6 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/attention/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/model_executor/layers/attention/backends/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/attention/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/model_executor/layers/attention/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/fused_moe/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.layers.fused_moe.fused_moe import (
 2 |     fused_moe,
 3 |     get_config_file_name,
 4 | )
 5 | 
 6 | __all__ = [
 7 |     "fused_moe",
 8 |     "get_config_file_name",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/fused_moe/configs/README:
--------------------------------------------------------------------------------
 1 | This directory contains tuned configurations for different settings of the fused_moe kernel.
 2 | For different settings of
 3 | - E (number of experts)
 4 | - N (intermediate size)
 5 | - device_name (torch.cuda.get_device_name())
 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration.
 7 | 
 8 | The example configurations provided are for the Mixtral model for TP2 on H100
 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
10 | N = 7168 and for TP4 we have N = 3584.
11 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/layernorm.py:
--------------------------------------------------------------------------------
 1 | """Custom normalization layers."""
 2 | from typing import Optional, Tuple, Union
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from vllm._C import ops
 8 | 
 9 | 
10 | class RMSNorm(nn.Module):
11 |     """Root mean square normalization.
12 | 
13 |     Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
14 |     Refer to https://arxiv.org/abs/1910.07467
15 |     """
16 | 
17 |     def __init__(
18 |         self,
19 |         hidden_size: int,
20 |         eps: float = 1e-6,
21 |     ) -> None:
22 |         super().__init__()
23 |         self.weight = nn.Parameter(torch.ones(hidden_size))
24 |         self.variance_epsilon = eps
25 | 
26 |     def _forward(
27 |         self,
28 |         x: torch.Tensor,
29 |         residual: Optional[torch.Tensor] = None,
30 |     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
31 |         """PyTorch-native implementation equivalent to forward()."""
32 |         orig_dtype = x.dtype
33 |         x = x.to(torch.float32)
34 |         if residual is not None:
35 |             x = x + residual.to(torch.float32)
36 |             residual = x.to(orig_dtype)
37 | 
38 |         variance = x.pow(2).mean(dim=-1, keepdim=True)
39 |         x = x * torch.rsqrt(variance + self.variance_epsilon)
40 |         x = x.to(orig_dtype) * self.weight
41 |         if residual is None:
42 |             return x
43 |         else:
44 |             return x, residual
45 | 
46 |     def forward(
47 |         self,
48 |         x: torch.Tensor,
49 |         residual: Optional[torch.Tensor] = None,
50 |     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
51 |         if residual is not None:
52 |             ops.fused_add_rms_norm(
53 |                 x,
54 |                 residual,
55 |                 self.weight.data,
56 |                 self.variance_epsilon,
57 |             )
58 |             return x, residual
59 |         out = torch.empty_like(x)
60 |         ops.rms_norm(
61 |             out,
62 |             x,
63 |             self.weight.data,
64 |             self.variance_epsilon,
65 |         )
66 |         return out
67 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/model_executor/layers/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | from vllm.model_executor.layers.quantization.base_config import (
 4 |     QuantizationConfig)
 5 | from vllm.model_executor.layers.quantization.awq import AWQConfig
 6 | from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 7 | from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
 8 | from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 9 | 
10 | _QUANTIZATION_CONFIG_REGISTRY = {
11 |     "awq": AWQConfig,
12 |     "gptq": GPTQConfig,
13 |     "squeezellm": SqueezeLLMConfig,
14 |     "marlin": MarlinConfig,
15 | }
16 | 
17 | 
18 | def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
19 |     if quantization not in _QUANTIZATION_CONFIG_REGISTRY:
20 |         raise ValueError(f"Invalid quantization method: {quantization}")
21 |     return _QUANTIZATION_CONFIG_REGISTRY[quantization]
22 | 
23 | 
24 | __all__ = [
25 |     "QuantizationConfig",
26 |     "get_quantization_config",
27 | ]
28 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/base_config.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Dict, List
 3 | 
 4 | import torch
 5 | 
 6 | from vllm.model_executor.layers.linear import LinearMethodBase
 7 | 
 8 | 
 9 | class QuantizationConfig(ABC):
10 |     """Base class for quantization configs."""
11 | 
12 |     @abstractmethod
13 |     def get_name(self) -> str:
14 |         """Name of the quantization method."""
15 |         raise NotImplementedError
16 | 
17 |     @abstractmethod
18 |     def get_supported_act_dtypes(self) -> List[torch.dtype]:
19 |         """List of supported activation dtypes."""
20 |         raise NotImplementedError
21 | 
22 |     @abstractmethod
23 |     def get_min_capability(self) -> int:
24 |         """Minimum GPU capability to support the quantization method.
25 | 
26 |         E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
27 |         This requirement is due to the custom CUDA kernels used by the
28 |         quantization method.
29 |         """
30 |         raise NotImplementedError
31 | 
32 |     @staticmethod
33 |     @abstractmethod
34 |     def get_config_filenames() -> List[str]:
35 |         """List of filenames to search for in the model directory."""
36 |         raise NotImplementedError
37 | 
38 |     @classmethod
39 |     @abstractmethod
40 |     def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig":
41 |         """Create a config class from the model's quantization config."""
42 |         raise NotImplementedError
43 | 
44 |     @staticmethod
45 |     def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
46 |         """Get a value from the model's quantization config."""
47 |         for key in keys:
48 |             if key in config:
49 |                 return config[key]
50 |         raise ValueError(f"Cannot find any of {keys} in the model's "
51 |                          "quantization config.")
52 | 
53 |     @abstractmethod
54 |     def get_linear_method(self) -> LinearMethodBase:
55 |         """Get the linear method to use for the quantized linear layer."""
56 |         raise NotImplementedError
57 | 
58 |     @abstractmethod
59 |     def get_scaled_act_names(self) -> List[str]:
60 |         """Returns the activation function names that should be post-scaled.
61 | 
62 |         For now, this is only used by AWQ.
63 |         """
64 |         raise NotImplementedError
65 | 


--------------------------------------------------------------------------------
/vllm/model_executor/neuron_model_loader.py:
--------------------------------------------------------------------------------
 1 | """Utilities for selecting and loading models."""
 2 | from typing import Type
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | from transformers import PretrainedConfig
 7 | 
 8 | from vllm.config import ModelConfig, DeviceConfig
 9 | from vllm.model_executor.models import ModelRegistry
10 | 
11 | TORCH_DTYPE_TO_NEURON_AMP = {
12 |     "auto": "f32",
13 |     "half": "f16",
14 |     "float16": "f16",
15 |     "bfloat16": "bf16",
16 |     "float": "f32",
17 |     "float32": "f32",
18 |     torch.float16: "f16",
19 |     torch.bfloat16: "bf16",
20 |     torch.float32: "f32",
21 | }
22 | 
23 | 
24 | def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
25 |     architectures = getattr(config, "architectures", [])
26 |     for arch in architectures:
27 |         model_cls = ModelRegistry.load_model_cls(arch)
28 |         if model_cls is not None:
29 |             return model_cls
30 |     raise ValueError(
31 |         f"Model architectures {architectures} are not supported for now. "
32 |         f"Supported architectures: {ModelRegistry.get_supported_archs()}")
33 | 
34 | 
35 | def get_model(model_config: ModelConfig, device_config: DeviceConfig,
36 |               **kwargs) -> nn.Module:
37 |     from transformers_neuronx.config import (NeuronConfig,
38 |                                              ContinuousBatchingConfig)
39 | 
40 |     parallel_config = kwargs.get("parallel_config")
41 |     scheduler_config = kwargs.get("scheduler_config")
42 | 
43 |     model_class = _get_model_architecture(model_config.hf_config)
44 |     linear_method = None
45 | 
46 |     # Create a model instance.
47 |     model = model_class(model_config.hf_config, linear_method)
48 | 
49 |     continuous_batching_config = ContinuousBatchingConfig(
50 |         batch_size_for_shared_caches=scheduler_config.max_num_seqs)
51 |     neuron_config = NeuronConfig(
52 |         continuous_batching=continuous_batching_config)
53 | 
54 |     # Load the weights from the cached or downloaded files.
55 |     model.load_weights(
56 |         model_config.model,
57 |         model_config.download_dir,
58 |         model_config.load_format,
59 |         model_config.revision,
60 |         tp_degree=parallel_config.neuron_tp_degree,
61 |         amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
62 |         neuron_config=neuron_config,
63 |         context_length_estimate=[scheduler_config.max_model_len],
64 |         n_positions=[scheduler_config.max_model_len],
65 |         batch_size=scheduler_config.max_num_seqs)
66 | 
67 |     return model.eval()
68 | 


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/README.md:
--------------------------------------------------------------------------------
1 | The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference.


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/model_executor/parallel_utils/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 The vLLM team.
 2 | # Adapted from
 3 | # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 4 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 5 | from typing import Sequence
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | def ensure_divisibility(numerator, denominator):
11 |     """Ensure that numerator is divisible by the denominator."""
12 |     assert numerator % denominator == 0, "{} is not divisible by {}".format(
13 |         numerator, denominator)
14 | 
15 | 
16 | def divide(numerator, denominator):
17 |     """Ensure that numerator is divisible by the denominator and return
18 |     the division value."""
19 |     ensure_divisibility(numerator, denominator)
20 |     return numerator // denominator
21 | 
22 | 
23 | def split_tensor_along_last_dim(
24 |     tensor: torch.Tensor,
25 |     num_partitions: int,
26 |     contiguous_split_chunks: bool = False,
27 | ) -> Sequence[torch.Tensor]:
28 |     """ Split a tensor along its last dimension.
29 | 
30 |         Arguments:
31 |             tensor: input tensor.
32 |             num_partitions: number of partitions to split the tensor
33 |             contiguous_split_chunks: If True, make each chunk contiguous
34 |                                      in memory.
35 | 
36 |         Returns:
37 |             A list of Tensors
38 |     """
39 |     # Get the size and dimension.
40 |     last_dim = tensor.dim() - 1
41 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
42 |     # Split.
43 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
44 |     # NOTE: torch.split does not create contiguous tensors by default.
45 |     if contiguous_split_chunks:
46 |         return tuple(chunk.contiguous() for chunk in tensor_list)
47 | 
48 |     return tensor_list
49 | 


--------------------------------------------------------------------------------
/vllm/model_executor/utils.py:
--------------------------------------------------------------------------------
 1 | """Utils for model executor."""
 2 | import random
 3 | import importlib
 4 | from typing import Any, Dict, Optional
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | 
 9 | from vllm.config import DeviceConfig, ModelConfig
10 | 
11 | DEVICE_TO_MODEL_LOADER_MAP = {
12 |     "cuda": "model_loader",
13 |     "neuron": "neuron_model_loader",
14 | }
15 | 
16 | 
17 | def set_random_seed(seed: int) -> None:
18 |     random.seed(seed)
19 |     np.random.seed(seed)
20 |     torch.manual_seed(seed)
21 |     if torch.cuda.is_available():
22 |         torch.cuda.manual_seed_all(seed)
23 | 
24 | 
25 | def set_weight_attrs(
26 |     weight: torch.Tensor,
27 |     weight_attrs: Optional[Dict[str, Any]],
28 | ):
29 |     """Set attributes on a weight tensor.
30 | 
31 |     This method is used to set attributes on a weight tensor. This method
32 |     will not overwrite existing attributes.
33 | 
34 |     Args:
35 |         weight: The weight tensor.
36 |         weight_attrs: A dictionary of attributes to set on the weight tensor.
37 |     """
38 |     if weight_attrs is None:
39 |         return
40 |     for key, value in weight_attrs.items():
41 |         assert not hasattr(
42 |             weight, key), (f"Overwriting existing tensor attribute: {key}")
43 |         setattr(weight, key, value)
44 | 
45 | 
46 | def get_model(model_config: ModelConfig, device_config: DeviceConfig,
47 |               **kwargs) -> torch.nn.Module:
48 |     model_loader_module = DEVICE_TO_MODEL_LOADER_MAP[device_config.device_type]
49 |     imported_model_loader = importlib.import_module(
50 |         f"vllm.model_executor.{model_loader_module}")
51 |     get_model_fn = imported_model_loader.get_model
52 |     return get_model_fn(model_config, device_config, **kwargs)
53 | 


--------------------------------------------------------------------------------
/vllm/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | # The vllm package uses inline types.
3 | 


--------------------------------------------------------------------------------
/vllm/spec_decode/interfaces.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Optional, Dict
 2 | from dataclasses import dataclass
 3 | from abc import ABC, abstractmethod
 4 | 
 5 | import torch
 6 | 
 7 | from vllm.sequence import SequenceGroupMetadata
 8 | 
 9 | 
10 | @dataclass
11 | class SpeculativeProposals:
12 |     """Datastructure used to represent proposal tokens from some proposer. It
13 |     also tracks how many speculative tokens each sequence has.
14 |     """
15 | 
16 |     # Speculative proposal tokens.
17 |     proposal_token_ids: torch.Tensor
18 | 
19 |     # Probabilities of the proposal tokens according to the proposer.
20 |     proposal_probs: torch.Tensor
21 | 
22 |     # The valid length of each proposal; can be zero.
23 |     proposal_lens: torch.Tensor
24 | 
25 |     def __repr__(self):
26 |         return (f"SpeculativeProposals("
27 |                 f"proposal_token_ids={self.proposal_token_ids.shape}, "
28 |                 f"proposal_probs={self.proposal_probs.shape}, "
29 |                 f"proposal_lens={self.proposal_lens.shape})")
30 | 
31 | 
32 | @dataclass
33 | class SpeculativeScores:
34 |     """Datastructure used to represent the scores of speculative tokens
35 |     according to the scoring model.
36 |     """
37 | 
38 |     # Probabilities of the speculative tokens according to the scoring model.
39 |     probs: torch.Tensor
40 | 
41 |     # Token ids sampled from the scoring model. Used for speculative bonus
42 |     # tokens and also non-speculative normal decoding.
43 |     token_ids: torch.Tensor
44 | 
45 |     def __repr__(self):
46 |         return (f"SpeculativeScores("
47 |                 f"probs={self.probs.shape}, "
48 |                 f"token_ids={self.token_ids.shape})")
49 | 
50 | 
51 | class SpeculativeProposer(ABC):
52 | 
53 |     @abstractmethod
54 |     def get_proposals(
55 |         self,
56 |         seq_group_metadata_list: List[SequenceGroupMetadata],
57 |         blocks_to_swap_in: Dict[int, int],
58 |         blocks_to_swap_out: Dict[int, int],
59 |         blocks_to_copy: Dict[int, List[int]],
60 |         max_proposal_len: int,
61 |     ) -> SpeculativeProposals:
62 |         raise NotImplementedError
63 | 
64 | 
65 | class SpeculativeScorer(ABC):
66 | 
67 |     @abstractmethod
68 |     def score_proposals(
69 |         self,
70 |         seq_group_metadata_list: List[SequenceGroupMetadata],
71 |         blocks_to_swap_in: Optional[Dict[int, int]],
72 |         blocks_to_swap_out: Optional[Dict[int, int]],
73 |         blocks_to_copy: Optional[Dict[int, List[int]]],
74 |         k: int,
75 |         proposals: SpeculativeProposals,
76 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
77 |         raise NotImplementedError
78 | 


--------------------------------------------------------------------------------
/vllm/test_utils.py:
--------------------------------------------------------------------------------
 1 | import ray
 2 | 
 3 | from vllm.config import ParallelConfig
 4 | from vllm.utils import get_open_port
 5 | from vllm.worker.worker import init_distributed_environment
 6 | 
 7 | 
 8 | def init_test_distributed_environment(
 9 |     pipeline_parallel_size: int,
10 |     tensor_parallel_size: int,
11 |     rank: int,
12 |     distributed_init_port: str,
13 | ) -> None:
14 |     parallel_config = ParallelConfig(pipeline_parallel_size,
15 |                                      tensor_parallel_size,
16 |                                      worker_use_ray=True)
17 |     distributed_init_method = f"tcp://localhost:{distributed_init_port}"
18 |     init_distributed_environment(
19 |         parallel_config,
20 |         rank,
21 |         cupy_port=None,
22 |         distributed_init_method=distributed_init_method)
23 | 
24 | 
25 | def multi_process_tensor_parallel(
26 |     tensor_parallel_size: int,
27 |     test_target,
28 | ) -> None:
29 |     # Using ray helps debugging the error when it failed
30 |     # as compared to multiprocessing.
31 |     ray.init()
32 | 
33 |     distributed_init_port = get_open_port()
34 |     refs = []
35 |     for rank in range(tensor_parallel_size):
36 |         refs.append(
37 |             test_target.remote(tensor_parallel_size, rank,
38 |                                distributed_init_port))
39 |     ray.get(refs)
40 | 
41 |     ray.shutdown()
42 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/transformers_utils/__init__.py


--------------------------------------------------------------------------------
/vllm/transformers_utils/config.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from transformers import AutoConfig, PretrainedConfig
 4 | 
 5 | from vllm.transformers_utils.configs import *
 6 | 
 7 | _CONFIG_REGISTRY = {
 8 |     "chatglm": ChatGLMConfig,
 9 |     "mpt": MPTConfig,
10 |     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
11 |     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
12 |     "starcoder2": Starcoder2Config,
13 | }
14 | 
15 | 
16 | def get_config(model: str,
17 |                trust_remote_code: bool,
18 |                revision: Optional[str] = None,
19 |                code_revision: Optional[str] = None) -> PretrainedConfig:
20 |     # FIXME(woosuk): This is a temporary fix for StarCoder2.
21 |     # Remove this when the model is supported by HuggingFace transformers.
22 |     if "bigcode" in model and "starcoder2" in model:
23 |         config_class = _CONFIG_REGISTRY["starcoder2"]
24 |         config = config_class.from_pretrained(model,
25 |                                               revision=revision,
26 |                                               code_revision=code_revision)
27 |         return config
28 | 
29 |     try:
30 |         config = AutoConfig.from_pretrained(
31 |             model,
32 |             trust_remote_code=trust_remote_code,
33 |             revision=revision,
34 |             code_revision=code_revision)
35 |     except ValueError as e:
36 |         if (not trust_remote_code and
37 |                 "requires you to execute the configuration file" in str(e)):
38 |             err_msg = (
39 |                 "Failed to load the model config. If the model is a custom "
40 |                 "model not yet available in the HuggingFace transformers "
41 |                 "library, consider setting `trust_remote_code=True` in LLM "
42 |                 "or using the `--trust-remote-code` flag in the CLI.")
43 |             raise RuntimeError(err_msg) from e
44 |         else:
45 |             raise e
46 |     if config.model_type in _CONFIG_REGISTRY:
47 |         config_class = _CONFIG_REGISTRY[config.model_type]
48 |         config = config_class.from_pretrained(model,
49 |                                               revision=revision,
50 |                                               code_revision=code_revision)
51 |     return config
52 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 2 | from vllm.transformers_utils.configs.mpt import MPTConfig
 3 | # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 4 | # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 5 | # `FalconConfig` class from the official HuggingFace transformers library.
 6 | from vllm.transformers_utils.configs.falcon import RWConfig
 7 | from vllm.transformers_utils.configs.starcoder2 import Starcoder2Config
 8 | 
 9 | __all__ = [
10 |     "ChatGLMConfig",
11 |     "MPTConfig",
12 |     "RWConfig",
13 |     "Starcoder2Config",
14 | ]
15 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/chatglm.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Adapted from
 3 | # https://github.com/THUDM/ChatGLM2-6B
 4 | from transformers import PretrainedConfig
 5 | 
 6 | 
 7 | class ChatGLMConfig(PretrainedConfig):
 8 |     model_type = "chatglm"
 9 |     attribute_map = {
10 |         "num_hidden_layers": "num_layers",
11 |         "n_head_kv": "multi_query_group_num",
12 |     }
13 | 
14 |     def __init__(self,
15 |                  num_layers=28,
16 |                  padded_vocab_size=65024,
17 |                  hidden_size=4096,
18 |                  ffn_hidden_size=13696,
19 |                  kv_channels=128,
20 |                  num_attention_heads=32,
21 |                  seq_length=2048,
22 |                  hidden_dropout=0.0,
23 |                  attention_dropout=0.0,
24 |                  layernorm_epsilon=1e-5,
25 |                  rmsnorm=True,
26 |                  apply_residual_connection_post_layernorm=False,
27 |                  post_layer_norm=True,
28 |                  add_bias_linear=False,
29 |                  add_qkv_bias=False,
30 |                  interleaved_qkv=False,
31 |                  bias_dropout_fusion=True,
32 |                  multi_query_attention=False,
33 |                  multi_query_group_num=1,
34 |                  apply_query_key_layer_scaling=True,
35 |                  attention_softmax_in_fp32=True,
36 |                  fp32_residual_connection=False,
37 |                  quantization_bit=0,
38 |                  pre_seq_len=None,
39 |                  prefix_projection=False,
40 |                  **kwargs):
41 |         self.num_layers = num_layers
42 |         self.vocab_size = padded_vocab_size
43 |         self.padded_vocab_size = padded_vocab_size
44 |         self.hidden_size = hidden_size
45 |         self.ffn_hidden_size = ffn_hidden_size
46 |         self.kv_channels = kv_channels
47 |         self.num_attention_heads = num_attention_heads
48 |         self.seq_length = seq_length
49 |         self.hidden_dropout = hidden_dropout
50 |         self.attention_dropout = attention_dropout
51 |         self.layernorm_epsilon = layernorm_epsilon
52 |         self.rmsnorm = rmsnorm
53 |         self.apply_residual_connection_post_layernorm = (
54 |             apply_residual_connection_post_layernorm)
55 |         self.post_layer_norm = post_layer_norm
56 |         self.add_bias_linear = add_bias_linear
57 |         self.add_qkv_bias = add_qkv_bias
58 |         self.bias_dropout_fusion = bias_dropout_fusion
59 |         self.multi_query_attention = multi_query_attention
60 |         self.multi_query_group_num = multi_query_group_num
61 |         self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
62 |         self.attention_softmax_in_fp32 = attention_softmax_in_fp32
63 |         self.fp32_residual_connection = fp32_residual_connection
64 |         self.quantization_bit = quantization_bit
65 |         self.pre_seq_len = pre_seq_len
66 |         self.prefix_projection = prefix_projection
67 |         self.interleaved_qkv = interleaved_qkv
68 |         super().__init__(**kwargs)
69 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/starcoder2.py:
--------------------------------------------------------------------------------
 1 | from transformers import PretrainedConfig
 2 | 
 3 | 
 4 | class Starcoder2Config(PretrainedConfig):
 5 |     model_type = "starcoder2"
 6 |     keys_to_ignore_at_inference = ["past_key_values"]
 7 | 
 8 |     def __init__(
 9 |         self,
10 |         vocab_size=49152,
11 |         hidden_size=3072,
12 |         intermediate_size=12288,
13 |         num_hidden_layers=30,
14 |         num_attention_heads=24,
15 |         num_key_value_heads=2,
16 |         hidden_act="gelu_pytorch_tanh",
17 |         max_position_embeddings=4096,
18 |         initializer_range=0.018042,
19 |         norm_epsilon=1e-5,
20 |         use_cache=True,
21 |         bos_token_id=50256,
22 |         eos_token_id=50256,
23 |         rope_theta=10000.0,
24 |         sliding_window=None,
25 |         attention_dropout=0.0,
26 |         residual_dropout=0.0,
27 |         embedding_dropout=0.0,
28 |         use_bias=True,
29 |         **kwargs,
30 |     ):
31 |         self.vocab_size = vocab_size
32 |         self.max_position_embeddings = max_position_embeddings
33 |         self.hidden_size = hidden_size
34 |         self.intermediate_size = intermediate_size
35 |         self.num_hidden_layers = num_hidden_layers
36 |         self.num_attention_heads = num_attention_heads
37 |         self.sliding_window = sliding_window
38 |         self.use_bias = use_bias
39 |         self.num_key_value_heads = num_key_value_heads
40 |         self.hidden_act = hidden_act
41 |         self.initializer_range = initializer_range
42 |         self.norm_epsilon = norm_epsilon
43 |         self.use_cache = use_cache
44 |         self.rope_theta = rope_theta
45 |         self.attention_dropout = attention_dropout
46 |         self.residual_dropout = residual_dropout
47 |         self.embedding_dropout = embedding_dropout
48 | 
49 |         super().__init__(
50 |             bos_token_id=bos_token_id,
51 |             eos_token_id=eos_token_id,
52 |             **kwargs,
53 |         )
54 |         if self.architectures is None:
55 |             self.architectures = ['Starcoder2ForCausalLM']
56 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizer_group/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from vllm.config import TokenizerPoolConfig
 3 | from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
 4 |     BaseTokenizerGroup)
 5 | from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
 6 |     TokenizerGroup)
 7 | from vllm.engine.ray_utils import ray
 8 | 
 9 | if ray:
10 |     from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
11 |         RayTokenizerGroupPool)
12 | else:
13 |     RayTokenizerGroupPool = None
14 | 
15 | 
16 | def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
17 |                         **init_kwargs) -> BaseTokenizerGroup:
18 |     if tokenizer_pool_config is None:
19 |         return TokenizerGroup(**init_kwargs)
20 |     if tokenizer_pool_config.pool_type == "ray":
21 |         if RayTokenizerGroupPool is None:
22 |             raise ImportError(
23 |                 "RayTokenizerGroupPool is not available. Please install "
24 |                 "the ray package to use the Ray tokenizer group pool.")
25 |         return RayTokenizerGroupPool.from_config(tokenizer_pool_config,
26 |                                                  **init_kwargs)
27 |     else:
28 |         raise ValueError(
29 |             f"Unknown pool type: {tokenizer_pool_config.pool_type}")
30 | 
31 | 
32 | __all__ = ["get_tokenizer_group", "BaseTokenizerGroup"]
33 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List, Optional
 3 | 
 4 | from transformers import PreTrainedTokenizer
 5 | 
 6 | from vllm.lora.request import LoRARequest
 7 | 
 8 | 
 9 | class BaseTokenizerGroup(ABC):
10 |     """A group of tokenizers that can be used for LoRA adapters."""
11 | 
12 |     @abstractmethod
13 |     def ping(self) -> bool:
14 |         """Check if the tokenizer group is alive."""
15 |         pass
16 | 
17 |     @abstractmethod
18 |     def get_max_input_len(self,
19 |                           lora_request: Optional[LoRARequest] = None
20 |                           ) -> Optional[int]:
21 |         """Get the maximum input length for the LoRA request."""
22 |         pass
23 | 
24 |     @abstractmethod
25 |     def encode(self,
26 |                prompt: str,
27 |                request_id: Optional[str] = None,
28 |                lora_request: Optional[LoRARequest] = None) -> List[int]:
29 |         """Encode a prompt using the tokenizer group."""
30 |         pass
31 | 
32 |     @abstractmethod
33 |     async def encode_async(
34 |             self,
35 |             prompt: str,
36 |             request_id: Optional[str] = None,
37 |             lora_request: Optional[LoRARequest] = None) -> List[int]:
38 |         """Encode a prompt using the tokenizer group."""
39 |         pass
40 | 
41 |     @abstractmethod
42 |     def get_lora_tokenizer(
43 |             self,
44 |             lora_request: Optional[LoRARequest] = None
45 |     ) -> "PreTrainedTokenizer":
46 |         """Get a tokenizer for a LoRA request."""
47 |         pass
48 | 
49 |     @abstractmethod
50 |     async def get_lora_tokenizer_async(
51 |             self,
52 |             lora_request: Optional[LoRARequest] = None
53 |     ) -> "PreTrainedTokenizer":
54 |         """Get a tokenizer for a LoRA request."""
55 |         pass
56 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer
2 | 
3 | __all__ = [
4 |     "BaichuanTokenizer",
5 | ]
6 | 


--------------------------------------------------------------------------------
/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SakanaAI/vllm/f1c0fc391909e55fce5f109893f3c483f69a091f/vllm/worker/__init__.py


--------------------------------------------------------------------------------