├── .buildkite ├── check-wheel-size.py ├── download-images.sh ├── run-amd-test.sh ├── run-benchmarks.sh ├── run-cpu-test.sh ├── run-neuron-test.sh ├── test-pipeline.yaml └── test-template.j2 ├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── 100-documentation.yml │ ├── 200-installation.yml │ ├── 300-usage.yml │ ├── 400-bug report.yml │ ├── 500-feature request.yml │ ├── 600-new model.yml │ ├── 700-performance discussion.yml │ ├── 750-RFC.yml │ ├── 800-misc discussion.yml │ └── config.yml ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── mypy.yaml │ ├── publish.yml │ ├── ruff.yml │ ├── scripts │ ├── build.sh │ ├── create_release.js │ ├── cuda-install.sh │ ├── env.sh │ └── pytorch-install.sh │ └── yapf.yml ├── .gitignore ├── .readthedocs.yaml ├── .yapfignore ├── CMakeLists.txt ├── CONTRIBUTING.md ├── Dockerfile ├── Dockerfile.cpu ├── Dockerfile.neuron ├── Dockerfile.rocm ├── LICENSE ├── MANIFEST.in ├── README.md ├── README_vllm_musa.md ├── benchmarks ├── README.md ├── backend_request_func.py ├── benchmark_latency.py ├── benchmark_prefix_caching.py ├── benchmark_serving.py ├── benchmark_throughput.py ├── kernels │ ├── benchmark_aqlm.py │ ├── benchmark_mixtral_moe.py │ ├── benchmark_paged_attention.py │ └── benchmark_rope.py ├── launch_tgi_server.sh └── sonnet.txt ├── build_musa.sh ├── cmake ├── cpu_extension.cmake ├── hipify.py └── utils.cmake ├── collect_env.py ├── csrc_musa ├── activation_kernels.mu ├── attention │ ├── attention_dtypes.h │ ├── attention_generic.muh │ ├── attention_kernels.mu │ ├── attention_utils.muh │ ├── dtype_bfloat16.muh │ ├── dtype_float16.muh │ ├── dtype_float32.muh │ └── dtype_fp8.muh ├── cache.h ├── cache_kernels.mu ├── cpu │ ├── activation.cpp │ ├── attention.cpp │ ├── cache.cpp │ ├── cpu_types.hpp │ ├── layernorm.cpp │ ├── pos_encoding.cpp │ └── pybind.cpp ├── custom_all_reduce.mu ├── custom_all_reduce.muh ├── custom_all_reduce_test.mu ├── dispatch_utils.h ├── layernorm_kernels.mu ├── moe │ ├── moe_ops.cpp │ ├── moe_ops.h │ └── topk_softmax_kernels.mu ├── moe_align_block_size_kernels.mu ├── musa_compat.h ├── musa_utils.h ├── musa_utils_kernels.mu ├── ops.h ├── pos_encoding_kernels.mu ├── punica │ ├── .LICENSE │ ├── bgmv │ │ ├── bgmv_bf16_bf16_bf16.mu │ │ ├── bgmv_bf16_fp32_bf16.mu │ │ ├── bgmv_config.h │ │ ├── bgmv_fp16_fp16_fp16.mu │ │ ├── bgmv_fp16_fp32_fp16.mu │ │ ├── bgmv_fp32_bf16_bf16.mu │ │ ├── bgmv_fp32_fp16_fp16.mu │ │ ├── bgmv_impl.muh │ │ ├── generator.py │ │ └── vec_dtypes.muh │ └── punica_ops.cc ├── pybind.cpp ├── quantization │ ├── aqlm │ │ └── gemm_kernels.mu │ ├── awq │ │ ├── dequantize.muh │ │ └── gemm_kernels.mu │ ├── fp8 │ │ ├── amd_detail │ │ │ ├── hip_float8.h │ │ │ ├── hip_float8_impl.h │ │ │ └── quant_utils.muh │ │ └── fp8_cuda_kernels.mu │ ├── fp8_e5m2_kvcache │ │ └── quant_utils.muh │ ├── gptq │ │ ├── compat.muh │ │ ├── matrix_view.muh │ │ ├── q_gemm.mu │ │ ├── qdq_2.muh │ │ ├── qdq_3.muh │ │ ├── qdq_4.muh │ │ ├── qdq_8.muh │ │ └── qdq_util.muh │ ├── gptq_marlin │ │ ├── gptq_marlin.mu │ │ ├── gptq_marlin.muh │ │ └── gptq_marlin_repack.mu │ ├── marlin │ │ ├── .LICENSE │ │ └── marlin_cuda_kernel.mu │ └── squeezellm │ │ └── quant_cuda_kernel.mu └── reduction_utils.muh ├── docs ├── Makefile ├── README.md ├── make.bat ├── requirements-docs.txt └── source │ ├── assets │ ├── dev │ │ └── dockerfile-stages-dependency.png │ ├── kernel │ │ ├── k_vecs.png │ │ ├── key.png │ │ ├── logits_vec.png │ │ ├── q_vecs.png │ │ ├── query.png │ │ ├── v_vec.png │ │ └── value.png │ └── logos │ │ ├── vllm-logo-only-light.png │ │ ├── vllm-logo-text-dark.png │ │ └── vllm-logo-text-light.png │ ├── conf.py │ ├── dev │ ├── dockerfile │ │ └── dockerfile.rst │ ├── engine │ │ ├── async_llm_engine.rst │ │ ├── engine_index.rst │ │ └── llm_engine.rst │ ├── kernel │ │ └── paged_attention.rst │ └── sampling_params.rst │ ├── generate_examples.py │ ├── getting_started │ ├── amd-installation.rst │ ├── cpu-installation.rst │ ├── examples │ │ └── examples_index.template.rst │ ├── installation.rst │ ├── neuron-installation.rst │ └── quickstart.rst │ ├── index.rst │ ├── models │ ├── adding_model.rst │ ├── engine_args.rst │ ├── lora.rst │ ├── performance.rst │ └── supported_models.rst │ ├── quantization │ ├── auto_awq.rst │ ├── fp8_e4m3_kvcache.rst │ └── fp8_e5m2_kvcache.rst │ └── serving │ ├── deploying_with_bentoml.rst │ ├── deploying_with_docker.rst │ ├── deploying_with_kserve.rst │ ├── deploying_with_triton.rst │ ├── distributed_serving.rst │ ├── env_vars.rst │ ├── integrations.rst │ ├── metrics.rst │ ├── openai_compatible_server.md │ ├── run_on_sky.rst │ ├── serving_with_langchain.rst │ └── usage_stats.md ├── examples ├── api_client.py ├── aqlm_example.py ├── fp8 │ ├── README.md │ ├── extract_scales.py │ └── quantizer │ │ ├── README.md │ │ └── quantize.py ├── gradio_openai_chatbot_webserver.py ├── gradio_webserver.py ├── llava_example.py ├── llm_engine_example.py ├── logging_configuration.md ├── multilora_inference.py ├── offline_inference.py ├── offline_inference_distributed.py ├── offline_inference_neuron.py ├── offline_inference_with_prefix.py ├── openai_chat_completion_client.py ├── openai_completion_client.py ├── production_monitoring │ ├── README.md │ ├── docker-compose.yaml │ ├── grafana.json │ └── prometheus.yaml ├── template_alpaca.jinja ├── template_baichuan.jinja ├── template_chatglm.jinja ├── template_chatglm2.jinja ├── template_chatml.jinja ├── template_falcon.jinja ├── template_falcon_180b.jinja ├── template_inkbot.jinja └── tensorize_vllm_model.py ├── format.sh ├── musa_porting.py ├── pyproject.toml ├── requirements-build.txt ├── requirements-common.txt ├── requirements-cpu.txt ├── requirements-cuda.txt ├── requirements-dev.txt ├── requirements-musa.txt ├── requirements-neuron.txt ├── requirements-rocm.txt ├── rocm_patch └── rocm_bf16.patch ├── setup.py ├── tests ├── __init__.py ├── async_engine │ ├── api_server_async_engine.py │ ├── test_api_server.py │ ├── test_async_llm_engine.py │ ├── test_chat_template.py │ ├── test_merge_async_iterators.py │ ├── test_openapi_server_ray.py │ └── test_request_tracker.py ├── basic_correctness │ ├── test_basic_correctness.py │ ├── test_chunked_prefill.py │ └── test_preemption.py ├── conftest.py ├── core │ ├── __init__.py │ ├── block │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── e2e │ │ │ ├── conftest.py │ │ │ └── test_correctness.py │ │ ├── test_block_manager_v2.py │ │ ├── test_block_table.py │ │ ├── test_common.py │ │ ├── test_cpu_gpu_block_allocator.py │ │ ├── test_naive_block.py │ │ └── test_prefix_caching_block.py │ ├── test_block_manager.py │ ├── test_chunked_prefill_scheduler.py │ ├── test_scheduler.py │ └── utils.py ├── distributed │ ├── test_basic_distributed_correctness.py │ ├── test_chunked_prefill_distributed.py │ ├── test_comm_ops.py │ ├── test_custom_all_reduce.py │ ├── test_pynccl.py │ └── test_pynccl_library.py ├── engine │ ├── output_processor │ │ └── test_multi_step.py │ ├── test_computed_prefix_blocks.py │ ├── test_detokenization.py │ ├── test_multiproc_workers.py │ ├── test_skip_tokenizer_init.py │ ├── test_stop_reason.py │ └── test_stop_strings.py ├── entrypoints │ ├── openai │ │ └── test_serving_chat.py │ ├── test_guided_processors.py │ ├── test_llm_generate.py │ ├── test_openai_server.py │ └── test_server_oot_registration.py ├── fp8_kv │ ├── llama2-70b-fp8-kv │ │ └── kv_cache_scales.json │ └── llama2-7b-fp8-kv │ │ └── kv_cache_scales.json ├── kernels │ ├── allclose_default.py │ ├── conftest.py │ ├── test_activation.py │ ├── test_attention.py │ ├── test_cache.py │ ├── test_layernorm.py │ ├── test_moe.py │ ├── test_pos_encoding.py │ ├── test_prefix_prefill.py │ ├── test_rand.py │ └── test_sampler.py ├── lora │ ├── __init__.py │ ├── conftest.py │ ├── test_baichuan.py │ ├── test_chatglm3.py │ ├── test_gemma.py │ ├── test_layer_variation.py │ ├── test_layers.py │ ├── test_llama.py │ ├── test_lora.py │ ├── test_lora_checkpoints.py │ ├── test_lora_manager.py │ ├── test_mixtral.py │ ├── test_punica.py │ ├── test_quant_model.py │ ├── test_tokenizer_group.py │ ├── test_utils.py │ ├── test_worker.py │ └── utils.py ├── metrics │ └── test_metrics.py ├── model_executor │ └── weight_utils.py ├── models │ ├── test_aqlm.py │ ├── test_big_models.py │ ├── test_fp8.py │ ├── test_gptq_marlin.py │ ├── test_llava.py │ ├── test_marlin.py │ ├── test_mistral.py │ ├── test_models.py │ ├── test_oot_registration.py │ └── utils.py ├── prefix_caching │ └── test_prefix_caching.py ├── prompts │ ├── example.txt │ └── summary.txt ├── quantization │ ├── test_configs.py │ └── test_fp8.py ├── samplers │ ├── test_beam_search.py │ ├── test_ignore_eos.py │ ├── test_logits_processor.py │ ├── test_logprobs.py │ ├── test_ranks.py │ ├── test_rejection_sampler.py │ ├── test_sampler.py │ └── test_seeded_generate.py ├── spec_decode │ ├── __init__.py │ ├── e2e │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_compatibility.py │ │ ├── test_logprobs.py │ │ ├── test_multistep_correctness.py │ │ └── test_ngram_correctness.py │ ├── test_batch_expansion.py │ ├── test_metrics.py │ ├── test_multi_step_worker.py │ ├── test_ngram_worker.py │ ├── test_spec_decode_worker.py │ ├── test_utils.py │ └── utils.py ├── tensorizer_loader │ ├── __init__.py │ ├── tensorize_vllm_model_for_testing.py │ └── test_tensorizer.py ├── test_cache_block_hashing.py ├── test_config.py ├── test_logger.py ├── test_logits_processor.py ├── test_regression.py ├── test_sampling_params.py ├── test_sequence.py ├── tokenization │ ├── __init__.py │ ├── test_cached_tokenizer.py │ ├── test_detokenize.py │ ├── test_tokenizer.py │ └── test_tokenizer_group.py └── worker │ ├── __init__.py │ ├── test_model_runner.py │ └── test_swap.py └── vllm ├── __init__.py ├── _custom_ops.py ├── attention ├── __init__.py ├── backends │ ├── __init__.py │ ├── abstract.py │ ├── flash_attn.py │ ├── flashinfer.py │ ├── rocm_flash_attn.py │ ├── torch_sdpa.py │ └── xformers.py ├── layer.py ├── ops │ ├── __init__.py │ ├── paged_attn.py │ ├── prefix_prefill.py │ └── triton_flash_attention.py └── selector.py ├── block.py ├── config.py ├── core ├── __init__.py ├── block │ ├── __init__.py │ ├── block_table.py │ ├── common.py │ ├── cpu_gpu_block_allocator.py │ ├── interfaces.py │ ├── naive_block.py │ └── prefix_caching_block.py ├── block_manager_v1.py ├── block_manager_v2.py ├── evictor_v1.py ├── evictor_v2.py ├── interfaces.py ├── policy.py └── scheduler.py ├── distributed ├── __init__.py ├── communication_op.py ├── device_communicators │ ├── __init__.py │ ├── custom_all_reduce.py │ ├── pymccl.py │ ├── pymccl_utils.py │ └── pynccl.py ├── parallel_state.py └── utils.py ├── engine ├── __init__.py ├── arg_utils.py ├── async_llm_engine.py ├── llm_engine.py ├── metrics.py └── output_processor │ ├── __init__.py │ ├── interfaces.py │ ├── multi_step.py │ ├── single_step.py │ ├── stop_checker.py │ └── util.py ├── entrypoints ├── __init__.py ├── api_server.py ├── llm.py └── openai │ ├── __init__.py │ ├── api_server.py │ ├── cli_args.py │ ├── protocol.py │ ├── serving_chat.py │ ├── serving_completion.py │ └── serving_engine.py ├── envs.py ├── executor ├── __init__.py ├── cpu_executor.py ├── distributed_gpu_executor.py ├── executor_base.py ├── gpu_executor.py ├── multiproc_worker_utils.py ├── neuron_executor.py ├── ray_gpu_executor.py └── ray_utils.py ├── logger.py ├── logging ├── __init__.py └── formatter.py ├── lora ├── __init__.py ├── fully_sharded_layers.py ├── layers.py ├── lora.py ├── models.py ├── punica.py ├── request.py ├── utils.py └── worker_manager.py ├── model_executor ├── __init__.py ├── guided_decoding │ ├── __init__.py │ ├── lm_format_enforcer_decoding.py │ ├── outlines_decoding.py │ └── outlines_logits_processors.py ├── layers │ ├── __init__.py │ ├── activation.py │ ├── fused_moe │ │ ├── __init__.py │ │ ├── configs │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ └── README │ │ └── fused_moe.py │ ├── layernorm.py │ ├── linear.py │ ├── logits_processor.py │ ├── ops │ │ ├── __init__.py │ │ ├── rand.py │ │ └── sample.py │ ├── quantization │ │ ├── __init__.py │ │ ├── aqlm.py │ │ ├── awq.py │ │ ├── base_config.py │ │ ├── fp8.py │ │ ├── gptq.py │ │ ├── gptq_marlin.py │ │ ├── marlin.py │ │ ├── schema.py │ │ └── squeezellm.py │ ├── rejection_sampler.py │ ├── rotary_embedding.py │ ├── sampler.py │ └── vocab_parallel_embedding.py ├── model_loader │ ├── __init__.py │ ├── loader.py │ ├── neuron.py │ ├── tensorizer.py │ ├── utils.py │ └── weight_utils.py ├── models │ ├── __init__.py │ ├── baichuan.py │ ├── bloom.py │ ├── chatglm.py │ ├── commandr.py │ ├── dbrx.py │ ├── decilm.py │ ├── deepseek.py │ ├── falcon.py │ ├── gemma.py │ ├── gpt2.py │ ├── gpt_bigcode.py │ ├── gpt_j.py │ ├── gpt_neox.py │ ├── internlm2.py │ ├── jais.py │ ├── llama.py │ ├── llava.py │ ├── minicpm.py │ ├── mixtral.py │ ├── mixtral_quant.py │ ├── mpt.py │ ├── olmo.py │ ├── opt.py │ ├── orion.py │ ├── phi.py │ ├── qwen.py │ ├── qwen2.py │ ├── qwen2_moe.py │ ├── stablelm.py │ ├── starcoder2.py │ └── xverse.py ├── sampling_metadata.py └── utils.py ├── outputs.py ├── py.typed ├── sampling_params.py ├── sequence.py ├── spec_decode ├── __init__.py ├── batch_expansion.py ├── interfaces.py ├── metrics.py ├── multi_step_worker.py ├── ngram_worker.py ├── spec_decode_worker.py ├── top1_proposer.py └── util.py ├── test_utils.py ├── transformers_utils ├── __init__.py ├── config.py ├── configs │ ├── __init__.py │ ├── chatglm.py │ ├── dbrx.py │ ├── falcon.py │ ├── jais.py │ └── mpt.py ├── detokenizer.py ├── tokenizer.py ├── tokenizer_group │ ├── __init__.py │ ├── base_tokenizer_group.py │ ├── ray_tokenizer_group.py │ └── tokenizer_group.py └── tokenizers │ ├── __init__.py │ └── baichuan.py ├── usage ├── __init__.py └── usage_lib.py ├── utils.py └── worker ├── __init__.py ├── cache_engine.py ├── cpu_model_runner.py ├── cpu_worker.py ├── model_runner.py ├── neuron_model_runner.py ├── neuron_worker.py ├── worker.py └── worker_base.py /.buildkite/check-wheel-size.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | 4 | MAX_SIZE_MB = 100 5 | 6 | 7 | def print_top_10_largest_files(zip_file): 8 | with zipfile.ZipFile(zip_file, 'r') as z: 9 | file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()] 10 | file_sizes.sort(key=lambda x: x[1], reverse=True) 11 | for f, size in file_sizes[:10]: 12 | print(f"{f}: {size/(1024*1024)} MBs uncompressed.") 13 | 14 | 15 | def check_wheel_size(directory): 16 | for root, _, files in os.walk(directory): 17 | for f in files: 18 | if f.endswith(".whl"): 19 | wheel_path = os.path.join(root, f) 20 | wheel_size = os.path.getsize(wheel_path) 21 | wheel_size_mb = wheel_size / (1024 * 1024) 22 | if wheel_size_mb > MAX_SIZE_MB: 23 | print( 24 | f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) " 25 | f"compare to the allowed size ({MAX_SIZE_MB} MB).") 26 | print_top_10_largest_files(wheel_path) 27 | return 1 28 | else: 29 | print(f"Wheel {wheel_path} is within the allowed size " 30 | f"({wheel_size_mb} MB).") 31 | return 0 32 | 33 | 34 | if __name__ == "__main__": 35 | import sys 36 | sys.exit(check_wheel_size(sys.argv[1])) 37 | -------------------------------------------------------------------------------- /.buildkite/download-images.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | set -o pipefail 5 | 6 | (which wget && which curl) || (apt-get update && apt-get install -y wget curl) 7 | 8 | # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/ 9 | mkdir -p images 10 | cd images 11 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt 12 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt 13 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt 14 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt 15 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg 16 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg 17 | 18 | cd - 19 | -------------------------------------------------------------------------------- /.buildkite/run-amd-test.sh: -------------------------------------------------------------------------------- 1 | # This script build the ROCm docker image and runs test inside it. 2 | set -ex 3 | 4 | # Print ROCm version 5 | echo "--- ROCm info" 6 | rocminfo 7 | 8 | echo "--- Resetting GPUs" 9 | 10 | echo "reset" > /opt/amdgpu/etc/gpu_state 11 | 12 | while true; do 13 | sleep 3 14 | if grep -q clean /opt/amdgpu/etc/gpu_state; then 15 | echo "GPUs state is \"clean\"" 16 | break 17 | fi 18 | done 19 | 20 | echo "--- Building container" 21 | sha=$(git rev-parse --short HEAD) 22 | container_name=rocm_${sha} 23 | docker build \ 24 | -t ${container_name} \ 25 | -f Dockerfile.rocm \ 26 | --progress plain \ 27 | . 28 | 29 | remove_docker_container() { 30 | docker rm -f ${container_name} || docker image rm -f ${container_name} || true 31 | } 32 | trap remove_docker_container EXIT 33 | 34 | echo "--- Running container" 35 | 36 | docker run \ 37 | --device /dev/kfd --device /dev/dri \ 38 | --network host \ 39 | --rm \ 40 | -e HF_TOKEN \ 41 | --name ${container_name} \ 42 | ${container_name} \ 43 | /bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//") 44 | 45 | -------------------------------------------------------------------------------- /.buildkite/run-cpu-test.sh: -------------------------------------------------------------------------------- 1 | # This script build the CPU docker image and run the offline inference inside the container. 2 | # It serves a sanity check for compilation and basic model usage. 3 | set -ex 4 | 5 | # Try building the docker image 6 | docker build -t cpu-test -f Dockerfile.cpu . 7 | 8 | # Setup cleanup 9 | remove_docker_container() { docker rm -f cpu-test || true; } 10 | trap remove_docker_container EXIT 11 | remove_docker_container 12 | 13 | # Run the image and launch offline inference 14 | docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py 15 | -------------------------------------------------------------------------------- /.buildkite/run-neuron-test.sh: -------------------------------------------------------------------------------- 1 | # This script build the Neuron docker image and run the API server inside the container. 2 | # It serves a sanity check for compilation and basic model usage. 3 | set -e 4 | 5 | # Try building the docker image 6 | aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com 7 | 8 | # prune old image and containers to save disk space, and only once a day 9 | # by using a timestamp file in tmp. 10 | if [ -f /tmp/neuron-docker-build-timestamp ]; then 11 | last_build=$(cat /tmp/neuron-docker-build-timestamp) 12 | current_time=$(date +%s) 13 | if [ $((current_time - last_build)) -gt 86400 ]; then 14 | docker system prune -f 15 | echo $current_time > /tmp/neuron-docker-build-timestamp 16 | fi 17 | else 18 | echo $(date +%s) > /tmp/neuron-docker-build-timestamp 19 | fi 20 | 21 | docker build -t neuron -f Dockerfile.neuron . 22 | 23 | # Setup cleanup 24 | remove_docker_container() { docker rm -f neuron || true; } 25 | trap remove_docker_container EXIT 26 | remove_docker_container 27 | 28 | # Run the image 29 | docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \ 30 | --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 & 31 | 32 | # Wait for the server to start 33 | wait_for_server_to_start() { 34 | timeout=300 35 | counter=0 36 | 37 | while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do 38 | sleep 1 39 | counter=$((counter + 1)) 40 | if [ $counter -ge $timeout ]; then 41 | echo "Timeout after $timeout seconds" 42 | break 43 | fi 44 | done 45 | } 46 | wait_for_server_to_start 47 | 48 | # Test a simple prompt 49 | curl -X POST -H "Content-Type: application/json" \ 50 | localhost:8000/generate \ 51 | -d '{"prompt": "San Francisco is a"}' 52 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | vllm/*.so 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/100-documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to https://docs.vllm.ai/ 3 | title: "[Doc]: " 4 | labels: ["documentation"] 5 | 6 | body: 7 | - type: textarea 8 | attributes: 9 | label: 📚 The doc issue 10 | description: > 11 | A clear and concise description of what content in https://docs.vllm.ai/ is an issue. 12 | validations: 13 | required: true 14 | - type: textarea 15 | attributes: 16 | label: Suggest a potential alternative/fix 17 | description: > 18 | Tell us how we could improve the documentation in this regard. 19 | - type: markdown 20 | attributes: 21 | value: > 22 | Thanks for contributing 🎉! 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/200-installation.yml: -------------------------------------------------------------------------------- 1 | name: 🛠️ Installation 2 | description: Report an issue here when you hit errors during installation. 3 | title: "[Installation]: " 4 | labels: ["installation"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Your current environment 14 | description: | 15 | Please run the following and paste the output below. 16 | ```sh 17 | wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py 18 | # For security purposes, please feel free to check the contents of collect_env.py before running it. 19 | python collect_env.py 20 | ``` 21 | It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. 22 | value: | 23 | ```text 24 | The output of `python collect_env.py` 25 | ``` 26 | validations: 27 | required: true 28 | - type: textarea 29 | attributes: 30 | label: How you are installing vllm 31 | description: | 32 | Paste the full command you are trying to execute. 33 | value: | 34 | ```sh 35 | pip install -vvv vllm 36 | ``` 37 | - type: markdown 38 | attributes: 39 | value: > 40 | Thanks for contributing 🎉! 41 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/300-usage.yml: -------------------------------------------------------------------------------- 1 | name: 💻 Usage 2 | description: Raise an issue here if you don't know how to use vllm. 3 | title: "[Usage]: " 4 | labels: ["usage"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Your current environment 14 | description: | 15 | Please run the following and paste the output below. 16 | ```sh 17 | wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py 18 | # For security purposes, please feel free to check the contents of collect_env.py before running it. 19 | python collect_env.py 20 | ``` 21 | It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. 22 | value: | 23 | ```text 24 | The output of `python collect_env.py` 25 | ``` 26 | validations: 27 | required: true 28 | - type: textarea 29 | attributes: 30 | label: How would you like to use vllm 31 | description: | 32 | A detailed description of how you want to use vllm. 33 | value: | 34 | I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm. 35 | - type: markdown 36 | attributes: 37 | value: > 38 | Thanks for contributing 🎉! 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/500-feature request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Submit a proposal/request for a new vllm feature 3 | title: "[Feature]: " 4 | labels: ["feature request"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: 🚀 The feature, motivation and pitch 14 | description: > 15 | A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. 16 | validations: 17 | required: true 18 | - type: textarea 19 | attributes: 20 | label: Alternatives 21 | description: > 22 | A description of any alternative solutions or features you've considered, if any. 23 | - type: textarea 24 | attributes: 25 | label: Additional context 26 | description: > 27 | Add any other context or screenshots about the feature request. 28 | - type: markdown 29 | attributes: 30 | value: > 31 | Thanks for contributing 🎉! 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/600-new model.yml: -------------------------------------------------------------------------------- 1 | name: 🤗 Support request for a new model from huggingface 2 | description: Submit a proposal/request for a new model from huggingface 3 | title: "[New Model]: " 4 | labels: ["new model"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | 12 | #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model. 13 | - type: textarea 14 | attributes: 15 | label: The model to consider. 16 | description: > 17 | A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 . 18 | validations: 19 | required: true 20 | - type: textarea 21 | attributes: 22 | label: The closest model vllm already supports. 23 | description: > 24 | Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for? 25 | - type: textarea 26 | attributes: 27 | label: What's your difficulty of supporting the model you want? 28 | description: > 29 | For example, any new operators or new architecture? 30 | - type: markdown 31 | attributes: 32 | value: > 33 | Thanks for contributing 🎉! 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/700-performance discussion.yml: -------------------------------------------------------------------------------- 1 | name: ⚡ Discussion on the performance of vllm 2 | description: Submit a proposal/discussion about the performance of vllm 3 | title: "[Performance]: " 4 | labels: ["performance"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Proposal to improve performance 14 | description: > 15 | How do you plan to improve vllm's performance? 16 | validations: 17 | required: false 18 | - type: textarea 19 | attributes: 20 | label: Report of performance regression 21 | description: > 22 | Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks . 23 | validations: 24 | required: false 25 | - type: textarea 26 | attributes: 27 | label: Misc discussion on performance 28 | description: > 29 | Anything about the performance. 30 | validations: 31 | required: false 32 | - type: textarea 33 | attributes: 34 | label: Your current environment (if you think it is necessary) 35 | description: | 36 | Please run the following and paste the output below. 37 | ```sh 38 | wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py 39 | # For security purposes, please feel free to check the contents of collect_env.py before running it. 40 | python collect_env.py 41 | ``` 42 | It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. 43 | value: | 44 | ```text 45 | The output of `python collect_env.py` 46 | ``` 47 | validations: 48 | required: false 49 | - type: markdown 50 | attributes: 51 | value: > 52 | Thanks for contributing 🎉! 53 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/750-RFC.yml: -------------------------------------------------------------------------------- 1 | name: 💬 Request for comments (RFC). 2 | description: Ask for feedback on major architectural changes or design choices. 3 | title: "[RFC]: " 4 | labels: ["RFC"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference. 11 | - type: textarea 12 | attributes: 13 | label: Motivation. 14 | description: > 15 | The motivation of the RFC. 16 | validations: 17 | required: true 18 | - type: textarea 19 | attributes: 20 | label: Proposed Change. 21 | description: > 22 | The proposed change of the RFC. 23 | validations: 24 | required: true 25 | - type: textarea 26 | attributes: 27 | label: Feedback Period. 28 | description: > 29 | The feedback period of the RFC. Usually at least one week. 30 | validations: 31 | required: false 32 | - type: textarea 33 | attributes: 34 | label: CC List. 35 | description: > 36 | The list of people you want to CC. 37 | validations: 38 | required: false 39 | - type: textarea 40 | attributes: 41 | label: Any Other Things. 42 | description: > 43 | Any other things you would like to mention. 44 | validations: 45 | required: false 46 | - type: markdown 47 | attributes: 48 | value: > 49 | Thanks for contributing 🎉! 50 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/800-misc discussion.yml: -------------------------------------------------------------------------------- 1 | name: 🎲 Misc/random discussions that do not fit into the above categories. 2 | description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues. 3 | title: "[Misc]: " 4 | labels: ["misc"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). 11 | - type: textarea 12 | attributes: 13 | label: Anything you want to discuss about vllm. 14 | description: > 15 | Anything you want to discuss about vllm. 16 | validations: 17 | required: true 18 | - type: markdown 19 | attributes: 20 | value: > 21 | Thanks for contributing 🎉! 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | -------------------------------------------------------------------------------- /.github/workflows/mypy.yaml: -------------------------------------------------------------------------------- 1 | name: mypy 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | ruff: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.8", "3.9", "3.10", "3.11"] 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install mypy==1.9.0 29 | pip install types-setuptools 30 | pip install types-PyYAML 31 | pip install types-requests 32 | pip install types-setuptools 33 | - name: Mypy 34 | run: | 35 | mypy vllm/attention --config-file pyproject.toml 36 | mypy vllm/core --config-file pyproject.toml 37 | mypy vllm/distributed --config-file pyproject.toml 38 | mypy vllm/entrypoints --config-file pyproject.toml 39 | mypy vllm/executor --config-file pyproject.toml 40 | mypy vllm/usage --config-file pyproject.toml 41 | mypy vllm/*.py --config-file pyproject.toml 42 | mypy vllm/transformers_utils --config-file pyproject.toml 43 | mypy vllm/engine --config-file pyproject.toml 44 | mypy vllm/worker --config-file pyproject.toml 45 | mypy vllm/spec_decode --config-file pyproject.toml 46 | mypy vllm/model_executor --config-file pyproject.toml 47 | mypy vllm/lora --config-file pyproject.toml 48 | mypy vllm/logging --config-file pyproject.toml 49 | mypy vllm/model_executor --config-file pyproject.toml 50 | 51 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: ruff 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | ruff: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.8", "3.9", "3.10", "3.11"] 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2 29 | - name: Analysing the code with ruff 30 | run: | 31 | ruff . 32 | - name: Spelling check with codespell 33 | run: | 34 | codespell --toml pyproject.toml 35 | - name: Run isort 36 | run: | 37 | isort . --check-only 38 | -------------------------------------------------------------------------------- /.github/workflows/scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | cuda_home=/usr/local/cuda-$2 5 | 6 | # Update paths 7 | PATH=${cuda_home}/bin:$PATH 8 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH 9 | 10 | # Install requirements 11 | $python_executable -m pip install wheel packaging 12 | $python_executable -m pip install -r requirements-cuda.txt 13 | 14 | # Limit the number of parallel jobs to avoid OOM 15 | export MAX_JOBS=1 16 | # Make sure punica is built for the release (for LoRA) 17 | export VLLM_INSTALL_PUNICA_KERNELS=1 18 | # Make sure release wheels are built for the following architectures 19 | export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" 20 | # Build 21 | $python_executable setup.py bdist_wheel --dist-dir=dist 22 | -------------------------------------------------------------------------------- /.github/workflows/scripts/create_release.js: -------------------------------------------------------------------------------- 1 | // Uses Github's API to create the release and wait for result. 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately. 3 | 4 | module.exports = async (github, context, core) => { 5 | try { 6 | const response = await github.rest.repos.createRelease({ 7 | draft: false, 8 | generate_release_notes: true, 9 | name: process.env.RELEASE_TAG, 10 | owner: context.repo.owner, 11 | prerelease: true, 12 | repo: context.repo.repo, 13 | tag_name: process.env.RELEASE_TAG, 14 | }); 15 | 16 | core.setOutput('upload_url', response.data.upload_url); 17 | } catch (error) { 18 | core.setFailed(error.message); 19 | } 20 | } -------------------------------------------------------------------------------- /.github/workflows/scripts/cuda-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Replace '.' with '-' ex: 11.8 -> 11-8 4 | cuda_version=$(echo $1 | tr "." "-") 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004 6 | OS=$(echo $2 | tr -d ".\-") 7 | 8 | # Installs CUDA 9 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb 10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 11 | rm cuda-keyring_1.1-1_all.deb 12 | sudo apt -qq update 13 | sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version} 14 | sudo apt clean 15 | 16 | # Test nvcc 17 | PATH=/usr/local/cuda-$1/bin:${PATH} 18 | nvcc --version 19 | 20 | # Log gcc, g++, c++ versions 21 | gcc --version 22 | g++ --version 23 | c++ --version 24 | -------------------------------------------------------------------------------- /.github/workflows/scripts/env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This file installs common linux environment tools 4 | 5 | export LANG C.UTF-8 6 | 7 | # python_version=$1 8 | 9 | sudo apt-get update && \ 10 | sudo apt-get install -y --no-install-recommends \ 11 | software-properties-common \ 12 | 13 | sudo apt-get install -y --no-install-recommends \ 14 | build-essential \ 15 | apt-utils \ 16 | ca-certificates \ 17 | wget \ 18 | git \ 19 | vim \ 20 | libssl-dev \ 21 | curl \ 22 | unzip \ 23 | unrar \ 24 | cmake \ 25 | net-tools \ 26 | sudo \ 27 | autotools-dev \ 28 | rsync \ 29 | jq \ 30 | openssh-server \ 31 | tmux \ 32 | screen \ 33 | htop \ 34 | pdsh \ 35 | openssh-client \ 36 | lshw \ 37 | dmidecode \ 38 | util-linux \ 39 | automake \ 40 | autoconf \ 41 | libtool \ 42 | net-tools \ 43 | pciutils \ 44 | libpci-dev \ 45 | libaio-dev \ 46 | libcap2 \ 47 | libtinfo5 \ 48 | fakeroot \ 49 | devscripts \ 50 | debhelper \ 51 | nfs-common 52 | 53 | # Remove github bloat files to free up disk space 54 | sudo rm -rf "/usr/local/share/boost" 55 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 56 | sudo rm -rf "/usr/share/dotnet" 57 | -------------------------------------------------------------------------------- /.github/workflows/scripts/pytorch-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | pytorch_version=$2 5 | cuda_version=$3 6 | 7 | # Install torch 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya 9 | $python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./} 10 | 11 | # Print version information 12 | $python_executable --version 13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)" 14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)" 15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" 16 | -------------------------------------------------------------------------------- /.github/workflows/yapf.yml: -------------------------------------------------------------------------------- 1 | name: yapf 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | jobs: 13 | yapf: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.8", "3.9", "3.10", "3.11"] 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install yapf==0.32.0 28 | pip install toml==0.10.2 29 | - name: Running yapf 30 | run: | 31 | yapf --diff --recursive . 32 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "3.8" 10 | 11 | sphinx: 12 | configuration: docs/source/conf.py 13 | 14 | # If using Sphinx, optionally build your docs in additional formats such as PDF 15 | formats: 16 | - pdf 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements-docs.txt 22 | -------------------------------------------------------------------------------- /.yapfignore: -------------------------------------------------------------------------------- 1 | collect_env.py 2 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to vLLM 2 | 3 | Thank you for your interest in contributing to vLLM! 4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. 5 | There are several ways you can contribute to the project: 6 | 7 | - Identify and report any issues or bugs. 8 | - Request or add a new model. 9 | - Suggest or implement new features. 10 | 11 | However, remember that contributions aren't just about code. 12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions. 13 | 14 | Finally, one of the most impactful ways to support us is by raising awareness about vLLM. 15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects. 16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository. 17 | 18 | 19 | ## Setup for development 20 | 21 | ### Build from source 22 | 23 | ```bash 24 | pip install -e . # This may take several minutes. 25 | ``` 26 | 27 | ### Testing 28 | 29 | ```bash 30 | pip install -r requirements-dev.txt 31 | 32 | # linting and formatting 33 | bash format.sh 34 | # Static type checking 35 | mypy 36 | # Unit tests 37 | pytest tests/ 38 | ``` 39 | **Note:** Currently, the repository does not pass the mypy tests. 40 | 41 | 42 | ## Contributing Guidelines 43 | 44 | ### Issue Reporting 45 | 46 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it. 47 | If not, please file a new issue, providing as much relevant information as possible. 48 | 49 | ### Pull Requests & Code Reviews 50 | 51 | Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution. 52 | 53 | ### Thank You 54 | 55 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. 56 | Your contributions make vLLM a great tool for everyone! 57 | -------------------------------------------------------------------------------- /Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform. 2 | 3 | FROM ubuntu:22.04 4 | 5 | RUN apt-get update -y \ 6 | && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \ 7 | && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 8 | 9 | RUN pip install --upgrade pip \ 10 | && pip install wheel packaging ninja setuptools>=49.4.0 numpy 11 | 12 | COPY ./ /workspace/vllm 13 | 14 | WORKDIR /workspace/vllm 15 | 16 | RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu 17 | 18 | RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install 19 | 20 | CMD ["/bin/bash"] 21 | -------------------------------------------------------------------------------- /Dockerfile.neuron: -------------------------------------------------------------------------------- 1 | # default base image 2 | ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04" 3 | 4 | FROM $BASE_IMAGE 5 | 6 | RUN echo "Base image is $BASE_IMAGE" 7 | 8 | # Install some basic utilities 9 | RUN apt-get update && apt-get install python3 python3-pip -y 10 | 11 | ### Mount Point ### 12 | # When launching the container, mount the code directory to /app 13 | ARG APP_MOUNT=/app 14 | VOLUME [ ${APP_MOUNT} ] 15 | WORKDIR ${APP_MOUNT} 16 | 17 | RUN python3 -m pip install --upgrade pip 18 | RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas 19 | RUN python3 -m pip install sentencepiece transformers==4.36.2 -U 20 | RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U 21 | RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U 22 | 23 | COPY ./vllm /app/vllm/vllm 24 | COPY ./setup.py /app/vllm/setup.py 25 | COPY ./requirements-common.txt /app/vllm/requirements-common.txt 26 | COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt 27 | 28 | RUN cd /app/vllm \ 29 | && python3 -m pip install -U -r requirements-neuron.txt 30 | 31 | ENV VLLM_BUILD_WITH_NEURON 1 32 | RUN cd /app/vllm \ 33 | && pip install -e . \ 34 | && cd .. 35 | 36 | CMD ["/bin/bash"] 37 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include requirements-common.txt 3 | include requirements-cuda.txt 4 | include requirements-rocm.txt 5 | include requirements-neuron.txt 6 | include requirements-cpu.txt 7 | include CMakeLists.txt 8 | 9 | recursive-include cmake * 10 | recursive-include csrc * 11 | -------------------------------------------------------------------------------- /README_vllm_musa.md: -------------------------------------------------------------------------------- 1 | # vllm_musa 2 | 3 | 摩尔线程致力于构建完善好用的国产GPU应用生态,自主研发了MUSA架构及软件平台。vllm项目是业界广泛使用的大语言模型的推理和服务引擎,使用CUDA/ROCm提供GPU加速能力。为了方便摩尔线程GPU用户使用vllm框架,我们发起vllm_musa开源项目为vllm提供MUSA加速,让用户可释放摩尔线程GPU的澎湃算力。 4 | 5 | 现有的vllm代码不支持摩尔线程GPU作为后端,因此我们新增了MUSA设备后端。vllm_musa接口与官方接口一致,用户无需改动业务代码,开箱即用。 6 | 7 | MUSA的一大优势是CUDA兼容,通过musify工具,我们可以快速将官方代码porting至MUSA软件栈,用户可以根据文档自行升级vllm版本并适配MUSA软件栈。 8 | 9 | ## 依赖 10 | 11 | - musa_toolkit >= dev3.0.0 12 | - pytorch >= v2.2.0 13 | - [torch_musa](https://github.com/MooreThreads/torch_musa) >= v1.3.0 14 | - triton >= v2.2.0 15 | - ray >= 2.9 16 | - vllm v0.4.2 17 | 18 | ## 使用 19 | ### 编译 20 | 运行 `bash build_musa.sh` 21 | ### 测试示例 22 | ``` 23 | from vllm import LLM, SamplingParams 24 | from transformers import AutoTokenizer, LlamaForCausalLM 25 | import transformers 26 | import time 27 | import torch 28 | import torch_musa 29 | 30 | 31 | model_path = 32 | 33 | prompts = [ 34 | "Hello, my name is", 35 | "The president of the United States is", 36 | "The capital of France is", 37 | "The future of AI is", 38 | ] 39 | 40 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 41 | llm = LLM(model=model_path, trust_remote_code=True, device="musa") 42 | 43 | outputs = llm.generate(prompts, sampling_params) 44 | 45 | # Print the outputs. 46 | for output in outputs: 47 | prompt = output.prompt 48 | generated_text = output.outputs[0].text 49 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 50 | 51 | ``` 52 | 53 | ## Porting 54 | 55 | 当前仓库porting自vllm v0.4.2版本。如果用户希望使用更高版本的vllm,只需要运行`musa_porting.py`将原生CUDA代码适配到MUSA代码即可。当然随着vllm的迭代可能会有些代码成为漏网之鱼,没有porting成功,用户可自行修改`musa_porting.py`文件中的文本替换规则。从而发挥MUSA强大的CUDA兼容能力。 56 | 57 | ### 步骤 58 | 1. 运行 `python musa_porting.py` 59 | 2. 将`CMakeLists.txt`中需要编译的文件后缀从`.cu`修改为`.mu` 60 | 3. 编译运行vllm_musa 61 | 62 | ## 贡献 63 | 64 | 欢迎广大用户及开发者使用、反馈,助力vllm_musa功能及性能持续完善。 65 | 66 | 社区共建,期待广大开发者与我们一道,共同打造MUSA软件生态。我们将陆续推出一系列开源软件MUSA加速项目。 -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking vLLM 2 | 3 | ## Downloading the ShareGPT dataset 4 | 5 | You can download the dataset by running: 6 | ```bash 7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 8 | ``` 9 | -------------------------------------------------------------------------------- /benchmarks/launch_tgi_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PORT=8000 4 | MODEL=$1 5 | TOKENS=$2 6 | 7 | docker run --gpus all --shm-size 1g -p $PORT:80 \ 8 | -v $PWD/data:/data \ 9 | ghcr.io/huggingface/text-generation-inference:1.4.0 \ 10 | --model-id $MODEL \ 11 | --sharded false \ 12 | --max-input-length 1024 \ 13 | --max-total-tokens 2048 \ 14 | --max-best-of 5 \ 15 | --max-concurrent-requests 5000 \ 16 | --max-batch-total-tokens $TOKENS 17 | -------------------------------------------------------------------------------- /build_musa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | pip install -r requirements-build.txt 7 | pip install -r requirements-musa.txt 8 | 9 | export VLLM_TARGET_DEVICE=musa 10 | export CMAKE_BUILD_TYPE=Debug 11 | export VERBOSE=1 12 | export VLLM_ATTENTION_BACKEND=FLASH_ATTN 13 | 14 | rm -rf build 15 | rm -rf dist 16 | rm -rf vllm.egg-info 17 | pip uninstall -y vllm 18 | 19 | python setup.py bdist_wheel 20 | pip install dist/* -------------------------------------------------------------------------------- /csrc_musa/attention/attention_dtypes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.muh" 4 | #include "dtype_float16.muh" 5 | #include "dtype_float32.muh" 6 | #include "dtype_bfloat16.muh" 7 | #include "dtype_fp8.muh" 8 | -------------------------------------------------------------------------------- /csrc_musa/attention/attention_generic.muh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h 3 | * Copyright (c) 2024 - 2024 Moore Threads Technology Co., Ltd("Moore Threads"). All rights reserved. 4 | * Copyright (c) 2023, The vLLM team. 5 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | #pragma once 20 | 21 | #include 22 | 23 | namespace vllm { 24 | 25 | // A vector type to store Q, K, V elements. 26 | template 27 | struct Vec {}; 28 | 29 | // A vector type to store FP32 accumulators. 30 | template 31 | struct FloatVec {}; 32 | 33 | // Template vector operations. 34 | template 35 | inline __device__ Acc mul(A a, B b); 36 | 37 | template 38 | inline __device__ float sum(T v); 39 | 40 | template 41 | inline __device__ float dot(T a, T b) { 42 | return sum(mul(a, b)); 43 | } 44 | 45 | template 46 | inline __device__ float dot(T a, T b) { 47 | return sum(mul(a, b)); 48 | } 49 | 50 | template 51 | inline __device__ void zero(T& dst) { 52 | constexpr int WORDS = sizeof(T) / 4; 53 | union { 54 | T raw; 55 | uint32_t words[WORDS]; 56 | } tmp; 57 | 58 | #pragma unroll 59 | for (int ii = 0; ii < WORDS; ++ii) { 60 | tmp.words[ii] = 0u; 61 | } 62 | dst = tmp.raw; 63 | } 64 | 65 | } // namespace vllm 66 | -------------------------------------------------------------------------------- /csrc_musa/attention/attention_utils.muh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp 3 | * Copyright (c) 2024 - 2024 Moore Threads Technology Co., Ltd("Moore Threads"). All rights reserved. 4 | * Copyright (c) 2023, The vLLM team. 5 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | #pragma once 20 | 21 | #include "../musa_compat.h" 22 | #include "attention_dtypes.h" 23 | 24 | #include 25 | #include 26 | 27 | namespace vllm { 28 | 29 | // Q*K^T operation. 30 | template 31 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) { 32 | using A_vec = typename FloatVec::Type; 33 | // Compute the parallel products for Q*K^T (treat vector lanes separately). 34 | A_vec qk_vec = mul(q[0], k[0]); 35 | #pragma unroll 36 | for (int ii = 1; ii < N; ++ii) { 37 | qk_vec = fma(q[ii], k[ii], qk_vec); 38 | } 39 | 40 | // Finalize the reduction across lanes. 41 | float qk = sum(qk_vec); 42 | #pragma unroll 43 | for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) { 44 | qk += VLLM_SHFL_XOR_SYNC(qk, mask); 45 | } 46 | return qk; 47 | } 48 | 49 | template 50 | struct Qk_dot { 51 | template 52 | static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) { 53 | return qk_dot_(q, k); 54 | } 55 | }; 56 | 57 | } // namespace vllm 58 | -------------------------------------------------------------------------------- /csrc_musa/attention/dtype_fp8.muh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.muh" 4 | 5 | #include 6 | #ifdef ENABLE_FP8_E5M2 7 | #include 8 | #endif 9 | 10 | namespace vllm { 11 | #if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3) 12 | // fp8 vector types for quantization of kv cache 13 | 14 | template<> 15 | struct Vec { 16 | using Type = uint8_t; 17 | }; 18 | 19 | template<> 20 | struct Vec { 21 | using Type = uint16_t; 22 | }; 23 | 24 | template<> 25 | struct Vec { 26 | using Type = uint32_t; 27 | }; 28 | 29 | template<> 30 | struct Vec { 31 | using Type = uint2; 32 | }; 33 | #endif // ENABLE_FP8_E5M2 34 | 35 | } // namespace vllm 36 | -------------------------------------------------------------------------------- /csrc_musa/cache.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | void swap_blocks( 9 | torch::Tensor& src, 10 | torch::Tensor& dst, 11 | const std::map& block_mapping); 12 | 13 | void copy_blocks( 14 | std::vector& key_caches, 15 | std::vector& value_caches, 16 | const std::map>& block_mapping); 17 | 18 | void reshape_and_cache( 19 | torch::Tensor& key, 20 | torch::Tensor& value, 21 | torch::Tensor& key_cache, 22 | torch::Tensor& value_cache, 23 | torch::Tensor& slot_mapping, 24 | const std::string& kv_cache_dtype, 25 | const float kv_scale); 26 | 27 | void reshape_and_cache_flash( 28 | torch::Tensor& key, 29 | torch::Tensor& value, 30 | torch::Tensor& key_cache, 31 | torch::Tensor& value_cache, 32 | torch::Tensor& slot_mapping, 33 | const std::string& kv_cache_dtype); 34 | 35 | // Just for unittest 36 | void convert_fp8( 37 | torch::Tensor& src_cache, 38 | torch::Tensor& dst_cache); 39 | -------------------------------------------------------------------------------- /csrc_musa/cpu/pybind.cpp: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | #include "cuda_utils.h" 3 | #include "ops.h" 4 | #include 5 | 6 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 7 | // vLLM custom ops 8 | pybind11::module ops = m.def_submodule("ops", "vLLM custom operators"); 9 | 10 | // Attention ops 11 | ops.def( 12 | "paged_attention_v1", 13 | &paged_attention_v1, 14 | "Compute the attention between an input query and the cached keys/values using PagedAttention."); 15 | ops.def( 16 | "paged_attention_v2", 17 | &paged_attention_v2, 18 | "PagedAttention V2."); 19 | 20 | // Activation ops 21 | ops.def( 22 | "silu_and_mul", 23 | &silu_and_mul, 24 | "Activation function used in SwiGLU."); 25 | ops.def( 26 | "gelu_and_mul", 27 | &gelu_and_mul, 28 | "Activation function used in GeGLU with `none` approximation."); 29 | ops.def( 30 | "gelu_tanh_and_mul", 31 | &gelu_tanh_and_mul, 32 | "Activation function used in GeGLU with `tanh` approximation."); 33 | ops.def( 34 | "gelu_new", 35 | &gelu_new, 36 | "GELU implementation used in GPT-2."); 37 | ops.def( 38 | "gelu_fast", 39 | &gelu_fast, 40 | "Approximate GELU implementation."); 41 | 42 | // Layernorm 43 | ops.def( 44 | "rms_norm", 45 | &rms_norm, 46 | "Apply Root Mean Square (RMS) Normalization to the input tensor."); 47 | 48 | ops.def( 49 | "fused_add_rms_norm", 50 | &fused_add_rms_norm, 51 | "In-place fused Add and RMS Normalization"); 52 | 53 | // Rotary embedding 54 | ops.def( 55 | "rotary_embedding", 56 | &rotary_embedding, 57 | "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); 58 | 59 | // Cache ops 60 | pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops"); 61 | cache_ops.def( 62 | "swap_blocks", 63 | &swap_blocks, 64 | "Swap in (out) the cache blocks from src to dst"); 65 | cache_ops.def( 66 | "copy_blocks", 67 | ©_blocks, 68 | "Copy the cache blocks from src to dst"); 69 | cache_ops.def( 70 | "reshape_and_cache", 71 | &reshape_and_cache, 72 | "Reshape the key and value tensors and cache them"); 73 | } 74 | -------------------------------------------------------------------------------- /csrc_musa/dispatch_utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from 3 | * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h 4 | */ 5 | #pragma once 6 | 7 | #include 8 | 9 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ 10 | AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ 11 | AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ 12 | AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) 13 | 14 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ 15 | AT_DISPATCH_SWITCH( \ 16 | TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) 17 | 18 | #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...) \ 19 | AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ 20 | AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ 21 | AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ 22 | AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) 23 | 24 | #define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \ 25 | AT_DISPATCH_SWITCH( \ 26 | TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__)) 27 | 28 | #define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...) \ 29 | AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \ 30 | AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ 31 | AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ 32 | AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ 33 | AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) 34 | 35 | #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ 36 | AT_DISPATCH_SWITCH( \ 37 | TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)) 38 | -------------------------------------------------------------------------------- /csrc_musa/moe/moe_ops.cpp: -------------------------------------------------------------------------------- 1 | #include "moe_ops.h" 2 | 3 | #include 4 | 5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 6 | m.def("topk_softmax", &topk_softmax, "Apply topk softmax to the gating outputs."); 7 | } 8 | -------------------------------------------------------------------------------- /csrc_musa/moe/moe_ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void topk_softmax( 6 | torch::Tensor& topk_weights, 7 | torch::Tensor& topk_indices, 8 | torch::Tensor& token_expert_indices, 9 | torch::Tensor& gating_output); 10 | -------------------------------------------------------------------------------- /csrc_musa/musa_compat.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef USE_ROCM 4 | #include 5 | #endif 6 | 7 | #ifndef USE_ROCM 8 | #define WARP_SIZE 32 9 | #else 10 | #define WARP_SIZE warpSize 11 | #endif 12 | 13 | #ifndef USE_ROCM 14 | #define VLLM_LDG(arg) __ldg(arg) 15 | #else 16 | #define VLLM_LDG(arg) *(arg) 17 | #endif 18 | 19 | #ifndef USE_ROCM 20 | #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask) 21 | #else 22 | #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask) 23 | #endif 24 | 25 | #ifndef USE_ROCM 26 | #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane) 27 | #else 28 | #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane) 29 | #endif 30 | 31 | #ifndef USE_ROCM 32 | #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ 33 | musaFuncSetAttribute(FUNC, musaFuncAttributeMaxDynamicSharedMemorySize, VAL) 34 | #else 35 | #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ 36 | hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) 37 | #endif 38 | 39 | -------------------------------------------------------------------------------- /csrc_musa/musa_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | int get_device_attribute( 6 | int attribute, 7 | int device_id); 8 | 9 | int get_max_shared_memory_per_block_device_attribute( 10 | int device_id); 11 | -------------------------------------------------------------------------------- /csrc_musa/musa_utils_kernels.mu: -------------------------------------------------------------------------------- 1 | #ifdef USE_ROCM 2 | #include 3 | #include 4 | #endif 5 | int get_device_attribute( 6 | int attribute, 7 | int device_id) 8 | { 9 | int device, value; 10 | if (device_id < 0) { 11 | musaGetDevice(&device); 12 | } 13 | else { 14 | device = device_id; 15 | } 16 | musaDeviceGetAttribute(&value, static_cast(attribute), device); 17 | return value; 18 | } 19 | 20 | 21 | int get_max_shared_memory_per_block_device_attribute( 22 | int device_id) 23 | { 24 | int attribute; 25 | // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html 26 | // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74 27 | 28 | #ifdef USE_ROCM 29 | attribute = hipDeviceAttributeMaxSharedMemoryPerBlock; 30 | #else 31 | attribute = musaDevAttrMaxSharedMemoryPerBlockOptin; 32 | #endif 33 | 34 | return get_device_attribute(attribute, device_id); 35 | } 36 | -------------------------------------------------------------------------------- /csrc_musa/punica/bgmv/bgmv_bf16_bf16_bf16.mu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, mt_bfloat16, mt_bfloat16, mt_bfloat16) 5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, mt_bfloat16, mt_bfloat16, mt_bfloat16) 6 | -------------------------------------------------------------------------------- /csrc_musa/punica/bgmv/bgmv_bf16_fp32_bf16.mu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, mt_bfloat16, float, mt_bfloat16) 5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, mt_bfloat16, float, mt_bfloat16) 6 | -------------------------------------------------------------------------------- /csrc_musa/punica/bgmv/bgmv_fp16_fp16_fp16.mu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half) 5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, nv_half, nv_half) 6 | -------------------------------------------------------------------------------- /csrc_musa/punica/bgmv/bgmv_fp16_fp32_fp16.mu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half) 5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, float, nv_half) 6 | -------------------------------------------------------------------------------- /csrc_musa/punica/bgmv/bgmv_fp32_bf16_bf16.mu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, mt_bfloat16, mt_bfloat16) 5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, mt_bfloat16, mt_bfloat16) 6 | -------------------------------------------------------------------------------- /csrc_musa/punica/bgmv/bgmv_fp32_fp16_fp16.mu: -------------------------------------------------------------------------------- 1 | #include "bgmv_config.h" 2 | #include "bgmv_impl.cuh" 3 | 4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half) 5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_half, nv_half) 6 | -------------------------------------------------------------------------------- /csrc_musa/punica/bgmv/generator.py: -------------------------------------------------------------------------------- 1 | DTYPES = ["fp16", "bf16", "fp32"] 2 | DTYPE_MAP = { 3 | "fp16": "nv_half", 4 | "bf16": "mt_bfloat16", 5 | "fp32": "float", 6 | } 7 | 8 | TEMPLATE = """ 9 | #include "bgmv_config.h" 10 | #include "bgmv_impl.cuh" 11 | 12 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype}) 13 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, {input_dtype}, {output_dtype}, {weight_dtype}) 14 | """.lstrip() # noqa: E501 15 | 16 | for input_dtype in DTYPES: 17 | for output_dtype in DTYPES: 18 | for weight_dtype in DTYPES: 19 | if weight_dtype == "fp32": 20 | # FP32 weights are not supported. 21 | continue 22 | if output_dtype == "fp32": 23 | # LoRA A matrix. 24 | if input_dtype != weight_dtype: 25 | # NOTE(woosuk): While Punica supports the case where the 26 | # input and weight dtypes are different, we only generate 27 | # the kernels the same dtypes to reduce the binary size. 28 | continue 29 | elif input_dtype == "fp32": 30 | # LoRA B matrix. 31 | if output_dtype != weight_dtype: 32 | # NOTE(woosuk): While Punica supports the case where the 33 | # output and weight dtypes are different, we only generate 34 | # the kernels the same dtypes to reduce the binary size. 35 | continue 36 | elif not (input_dtype == output_dtype == weight_dtype): 37 | # NOTE(woosuk): While Punica supports mixed data types for 38 | # input, output, and weight, we only generate the kernels with 39 | # the same data types to reduce the binary size. 40 | continue 41 | 42 | kernel_definition = TEMPLATE.format( 43 | input_dtype=DTYPE_MAP[input_dtype], 44 | output_dtype=DTYPE_MAP[output_dtype], 45 | weight_dtype=DTYPE_MAP[weight_dtype]) 46 | filename = f"bgmv_{input_dtype}_{output_dtype}_{weight_dtype}.cu" 47 | with open(filename, "w") as f: 48 | f.write(kernel_definition) 49 | -------------------------------------------------------------------------------- /csrc_musa/quantization/gptq/compat.muh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _compat_cuh 6 | #define _compat_cuh 7 | 8 | namespace vllm { 9 | namespace gptq { 10 | // atomicAdd for half types, to support CC < 7.x 11 | 12 | __device__ __forceinline__ void atomicAdd_half(half* address, half val) 13 | { 14 | unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); 15 | unsigned int old = *address_as_ui; 16 | unsigned int assumed; 17 | 18 | do 19 | { 20 | assumed = old; 21 | __half_raw hsum; 22 | hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); 23 | half tmpres = __hadd(hsum, val); 24 | hsum = __half_raw(tmpres); 25 | old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; 26 | old = atomicCAS(address_as_ui, assumed, old); 27 | } 28 | while (assumed != old); 29 | } 30 | 31 | // atomicAdd for half2 types 32 | 33 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) 34 | { 35 | unsigned int* address_as_ui = (unsigned int*)address; 36 | unsigned int old = *address_as_ui; 37 | unsigned int assumed; 38 | do 39 | { 40 | assumed = old; 41 | half2 old_val = *((half2*)&old); 42 | half2 new_val = __hadd2(old_val, val); 43 | old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); 44 | } 45 | while (assumed != old); 46 | } 47 | 48 | // 49 | 50 | #if defined(__MUSA_ARCH__) || defined(USE_ROCM) 51 | #if __MUSA_ARCH__ < 700 || defined(USE_ROCM) 52 | 53 | __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } 54 | 55 | #if __MUSA_ARCH__ < 600 || defined(USE_ROCM) 56 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } 57 | #endif 58 | 59 | #endif 60 | #endif 61 | 62 | } // namespace gptq 63 | } // namespace vllm 64 | #endif 65 | -------------------------------------------------------------------------------- /csrc_musa/quantization/gptq/qdq_8.muh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _qdq_8_cuh 6 | #define _qdq_8_cuh 7 | 8 | #include "qdq_util.cuh" 9 | 10 | namespace vllm { 11 | namespace gptq { 12 | 13 | __forceinline__ __device__ void shuffle_8bit_4 14 | ( 15 | uint32_t* q, 16 | int stride 17 | ) 18 | { 19 | } 20 | 21 | __forceinline__ __device__ void dequant_8bit_8 22 | ( 23 | const uint32_t q_0, 24 | const uint32_t q_1, 25 | half2 (&dq)[4], 26 | int stride, 27 | const uint32_t zero 28 | ) 29 | { 30 | half dqh[8]; 31 | for (int i = 0; i < 4; i++) dqh[i ] = dq_ns(exb(q_0, i * 8, 0xff), zero); 32 | for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero); 33 | 34 | for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); 35 | } 36 | 37 | } // namespace gptq 38 | } // namespace vllm 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /csrc_musa/quantization/gptq/qdq_util.muh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _qdq_util_cuh 6 | #define _qdq_util_cuh 7 | 8 | namespace vllm { 9 | namespace gptq { 10 | 11 | union half2_uint32 12 | { 13 | uint32_t as_uint32; 14 | half2 as_half2; 15 | __device__ half2_uint32(uint32_t val) : as_uint32(val) {} 16 | __device__ half2_uint32(half2 val) : as_half2(val) {} 17 | }; 18 | 19 | union half_uint16 20 | { 21 | uint16_t as_uint16; 22 | half as_half; 23 | __device__ half_uint16(uint16_t val) : as_uint16(val) {} 24 | __device__ half_uint16(half val) : as_half(val) {} 25 | }; 26 | 27 | // Max_scale premultiplied by 1/256 28 | 29 | __forceinline__ __device__ half dq_scale(const int qs, const half max_scale) 30 | { 31 | int qs_i = qs + 1; 32 | half qs_h = __int2half_rn(qs_i * qs_i); 33 | qs_h = __hmul(qs_h, max_scale); 34 | return qs_h; 35 | } 36 | 37 | __forceinline__ __device__ half dq(const int q, const int qzero, const half scale) 38 | { 39 | return __hmul(__int2half_rn(q - qzero), scale); 40 | } 41 | 42 | __forceinline__ __device__ half dq_ns(const int q, const int qzero) 43 | { 44 | //return __hsub(__int2half_rn(q), __int2half_rn(qzero)); 45 | return __int2half_rn(q - qzero); 46 | } 47 | 48 | __forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask) 49 | { 50 | return (int)((q >> shift) & mask); 51 | } 52 | 53 | __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask) 54 | { 55 | return (int)(__funnelshift_rc(q0, q1, shift) & mask); 56 | } 57 | 58 | } // namespace gptq 59 | } // namespace vllm 60 | #endif 61 | -------------------------------------------------------------------------------- /csrc_musa/quantization/gptq_marlin/gptq_marlin.muh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "torch_musa/csrc/aten/musa/MUSAContext.h" 6 | #include "torch_musa/csrc/core/MUSAGuard.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace gptq_marlin { 13 | 14 | // 8 warps are a good choice since every SM has 4 schedulers and having more than 1 warp per 15 | // schedule allows some more latency hiding. At the same time, we want relatively few warps to have 16 | // many registers per warp and small tiles. 17 | static constexpr int default_threads = 256; 18 | 19 | static constexpr int pipe_stages = 4; // 4 pipeline stages fit into shared memory 20 | 21 | static constexpr int min_thread_n = 64; 22 | static constexpr int min_thread_k = 64; 23 | 24 | static constexpr int tile_size = 16; 25 | static constexpr int max_par = 16; 26 | 27 | template 28 | struct Vec { 29 | T elems[n]; 30 | __device__ T& operator[](int i) { return elems[i]; } 31 | }; 32 | 33 | using I4 = Vec; 34 | 35 | constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; } 36 | 37 | #if defined(__MUSA_ARCH__) && __MUSA_ARCH__ < 800 38 | // No support for async 39 | #else 40 | 41 | __device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, bool pred = true) { 42 | const int BYTES = 16; 43 | uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); 44 | asm volatile("{\n" 45 | " .reg .pred p;\n" 46 | " setp.ne.b32 p, %0, 0;\n" 47 | " @p cp.async.cg.shared.global [%1], [%2], %3;\n" 48 | "}\n" ::"r"((int)pred), 49 | "r"(smem), "l"(glob_ptr), "n"(BYTES)); 50 | } 51 | 52 | __device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) { 53 | const int BYTES = 16; 54 | uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); 55 | asm volatile("{\n" 56 | " cp.async.cg.shared.global [%0], [%1], %2;\n" 57 | "}\n" ::"r"(smem), 58 | "l"(glob_ptr), "n"(BYTES)); 59 | } 60 | 61 | __device__ inline void cp_async_fence() { asm volatile("cp.async.commit_group;\n" ::); } 62 | 63 | template 64 | __device__ inline void cp_async_wait() { 65 | asm volatile("cp.async.wait_group %0;\n" ::"n"(n)); 66 | } 67 | 68 | #endif 69 | 70 | } // namespace gptq_marlin 71 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # vLLM documents 2 | 3 | ## Build the docs 4 | 5 | ```bash 6 | # Install dependencies. 7 | pip install -r requirements-docs.txt 8 | 9 | # Build the docs. 10 | make clean 11 | make html 12 | ``` 13 | 14 | ## Open the docs with your browser 15 | 16 | ```bash 17 | python -m http.server -d build/html/ 18 | ``` 19 | Launch your browser and open localhost:8000. 20 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx == 6.2.1 2 | sphinx-book-theme == 1.0.1 3 | sphinx-copybutton == 0.5.2 4 | myst-parser == 2.0.0 5 | sphinx-argparse 6 | 7 | # packages to install to build the documentation 8 | pydantic 9 | -f https://download.pytorch.org/whl/cpu 10 | torch 11 | py-cpuinfo 12 | transformers 13 | -------------------------------------------------------------------------------- /docs/source/assets/dev/dockerfile-stages-dependency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/dev/dockerfile-stages-dependency.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/k_vecs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/k_vecs.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/key.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/logits_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/logits_vec.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/q_vecs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/q_vecs.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/query.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/v_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/v_vec.png -------------------------------------------------------------------------------- /docs/source/assets/kernel/value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/value.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-only-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/logos/vllm-logo-only-light.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/logos/vllm-logo-text-dark.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/logos/vllm-logo-text-light.png -------------------------------------------------------------------------------- /docs/source/dev/dockerfile/dockerfile.rst: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | ==================== 3 | 4 | See `here `_ for the main Dockerfile to construct 5 | the image for running an OpenAI compatible server with vLLM. 6 | 7 | - Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: 8 | 9 | - All build stages 10 | - The default build target (highlighted in grey) 11 | - External images (with dashed borders) 12 | 13 | The edges of the build graph represent: 14 | 15 | - FROM ... dependencies (with a solid line and a full arrow head) 16 | - COPY --from=... dependencies (with a dashed line and an empty arrow head) 17 | - RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head) 18 | 19 | .. figure:: ../../assets/dev/dockerfile-stages-dependency.png 20 | :alt: query 21 | :width: 100% 22 | :align: center 23 | 24 | Made using: https://github.com/patrickhoefler/dockerfilegraph 25 | 26 | Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present): 27 | 28 | .. code:: bash 29 | 30 | dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile 31 | 32 | or in case you want to run it directly with the docker image: 33 | 34 | .. code:: bash 35 | 36 | docker run \ 37 | --rm \ 38 | --user "$(id -u):$(id -g)" \ 39 | --workdir /workspace \ 40 | --volume "$(pwd)":/workspace \ 41 | ghcr.io/patrickhoefler/dockerfilegraph:alpine \ 42 | --output png \ 43 | --dpi 200 \ 44 | --max-label-length 50 \ 45 | --filename Dockerfile \ 46 | --legend 47 | 48 | (To run it for a different file, you can pass in a different argument to the flag `--filename`.) 49 | 50 | -------------------------------------------------------------------------------- /docs/source/dev/engine/async_llm_engine.rst: -------------------------------------------------------------------------------- 1 | AsyncLLMEngine 2 | ================================= 3 | 4 | .. autoclass:: vllm.AsyncLLMEngine 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/dev/engine/engine_index.rst: -------------------------------------------------------------------------------- 1 | vLLM Engine 2 | ================================= 3 | 4 | .. automodule:: vllm.engine 5 | .. currentmodule:: vllm.engine 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | :caption: Engines 10 | 11 | llm_engine 12 | async_llm_engine 13 | 14 | -------------------------------------------------------------------------------- /docs/source/dev/engine/llm_engine.rst: -------------------------------------------------------------------------------- 1 | LLMEngine 2 | ================================= 3 | 4 | .. autoclass:: vllm.LLMEngine 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/dev/sampling_params.rst: -------------------------------------------------------------------------------- 1 | Sampling Params 2 | =============== 3 | 4 | .. autoclass:: vllm.SamplingParams 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/generate_examples.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pathlib import Path 3 | 4 | 5 | def fix_case(text: str) -> str: 6 | subs = [ 7 | ("api", "API"), 8 | ("llm", "LLM"), 9 | ("vllm", "vLLM"), 10 | ("openai", "OpenAI"), 11 | ("multilora", "MultiLoRA"), 12 | ] 13 | for sub in subs: 14 | text = re.sub(*sub, text, flags=re.IGNORECASE) 15 | return text 16 | 17 | 18 | def underline(title: str, character: str = "=") -> str: 19 | return f"{title}\n{character * len(title)}" 20 | 21 | 22 | def generate_title(filename: str) -> str: 23 | # Turn filename into a title 24 | title = filename.replace("_", " ").title() 25 | # Handle acronyms and names 26 | title = fix_case(title) 27 | # Underline title 28 | title = underline(title) 29 | return title 30 | 31 | 32 | def generate_examples(): 33 | root_dir = Path(__file__).parent.parent.parent.resolve() 34 | 35 | # Source paths 36 | script_dir = root_dir / "examples" 37 | script_paths = sorted(script_dir.glob("*.py")) 38 | 39 | # Destination paths 40 | doc_dir = root_dir / "docs/source/getting_started/examples" 41 | doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths] 42 | 43 | # Generate the example docs for each example script 44 | for script_path, doc_path in zip(script_paths, doc_paths): 45 | script_url = f"https://github.com/vllm-project/vllm/blob/main/examples/{script_path.name}" 46 | # Make script_path relative to doc_path and call it include_path 47 | include_path = '../../../..' / script_path.relative_to(root_dir) 48 | content = (f"{generate_title(doc_path.stem)}\n\n" 49 | f"Source {script_url}.\n\n" 50 | f".. literalinclude:: {include_path}\n" 51 | " :language: python\n" 52 | " :linenos:\n") 53 | with open(doc_path, "w+") as f: 54 | f.write(content) 55 | 56 | # Generate the toctree for the example scripts 57 | with open(doc_dir / "examples_index.template.rst") as f: 58 | examples_index = f.read() 59 | with open(doc_dir / "examples_index.rst", "w+") as f: 60 | example_docs = "\n ".join(path.stem for path in script_paths) 61 | f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs)) 62 | -------------------------------------------------------------------------------- /docs/source/getting_started/examples/examples_index.template.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :caption: Scripts 7 | 8 | %EXAMPLE_DOCS% 9 | -------------------------------------------------------------------------------- /docs/source/models/engine_args.rst: -------------------------------------------------------------------------------- 1 | .. _engine_args: 2 | 3 | Engine Arguments 4 | ================ 5 | 6 | Below, you can find an explanation of every engine argument for vLLM: 7 | 8 | .. argparse:: 9 | :module: vllm.engine.arg_utils 10 | :func: _engine_args_parser 11 | :prog: -m vllm.entrypoints.openai.api_server 12 | :nodefaultconst: 13 | 14 | Async Engine Arguments 15 | ---------------------- 16 | 17 | Below are the additional arguments related to the asynchronous engine: 18 | 19 | .. argparse:: 20 | :module: vllm.engine.arg_utils 21 | :func: _async_engine_args_parser 22 | :prog: -m vllm.entrypoints.openai.api_server 23 | :nodefaultconst: -------------------------------------------------------------------------------- /docs/source/models/performance.rst: -------------------------------------------------------------------------------- 1 | .. _performance: 2 | 3 | Performance and Tuning 4 | ====================== 5 | 6 | Chunked Prefill 7 | --------------- 8 | vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests. 9 | 10 | You can enable the feature by specifying 11 | 12 | .. code-block:: python 13 | 14 | llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True) 15 | # Set max_num_batched_tokens to tune performance. 16 | # NOTE: 512 is the default max_num_batched_tokens for chunked prefill. 17 | # llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512) 18 | 19 | By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch. This policy optimizes the TTFT (time to thefirst token), but incurs slower ITL (inter token latency) and inefficient GPU utilization. 20 | 21 | Once chunked prefill is enabled, the policy is changed to 22 | 23 | - prioritize decode requests. It batches all pending decode requests to the batch before scheduling any prefill. 24 | - When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills. If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it. 25 | 26 | This policy has two benefits. 27 | 28 | - It improves ITL (inter token latency) and generation decode because decode requests are prioritized. 29 | - It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch. 30 | 31 | You can tune the performance by changing `max_num_batched_tokens`. 32 | By default, it is set to 512, which has the best ITL on A100 in the initial benchmark. 33 | Smaller batch size achieves better ITL because there are fewer prefills interrupting decodes. 34 | Higher batch size achieves better TTFT as you can put more prefill to the batch. 35 | If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes). 36 | Note that the default batch size (512) is optimized for ITL, and it may have lower throughput than the default scheduler. We recommend you set `max_num_batched_tokens > 2048` for throughput. 37 | 38 | See related papers for more details (https://arxiv.org/pdf/2401.08671 or https://arxiv.org/pdf/2308.16369). 39 | -------------------------------------------------------------------------------- /docs/source/quantization/fp8_e5m2_kvcache.rst: -------------------------------------------------------------------------------- 1 | .. _fp8_kv_cache: 2 | 3 | FP8 E5M2 KV Cache 4 | ================== 5 | 6 | The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. 7 | The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other. 8 | 9 | Here is an example of how to enable this feature: 10 | 11 | .. code-block:: python 12 | 13 | from vllm import LLM, SamplingParams 14 | # Sample prompts. 15 | prompts = [ 16 | "Hello, my name is", 17 | "The president of the United States is", 18 | "The capital of France is", 19 | "The future of AI is", 20 | ] 21 | # Create a sampling params object. 22 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 23 | # Create an LLM. 24 | llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") 25 | # Generate texts from the prompts. The output is a list of RequestOutput objects 26 | # that contain the prompt, generated text, and other information. 27 | outputs = llm.generate(prompts, sampling_params) 28 | # Print the outputs. 29 | for output in outputs: 30 | prompt = output.prompt 31 | generated_text = output.outputs[0].text 32 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 33 | 34 | 35 | Note, current prefix caching doesn't work with FP8 KV cache enabled, forward_prefix kernel should handle different KV and cache type. 36 | 37 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_bentoml.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_bentoml: 2 | 3 | Deploying with BentoML 4 | ====================== 5 | 6 | `BentoML `_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. 7 | 8 | For details, see the tutorial `vLLM inference in the BentoML documentation `_. -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_kserve.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_kserve: 2 | 3 | Deploying with KServe 4 | ============================ 5 | 6 | vLLM can be deployed with `KServe `_ on Kubernetes for highly scalable distributed model serving. 7 | 8 | Please see `this guide `_ for more details on using vLLM with KServe. 9 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_triton.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_triton: 2 | 3 | Deploying with NVIDIA Triton 4 | ============================ 5 | 6 | The `Triton Inference Server `_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m `_ model using vLLM. Please see `Deploying a vLLM model in Triton `_ for more details. 7 | -------------------------------------------------------------------------------- /docs/source/serving/distributed_serving.rst: -------------------------------------------------------------------------------- 1 | .. _distributed_serving: 2 | 3 | Distributed Inference and Serving 4 | ================================= 5 | 6 | vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with `Ray `_. To run distributed inference, install Ray with: 7 | 8 | .. code-block:: console 9 | 10 | $ pip install ray 11 | 12 | To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: 13 | 14 | .. code-block:: python 15 | 16 | from vllm import LLM 17 | llm = LLM("facebook/opt-13b", tensor_parallel_size=4) 18 | output = llm.generate("San Franciso is a") 19 | 20 | To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: 21 | 22 | .. code-block:: console 23 | 24 | $ python -m vllm.entrypoints.api_server \ 25 | $ --model facebook/opt-13b \ 26 | $ --tensor-parallel-size 4 27 | 28 | To scale vLLM beyond a single machine, start a `Ray runtime `_ via CLI before running vLLM: 29 | 30 | .. code-block:: console 31 | 32 | $ # On head node 33 | $ ray start --head 34 | 35 | $ # On worker nodes 36 | $ ray start --address= 37 | 38 | After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines. -------------------------------------------------------------------------------- /docs/source/serving/env_vars.rst: -------------------------------------------------------------------------------- 1 | Environment Variables 2 | ======================== 3 | 4 | vLLM uses the following environment variables to configure the system: 5 | 6 | .. literalinclude:: ../../../vllm/envs.py 7 | :language: python 8 | :start-after: begin-env-vars-definition 9 | :end-before: end-env-vars-definition 10 | -------------------------------------------------------------------------------- /docs/source/serving/integrations.rst: -------------------------------------------------------------------------------- 1 | Integrations 2 | ------------ 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | run_on_sky 8 | deploying_with_kserve 9 | deploying_with_triton 10 | deploying_with_bentoml 11 | serving_with_langchain 12 | -------------------------------------------------------------------------------- /docs/source/serving/metrics.rst: -------------------------------------------------------------------------------- 1 | Production Metrics 2 | ================== 3 | 4 | vLLM exposes a number of metrics that can be used to monitor the health of the 5 | system. These metrics are exposed via the `/metrics` endpoint on the vLLM 6 | OpenAI compatible API server. 7 | 8 | The following metrics are exposed: 9 | 10 | .. literalinclude:: ../../../vllm/engine/metrics.py 11 | :language: python 12 | :start-after: begin-metrics-definitions 13 | :end-before: end-metrics-definitions 14 | -------------------------------------------------------------------------------- /docs/source/serving/serving_with_langchain.rst: -------------------------------------------------------------------------------- 1 | .. _run_on_langchain: 2 | 3 | Serving with Langchain 4 | ============================ 5 | 6 | vLLM is also available via `Langchain `_ . 7 | 8 | To install langchain, run 9 | 10 | .. code-block:: console 11 | 12 | $ pip install langchain langchain_community -q 13 | 14 | To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``. 15 | 16 | .. code-block:: python 17 | 18 | from langchain_community.llms import VLLM 19 | 20 | llm = VLLM(model="mosaicml/mpt-7b", 21 | trust_remote_code=True, # mandatory for hf models 22 | max_new_tokens=128, 23 | top_k=10, 24 | top_p=0.95, 25 | temperature=0.8, 26 | # tensor_parallel_size=... # for distributed inference 27 | ) 28 | 29 | print(llm("What is the capital of France ?")) 30 | 31 | Please refer to this `Tutorial `_ for more details. 32 | -------------------------------------------------------------------------------- /docs/source/serving/usage_stats.md: -------------------------------------------------------------------------------- 1 | # Usage Stats Collection 2 | 3 | vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit. 4 | 5 | ## What data is collected? 6 | 7 | You can see the up to date list of data collected by vLLM in the [usage_lib.py](https://github.com/vllm-project/vllm/blob/main/vllm/usage/usage_lib.py). 8 | 9 | Here is an example as of v0.4.0: 10 | 11 | ```json 12 | { 13 | "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109", 14 | "provider": "GCP", 15 | "num_cpu": 24, 16 | "cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz", 17 | "cpu_family_model_stepping": "6,85,7", 18 | "total_memory": 101261135872, 19 | "architecture": "x86_64", 20 | "platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31", 21 | "gpu_count": 2, 22 | "gpu_type": "NVIDIA L4", 23 | "gpu_memory_per_device": 23580639232, 24 | "model_architecture": "OPTForCausalLM", 25 | "vllm_version": "0.3.2+cu123", 26 | "context": "LLM_CLASS", 27 | "log_time": 1711663373492490000, 28 | "source": "production", 29 | "dtype": "torch.float16", 30 | "tensor_parallel_size": 1, 31 | "block_size": 16, 32 | "gpu_memory_utilization": 0.9, 33 | "quantization": null, 34 | "kv_cache_dtype": "auto", 35 | "enable_lora": false, 36 | "enable_prefix_caching": false, 37 | "enforce_eager": false, 38 | "disable_custom_all_reduce": true 39 | } 40 | ``` 41 | 42 | You can preview the collected data by running the following command: 43 | 44 | ```bash 45 | tail ~/.config/vllm/usage_stats.json 46 | ``` 47 | 48 | ## Opt-out of Usage Stats Collection 49 | 50 | You can opt-out of usage stats collection by setting the VLLM_NO_USAGE_STATS or DO_NOT_TRACK environment variable, or by creating a ~/.config/vllm/do_not_track file: 51 | 52 | ```bash 53 | # Any of the following methods can disable usage stats collection 54 | export VLLM_NO_USAGE_STATS=1 55 | export DO_NOT_TRACK=1 56 | mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track 57 | ``` 58 | -------------------------------------------------------------------------------- /examples/aqlm_example.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from vllm import LLM, SamplingParams 4 | 5 | 6 | def main(): 7 | 8 | parser = argparse.ArgumentParser(description='AQLM examples') 9 | 10 | parser.add_argument('--model', 11 | '-m', 12 | type=str, 13 | default=None, 14 | help='model path, as for HF') 15 | parser.add_argument('--choice', 16 | '-c', 17 | type=int, 18 | default=0, 19 | help='known good models by index, [0-4]') 20 | parser.add_argument('--tensor_parallel_size', 21 | '-t', 22 | type=int, 23 | default=1, 24 | help='tensor parallel size') 25 | 26 | args = parser.parse_args() 27 | 28 | models = [ 29 | "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", 30 | "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf", 31 | "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf", 32 | "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf", 33 | "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", 34 | ] 35 | 36 | model = LLM(args.model if args.model is not None else models[args.choice], 37 | tensor_parallel_size=args.tensor_parallel_size) 38 | 39 | sampling_params = SamplingParams(max_tokens=100, temperature=0) 40 | outputs = model.generate("Hello my name is", 41 | sampling_params=sampling_params) 42 | print(outputs[0].outputs[0].text) 43 | 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /examples/fp8/quantizer/README.md: -------------------------------------------------------------------------------- 1 | ### Quantizer Utilities 2 | `quantize.py`: NVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM: 3 | `https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py` 4 | 5 | ### Prerequisite 6 | 7 | #### AMMO (AlgorithMic Model Optimization) Installation: nvidia-ammo 0.7.1 or later 8 | `pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` 9 | 10 | #### AMMO Download (code and docs) 11 | `https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz` 12 | `https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.7.1.tar.gz` 13 | 14 | ### Usage 15 | 16 | #### Run on H100 system for speed if FP8; number of GPUs depends on the model size 17 | 18 | #### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache: 19 | `python quantize.py --model_dir ./ll2-7b --dtype float16 --qformat fp8 --kv_cache_dtype fp8 --output_dir ./ll2_7b_fp8 --calib_size 512 --tp_size 1` 20 | 21 | Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference) 22 | ``` 23 | # ll ./ll2_7b_fp8/ 24 | total 19998244 25 | drwxr-xr-x 2 root root 4096 Feb 7 01:08 ./ 26 | drwxrwxr-x 8 1060 1061 4096 Feb 7 01:08 ../ 27 | -rw-r--r-- 1 root root 176411 Feb 7 01:08 llama_tp1.json 28 | -rw-r--r-- 1 root root 13477087480 Feb 7 01:09 llama_tp1_rank0.npz 29 | -rw-r--r-- 1 root root 7000893272 Feb 7 01:08 rank0.safetensors 30 | # 31 | ``` 32 | 33 | -------------------------------------------------------------------------------- /examples/gradio_webserver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import gradio as gr 5 | import requests 6 | 7 | 8 | def http_bot(prompt): 9 | headers = {"User-Agent": "vLLM Client"} 10 | pload = { 11 | "prompt": prompt, 12 | "stream": True, 13 | "max_tokens": 128, 14 | } 15 | response = requests.post(args.model_url, 16 | headers=headers, 17 | json=pload, 18 | stream=True) 19 | 20 | for chunk in response.iter_lines(chunk_size=8192, 21 | decode_unicode=False, 22 | delimiter=b"\0"): 23 | if chunk: 24 | data = json.loads(chunk.decode("utf-8")) 25 | output = data["text"][0] 26 | yield output 27 | 28 | 29 | def build_demo(): 30 | with gr.Blocks() as demo: 31 | gr.Markdown("# vLLM text completion demo\n") 32 | inputbox = gr.Textbox(label="Input", 33 | placeholder="Enter text and press ENTER") 34 | outputbox = gr.Textbox(label="Output", 35 | placeholder="Generated result from the model") 36 | inputbox.submit(http_bot, [inputbox], [outputbox]) 37 | return demo 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--host", type=str, default=None) 43 | parser.add_argument("--port", type=int, default=8001) 44 | parser.add_argument("--model-url", 45 | type=str, 46 | default="http://localhost:8000/generate") 47 | args = parser.parse_args() 48 | 49 | demo = build_demo() 50 | demo.queue().launch(server_name=args.host, 51 | server_port=args.port, 52 | share=True) 53 | -------------------------------------------------------------------------------- /examples/llm_engine_example.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import List, Tuple 3 | 4 | from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams 5 | 6 | 7 | def create_test_prompts() -> List[Tuple[str, SamplingParams]]: 8 | """Create a list of test prompts with their sampling parameters.""" 9 | return [ 10 | ("A robot may not injure a human being", 11 | SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)), 12 | ("To be or not to be,", 13 | SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)), 14 | ("What is the meaning of life?", 15 | SamplingParams(n=2, 16 | best_of=5, 17 | temperature=0.8, 18 | top_p=0.95, 19 | frequency_penalty=0.1)), 20 | ("It is only with the heart that one can see rightly", 21 | SamplingParams(n=3, best_of=3, use_beam_search=True, 22 | temperature=0.0)), 23 | ] 24 | 25 | 26 | def process_requests(engine: LLMEngine, 27 | test_prompts: List[Tuple[str, SamplingParams]]): 28 | """Continuously process a list of prompts and handle the outputs.""" 29 | request_id = 0 30 | 31 | while test_prompts or engine.has_unfinished_requests(): 32 | if test_prompts: 33 | prompt, sampling_params = test_prompts.pop(0) 34 | engine.add_request(str(request_id), prompt, sampling_params) 35 | request_id += 1 36 | 37 | request_outputs: List[RequestOutput] = engine.step() 38 | 39 | for request_output in request_outputs: 40 | if request_output.finished: 41 | print(request_output) 42 | 43 | 44 | def initialize_engine(args: argparse.Namespace) -> LLMEngine: 45 | """Initialize the LLMEngine from the command line arguments.""" 46 | engine_args = EngineArgs.from_cli_args(args) 47 | return LLMEngine.from_engine_args(engine_args) 48 | 49 | 50 | def main(args: argparse.Namespace): 51 | """Main function that sets up and runs the prompt processing.""" 52 | engine = initialize_engine(args) 53 | test_prompts = create_test_prompts() 54 | process_requests(engine, test_prompts) 55 | 56 | 57 | if __name__ == '__main__': 58 | parser = argparse.ArgumentParser( 59 | description='Demo on using the LLMEngine class directly') 60 | parser = EngineArgs.add_cli_args(parser) 61 | args = parser.parse_args() 62 | main(args) 63 | -------------------------------------------------------------------------------- /examples/offline_inference.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM(model="facebook/opt-125m") 15 | # Generate texts from the prompts. The output is a list of RequestOutput objects 16 | # that contain the prompt, generated text, and other information. 17 | outputs = llm.generate(prompts, sampling_params) 18 | # Print the outputs. 19 | for output in outputs: 20 | prompt = output.prompt 21 | generated_text = output.outputs[0].text 22 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 23 | -------------------------------------------------------------------------------- /examples/offline_inference_neuron.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM( 15 | model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", 16 | max_num_seqs=8, 17 | # The max_model_len and block_size arguments are required to be same as 18 | # max sequence length when targeting neuron device. 19 | # Currently, this is a known limitation in continuous batching support 20 | # in transformers-neuronx. 21 | # TODO(liangfu): Support paged-attention in transformers-neuronx. 22 | max_model_len=128, 23 | block_size=128, 24 | # The device can be automatically detected when AWS Neuron SDK is installed. 25 | # The device argument can be either unspecified for automated detection, 26 | # or explicitly assigned. 27 | device="neuron", 28 | tensor_parallel_size=2) 29 | # Generate texts from the prompts. The output is a list of RequestOutput objects 30 | # that contain the prompt, generated text, and other information. 31 | outputs = llm.generate(prompts, sampling_params) 32 | # Print the outputs. 33 | for output in outputs: 34 | prompt = output.prompt 35 | generated_text = output.outputs[0].text 36 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 37 | -------------------------------------------------------------------------------- /examples/offline_inference_with_prefix.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | prefix = ( 4 | "You are an expert school principal, skilled in effectively managing " 5 | "faculty and staff. Draft 10-15 questions for a potential first grade " 6 | "Head Teacher for my K-12, all-girls', independent school that emphasizes " 7 | "community, joyful discovery, and life-long learning. The candidate is " 8 | "coming in for a first-round panel interview for a 8th grade Math " 9 | "teaching role. They have 5 years of previous teaching experience " 10 | "as an assistant teacher at a co-ed, public school with experience " 11 | "in middle school math teaching. Based on these information, fulfill " 12 | "the following paragraph: ") 13 | 14 | # Sample prompts. 15 | prompts = [ 16 | "Hello, my name is", 17 | "The president of the United States is", 18 | "The capital of France is", 19 | "The future of AI is", 20 | ] 21 | # Create a sampling params object. 22 | sampling_params = SamplingParams(temperature=0.0) 23 | 24 | # Create an LLM. 25 | llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True) 26 | 27 | generating_prompts = [prefix + prompt for prompt in prompts] 28 | 29 | # Generate texts from the prompts. The output is a list of RequestOutput objects 30 | # that contain the prompt, generated text, and other information. 31 | outputs = llm.generate(generating_prompts, sampling_params) 32 | # Print the outputs. 33 | for output in outputs: 34 | prompt = output.prompt 35 | generated_text = output.outputs[0].text 36 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 37 | 38 | print("-" * 80) 39 | 40 | # The llm.generate call will batch all prompts and send the batch at once 41 | # if resources allow. The prefix will only be cached after the first batch 42 | # is processed, so we need to call generate once to calculate the prefix 43 | # and cache it. 44 | outputs = llm.generate(generating_prompts[0], sampling_params) 45 | 46 | # Subsequent batches can leverage the cached prefix 47 | outputs = llm.generate(generating_prompts, sampling_params) 48 | 49 | # Print the outputs. You should see the same outputs as before 50 | for output in outputs: 51 | prompt = output.prompt 52 | generated_text = output.outputs[0].text 53 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 54 | -------------------------------------------------------------------------------- /examples/openai_chat_completion_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai_api_key = "EMPTY" 5 | openai_api_base = "http://localhost:8000/v1" 6 | 7 | client = OpenAI( 8 | # defaults to os.environ.get("OPENAI_API_KEY") 9 | api_key=openai_api_key, 10 | base_url=openai_api_base, 11 | ) 12 | 13 | models = client.models.list() 14 | model = models.data[0].id 15 | 16 | chat_completion = client.chat.completions.create( 17 | messages=[{ 18 | "role": "system", 19 | "content": "You are a helpful assistant." 20 | }, { 21 | "role": "user", 22 | "content": "Who won the world series in 2020?" 23 | }, { 24 | "role": 25 | "assistant", 26 | "content": 27 | "The Los Angeles Dodgers won the World Series in 2020." 28 | }, { 29 | "role": "user", 30 | "content": "Where was it played?" 31 | }], 32 | model=model, 33 | ) 34 | 35 | print("Chat completion results:") 36 | print(chat_completion) 37 | -------------------------------------------------------------------------------- /examples/openai_completion_client.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai_api_key = "EMPTY" 5 | openai_api_base = "http://localhost:8000/v1" 6 | 7 | client = OpenAI( 8 | # defaults to os.environ.get("OPENAI_API_KEY") 9 | api_key=openai_api_key, 10 | base_url=openai_api_base, 11 | ) 12 | 13 | models = client.models.list() 14 | model = models.data[0].id 15 | 16 | # Completion API 17 | stream = False 18 | completion = client.completions.create( 19 | model=model, 20 | prompt="A robot may not injure a human being", 21 | echo=False, 22 | n=2, 23 | stream=stream, 24 | logprobs=3) 25 | 26 | print("Completion results:") 27 | if stream: 28 | for c in completion: 29 | print(c) 30 | else: 31 | print(completion) 32 | -------------------------------------------------------------------------------- /examples/production_monitoring/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # docker-compose.yaml 2 | version: "3" 3 | 4 | services: 5 | prometheus: 6 | image: prom/prometheus:latest 7 | extra_hosts: 8 | - "host.docker.internal:host-gateway" # allow a direct connection from container to the local machine 9 | ports: 10 | - "9090:9090" # the default port used by Prometheus 11 | volumes: 12 | - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file 13 | 14 | grafana: 15 | image: grafana/grafana:latest 16 | depends_on: 17 | - prometheus 18 | ports: 19 | - "3000:3000" # the default port used by Grafana 20 | -------------------------------------------------------------------------------- /examples/production_monitoring/prometheus.yaml: -------------------------------------------------------------------------------- 1 | # prometheus.yaml 2 | global: 3 | scrape_interval: 5s 4 | evaluation_interval: 30s 5 | 6 | scrape_configs: 7 | - job_name: vllm 8 | static_configs: 9 | - targets: 10 | - 'host.docker.internal:8000' 11 | -------------------------------------------------------------------------------- /examples/template_alpaca.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {% for message in messages %} 4 | {% if message['role'] == 'user' %} 5 | ### Instruction: 6 | {{ message['content']|trim -}} 7 | {% if not loop.last %} 8 | 9 | 10 | {% endif %} 11 | {% elif message['role'] == 'assistant' %} 12 | ### Response: 13 | {{ message['content']|trim -}} 14 | {% if not loop.last %} 15 | 16 | 17 | {% endif %} 18 | {% elif message['role'] == 'user_context' %} 19 | ### Input: 20 | {{ message['content']|trim -}} 21 | {% if not loop.last %} 22 | 23 | 24 | {% endif %} 25 | {% endif %} 26 | {% endfor %} 27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} 28 | ### Response: 29 | {% endif %} -------------------------------------------------------------------------------- /examples/template_baichuan.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {%- for message in messages -%} 4 | {%- if message['role'] == 'user' -%} 5 | {{- '' + message['content'] -}} 6 | {%- elif message['role'] == 'assistant' -%} 7 | {{- '' + message['content'] -}} 8 | {%- endif -%} 9 | {%- endfor -%} 10 | 11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 12 | {{- '' -}} 13 | {% endif %} -------------------------------------------------------------------------------- /examples/template_chatglm.jinja: -------------------------------------------------------------------------------- 1 | {%- set counter = namespace(index=0) -%} 2 | {%- for message in messages -%} 3 | {%- if message['role'] == 'user' -%} 4 | {{- '[Round ' + counter.index|string + ']\n问:' + message['content'] -}} 5 | {%- set counter.index = counter.index + 1 -%} 6 | {%- endif -%} 7 | {%- if message['role'] == 'assistant' -%} 8 | {{- '\n答:' + message['content'] -}} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n' -}} 11 | {%- endif -%} 12 | {%- endif -%} 13 | {%- endfor -%} 14 | 15 | 16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 17 | {{- '\n答:' -}} 18 | {%- endif -%} -------------------------------------------------------------------------------- /examples/template_chatglm2.jinja: -------------------------------------------------------------------------------- 1 | {%- set counter = namespace(index=1) -%} 2 | {%- for message in messages -%} 3 | {%- if message['role'] == 'user' -%} 4 | {{- '[Round ' + counter.index|string + ']\n\n问:' + message['content'] -}} 5 | {%- set counter.index = counter.index + 1 -%} 6 | {%- endif -%} 7 | {%- if message['role'] == 'assistant' -%} 8 | {{- '\n\n答:' + message['content'] -}} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n\n' -}} 11 | {%- endif -%} 12 | {%- endif -%} 13 | {%- endfor -%} 14 | 15 | 16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 17 | {{- '\n\n答:' -}} 18 | {%- endif -%} -------------------------------------------------------------------------------- /examples/template_chatml.jinja: -------------------------------------------------------------------------------- 1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} 2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} -------------------------------------------------------------------------------- /examples/template_falcon.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'user' -%} 3 | {{- 'User: ' + message['content'] -}} 4 | {%- elif message['role'] == 'assistant' -%} 5 | {{- 'Assistant: ' + message['content'] -}} 6 | {%- endif -%} 7 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 8 | {{- '\n' -}} 9 | {%- endif -%} 10 | {%- endfor -%} 11 | 12 | 13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 14 | {{- 'Assistant:' -}} 15 | {% endif %} -------------------------------------------------------------------------------- /examples/template_falcon_180b.jinja: -------------------------------------------------------------------------------- 1 | {%- for message in messages -%} 2 | {%- if message['role'] == 'system' -%} 3 | {{- 'System: ' + message['content'] -}} 4 | {%- elif message['role'] == 'user' -%} 5 | {{- 'User: ' + message['content'] -}} 6 | {%- elif message['role'] == 'assistant' -%} 7 | {{- 'Falcon: ' + message['content'] -}} 8 | {%- endif -%} 9 | {%- if (loop.last and add_generation_prompt) or not loop.last -%} 10 | {{- '\n' -}} 11 | {%- endif -%} 12 | {%- endfor -%} 13 | 14 | 15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} 16 | {{- 'Falcon:' -}} 17 | {% endif %} -------------------------------------------------------------------------------- /examples/template_inkbot.jinja: -------------------------------------------------------------------------------- 1 | <#meta#> 2 | - Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }} 3 | - Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }} 4 | <#system#> 5 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 6 | <#chat#> 7 | {% for message in messages %} 8 | {% if message['role'] == 'user' %} 9 | <#user#> 10 | {{ message['content']|trim -}} 11 | {% if not loop.last %} 12 | 13 | {% endif %} 14 | {% elif message['role'] == 'assistant' %} 15 | <#bot#> 16 | {{ message['content']|trim -}} 17 | {% if not loop.last %} 18 | 19 | {% endif %} 20 | {% elif message['role'] == 'user_context' %} 21 | <#user_context#> 22 | {{ message['content']|trim -}} 23 | {% if not loop.last %} 24 | 25 | {% endif %} 26 | {% endif %} 27 | {% endfor %} 28 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} 29 | <#bot#> 30 | {% endif %} -------------------------------------------------------------------------------- /musa_porting.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | from torch_musa.utils.simple_porting import SimplePorting 4 | from torch_musa.utils.musa_extension import MUSAExtension 5 | 6 | SimplePorting(cuda_dir_path="./csrc", mapping_rule={ 7 | "x.device().is_cuda()": "true", 8 | "#include ": "#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"", 9 | "#include ": "#include \"torch_musa/csrc/core/MUSAGuard.h\"", 10 | "#include ": "#include \"torch_musa/csrc/core/MUSAException.h\"", 11 | "#include ": "#include \"torch_musa/csrc/core/MUSAStream.h\"", 12 | "at::kCUDA": "at::musa::kMUSA", 13 | "at::cuda::getCurrentCUDAStream()": "at::musa::getCurrentMUSAStream()", 14 | "__nv_bfloat16": "__mt_bfloat16", 15 | "at::cuda::OptionalCUDAGuard": "at::musa::OptionalMUSAGuard", 16 | "at::cuda::getCurrentCUDABlasHandle()": "at::musa::getCurrentMUSABlasHandle()", 17 | "ATen/cuda/CUDATensorMethods.cuh": "ATen/musa/MUSA_PORT_TensorMethods.muh", 18 | "#include \"attention_generic.cuh\"": "#include \"attention_generic.muh\"", 19 | "#include \"reduction_utils.cuh\"": "#include \"reduction_utils.muh\"", 20 | "#include ": "#include ", 21 | "#include \"dtype_float16.cuh\"": "#include \"dtype_float16.muh\"", 22 | "#include \"dtype_float32.cuh\"": "#include \"dtype_float32.muh\"", 23 | "#include \"custom_all_reduce.cuh\"": "#include \"custom_all_reduce.muh\"", 24 | "#include \"dtype_bfloat16.cuh\"": "#include \"dtype_bfloat16.muh\"", 25 | "#include \"dtype_fp8.cuh\"": "#include \"dtype_fp8.muh\"", 26 | "#include \"attention_utils.cuh\"": "#include \"attention_utils.muh\"", 27 | "cuPointerGetAttribute": "muPointerGetAttribute", 28 | "CUdeviceptr": "MUdeviceptr", 29 | "CUDA_SUCCESS": "MUSA_SUCCESS", 30 | "CU_POINTER_ATTRIBUTE_RANGE_START_ADDR": "MU_POINTER_ATTRIBUTE_RANGE_START_ADDR", 31 | "c10::cuda": "c10::musa", 32 | "cudaStreamIsCapturing": "at::musa::musaStreamIsCapturing", 33 | "AT_CUDA_CHECK": "C10_MUSA_CHECK", 34 | "nv_bfloat16": "mt_bfloat16", 35 | "struct __align__(16) RankData { const void *__restrict__ ptrs[8]; };":"struct __align__(16) RankData { const void *__restrict__ ptrs[8]; RankData& operator=(const RankData& ){return *this;} };" 36 | }).run() -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | # Should be mirrored in requirements-build.txt 3 | requires = [ 4 | "cmake>=3.21", 5 | "ninja", 6 | "packaging", 7 | "setuptools >= 49.4.0", 8 | "torch == 2.2.0", 9 | "wheel", 10 | ] 11 | build-backend = "setuptools.build_meta" 12 | 13 | [tool.ruff] 14 | # Allow lines to be as long as 80. 15 | line-length = 80 16 | exclude = [ 17 | # External file, leaving license intact 18 | "examples/fp8/quantizer/quantize.py" 19 | ] 20 | 21 | [tool.ruff.lint] 22 | select = [ 23 | # pycodestyle 24 | "E", 25 | # Pyflakes 26 | "F", 27 | # pyupgrade 28 | # "UP", 29 | # flake8-bugbear 30 | "B", 31 | # flake8-simplify 32 | "SIM", 33 | # isort 34 | # "I", 35 | "G", 36 | ] 37 | ignore = [ 38 | # star imports 39 | "F405", "F403", 40 | # lambda expression assignment 41 | "E731", 42 | # Loop control variable not used within loop body 43 | "B007", 44 | ] 45 | 46 | [tool.mypy] 47 | python_version = "3.9" 48 | 49 | ignore_missing_imports = true 50 | check_untyped_defs = true 51 | follow_imports = "skip" 52 | 53 | files = "vllm" 54 | # TODO(woosuk): Include the code from Megatron and HuggingFace. 55 | exclude = [ 56 | "vllm/model_executor/parallel_utils/|vllm/model_executor/models/", 57 | # Ignore triton kernels in ops. 58 | 'vllm/attention/ops/.*\.py$' 59 | ] 60 | 61 | [tool.codespell] 62 | ignore-words-list = "dout, te, indicies" 63 | skip = "./tests/prompts,./benchmarks/sonnet.txt" 64 | 65 | [tool.isort] 66 | use_parentheses = true 67 | skip_gitignore = true 68 | -------------------------------------------------------------------------------- /requirements-build.txt: -------------------------------------------------------------------------------- 1 | # Should be mirrored in pyproject.toml 2 | cmake>=3.21 3 | ninja 4 | packaging 5 | setuptools>=49.4.0 6 | torch==2.2.0 7 | wheel 8 | triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. -------------------------------------------------------------------------------- /requirements-common.txt: -------------------------------------------------------------------------------- 1 | cmake >= 3.21 2 | ninja # For faster builds. 3 | psutil 4 | sentencepiece # Required for LLaMA tokenizer. 5 | numpy 6 | requests 7 | py-cpuinfo 8 | transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3. 9 | tokenizers >= 0.19.1 # Required for Llama 3. 10 | fastapi 11 | openai 12 | uvicorn[standard] 13 | pydantic >= 2.0 # Required for OpenAI server. 14 | prometheus_client >= 0.18.0 15 | prometheus-fastapi-instrumentator >= 7.0.0 16 | tiktoken == 0.6.0 # Required for DBRX tokenizer 17 | lm-format-enforcer == 0.9.8 18 | outlines == 0.0.34 # Requires torch >= 2.1.0 19 | typing_extensions 20 | filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 21 | -------------------------------------------------------------------------------- /requirements-cpu.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for x86_64 CPUs 5 | torch == 2.3.0+cpu 6 | triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. -------------------------------------------------------------------------------- /requirements-cuda.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for NVIDIA GPUs 5 | ray >= 2.9 6 | nvidia-ml-py # for pynvml package 7 | vllm-nccl-cu12>=2.18,<2.19 # for downloading nccl library 8 | torch == 2.3.0 9 | xformers == 0.0.26.post1 # Requires PyTorch 2.3.0 10 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # formatting 2 | yapf==0.32.0 3 | toml==0.10.2 4 | tomli==2.0.1 5 | ruff==0.1.5 6 | codespell==2.2.6 7 | isort==5.13.2 8 | 9 | # type checking 10 | mypy==1.9.0 11 | types-PyYAML 12 | types-requests 13 | types-setuptools 14 | 15 | # testing 16 | pytest 17 | tensorizer==2.9.0 18 | pytest-forked 19 | pytest-asyncio 20 | pytest-rerunfailures 21 | pytest-shard 22 | httpx 23 | einops # required for MPT 24 | requests 25 | ray 26 | peft 27 | awscli 28 | 29 | # Benchmarking 30 | aiohttp 31 | 32 | # Multimodal 33 | pillow 34 | -------------------------------------------------------------------------------- /requirements-musa.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for MTHREADS GPUs 5 | ray >= 2.9 6 | torch == 2.2.0 7 | triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. 8 | -------------------------------------------------------------------------------- /requirements-neuron.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for Neuron devices 5 | transformers-neuronx >= 0.9.0 6 | torch-neuronx >= 2.1.0 7 | neuronx-cc 8 | -------------------------------------------------------------------------------- /requirements-rocm.txt: -------------------------------------------------------------------------------- 1 | # Common dependencies 2 | -r requirements-common.txt 3 | 4 | # Dependencies for AMD GPUs 5 | ray == 2.9.3 6 | -------------------------------------------------------------------------------- /rocm_patch/rocm_bf16.patch: -------------------------------------------------------------------------------- 1 | --- amd_hip_bf16.h 2024-02-06 18:28:58.268699142 +0000 2 | +++ amd_hip_bf16.h.new 2024-02-06 18:28:31.988647133 +0000 3 | @@ -90,10 +90,10 @@ 4 | #include "math_fwd.h" // ocml device functions 5 | 6 | #if defined(__HIPCC_RTC__) 7 | -#define __HOST_DEVICE__ __device__ 8 | +#define __HOST_DEVICE__ __device__ static 9 | #else 10 | #include 11 | -#define __HOST_DEVICE__ __host__ __device__ 12 | +#define __HOST_DEVICE__ __host__ __device__ static inline 13 | #endif 14 | 15 | // Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on 16 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/__init__.py -------------------------------------------------------------------------------- /tests/async_engine/api_server_async_engine.py: -------------------------------------------------------------------------------- 1 | """vllm.entrypoints.api_server with some extra logging for testing.""" 2 | import argparse 3 | from typing import Any, Dict 4 | 5 | import uvicorn 6 | from fastapi.responses import JSONResponse, Response 7 | 8 | import vllm.entrypoints.api_server 9 | from vllm.engine.arg_utils import AsyncEngineArgs 10 | from vllm.engine.async_llm_engine import AsyncLLMEngine 11 | 12 | app = vllm.entrypoints.api_server.app 13 | 14 | 15 | class AsyncLLMEngineWithStats(AsyncLLMEngine): 16 | 17 | def __init__(self, *args, **kwargs): 18 | super().__init__(*args, **kwargs) 19 | self._num_aborts = 0 20 | 21 | async def abort(self, request_id: str) -> None: 22 | await super().abort(request_id) 23 | self._num_aborts += 1 24 | 25 | def testing_stats(self) -> Dict[str, Any]: 26 | return {"num_aborted_requests": self._num_aborts} 27 | 28 | 29 | @app.get("/stats") 30 | def stats() -> Response: 31 | """Get the statistics of the engine.""" 32 | return JSONResponse(engine.testing_stats()) 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument("--host", type=str, default="localhost") 38 | parser.add_argument("--port", type=int, default=8000) 39 | parser = AsyncEngineArgs.add_cli_args(parser) 40 | args = parser.parse_args() 41 | 42 | engine_args = AsyncEngineArgs.from_cli_args(args) 43 | engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) 44 | vllm.entrypoints.api_server.engine = engine 45 | uvicorn.run( 46 | app, 47 | host=args.host, 48 | port=args.port, 49 | log_level="debug", 50 | timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE) 51 | -------------------------------------------------------------------------------- /tests/async_engine/test_merge_async_iterators.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import AsyncIterator, Tuple 3 | 4 | import pytest 5 | 6 | from vllm.utils import merge_async_iterators 7 | 8 | 9 | @pytest.mark.asyncio 10 | async def test_merge_async_iterators(): 11 | 12 | async def mock_async_iterator(idx: int) -> AsyncIterator[str]: 13 | try: 14 | while True: 15 | yield f"item from iterator {idx}" 16 | await asyncio.sleep(0.1) 17 | except asyncio.CancelledError: 18 | pass 19 | 20 | iterators = [mock_async_iterator(i) for i in range(3)] 21 | merged_iterator: AsyncIterator[Tuple[int, str]] = merge_async_iterators( 22 | *iterators) 23 | 24 | async def stream_output(generator: AsyncIterator[Tuple[int, str]]): 25 | async for idx, output in generator: 26 | print(f"idx: {idx}, output: {output}") 27 | 28 | task = asyncio.create_task(stream_output(merged_iterator)) 29 | await asyncio.sleep(0.5) 30 | task.cancel() 31 | with pytest.raises(asyncio.CancelledError): 32 | await task 33 | 34 | for iterator in iterators: 35 | try: 36 | await asyncio.wait_for(anext(iterator), 1) 37 | except StopAsyncIteration: 38 | # All iterators should be cancelled and print this message. 39 | print("Iterator was cancelled normally") 40 | except (Exception, asyncio.CancelledError) as e: 41 | raise AssertionError() from e 42 | -------------------------------------------------------------------------------- /tests/basic_correctness/test_basic_correctness.py: -------------------------------------------------------------------------------- 1 | """Compare the short outputs of HF and vLLM when using greedy sampling. 2 | 3 | Run `pytest tests/basic_correctness/test_basic_correctness.py`. 4 | """ 5 | import os 6 | 7 | import pytest 8 | 9 | MODELS = [ 10 | "facebook/opt-125m", 11 | "meta-llama/Llama-2-7b-hf", 12 | ] 13 | VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND" 14 | 15 | 16 | @pytest.mark.parametrize("model", MODELS) 17 | @pytest.mark.parametrize("dtype", ["half"]) 18 | @pytest.mark.parametrize("max_tokens", [5]) 19 | @pytest.mark.parametrize("enforce_eager", [False, True]) 20 | def test_models( 21 | hf_runner, 22 | vllm_runner, 23 | example_prompts, 24 | model: str, 25 | dtype: str, 26 | max_tokens: int, 27 | enforce_eager: bool, 28 | ) -> None: 29 | backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND) 30 | if backend_by_env_var == "FLASHINFER" and enforce_eager is False: 31 | pytest.skip("Skipping non-eager test for FlashInferBackend.") 32 | 33 | hf_model = hf_runner(model, dtype=dtype) 34 | hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) 35 | del hf_model 36 | 37 | vllm_model = vllm_runner(model, 38 | dtype=dtype, 39 | enforce_eager=enforce_eager, 40 | gpu_memory_utilization=0.7) 41 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 42 | del vllm_model 43 | 44 | for i in range(len(example_prompts)): 45 | hf_output_ids, hf_output_str = hf_outputs[i] 46 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 47 | assert hf_output_str == vllm_output_str, ( 48 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 49 | assert hf_output_ids == vllm_output_ids, ( 50 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 51 | -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/core/__init__.py -------------------------------------------------------------------------------- /tests/core/block/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/core/block/__init__.py -------------------------------------------------------------------------------- /tests/core/block/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture() 5 | def should_do_global_cleanup_after_test() -> bool: 6 | """Disable the global cleanup fixture for tests in this directory. This 7 | provides a ~10x speedup for unit tests that don't load a model to GPU. 8 | 9 | This requires that tests in this directory clean up after themselves if they 10 | use the GPU. 11 | """ 12 | return False 13 | -------------------------------------------------------------------------------- /tests/core/block/e2e/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from tests.conftest import cleanup 4 | from vllm import LLM 5 | from vllm.model_executor.utils import set_random_seed 6 | 7 | 8 | @pytest.fixture 9 | def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, 10 | baseline_llm_kwargs, seed): 11 | return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, 12 | baseline_llm_kwargs, seed) 13 | 14 | 15 | @pytest.fixture 16 | def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, 17 | test_llm_kwargs, seed): 18 | return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, 19 | test_llm_kwargs, seed) 20 | 21 | 22 | def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, 23 | distinct_llm_kwargs, seed): 24 | kwargs = { 25 | **common_llm_kwargs, 26 | **per_test_common_llm_kwargs, 27 | **distinct_llm_kwargs, 28 | } 29 | 30 | def generator_inner(): 31 | llm = LLM(**kwargs) 32 | 33 | set_random_seed(seed) 34 | 35 | yield llm 36 | del llm 37 | cleanup() 38 | 39 | for llm in generator_inner(): 40 | yield llm 41 | del llm 42 | -------------------------------------------------------------------------------- /tests/core/block/test_common.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import pytest 4 | 5 | from vllm.core.block.common import RefCounter 6 | 7 | 8 | @pytest.mark.parametrize("seed", list(range(20))) 9 | @pytest.mark.parametrize("num_incrs", [1, 100]) 10 | @pytest.mark.parametrize("num_blocks", [1024]) 11 | def test_incr(seed: int, num_incrs: int, num_blocks: int): 12 | random.seed(seed) 13 | 14 | all_block_indices = list(range(num_blocks)) 15 | counter = RefCounter(all_block_indices=all_block_indices) 16 | 17 | block_id = random.randint(0, num_blocks - 1) 18 | for i in range(num_incrs): 19 | value = counter.incr(block_id) 20 | assert value == i + 1 21 | 22 | 23 | @pytest.mark.parametrize("seed", list(range(20))) 24 | @pytest.mark.parametrize("num_incrs", [1, 100]) 25 | @pytest.mark.parametrize("num_blocks", [1024]) 26 | def test_incr_decr(seed: int, num_incrs: int, num_blocks: int): 27 | random.seed(seed) 28 | 29 | all_block_indices = list(range(num_blocks)) 30 | counter = RefCounter(all_block_indices=all_block_indices) 31 | 32 | block_id = random.randint(0, num_blocks - 1) 33 | for i in range(num_incrs): 34 | value = counter.incr(block_id) 35 | assert value == i + 1 36 | 37 | for i in range(num_incrs): 38 | value = counter.decr(block_id) 39 | assert value == num_incrs - (i + 1) 40 | 41 | with pytest.raises(AssertionError): 42 | counter.decr(block_id) 43 | -------------------------------------------------------------------------------- /tests/core/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Iterable, Optional, Tuple 3 | 4 | from vllm import SamplingParams 5 | from vllm.lora.request import LoRARequest 6 | from vllm.sequence import Logprob, Sequence, SequenceGroup 7 | 8 | 9 | def create_dummy_prompt( 10 | request_id: str, 11 | prompt_length: int, 12 | block_size: Optional[int] = None, 13 | lora_request: Optional[LoRARequest] = None, 14 | use_beam_search: bool = False, 15 | best_of: int = 1, 16 | ) -> Tuple[Sequence, SequenceGroup]: 17 | if not block_size: 18 | block_size = prompt_length 19 | 20 | # Create dummy prompt sequence with tokens 0...block_size-1 21 | # and prompt "0 ... block_size". 22 | prompt_tokens = list(range(prompt_length)) 23 | prompt_str = " ".join([str(t) for t in prompt_tokens]) 24 | prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size) 25 | seq_group = SequenceGroup( 26 | request_id, [prompt], 27 | SamplingParams(use_beam_search=use_beam_search, best_of=best_of), 28 | time.time(), lora_request) 29 | 30 | return prompt, seq_group 31 | 32 | 33 | def create_seq_group( 34 | seq_prompt_len: int = 1024, 35 | seq_output_lens: Iterable[int] = (128, ), 36 | request_id: str = '0', 37 | seq_id_start: int = 0, 38 | sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: 39 | 40 | assert len(seq_output_lens) > 0 41 | 42 | if sampling_params is None: 43 | sampling_params = SamplingParams() 44 | 45 | prompt_token_ids = [0] * seq_prompt_len 46 | 47 | seqs = [] 48 | for seq_id_offset, output_len in enumerate(seq_output_lens): 49 | seq = Sequence( 50 | seq_id=seq_id_start + seq_id_offset, 51 | prompt="", 52 | prompt_token_ids=prompt_token_ids, 53 | block_size=16, 54 | ) 55 | 56 | for i in range(output_len): 57 | seq.append_token_id( 58 | token_id=i, 59 | logprobs={i: Logprob(0.0)}, 60 | ) 61 | seqs.append(seq) 62 | 63 | seq_group = SequenceGroup( 64 | request_id=request_id, 65 | seqs=seqs, 66 | sampling_params=sampling_params, 67 | arrival_time=time.time(), 68 | ) 69 | 70 | return seq_group 71 | 72 | 73 | def round_up_to_next_block(seq_len: int, block_size: int) -> int: 74 | return (seq_len + block_size - 1) // block_size 75 | -------------------------------------------------------------------------------- /tests/distributed/test_basic_distributed_correctness.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and distributed vLLM when using greedy sampling. 2 | vLLM will allocate all the available memory, so we need to run the tests one 3 | by one. The solution is to pass arguments (model name) by environment 4 | variables. 5 | Run: 6 | ```sh 7 | TEST_DIST_MODEL=facebook/opt-125m pytest \ 8 | test_basic_distributed_correctness.py 9 | TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \ 10 | test_basic_distributed_correctness.py 11 | ``` 12 | """ 13 | import os 14 | 15 | import pytest 16 | import torch 17 | 18 | MODELS = [ 19 | os.environ["TEST_DIST_MODEL"], 20 | ] 21 | VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND" 22 | 23 | 24 | @pytest.mark.skipif(torch.cuda.device_count() < 2, 25 | reason="Need at least 2 GPUs to run the test.") 26 | @pytest.mark.parametrize("model", MODELS) 27 | @pytest.mark.parametrize("dtype", ["half"]) 28 | @pytest.mark.parametrize("max_tokens", [5]) 29 | def test_models( 30 | hf_runner, 31 | vllm_runner, 32 | example_prompts, 33 | model: str, 34 | dtype: str, 35 | max_tokens: int, 36 | ) -> None: 37 | enforce_eager = False 38 | backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND) 39 | if backend_by_env_var == "FLASHINFER": 40 | enforce_eager = True 41 | 42 | hf_model = hf_runner(model, dtype=dtype) 43 | hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) 44 | del hf_model 45 | 46 | vllm_model = vllm_runner(model, 47 | dtype=dtype, 48 | tensor_parallel_size=2, 49 | enforce_eager=enforce_eager) 50 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 51 | del vllm_model 52 | 53 | for i in range(len(example_prompts)): 54 | hf_output_ids, hf_output_str = hf_outputs[i] 55 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 56 | assert hf_output_str == vllm_output_str, ( 57 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 58 | assert hf_output_ids == vllm_output_ids, ( 59 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 60 | -------------------------------------------------------------------------------- /tests/distributed/test_chunked_prefill_distributed.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and distributed vLLM when using greedy sampling. 2 | vLLM will allocate all the available memory, so we need to run the tests one 3 | by one. The solution is to pass arguments (model name) by environment 4 | variables. 5 | 6 | Run: 7 | ```sh 8 | TEST_DIST_MODEL=facebook/opt-125m pytest \ 9 | test_chunked_prefill_distributed.py 10 | TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \ 11 | test_chunked_prefill_distributed.py 12 | ``` 13 | """ 14 | import os 15 | 16 | import pytest 17 | import torch 18 | 19 | MODELS = [ 20 | os.environ["TEST_DIST_MODEL"], 21 | ] 22 | 23 | 24 | @pytest.mark.skipif(torch.cuda.device_count() < 2, 25 | reason="Need at least 2 GPUs to run the test.") 26 | @pytest.mark.parametrize("model", MODELS) 27 | @pytest.mark.parametrize("dtype", ["half"]) 28 | @pytest.mark.parametrize("max_tokens", [5]) 29 | @pytest.mark.parametrize("chunked_prefill_token_size", [16]) 30 | def test_models( 31 | hf_runner, 32 | vllm_runner, 33 | example_prompts, 34 | model: str, 35 | dtype: str, 36 | max_tokens: int, 37 | chunked_prefill_token_size: int, 38 | ) -> None: 39 | # Add a chunked prefill config. 40 | max_num_seqs = min(chunked_prefill_token_size, 256) 41 | assert chunked_prefill_token_size != -1 42 | enable_chunked_prefill = True 43 | max_num_batched_tokens = chunked_prefill_token_size 44 | 45 | hf_model = hf_runner(model, dtype=dtype) 46 | hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) 47 | del hf_model 48 | 49 | vllm_model = vllm_runner( 50 | model, 51 | dtype=dtype, 52 | tensor_parallel_size=2, 53 | max_num_seqs=max_num_seqs, 54 | enable_chunked_prefill=enable_chunked_prefill, 55 | max_num_batched_tokens=max_num_batched_tokens, 56 | ) 57 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 58 | del vllm_model 59 | 60 | for i in range(len(example_prompts)): 61 | hf_output_ids, hf_output_str = hf_outputs[i] 62 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 63 | assert hf_output_str == vllm_output_str, ( 64 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 65 | assert hf_output_ids == vllm_output_ids, ( 66 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 67 | -------------------------------------------------------------------------------- /tests/distributed/test_pynccl_library.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import tempfile 3 | 4 | 5 | def target_fn(env, filepath): 6 | from vllm.utils import update_environment_variables 7 | update_environment_variables(env) 8 | from vllm.utils import nccl_integrity_check 9 | nccl_integrity_check(filepath) 10 | 11 | 12 | def test_library_file(): 13 | # note: don't import vllm.distributed.device_communicators.pynccl 14 | # before running this test, otherwise the library file will be loaded 15 | # and it might interfere with the test 16 | from vllm.utils import find_nccl_library 17 | so_file = find_nccl_library() 18 | with open(so_file, 'rb') as f: 19 | content = f.read() 20 | try: 21 | # corrupt the library file, should raise an exception 22 | with open(so_file, 'wb') as f: 23 | f.write(content[:len(content) // 2]) 24 | p = multiprocessing.Process(target=target_fn, args=({}, so_file)) 25 | p.start() 26 | p.join() 27 | assert p.exitcode != 0 28 | 29 | # move the library file to a tmp path 30 | # test VLLM_NCCL_SO_PATH 31 | fd, path = tempfile.mkstemp() 32 | with open(path, 'wb') as f: 33 | f.write(content) 34 | p = multiprocessing.Process(target=target_fn, 35 | args=({ 36 | "VLLM_NCCL_SO_PATH": path 37 | }, path)) 38 | p.start() 39 | p.join() 40 | assert p.exitcode == 0 41 | finally: 42 | with open(so_file, 'wb') as f: 43 | f.write(content) 44 | -------------------------------------------------------------------------------- /tests/engine/test_computed_prefix_blocks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.engine.arg_utils import EngineArgs 4 | from vllm.engine.llm_engine import LLMEngine 5 | from vllm.sampling_params import SamplingParams 6 | 7 | 8 | @pytest.mark.parametrize("model", ["facebook/opt-125m"]) 9 | @pytest.mark.parametrize("block_size", [16]) 10 | def test_computed_prefix_blocks(model: str, block_size: int): 11 | # This test checks if we are able to run the engine to completion 12 | # without triggering asserts. 13 | # We are in a scenario where all blocks from the second request's prompt 14 | # are full and already computed when the second request arrives. 15 | prompt = ( 16 | "You are a helpful assistant. How do I build a car from cardboard and " 17 | "paper clips? Is there an easy to follow video tutorial available " 18 | "online for free?") 19 | prompt2 = ( 20 | " Please recommend to me some resources where I can learn not only to " 21 | "handle technical difficulties of building a car, but also " 22 | "decoration.") 23 | 24 | engine_args = EngineArgs(model=model, 25 | block_size=block_size, 26 | enable_prefix_caching=True) 27 | 28 | engine = LLMEngine.from_engine_args(engine_args) 29 | sampling_params = SamplingParams() 30 | 31 | engine.add_request("0", prompt + prompt2, sampling_params) 32 | engine.step() 33 | engine.add_request("1", prompt, sampling_params) 34 | engine.step() 35 | -------------------------------------------------------------------------------- /tests/engine/test_detokenization.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.entrypoints.llm import LLM 4 | from vllm.sampling_params import SamplingParams 5 | 6 | 7 | @pytest.mark.parametrize("model", ["facebook/opt-125m"]) 8 | def test_computed_prefix_blocks(model: str): 9 | # This test checks if the engine generates completions both with and 10 | # without optional detokenization, that detokenization includes text 11 | # and no-detokenization doesn't, and that both completions have the same 12 | # token_ids. 13 | prompt = ( 14 | "You are a helpful assistant. How do I build a car from cardboard and " 15 | "paper clips? Is there an easy to follow video tutorial available " 16 | "online for free?") 17 | 18 | llm = LLM(model=model) 19 | sampling_params = SamplingParams(max_tokens=10, 20 | temperature=0.0, 21 | detokenize=False) 22 | 23 | outputs_no_detokenization = llm.generate(prompt, 24 | sampling_params)[0].outputs[0] 25 | sampling_params.detokenize = True 26 | outputs_with_detokenization = llm.generate(prompt, 27 | sampling_params)[0].outputs[0] 28 | 29 | assert outputs_no_detokenization.text == '' 30 | assert outputs_with_detokenization.text != '' 31 | assert outputs_no_detokenization.token_ids == \ 32 | outputs_with_detokenization.token_ids 33 | -------------------------------------------------------------------------------- /tests/engine/test_skip_tokenizer_init.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.entrypoints.llm import LLM 4 | from vllm.sampling_params import SamplingParams 5 | 6 | 7 | @pytest.mark.parametrize("model", ["facebook/opt-125m"]) 8 | def test_skip_tokenizer_initialization(model: str): 9 | # This test checks if the flag skip_tokenizer_init skips the initialization 10 | # of tokenizer and detokenizer. The generated output is expected to contain 11 | # token ids. 12 | llm = LLM(model=model, skip_tokenizer_init=True) 13 | sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) 14 | with pytest.raises(ValueError) as err: 15 | llm.generate("abc", sampling_params) 16 | assert "prompts must be None if" in str(err.value) 17 | outputs = llm.generate(prompt_token_ids=[[1, 2, 3]], 18 | sampling_params=sampling_params) 19 | assert len(outputs) > 0 20 | completions = outputs[0].outputs 21 | assert len(completions) > 0 22 | assert completions[0].text == "" 23 | assert completions[0].token_ids 24 | -------------------------------------------------------------------------------- /tests/engine/test_stop_reason.py: -------------------------------------------------------------------------------- 1 | """Test the different finish_reason="stop" situations during generation: 2 | 1. One of the provided stop strings 3 | 2. One of the provided stop tokens 4 | 3. The EOS token 5 | 6 | Run `pytest tests/engine/test_stop_reason.py`. 7 | """ 8 | 9 | import pytest 10 | import transformers 11 | 12 | from vllm import SamplingParams 13 | 14 | MODEL = "facebook/opt-350m" 15 | STOP_STR = "." 16 | SEED = 42 17 | MAX_TOKENS = 1024 18 | 19 | 20 | @pytest.fixture 21 | def vllm_model(vllm_runner): 22 | vllm_model = vllm_runner(MODEL) 23 | yield vllm_model 24 | del vllm_model 25 | 26 | 27 | def test_stop_reason(vllm_model, example_prompts): 28 | tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL) 29 | stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR) 30 | llm = vllm_model.model 31 | 32 | # test stop token 33 | outputs = llm.generate(example_prompts, 34 | sampling_params=SamplingParams( 35 | seed=SEED, 36 | max_tokens=MAX_TOKENS, 37 | stop_token_ids=[stop_token_id])) 38 | for output in outputs: 39 | output = output.outputs[0] 40 | assert output.finish_reason == "stop" 41 | assert output.stop_reason == stop_token_id 42 | 43 | # test stop string 44 | outputs = llm.generate(example_prompts, 45 | sampling_params=SamplingParams( 46 | seed=SEED, max_tokens=MAX_TOKENS, stop=".")) 47 | for output in outputs: 48 | output = output.outputs[0] 49 | assert output.finish_reason == "stop" 50 | assert output.stop_reason == STOP_STR 51 | 52 | # test EOS token 53 | outputs = llm.generate(example_prompts, 54 | sampling_params=SamplingParams( 55 | seed=SEED, max_tokens=MAX_TOKENS)) 56 | for output in outputs: 57 | output = output.outputs[0] 58 | assert output.finish_reason == "length" or ( 59 | output.finish_reason == "stop" and output.stop_reason is None) 60 | -------------------------------------------------------------------------------- /tests/entrypoints/openai/test_serving_chat.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from dataclasses import dataclass 3 | 4 | from vllm.entrypoints.openai.serving_chat import OpenAIServingChat 5 | 6 | MODEL_NAME = "openai-community/gpt2" 7 | CHAT_TEMPLATE = "Dummy chat template for testing {}" 8 | 9 | 10 | @dataclass 11 | class MockModelConfig: 12 | tokenizer = MODEL_NAME 13 | trust_remote_code = False 14 | tokenizer_mode = "auto" 15 | max_model_len = 100 16 | tokenizer_revision = None 17 | 18 | 19 | @dataclass 20 | class MockEngine: 21 | 22 | async def get_model_config(self): 23 | return MockModelConfig 24 | 25 | 26 | async def _async_serving_chat_init(): 27 | serving_completion = OpenAIServingChat(MockEngine(), 28 | served_model_names=[MODEL_NAME], 29 | response_role="assistant", 30 | chat_template=CHAT_TEMPLATE) 31 | return serving_completion 32 | 33 | 34 | def test_async_serving_chat_init(): 35 | serving_completion = asyncio.run(_async_serving_chat_init()) 36 | assert serving_completion.tokenizer is not None 37 | assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE 38 | -------------------------------------------------------------------------------- /tests/entrypoints/test_llm_generate.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm import LLM, SamplingParams 4 | 5 | 6 | def test_multiple_sampling_params(): 7 | 8 | llm = LLM(model="facebook/opt-125m", 9 | max_num_batched_tokens=4096, 10 | tensor_parallel_size=1) 11 | 12 | prompts = [ 13 | "Hello, my name is", 14 | "The president of the United States is", 15 | "The capital of France is", 16 | "The future of AI is", 17 | ] 18 | 19 | sampling_params = [ 20 | SamplingParams(temperature=0.01, top_p=0.95), 21 | SamplingParams(temperature=0.3, top_p=0.95), 22 | SamplingParams(temperature=0.7, top_p=0.95), 23 | SamplingParams(temperature=0.99, top_p=0.95), 24 | ] 25 | 26 | # Multiple SamplingParams should be matched with each prompt 27 | outputs = llm.generate(prompts, sampling_params=sampling_params) 28 | assert len(prompts) == len(outputs) 29 | 30 | # Exception raised, if the size of params does not match the size of prompts 31 | with pytest.raises(ValueError): 32 | outputs = llm.generate(prompts, sampling_params=sampling_params[:3]) 33 | 34 | # Single SamplingParams should be applied to every prompt 35 | single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95) 36 | outputs = llm.generate(prompts, sampling_params=single_sampling_params) 37 | assert len(prompts) == len(outputs) 38 | 39 | # sampling_params is None, default params should be applied 40 | outputs = llm.generate(prompts, sampling_params=None) 41 | assert len(prompts) == len(outputs) -------------------------------------------------------------------------------- /tests/entrypoints/test_server_oot_registration.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import sys 3 | import time 4 | 5 | import torch 6 | from openai import OpenAI, OpenAIError 7 | 8 | from vllm import ModelRegistry 9 | from vllm.model_executor.models.opt import OPTForCausalLM 10 | from vllm.model_executor.sampling_metadata import SamplingMetadata 11 | from vllm.utils import get_open_port 12 | 13 | 14 | class MyOPTForCausalLM(OPTForCausalLM): 15 | 16 | def compute_logits(self, hidden_states: torch.Tensor, 17 | sampling_metadata: SamplingMetadata) -> torch.Tensor: 18 | # this dummy model always predicts the first token 19 | logits = super().compute_logits(hidden_states, sampling_metadata) 20 | logits.zero_() 21 | logits[:, 0] += 1.0 22 | return logits 23 | 24 | 25 | def server_function(port): 26 | # register our dummy model 27 | ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM) 28 | sys.argv = ["placeholder.py"] + \ 29 | ("--model facebook/opt-125m --dtype" 30 | f" float32 --api-key token-abc123 --port {port}").split() 31 | import runpy 32 | runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') 33 | 34 | 35 | def test_oot_registration_for_api_server(): 36 | port = get_open_port() 37 | server = multiprocessing.Process(target=server_function, args=(port, )) 38 | server.start() 39 | client = OpenAI( 40 | base_url=f"http://localhost:{port}/v1", 41 | api_key="token-abc123", 42 | ) 43 | while True: 44 | try: 45 | completion = client.chat.completions.create( 46 | model="facebook/opt-125m", 47 | messages=[{ 48 | "role": "system", 49 | "content": "You are a helpful assistant." 50 | }, { 51 | "role": "user", 52 | "content": "Hello!" 53 | }], 54 | temperature=0, 55 | ) 56 | break 57 | except OpenAIError as e: 58 | if "Connection error" in str(e): 59 | time.sleep(3) 60 | else: 61 | raise e 62 | server.kill() 63 | generated_text = completion.choices[0].message.content 64 | # make sure only the first token is generated 65 | rest = generated_text.replace("", "") 66 | assert rest == "" 67 | -------------------------------------------------------------------------------- /tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "llama", 3 | "kv_cache": { 4 | "dtype": "float8_e4m3fn", 5 | "scaling_factor": { 6 | "0": { 7 | "0": 0.0152239128947258, 8 | "1": 0.0188860222697258, 9 | "2": 0.0354178324341774, 10 | "3": 0.0376674123108387, 11 | "4": 0.0418526791036129, 12 | "5": 0.0433175228536129, 13 | "6": 0.0397600457072258, 14 | "7": 0.0424455925822258, 15 | "8": 0.0415387861430645, 16 | "9": 0.0408412404358387, 17 | "10": 0.0395856611430645, 18 | "11": 0.0377371683716774, 19 | "12": 0.0400739423930645, 20 | "13": 0.040771484375, 21 | "14": 0.0393415205180645, 22 | "15": 0.0369001142680645, 23 | "16": 0.03857421875, 24 | "17": 0.0387486070394516, 25 | "18": 0.0403180830180645, 26 | "19": 0.0396205373108387, 27 | "20": 0.0375627800822258, 28 | "21": 0.0407366082072258, 29 | "22": 0.0432477705180645, 30 | "23": 0.0377022884786129, 31 | "24": 0.0399693101644516, 32 | "25": 0.0374581478536129, 33 | "26": 0.0413295216858387, 34 | "27": 0.0442243330180645, 35 | "28": 0.0424804724752903, 36 | "29": 0.0456891767680645, 37 | "30": 0.0409109964966774, 38 | "31": 0.0482352152466774 39 | } 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /tests/kernels/allclose_default.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | # Reference default values of atol and rtol are from 4 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67 5 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5} 6 | default_rtol = { 7 | torch.float16: 1e-3, 8 | torch.bfloat16: 1.6e-2, 9 | torch.float: 1.3e-6 10 | } 11 | 12 | 13 | def get_default_atol(output) -> float: 14 | return default_atol[output.dtype] 15 | 16 | 17 | def get_default_rtol(output) -> float: 18 | return default_rtol[output.dtype] 19 | -------------------------------------------------------------------------------- /tests/kernels/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.utils import (create_kv_caches_with_random, 4 | create_kv_caches_with_random_flash) 5 | 6 | 7 | @pytest.fixture() 8 | def kv_cache_factory(): 9 | return create_kv_caches_with_random 10 | 11 | 12 | @pytest.fixture() 13 | def kv_cache_factory_flashinfer(): 14 | return create_kv_caches_with_random_flash 15 | -------------------------------------------------------------------------------- /tests/kernels/test_layernorm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from vllm.model_executor.layers.layernorm import RMSNorm 5 | 6 | DTYPES = [torch.half, torch.bfloat16, torch.float] 7 | NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing 8 | HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 9 | 8199] # Arbitrary values for testing 10 | ADD_RESIDUAL = [False, True] 11 | SEEDS = [0] 12 | CUDA_DEVICES = [ 13 | f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) 14 | ] 15 | 16 | 17 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 18 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) 19 | @pytest.mark.parametrize("add_residual", ADD_RESIDUAL) 20 | @pytest.mark.parametrize("dtype", DTYPES) 21 | @pytest.mark.parametrize("seed", SEEDS) 22 | @pytest.mark.parametrize("device", CUDA_DEVICES) 23 | @torch.inference_mode() 24 | def test_rms_norm( 25 | num_tokens: int, 26 | hidden_size: int, 27 | add_residual: bool, 28 | dtype: torch.dtype, 29 | seed: int, 30 | device: str, 31 | ) -> None: 32 | torch.random.manual_seed(seed) 33 | if torch.cuda.is_available(): 34 | torch.cuda.manual_seed(seed) 35 | torch.set_default_device(device) 36 | layer = RMSNorm(hidden_size).to(dtype=dtype) 37 | layer.weight.data.normal_(mean=1.0, std=0.1) 38 | scale = 1 / (2 * hidden_size) 39 | x = torch.randn(num_tokens, hidden_size, dtype=dtype) 40 | x *= scale 41 | residual = torch.randn_like(x) * scale if add_residual else None 42 | 43 | # NOTE(woosuk): The reference implementation should be executed first 44 | # because the custom kernel is in-place. 45 | ref_out = layer._forward(x, residual) 46 | out = layer(x, residual) 47 | # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger 48 | # numerical errors than other operators because they involve reductions. 49 | # Therefore, we use a larger tolerance. 50 | if add_residual: 51 | assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2) 52 | assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2) 53 | else: 54 | assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2) 55 | -------------------------------------------------------------------------------- /tests/kernels/test_rand.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import pytest 4 | import torch 5 | 6 | from vllm.model_executor.layers.ops.rand import seeded_uniform 7 | from vllm.model_executor.utils import set_random_seed 8 | 9 | 10 | @pytest.mark.parametrize("dtype", 11 | [torch.float32, torch.float16, torch.bfloat16]) 12 | @pytest.mark.parametrize("use_3d", [True, False]) 13 | def test_seeded_uniform(dtype: torch.dtype, use_3d: bool): 14 | device = "cuda" 15 | for seed in range(512): 16 | set_random_seed(seed) 17 | rows = random.randint(1, 512) 18 | cols = random.randint(1, 64000) 19 | if use_3d: 20 | third_dim = random.randint(2, 10) 21 | dims = [rows, third_dim, cols] 22 | else: 23 | dims = [rows, cols] 24 | seeds = torch.randint(torch.iinfo(torch.long).min, 25 | torch.iinfo(torch.long).max, (rows, ), 26 | device=device) 27 | 28 | # Test that the same seed produces the same output 29 | out = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device) 30 | out2 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device) 31 | torch.testing.assert_close(out, out2) 32 | # del to save memory 33 | del out2 34 | 35 | out3 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device) 36 | torch.testing.assert_close(out, out3) 37 | # del to save memory 38 | del out3 39 | 40 | # Initialize out tensor with garbage to ensure that it is overwritten 41 | out_with_tensor = seeded_uniform( 42 | *dims, 43 | out=torch.full( 44 | (*dims, ), 45 | -1, 46 | dtype=dtype, 47 | device=device, 48 | ), 49 | seeds=seeds, 50 | dtype=dtype, 51 | ) 52 | torch.testing.assert_close(out, out_with_tensor) 53 | -------------------------------------------------------------------------------- /tests/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/lora/__init__.py -------------------------------------------------------------------------------- /tests/lora/test_gemma.py: -------------------------------------------------------------------------------- 1 | import vllm 2 | from vllm.lora.request import LoRARequest 3 | 4 | MODEL_PATH = "google/gemma-7b" 5 | 6 | 7 | def do_sample(llm, lora_path: str, lora_id: int) -> str: 8 | prompts = [ 9 | "Quote: Imagination is", 10 | "Quote: Be yourself;", 11 | "Quote: So many books,", 12 | ] 13 | sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) 14 | outputs = llm.generate( 15 | prompts, 16 | sampling_params, 17 | lora_request=LoRARequest(str(lora_id), lora_id, lora_path) 18 | if lora_id else None) 19 | # Print the outputs. 20 | generated_texts = [] 21 | for output in outputs: 22 | prompt = output.prompt 23 | generated_text = output.outputs[0].text.strip() 24 | generated_texts.append(generated_text) 25 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 26 | return generated_texts 27 | 28 | 29 | def test_gemma_lora(gemma_lora_files): 30 | llm = vllm.LLM(MODEL_PATH, 31 | max_model_len=1024, 32 | enable_lora=True, 33 | max_loras=4) 34 | 35 | expected_lora_output = [ 36 | "more important than knowledge.\nAuthor: Albert Einstein\n", 37 | "everyone else is already taken.\nAuthor: Oscar Wilde\n", 38 | "so little time\nAuthor: Frank Zappa\n", 39 | ] 40 | 41 | output1 = do_sample(llm, gemma_lora_files, lora_id=1) 42 | for i in range(len(expected_lora_output)): 43 | assert output1[i].startswith(expected_lora_output[i]) 44 | output2 = do_sample(llm, gemma_lora_files, lora_id=2) 45 | for i in range(len(expected_lora_output)): 46 | assert output2[i].startswith(expected_lora_output[i]) 47 | -------------------------------------------------------------------------------- /tests/lora/test_tokenizer_group.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from transformers import AutoTokenizer, PreTrainedTokenizerBase 3 | 4 | from vllm.lora.request import LoRARequest 5 | from vllm.transformers_utils.tokenizer import get_lora_tokenizer 6 | from vllm.transformers_utils.tokenizer_group import get_tokenizer_group 7 | 8 | from ..conftest import get_tokenizer_pool_config 9 | 10 | 11 | @pytest.mark.asyncio 12 | @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"]) 13 | async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type): 14 | reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files) 15 | tokenizer_group = get_tokenizer_group( 16 | get_tokenizer_pool_config(tokenizer_group_type), 17 | tokenizer_id="gpt2", 18 | enable_lora=True, 19 | max_num_seqs=1, 20 | max_input_length=None, 21 | ) 22 | lora_request = LoRARequest("1", 1, sql_lora_files) 23 | assert reference_tokenizer.encode("prompt") == tokenizer_group.encode( 24 | request_id="request_id", prompt="prompt", lora_request=lora_request) 25 | assert reference_tokenizer.encode( 26 | "prompt") == await tokenizer_group.encode_async( 27 | request_id="request_id", 28 | prompt="prompt", 29 | lora_request=lora_request) 30 | assert isinstance(tokenizer_group.get_lora_tokenizer(None), 31 | PreTrainedTokenizerBase) 32 | assert tokenizer_group.get_lora_tokenizer( 33 | None) == await tokenizer_group.get_lora_tokenizer_async(None) 34 | 35 | assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request), 36 | PreTrainedTokenizerBase) 37 | assert tokenizer_group.get_lora_tokenizer( 38 | lora_request) != tokenizer_group.get_lora_tokenizer(None) 39 | assert tokenizer_group.get_lora_tokenizer( 40 | lora_request) == await tokenizer_group.get_lora_tokenizer_async( 41 | lora_request) 42 | 43 | 44 | def test_get_lora_tokenizer(sql_lora_files, tmpdir): 45 | lora_request = None 46 | tokenizer = get_lora_tokenizer(lora_request) 47 | assert not tokenizer 48 | 49 | lora_request = LoRARequest("1", 1, sql_lora_files) 50 | tokenizer = get_lora_tokenizer(lora_request) 51 | assert tokenizer.get_added_vocab() 52 | 53 | lora_request = LoRARequest("1", 1, str(tmpdir)) 54 | tokenizer = get_lora_tokenizer(lora_request) 55 | assert not tokenizer 56 | -------------------------------------------------------------------------------- /tests/model_executor/weight_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import huggingface_hub.constants 5 | import pytest 6 | from huggingface_hub.utils import LocalEntryNotFoundError 7 | 8 | from vllm.model_executor.model_loader.weight_utils import ( 9 | download_weights_from_hf, enable_hf_transfer) 10 | 11 | 12 | def test_hf_transfer_auto_activation(): 13 | if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ: 14 | # in case it is already set, we can't test the auto activation 15 | pytest.skip( 16 | "HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation") 17 | enable_hf_transfer() 18 | try: 19 | # enable hf hub transfer if available 20 | import hf_transfer # type: ignore # noqa 21 | HF_TRANFER_ACTIVE = True 22 | except ImportError: 23 | HF_TRANFER_ACTIVE = False 24 | assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == 25 | HF_TRANFER_ACTIVE) 26 | 27 | 28 | def test_download_weights_from_hf(): 29 | with tempfile.TemporaryDirectory() as tmpdir: 30 | # assert LocalEntryNotFoundError error is thrown 31 | # if offline is set and model is not cached 32 | huggingface_hub.constants.HF_HUB_OFFLINE = True 33 | with pytest.raises(LocalEntryNotFoundError): 34 | download_weights_from_hf("facebook/opt-125m", 35 | allow_patterns=["*.safetensors", "*.bin"], 36 | cache_dir=tmpdir) 37 | 38 | # download the model 39 | huggingface_hub.constants.HF_HUB_OFFLINE = False 40 | download_weights_from_hf("facebook/opt-125m", 41 | allow_patterns=["*.safetensors", "*.bin"], 42 | cache_dir=tmpdir) 43 | 44 | # now it should work offline 45 | huggingface_hub.constants.HF_HUB_OFFLINE = True 46 | assert download_weights_from_hf( 47 | "facebook/opt-125m", 48 | allow_patterns=["*.safetensors", "*.bin"], 49 | cache_dir=tmpdir) is not None 50 | 51 | 52 | if __name__ == "__main__": 53 | test_hf_transfer_auto_activation() 54 | test_download_weights_from_hf() 55 | -------------------------------------------------------------------------------- /tests/models/test_big_models.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM when using greedy sampling. 2 | 3 | This tests bigger models and use half precision. 4 | 5 | Run `pytest tests/models/test_big_models.py`. 6 | """ 7 | import pytest 8 | 9 | MODELS = [ 10 | "meta-llama/Llama-2-7b-hf", 11 | # "mistralai/Mistral-7B-v0.1", # Broken 12 | # "Deci/DeciLM-7b", # Broken 13 | # "tiiuae/falcon-7b", # Broken 14 | "EleutherAI/gpt-j-6b", 15 | "mosaicml/mpt-7b", 16 | # "Qwen/Qwen1.5-0.5B" # Broken, 17 | ] 18 | 19 | 20 | @pytest.mark.parametrize("model", MODELS) 21 | @pytest.mark.parametrize("dtype", ["half"]) 22 | @pytest.mark.parametrize("max_tokens", [32]) 23 | def test_models( 24 | hf_runner, 25 | vllm_runner, 26 | example_prompts, 27 | model: str, 28 | dtype: str, 29 | max_tokens: int, 30 | ) -> None: 31 | hf_model = hf_runner(model, dtype=dtype) 32 | hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) 33 | del hf_model 34 | 35 | vllm_model = vllm_runner(model, dtype=dtype) 36 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 37 | del vllm_model 38 | 39 | for i in range(len(example_prompts)): 40 | hf_output_ids, hf_output_str = hf_outputs[i] 41 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 42 | assert hf_output_str == vllm_output_str, ( 43 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 44 | assert hf_output_ids == vllm_output_ids, ( 45 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 46 | 47 | 48 | @pytest.mark.parametrize("model", MODELS) 49 | @pytest.mark.parametrize("dtype", ["half"]) 50 | def test_model_print( 51 | vllm_runner, 52 | model: str, 53 | dtype: str, 54 | ) -> None: 55 | vllm_model = vllm_runner(model, dtype=dtype) 56 | # This test is for verifying whether the model's extra_repr 57 | # can be printed correctly. 58 | print(vllm_model.model.llm_engine.model_executor.driver_worker. 59 | model_runner.model) 60 | del vllm_model 61 | -------------------------------------------------------------------------------- /tests/models/test_mistral.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM for Mistral models using greedy sampling. 2 | 3 | Run `pytest tests/models/test_mistral.py`. 4 | """ 5 | import pytest 6 | 7 | MODELS = [ 8 | "mistralai/Mistral-7B-Instruct-v0.1", 9 | ] 10 | 11 | 12 | @pytest.mark.parametrize("model", MODELS) 13 | @pytest.mark.parametrize("dtype", ["bfloat16"]) 14 | @pytest.mark.parametrize("max_tokens", [128]) 15 | @pytest.mark.skip( 16 | "Two problems: 1. Failing correctness tests. 2. RuntimeError: expected " 17 | "scalar type BFloat16 but found Half (only in CI).") 18 | def test_models( 19 | hf_runner, 20 | vllm_runner, 21 | example_long_prompts, 22 | model: str, 23 | dtype: str, 24 | max_tokens: int, 25 | ) -> None: 26 | hf_model = hf_runner(model, dtype=dtype) 27 | hf_outputs = hf_model.generate_greedy(example_long_prompts, max_tokens) 28 | del hf_model 29 | 30 | vllm_model = vllm_runner(model, dtype=dtype) 31 | vllm_outputs = vllm_model.generate_greedy(example_long_prompts, max_tokens) 32 | del vllm_model 33 | 34 | for i in range(len(example_long_prompts)): 35 | hf_output_ids, hf_output_str = hf_outputs[i] 36 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 37 | assert hf_output_str == vllm_output_str, ( 38 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 39 | assert hf_output_ids == vllm_output_ids, ( 40 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 41 | -------------------------------------------------------------------------------- /tests/models/test_models.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM when using greedy sampling. 2 | 3 | This test only tests small models. Big models such as 7B should be tested from 4 | test_big_models.py because it could use a larger instance to run tests. 5 | 6 | Run `pytest tests/models/test_models.py`. 7 | """ 8 | import pytest 9 | 10 | MODELS = [ 11 | "facebook/opt-125m", 12 | "gpt2", 13 | "bigcode/tiny_starcoder_py", 14 | "EleutherAI/pythia-70m", 15 | "bigscience/bloom-560m", # Testing alibi slopes. 16 | "microsoft/phi-2", 17 | "stabilityai/stablelm-3b-4e1t", 18 | # "allenai/OLMo-1B", # Broken 19 | "bigcode/starcoder2-3b", 20 | ] 21 | 22 | 23 | @pytest.mark.parametrize("model", MODELS) 24 | @pytest.mark.parametrize("dtype", ["float"]) 25 | @pytest.mark.parametrize("max_tokens", [96]) 26 | def test_models( 27 | hf_runner, 28 | vllm_runner, 29 | example_prompts, 30 | model: str, 31 | dtype: str, 32 | max_tokens: int, 33 | ) -> None: 34 | # To pass the small model tests, we need full precision. 35 | assert dtype == "float" 36 | 37 | hf_model = hf_runner(model, dtype=dtype) 38 | hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) 39 | del hf_model 40 | 41 | vllm_model = vllm_runner(model, dtype=dtype) 42 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 43 | del vllm_model 44 | 45 | for i in range(len(example_prompts)): 46 | hf_output_ids, hf_output_str = hf_outputs[i] 47 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 48 | assert hf_output_str == vllm_output_str, ( 49 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 50 | assert hf_output_ids == vllm_output_ids, ( 51 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 52 | 53 | 54 | @pytest.mark.parametrize("model", MODELS) 55 | @pytest.mark.parametrize("dtype", ["float"]) 56 | def test_model_print( 57 | vllm_runner, 58 | model: str, 59 | dtype: str, 60 | ) -> None: 61 | vllm_model = vllm_runner(model, dtype=dtype) 62 | # This test is for verifying whether the model's extra_repr 63 | # can be printed correctly. 64 | print(vllm_model.model.llm_engine.model_executor.driver_worker. 65 | model_runner.model) 66 | del vllm_model 67 | -------------------------------------------------------------------------------- /tests/models/test_oot_registration.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from vllm import LLM, ModelRegistry, SamplingParams 4 | from vllm.model_executor.models.opt import OPTForCausalLM 5 | from vllm.model_executor.sampling_metadata import SamplingMetadata 6 | 7 | 8 | class MyOPTForCausalLM(OPTForCausalLM): 9 | 10 | def compute_logits(self, hidden_states: torch.Tensor, 11 | sampling_metadata: SamplingMetadata) -> torch.Tensor: 12 | # this dummy model always predicts the first token 13 | logits = super().compute_logits(hidden_states, sampling_metadata) 14 | logits.zero_() 15 | logits[:, 0] += 1.0 16 | return logits 17 | 18 | 19 | def test_oot_registration(): 20 | # register our dummy model 21 | ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM) 22 | prompts = ["Hello, my name is", "The text does not matter"] 23 | sampling_params = SamplingParams(temperature=0) 24 | llm = LLM(model="facebook/opt-125m") 25 | first_token = llm.get_tokenizer().decode(0) 26 | outputs = llm.generate(prompts, sampling_params) 27 | 28 | for output in outputs: 29 | generated_text = output.outputs[0].text 30 | # make sure only the first token is generated 31 | rest = generated_text.replace(first_token, "") 32 | assert rest == "" 33 | -------------------------------------------------------------------------------- /tests/models/utils.py: -------------------------------------------------------------------------------- 1 | def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1): 2 | """Compare the logprobs of two sequences generated by different models, 3 | which should be similar but not necessarily equal. 4 | """ 5 | # Loop through responses to each prompt. 6 | for prompt_idx, (outputs_0, 7 | outputs_1) in enumerate(zip(outputs_0_lst, 8 | outputs_1_lst)): 9 | output_ids_0, output_str_0, logprobs_0 = outputs_0 10 | output_ids_1, output_str_1, logprobs_1 = outputs_1 11 | 12 | # Loop through generated tokens. 13 | for idx, (output_id_0, 14 | output_id_1) in enumerate(zip(output_ids_0, output_ids_1)): 15 | 16 | # If generated tokens don't match, then 17 | if output_id_0 != output_id_1: 18 | # Each predicted token must be in top N logprobs of the other 19 | assert output_id_0 in logprobs_1[idx], ( 20 | f"Test{prompt_idx}:" 21 | f"\n{name_0}:\t{output_str_0!r}" 22 | f"\n{name_1}:\t{output_str_1!r}") 23 | assert output_id_1 in logprobs_0[idx], ( 24 | f"Test{prompt_idx}:" 25 | f"\n{name_0}:\t{output_str_0!r}" 26 | f"\n{name_1}:\t{output_str_1!r}") 27 | 28 | # Break out since sequences will now diverge. 29 | break 30 | -------------------------------------------------------------------------------- /tests/prompts/example.txt: -------------------------------------------------------------------------------- 1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. 2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. 3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information. 4 | Describe the basic components of a neural network and how it can be trained. 5 | Write a short story about a robot that dreams for the first time. 6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. 7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. 8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' 9 | -------------------------------------------------------------------------------- /tests/quantization/test_fp8.py: -------------------------------------------------------------------------------- 1 | """Tests whether FP8 computation is enabled correctly. 2 | 3 | Run `pytest tests/quantization/test_fp8.py --forked`. 4 | """ 5 | import pytest 6 | import torch 7 | 8 | from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS 9 | from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod 10 | 11 | capability = torch.cuda.get_device_capability() 12 | capability = capability[0] * 10 + capability[1] 13 | 14 | 15 | @pytest.mark.skipif( 16 | capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), 17 | reason="FP8 is not supported on this GPU type.") 18 | def test_load_fp16_model(vllm_runner) -> None: 19 | llm = vllm_runner("facebook/opt-125m", quantization="fp8") 20 | 21 | model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model 22 | fc1 = model.model.decoder.layers[0].fc1 23 | assert isinstance(fc1.quant_method, Fp8LinearMethod) 24 | assert fc1.weight.dtype == torch.float8_e4m3fn 25 | -------------------------------------------------------------------------------- /tests/samplers/test_beam_search.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM when using beam search. 2 | 3 | Run `pytest tests/samplers/test_beam_search.py`. 4 | """ 5 | import gc 6 | 7 | import pytest 8 | import torch 9 | 10 | # FIXME(zhuohan): The test can not pass if we: 11 | # 1. Increase max_tokens to 256. 12 | # 2. Increase beam_width to 8. 13 | # 3. Use the model "huggyllama/llama-7b". 14 | MAX_TOKENS = [128] 15 | BEAM_WIDTHS = [4] 16 | MODELS = ["facebook/opt-125m"] 17 | 18 | 19 | @pytest.mark.parametrize("model", MODELS) 20 | @pytest.mark.parametrize("dtype", ["half"]) 21 | @pytest.mark.parametrize("max_tokens", MAX_TOKENS) 22 | @pytest.mark.parametrize("beam_width", BEAM_WIDTHS) 23 | def test_beam_search_single_input( 24 | hf_runner, 25 | vllm_runner, 26 | example_prompts, 27 | model: str, 28 | dtype: str, 29 | max_tokens: int, 30 | beam_width: int, 31 | ) -> None: 32 | example_prompts = example_prompts[:1] 33 | hf_model = hf_runner(model, dtype=dtype) 34 | hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, 35 | max_tokens) 36 | del hf_model 37 | 38 | vllm_model = vllm_runner(model, dtype=dtype) 39 | vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, 40 | max_tokens) 41 | del vllm_model 42 | # NOTE(woosuk): For some reason, the following GC is required to avoid 43 | # GPU OOM errors in the following tests using `vllm_runner`. 44 | gc.collect() 45 | torch.cuda.empty_cache() 46 | 47 | for i in range(len(example_prompts)): 48 | hf_output_ids, _ = hf_outputs[i] 49 | vllm_output_ids, _ = vllm_outputs[i] 50 | assert len(hf_output_ids) == len(vllm_output_ids) 51 | for j in range(len(hf_output_ids)): 52 | assert hf_output_ids[j] == vllm_output_ids[j], ( 53 | f"Test{i} output{j}:\nHF: {hf_output_ids}\n" 54 | f"vLLM: {vllm_output_ids}") 55 | -------------------------------------------------------------------------------- /tests/samplers/test_ignore_eos.py: -------------------------------------------------------------------------------- 1 | """Make sure ignore_eos works. 2 | 3 | Run `pytest tests/samplers/test_ignore_eos.py`. 4 | """ 5 | 6 | import pytest 7 | 8 | from vllm import SamplingParams 9 | 10 | MODELS = ["facebook/opt-125m"] 11 | 12 | 13 | @pytest.mark.parametrize("model", MODELS) 14 | @pytest.mark.parametrize("dtype", ["half"]) 15 | @pytest.mark.parametrize("max_tokens", [1024]) 16 | def test_beam_search_single_input( 17 | vllm_runner, 18 | example_prompts, 19 | model: str, 20 | dtype: str, 21 | max_tokens: int, 22 | ) -> None: 23 | example_prompts = "1 + 1 is" 24 | 25 | vllm_model = vllm_runner(model, dtype=dtype) 26 | sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) 27 | ignore_eos_output = vllm_model.model.generate( 28 | example_prompts, sampling_params=sampling_params) 29 | print(len(ignore_eos_output[0].outputs[0].token_ids)) 30 | assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) < 10 31 | assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) >= 0 32 | -------------------------------------------------------------------------------- /tests/samplers/test_logits_processor.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from vllm import SamplingParams 5 | 6 | MODELS = ["facebook/opt-125m"] 7 | 8 | 9 | @pytest.mark.parametrize("model", MODELS) 10 | @pytest.mark.parametrize("dtype", ["half"]) 11 | def test_logits_processor_force_generate( 12 | vllm_runner, 13 | example_prompts, 14 | model: str, 15 | dtype: str, 16 | ) -> None: 17 | vllm_model = vllm_runner(model, dtype=dtype) 18 | tokenizer = vllm_model.model.get_tokenizer() 19 | repeat_times = 2 20 | enforced_answers = " vLLM" 21 | vllm_token_ids = tokenizer.encode(enforced_answers, 22 | add_special_tokens=False) 23 | max_tokens = len(vllm_token_ids) * repeat_times 24 | 25 | def pick_vllm(token_ids, logits): 26 | token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)] 27 | logits[token_id] = torch.finfo(logits.dtype).max 28 | return logits 29 | 30 | params_with_logprobs = SamplingParams( 31 | logits_processors=[pick_vllm], 32 | prompt_logprobs=3, 33 | max_tokens=max_tokens, 34 | ) 35 | 36 | # test logits_processors when prompt_logprobs is not None 37 | vllm_model.model._add_request( 38 | prompt=example_prompts[0], 39 | sampling_params=params_with_logprobs, 40 | prompt_token_ids=None, 41 | ) 42 | 43 | # test prompt_logprobs is not None 44 | vllm_model.model._add_request( 45 | prompt=example_prompts[1], 46 | sampling_params=SamplingParams( 47 | prompt_logprobs=3, 48 | max_tokens=max_tokens, 49 | ), 50 | prompt_token_ids=None, 51 | ) 52 | 53 | # test grouped requests 54 | vllm_model.model._add_request( 55 | prompt=example_prompts[2], 56 | sampling_params=SamplingParams(max_tokens=max_tokens), 57 | prompt_token_ids=None, 58 | ) 59 | 60 | outputs = vllm_model.model._run_engine(False) 61 | 62 | assert outputs[0].outputs[0].text == enforced_answers * repeat_times 63 | -------------------------------------------------------------------------------- /tests/samplers/test_ranks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm import SamplingParams 4 | 5 | MODELS = ["facebook/opt-125m"] 6 | 7 | 8 | @pytest.mark.parametrize("model", MODELS) 9 | @pytest.mark.parametrize("dtype", ["half"]) 10 | def test_ranks( 11 | vllm_runner, 12 | model, 13 | dtype, 14 | example_prompts, 15 | ): 16 | max_tokens = 5 17 | num_top_logprobs = 5 18 | num_prompt_logprobs = 5 19 | 20 | vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs) 21 | 22 | ## Test greedy logprobs ranks 23 | vllm_sampling_params = SamplingParams(temperature=0.0, 24 | top_p=1.0, 25 | max_tokens=max_tokens, 26 | logprobs=num_top_logprobs, 27 | prompt_logprobs=num_prompt_logprobs) 28 | vllm_results = vllm_model.generate_w_logprobs(example_prompts, 29 | vllm_sampling_params) 30 | for result in vllm_results: 31 | assert result[2] is not None 32 | assert len(result[2]) == len(result[0]) 33 | # check whether all chosen tokens have ranks = 1 34 | for token, logprobs in zip(result[0], result[2]): 35 | assert token in logprobs 36 | assert logprobs[token].rank == 1 37 | 38 | ## Test non-greedy logprobs ranks 39 | sampling_params = SamplingParams(temperature=1.0, 40 | top_p=1.0, 41 | max_tokens=max_tokens, 42 | logprobs=num_top_logprobs, 43 | prompt_logprobs=num_prompt_logprobs) 44 | res = vllm_model.generate_w_logprobs(example_prompts, sampling_params) 45 | for result in res: 46 | assert result[2] is not None 47 | assert len(result[2]) == len(result[0]) 48 | # check whether all chosen tokens have ranks 49 | for token, logprobs in zip(result[0], result[2]): 50 | assert logprobs[token].rank >= 1 51 | -------------------------------------------------------------------------------- /tests/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/spec_decode/__init__.py -------------------------------------------------------------------------------- /tests/spec_decode/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/spec_decode/e2e/__init__.py -------------------------------------------------------------------------------- /tests/tensorizer_loader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/tensorizer_loader/__init__.py -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | from vllm.config import ModelConfig 2 | 3 | 4 | def test_get_sliding_window(): 5 | TEST_SLIDING_WINDOW = 4096 6 | # Test that the sliding window is correctly computed. 7 | # For Qwen1.5/Qwen2, get_sliding_window() should be None 8 | # when use_sliding_window is False. 9 | qwen2_model_config = ModelConfig( 10 | "Qwen/Qwen1.5-7B", 11 | "Qwen/Qwen1.5-7B", 12 | tokenizer_mode="auto", 13 | trust_remote_code=False, 14 | seed=0, 15 | dtype="float16", 16 | revision=None, 17 | ) 18 | 19 | qwen2_model_config.hf_config.use_sliding_window = False 20 | qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW 21 | assert qwen2_model_config.get_sliding_window() is None 22 | 23 | qwen2_model_config.hf_config.use_sliding_window = True 24 | assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW 25 | 26 | mistral_model_config = ModelConfig( 27 | "mistralai/Mistral-7B-v0.1", 28 | "mistralai/Mistral-7B-v0.1", 29 | tokenizer_mode="auto", 30 | trust_remote_code=False, 31 | seed=0, 32 | dtype="float16", 33 | revision=None, 34 | ) 35 | mistral_model_config.hf_config.sliding_window = None 36 | assert mistral_model_config.get_sliding_window() is None 37 | 38 | mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW 39 | assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW -------------------------------------------------------------------------------- /tests/test_regression.py: -------------------------------------------------------------------------------- 1 | """Containing tests that check for regressions in vLLM's behavior. 2 | 3 | It should include tests that are reported by users and making sure they 4 | will never happen again. 5 | 6 | """ 7 | import gc 8 | 9 | import torch 10 | 11 | from vllm import LLM, SamplingParams 12 | 13 | 14 | def test_duplicated_ignored_sequence_group(): 15 | """https://github.com/vllm-project/vllm/issues/1655""" 16 | 17 | sampling_params = SamplingParams(temperature=0.01, 18 | top_p=0.1, 19 | max_tokens=256) 20 | llm = LLM(model="facebook/opt-125m", 21 | max_num_batched_tokens=4096, 22 | tensor_parallel_size=1) 23 | prompts = ["This is a short prompt", "This is a very long prompt " * 1000] 24 | outputs = llm.generate(prompts, sampling_params=sampling_params) 25 | 26 | assert len(prompts) == len(outputs) 27 | 28 | 29 | def test_max_tokens_none(): 30 | sampling_params = SamplingParams(temperature=0.01, 31 | top_p=0.1, 32 | max_tokens=None) 33 | llm = LLM(model="facebook/opt-125m", 34 | max_num_batched_tokens=4096, 35 | tensor_parallel_size=1) 36 | prompts = ["Just say hello!"] 37 | outputs = llm.generate(prompts, sampling_params=sampling_params) 38 | 39 | assert len(prompts) == len(outputs) 40 | 41 | 42 | def test_gc(): 43 | llm = LLM("facebook/opt-125m", enforce_eager=True) 44 | del llm 45 | 46 | gc.collect() 47 | torch.cuda.empty_cache() 48 | 49 | # The memory allocated for model and KV cache should be released. 50 | # The memory allocated for PyTorch and others should be less than 50MB. 51 | # Usually, it's around 10MB. 52 | allocated = torch.cuda.memory_allocated() 53 | assert allocated < 50 * 1024 * 1024 54 | 55 | 56 | if __name__ == "__main__": 57 | import pytest 58 | pytest.main([__file__]) 59 | -------------------------------------------------------------------------------- /tests/test_sampling_params.py: -------------------------------------------------------------------------------- 1 | """Tests for the SamplingParams class. 2 | """ 3 | from vllm import SamplingParams 4 | 5 | 6 | def test_max_tokens_none(): 7 | """max_tokens=None should be allowed""" 8 | SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None) 9 | 10 | 11 | if __name__ == "__main__": 12 | import pytest 13 | pytest.main([__file__]) 14 | -------------------------------------------------------------------------------- /tests/tokenization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/tokenization/__init__.py -------------------------------------------------------------------------------- /tests/tokenization/test_cached_tokenizer.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from transformers import AutoTokenizer 4 | 5 | from vllm.transformers_utils.tokenizer import get_cached_tokenizer 6 | 7 | 8 | def test_cached_tokenizer(): 9 | reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") 10 | reference_tokenizer.add_special_tokens({"cls_token": ""}) 11 | reference_tokenizer.add_special_tokens( 12 | {"additional_special_tokens": [""]}) 13 | cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer)) 14 | 15 | assert reference_tokenizer.encode("prompt") == cached_tokenizer.encode( 16 | "prompt") 17 | assert set(reference_tokenizer.all_special_ids) == set( 18 | cached_tokenizer.all_special_ids) 19 | assert set(reference_tokenizer.all_special_tokens) == set( 20 | cached_tokenizer.all_special_tokens) 21 | assert set(reference_tokenizer.all_special_tokens_extended) == set( 22 | cached_tokenizer.all_special_tokens_extended) 23 | -------------------------------------------------------------------------------- /tests/tokenization/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from transformers import PreTrainedTokenizerBase 3 | 4 | from vllm.transformers_utils.tokenizer import get_tokenizer 5 | 6 | TOKENIZER_NAMES = [ 7 | "facebook/opt-125m", 8 | "gpt2", 9 | ] 10 | 11 | 12 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES) 13 | def test_tokenizer_revision(tokenizer_name: str): 14 | # Assume that "main" branch always exists 15 | tokenizer = get_tokenizer(tokenizer_name, revision="main") 16 | assert isinstance(tokenizer, PreTrainedTokenizerBase) 17 | 18 | # Assume that "never" branch always does not exist 19 | with pytest.raises(OSError, match='not a valid git identifier'): 20 | get_tokenizer(tokenizer_name, revision="never") 21 | -------------------------------------------------------------------------------- /tests/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/worker/__init__.py -------------------------------------------------------------------------------- /vllm/__init__.py: -------------------------------------------------------------------------------- 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" 2 | 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine 5 | from vllm.engine.llm_engine import LLMEngine 6 | from vllm.entrypoints.llm import LLM 7 | from vllm.executor.ray_utils import initialize_ray_cluster 8 | from vllm.model_executor.models import ModelRegistry 9 | from vllm.outputs import CompletionOutput, RequestOutput 10 | from vllm.sampling_params import SamplingParams 11 | 12 | __version__ = "0.4.2" 13 | 14 | __all__ = [ 15 | "LLM", 16 | "ModelRegistry", 17 | "SamplingParams", 18 | "RequestOutput", 19 | "CompletionOutput", 20 | "LLMEngine", 21 | "EngineArgs", 22 | "AsyncLLMEngine", 23 | "AsyncEngineArgs", 24 | "initialize_ray_cluster", 25 | ] 26 | -------------------------------------------------------------------------------- /vllm/attention/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.attention.backends.abstract import (AttentionBackend, 2 | AttentionMetadata, 3 | AttentionMetadataPerStage) 4 | from vllm.attention.layer import Attention 5 | from vllm.attention.selector import get_attn_backend 6 | 7 | __all__ = [ 8 | "AttentionBackend", 9 | "AttentionMetadata", 10 | "Attention", 11 | "get_attn_backend", 12 | "AttentionMetadataPerStage", 13 | ] 14 | -------------------------------------------------------------------------------- /vllm/attention/backends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/attention/backends/__init__.py -------------------------------------------------------------------------------- /vllm/attention/layer.py: -------------------------------------------------------------------------------- 1 | """Attention layer.""" 2 | from typing import List, Optional 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from vllm.attention.backends.abstract import (AttentionMetadata, 8 | AttentionMetadataPerStage) 9 | from vllm.attention.selector import get_attn_backend 10 | 11 | 12 | class Attention(nn.Module): 13 | """Attention layer. 14 | 15 | This class takes query, key, and value tensors as input. The input tensors 16 | can either contain prompt tokens or generation tokens. 17 | The class does the following: 18 | 19 | 1. Store the input key and value tensors in the KV cache. 20 | 2. Perform (multi-head/multi-query/grouped-query) attention. 21 | 3. Return the output tensor. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | num_heads: int, 27 | head_size: int, 28 | scale: float, 29 | num_kv_heads: Optional[int] = None, 30 | alibi_slopes: Optional[List[float]] = None, 31 | sliding_window: Optional[int] = None, 32 | ) -> None: 33 | super().__init__() 34 | self.backend = get_attn_backend(torch.get_default_dtype()) 35 | impl_cls = self.backend.get_impl_cls() 36 | self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, 37 | alibi_slopes, sliding_window) 38 | 39 | def forward( 40 | self, 41 | query: torch.Tensor, 42 | key: torch.Tensor, 43 | value: torch.Tensor, 44 | kv_cache: Optional[torch.Tensor], 45 | attn_metadata: AttentionMetadata[AttentionMetadataPerStage], 46 | kv_scale: float = 1.0, 47 | ) -> torch.Tensor: 48 | return self.impl.forward(query, key, value, kv_cache, attn_metadata, 49 | kv_scale) 50 | 51 | def extra_repr(self) -> str: 52 | s = f"head_size={self.impl.head_size}" # type: ignore 53 | s += f", num_heads={self.impl.num_heads}" # type: ignore 54 | s += f", num_kv_heads={self.impl.num_kv_heads}" # type: ignore 55 | s += f", scale={self.impl.scale}" # type: ignore 56 | return s 57 | -------------------------------------------------------------------------------- /vllm/attention/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/attention/ops/__init__.py -------------------------------------------------------------------------------- /vllm/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/core/__init__.py -------------------------------------------------------------------------------- /vllm/core/block/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/core/block/__init__.py -------------------------------------------------------------------------------- /vllm/core/policy.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | from typing import Deque 3 | 4 | from vllm.sequence import SequenceGroup 5 | 6 | 7 | class Policy: 8 | 9 | def get_priority( 10 | self, 11 | now: float, 12 | seq_group: SequenceGroup, 13 | ) -> float: 14 | raise NotImplementedError 15 | 16 | def sort_by_priority( 17 | self, 18 | now: float, 19 | seq_groups: Deque[SequenceGroup], 20 | ) -> Deque[SequenceGroup]: 21 | return deque( 22 | sorted( 23 | seq_groups, 24 | key=lambda seq_group: self.get_priority(now, seq_group), 25 | reverse=True, 26 | )) 27 | 28 | 29 | class FCFS(Policy): 30 | 31 | def get_priority( 32 | self, 33 | now: float, 34 | seq_group: SequenceGroup, 35 | ) -> float: 36 | return now - seq_group.metrics.arrival_time 37 | 38 | 39 | class PolicyFactory: 40 | 41 | _POLICY_REGISTRY = {'fcfs': FCFS} 42 | 43 | @classmethod 44 | def get_policy(cls, policy_name: str, **kwargs) -> Policy: 45 | return cls._POLICY_REGISTRY[policy_name](**kwargs) 46 | -------------------------------------------------------------------------------- /vllm/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | from .communication_op import * 2 | from .parallel_state import * 3 | from .utils import * 4 | -------------------------------------------------------------------------------- /vllm/distributed/device_communicators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/distributed/device_communicators/__init__.py -------------------------------------------------------------------------------- /vllm/distributed/device_communicators/pymccl_utils.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | from typing import Optional 3 | 4 | import torch 5 | from torch.distributed import ProcessGroup, ReduceOp 6 | 7 | from vllm.logger import init_logger 8 | 9 | logger = init_logger(__name__) 10 | 11 | try: 12 | from vllm.distributed.device_communicators.pymccl import (MCCLCommunicator, 13 | mcclGetVersion) 14 | except Exception as e: 15 | # in non-MTHREADS environments, we can't import the mccl module 16 | # e.g. when running on machines with AMD GPUs 17 | logger.info("Failed to import MCCL library: %s", e) 18 | logger.info("It is expected if you are not running on Mthreads GPUs.") 19 | pass 20 | 21 | comm: Optional["MCCLCommunicator"] = None 22 | 23 | 24 | def is_initialized() -> bool: 25 | """Returns whether the NCCL backend is initialized.""" 26 | return comm is not None 27 | 28 | 29 | @contextlib.contextmanager 30 | def set_pymccl_stream(stream: torch.cuda.Stream): 31 | """Set the cuda stream for communication""" 32 | try: 33 | assert comm is not None 34 | comm.stream = stream 35 | yield 36 | finally: 37 | pass 38 | 39 | 40 | def init_process_group(group: Optional[ProcessGroup] = None) -> None: 41 | assert not is_initialized() 42 | global comm 43 | logger.info("vLLM is using nccl==%s", mcclGetVersion()) 44 | comm = MCCLCommunicator(group=group) 45 | 46 | 47 | def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None: 48 | """All-reduces the input tensor across the process group.""" 49 | assert input_.is_musa, f"{input_} should be a musa tensor" 50 | assert comm is not None 51 | comm.all_reduce(input_, op) 52 | 53 | 54 | def destroy_process_group() -> None: 55 | global comm 56 | comm = None 57 | 58 | 59 | def get_world_size() -> int: 60 | """Returns the world size.""" 61 | assert comm is not None 62 | return comm.world_size 63 | 64 | 65 | def get_nccl_backend() -> Optional["MCCLCommunicator"]: 66 | return comm 67 | -------------------------------------------------------------------------------- /vllm/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/engine/__init__.py -------------------------------------------------------------------------------- /vllm/engine/output_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/engine/output_processor/__init__.py -------------------------------------------------------------------------------- /vllm/engine/output_processor/util.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from vllm.sequence import SamplerOutput, SequenceGroupOutput 4 | 5 | 6 | def create_output_by_sequence_group( 7 | sampler_outputs: List[SamplerOutput], 8 | num_seq_groups: int) -> List[List[SequenceGroupOutput]]: 9 | """Helper method which transforms a 2d list organized by 10 | [step][sequence group] into [sequence group][step]. 11 | """ 12 | output_by_sequence_group: List[List[SamplerOutput]] = [ 13 | [] for _ in range(num_seq_groups) 14 | ] 15 | for step in sampler_outputs: 16 | for i, sequence_group_output in enumerate(step): 17 | output_by_sequence_group[i].append(sequence_group_output) 18 | 19 | return output_by_sequence_group 20 | -------------------------------------------------------------------------------- /vllm/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/entrypoints/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/entrypoints/openai/__init__.py -------------------------------------------------------------------------------- /vllm/executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/executor/__init__.py -------------------------------------------------------------------------------- /vllm/logging/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.logging.formatter import NewLineFormatter 2 | 3 | __all__ = [ 4 | "NewLineFormatter", 5 | ] 6 | -------------------------------------------------------------------------------- /vllm/logging/formatter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class NewLineFormatter(logging.Formatter): 5 | """Adds logging prefix to newlines to align multi-line messages.""" 6 | 7 | def __init__(self, fmt, datefmt=None, style="%"): 8 | logging.Formatter.__init__(self, fmt, datefmt, style) 9 | 10 | def format(self, record): 11 | msg = logging.Formatter.format(self, record) 12 | if record.message != "": 13 | parts = msg.split(record.message) 14 | msg = msg.replace("\n", "\r\n" + parts[0]) 15 | return msg 16 | -------------------------------------------------------------------------------- /vllm/lora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/lora/__init__.py -------------------------------------------------------------------------------- /vllm/lora/request.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class LoRARequest: 6 | """ 7 | Request for a LoRA adapter. 8 | 9 | Note that this class should be be used internally. For online 10 | serving, it is recommended to not allow users to use this class but 11 | instead provide another layer of abstraction to prevent users from 12 | accessing unauthorized LoRA adapters. 13 | 14 | lora_int_id must be globally unique for a given adapter. 15 | This is currently not enforced in vLLM. 16 | """ 17 | 18 | lora_name: str 19 | lora_int_id: int 20 | lora_local_path: str 21 | 22 | def __post_init__(self): 23 | if self.lora_int_id < 1: 24 | raise ValueError( 25 | f"lora_int_id must be > 0, got {self.lora_int_id}") 26 | 27 | def __eq__(self, value: object) -> bool: 28 | return isinstance( 29 | value, LoRARequest) and self.lora_int_id == value.lora_int_id 30 | 31 | def __hash__(self) -> int: 32 | return self.lora_int_id 33 | -------------------------------------------------------------------------------- /vllm/model_executor/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.sampling_metadata import SamplingMetadata 2 | from vllm.model_executor.utils import set_random_seed 3 | 4 | __all__ = [ 5 | "SamplingMetadata", 6 | "set_random_seed", 7 | ] 8 | -------------------------------------------------------------------------------- /vllm/model_executor/guided_decoding/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | 3 | from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, 4 | CompletionRequest) 5 | from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import ( 6 | get_lm_format_enforcer_guided_decoding_logits_processor) 7 | from vllm.model_executor.guided_decoding.outlines_decoding import ( 8 | get_outlines_guided_decoding_logits_processor) 9 | from vllm.sampling_params import LogitsProcessor 10 | 11 | 12 | async def get_guided_decoding_logits_processor( 13 | guided_decoding_backend: str, request: Union[CompletionRequest, 14 | ChatCompletionRequest], 15 | tokenizer) -> Optional[LogitsProcessor]: 16 | if guided_decoding_backend == 'outlines': 17 | return await get_outlines_guided_decoding_logits_processor( 18 | request, tokenizer) 19 | if guided_decoding_backend == 'lm-format-enforcer': 20 | return await get_lm_format_enforcer_guided_decoding_logits_processor( 21 | request, tokenizer) 22 | 23 | raise ValueError( 24 | f"Unknown guided decoding backend '{guided_decoding_backend}'. " 25 | "Must be one of 'outlines, 'lm-format-enforcer'") 26 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/model_executor/layers/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/fused_moe/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.layers.fused_moe.fused_moe import ( 2 | fused_moe, get_config_file_name) 3 | 4 | __all__ = [ 5 | "fused_moe", 6 | "get_config_file_name", 7 | ] 8 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/fused_moe/configs/README: -------------------------------------------------------------------------------- 1 | This directory contains tuned configurations for different settings of the fused_moe kernel. 2 | For different settings of 3 | - E (number of experts) 4 | - N (intermediate size) 5 | - device_name (torch.cuda.get_device_name()) 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration. 7 | 8 | The example configurations provided are for the Mixtral model for TP2 on H100 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have 10 | N = 7168 and for TP4 we have N = 3584. 11 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/layernorm.py: -------------------------------------------------------------------------------- 1 | """Custom normalization layers.""" 2 | from typing import Optional, Tuple, Union 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from vllm import _custom_ops as ops 8 | 9 | 10 | class RMSNorm(nn.Module): 11 | """Root mean square normalization. 12 | 13 | Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight. 14 | Refer to https://arxiv.org/abs/1910.07467 15 | """ 16 | 17 | def __init__( 18 | self, 19 | hidden_size: int, 20 | eps: float = 1e-6, 21 | ) -> None: 22 | super().__init__() 23 | self.weight = nn.Parameter(torch.ones(hidden_size)) 24 | self.variance_epsilon = eps 25 | 26 | def _forward( 27 | self, 28 | x: torch.Tensor, 29 | residual: Optional[torch.Tensor] = None, 30 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 31 | """PyTorch-native implementation equivalent to forward().""" 32 | orig_dtype = x.dtype 33 | x = x.to(torch.float32) 34 | if residual is not None: 35 | x = x + residual.to(torch.float32) 36 | residual = x.to(orig_dtype) 37 | 38 | variance = x.pow(2).mean(dim=-1, keepdim=True) 39 | x = x * torch.rsqrt(variance + self.variance_epsilon) 40 | x = x.to(orig_dtype) * self.weight 41 | if residual is None: 42 | return x 43 | else: 44 | return x, residual 45 | 46 | def forward( 47 | self, 48 | x: torch.Tensor, 49 | residual: Optional[torch.Tensor] = None, 50 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 51 | if residual is not None: 52 | ops.fused_add_rms_norm( 53 | x, 54 | residual, 55 | self.weight.data, 56 | self.variance_epsilon, 57 | ) 58 | return x, residual 59 | out = torch.empty_like(x) 60 | ops.rms_norm( 61 | out, 62 | x, 63 | self.weight.data, 64 | self.variance_epsilon, 65 | ) 66 | return out 67 | 68 | def extra_repr(self) -> str: 69 | s = f"hidden_size={self.weight.data.size(0)}" 70 | s += f", eps={self.variance_epsilon}" 71 | return s 72 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/model_executor/layers/ops/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Type 2 | 3 | from vllm.model_executor.layers.quantization.aqlm import AQLMConfig 4 | from vllm.model_executor.layers.quantization.awq import AWQConfig 5 | from vllm.model_executor.layers.quantization.base_config import ( 6 | QuantizationConfig) 7 | from vllm.model_executor.layers.quantization.fp8 import Fp8Config 8 | from vllm.model_executor.layers.quantization.gptq import GPTQConfig 9 | from vllm.model_executor.layers.quantization.gptq_marlin import ( 10 | GPTQMarlinConfig) 11 | from vllm.model_executor.layers.quantization.marlin import MarlinConfig 12 | from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig 13 | 14 | QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = { 15 | "aqlm": AQLMConfig, 16 | "awq": AWQConfig, 17 | "fp8": Fp8Config, 18 | "gptq": GPTQConfig, 19 | "squeezellm": SqueezeLLMConfig, 20 | "gptq_marlin": GPTQMarlinConfig, 21 | "marlin": MarlinConfig, 22 | } 23 | 24 | 25 | def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: 26 | if quantization not in QUANTIZATION_METHODS: 27 | raise ValueError(f"Invalid quantization method: {quantization}") 28 | return QUANTIZATION_METHODS[quantization] 29 | 30 | 31 | __all__ = [ 32 | "QuantizationConfig", 33 | "get_quantization_config", 34 | "QUANTIZATION_METHODS", 35 | ] 36 | -------------------------------------------------------------------------------- /vllm/model_executor/model_loader/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from torch import nn 4 | 5 | from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, 6 | ParallelConfig, SchedulerConfig, VisionLanguageConfig) 7 | from vllm.model_executor.model_loader.loader import (BaseModelLoader, 8 | get_model_loader) 9 | from vllm.model_executor.model_loader.utils import ( 10 | get_architecture_class_name, get_model_architecture) 11 | 12 | 13 | def get_model( 14 | *, model_config: ModelConfig, load_config: LoadConfig, 15 | device_config: DeviceConfig, parallel_config: ParallelConfig, 16 | scheduler_config: SchedulerConfig, lora_config: Optional[LoRAConfig], 17 | vision_language_config: Optional[VisionLanguageConfig]) -> nn.Module: 18 | loader = get_model_loader(load_config) 19 | return loader.load_model(model_config=model_config, 20 | device_config=device_config, 21 | lora_config=lora_config, 22 | vision_language_config=vision_language_config, 23 | parallel_config=parallel_config, 24 | scheduler_config=scheduler_config) 25 | 26 | 27 | __all__ = [ 28 | "get_model", "get_model_loader", "BaseModelLoader", 29 | "get_architecture_class_name", "get_model_architecture" 30 | ] 31 | -------------------------------------------------------------------------------- /vllm/model_executor/model_loader/utils.py: -------------------------------------------------------------------------------- 1 | """Utilities for selecting and loading models.""" 2 | import contextlib 3 | from typing import Tuple, Type 4 | 5 | import torch 6 | from torch import nn 7 | 8 | from vllm.config import ModelConfig 9 | from vllm.model_executor.models import ModelRegistry 10 | 11 | 12 | @contextlib.contextmanager 13 | def set_default_torch_dtype(dtype: torch.dtype): 14 | """Sets the default torch dtype to the given dtype.""" 15 | old_dtype = torch.get_default_dtype() 16 | torch.set_default_dtype(dtype) 17 | yield 18 | torch.set_default_dtype(old_dtype) 19 | 20 | 21 | def get_model_architecture( 22 | model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: 23 | architectures = getattr(model_config.hf_config, "architectures", []) 24 | # Special handling for quantized Mixtral. 25 | # FIXME(woosuk): This is a temporary hack. 26 | if (model_config.quantization is not None 27 | and model_config.quantization != "fp8" 28 | and "MixtralForCausalLM" in architectures): 29 | architectures = ["QuantMixtralForCausalLM"] 30 | 31 | for arch in architectures: 32 | model_cls = ModelRegistry.load_model_cls(arch) 33 | if model_cls is not None: 34 | return (model_cls, arch) 35 | raise ValueError( 36 | f"Model architectures {architectures} are not supported for now. " 37 | f"Supported architectures: {ModelRegistry.get_supported_archs()}") 38 | 39 | 40 | def get_architecture_class_name(model_config: ModelConfig) -> str: 41 | return get_model_architecture(model_config)[1] 42 | -------------------------------------------------------------------------------- /vllm/model_executor/utils.py: -------------------------------------------------------------------------------- 1 | """Utils for model executor.""" 2 | import random 3 | from typing import Any, Dict, Optional 4 | 5 | import numpy as np 6 | import torch 7 | 8 | 9 | def set_random_seed(seed: int) -> None: 10 | random.seed(seed) 11 | np.random.seed(seed) 12 | torch.manual_seed(seed) 13 | if torch.cuda.is_available(): 14 | torch.cuda.manual_seed_all(seed) 15 | elif torch.musa.is_available(): 16 | torch.musa.manual_seed_all(seed) 17 | 18 | 19 | def set_weight_attrs( 20 | weight: torch.Tensor, 21 | weight_attrs: Optional[Dict[str, Any]], 22 | ): 23 | """Set attributes on a weight tensor. 24 | 25 | This method is used to set attributes on a weight tensor. This method 26 | will not overwrite existing attributes. 27 | 28 | Args: 29 | weight: The weight tensor. 30 | weight_attrs: A dictionary of attributes to set on the weight tensor. 31 | """ 32 | if weight_attrs is None: 33 | return 34 | for key, value in weight_attrs.items(): 35 | assert not hasattr( 36 | weight, key), (f"Overwriting existing tensor attribute: {key}") 37 | setattr(weight, key, value) 38 | -------------------------------------------------------------------------------- /vllm/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561. 2 | # The vllm package uses inline types. 3 | -------------------------------------------------------------------------------- /vllm/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/spec_decode/__init__.py -------------------------------------------------------------------------------- /vllm/spec_decode/interfaces.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from dataclasses import dataclass 3 | 4 | import torch 5 | 6 | from vllm.sequence import ExecuteModelRequest 7 | 8 | 9 | @dataclass 10 | class SpeculativeProposals: 11 | """Datastructure used to represent proposal tokens from some proposer. It 12 | also tracks how many speculative tokens each sequence has. 13 | """ 14 | 15 | # Speculative proposal tokens. 16 | proposal_token_ids: torch.Tensor 17 | 18 | # Probabilities of the proposal tokens according to the proposer. 19 | proposal_probs: torch.Tensor 20 | 21 | # The valid length of each proposal; can be zero. 22 | proposal_lens: torch.Tensor 23 | 24 | def __repr__(self): 25 | return (f"SpeculativeProposals(" 26 | f"proposal_token_ids={self.proposal_token_ids}, " 27 | f"proposal_probs={self.proposal_probs.shape}, " 28 | f"proposal_lens={self.proposal_lens})") 29 | 30 | 31 | @dataclass 32 | class SpeculativeScores: 33 | """Datastructure used to represent the scores of speculative tokens 34 | according to the scoring model. 35 | """ 36 | 37 | # Probabilities of the speculative tokens according to the scoring model. 38 | probs: torch.Tensor 39 | 40 | # Log-probabilities of the speculative tokens according to the scoring 41 | # model. These values can be used to generate Logprob objects that are 42 | # returned to the user. 43 | logprobs: torch.Tensor 44 | 45 | # Token ids sampled from the scoring model. Used for speculative bonus 46 | # tokens and also non-speculative normal decoding. 47 | token_ids: torch.Tensor 48 | 49 | def __repr__(self): 50 | return (f"SpeculativeScores(" 51 | f"probs={self.probs.shape}, " 52 | f"token_ids={self.token_ids.shape})") 53 | 54 | 55 | class SpeculativeProposer(ABC): 56 | 57 | @abstractmethod 58 | def get_proposals( 59 | self, 60 | execute_model_req: ExecuteModelRequest, 61 | ) -> SpeculativeProposals: 62 | raise NotImplementedError 63 | 64 | 65 | class SpeculativeScorer(ABC): 66 | 67 | @abstractmethod 68 | def score_proposals( 69 | self, 70 | execute_model_req: ExecuteModelRequest, 71 | proposals: SpeculativeProposals, 72 | ) -> SpeculativeScores: 73 | raise NotImplementedError 74 | -------------------------------------------------------------------------------- /vllm/test_utils.py: -------------------------------------------------------------------------------- 1 | import ray 2 | 3 | from vllm.distributed import (ensure_model_parallel_initialized, 4 | init_distributed_environment) 5 | from vllm.utils import get_open_port 6 | 7 | 8 | def init_test_distributed_environment( 9 | pipeline_parallel_size: int, 10 | tensor_parallel_size: int, 11 | rank: int, 12 | distributed_init_port: str, 13 | local_rank: int = -1, 14 | ) -> None: 15 | distributed_init_method = f"tcp://localhost:{distributed_init_port}" 16 | init_distributed_environment( 17 | world_size=pipeline_parallel_size * tensor_parallel_size, 18 | rank=rank, 19 | distributed_init_method=distributed_init_method, 20 | local_rank=local_rank) 21 | ensure_model_parallel_initialized(tensor_parallel_size, 22 | pipeline_parallel_size) 23 | 24 | 25 | def multi_process_tensor_parallel( 26 | tensor_parallel_size: int, 27 | test_target, 28 | ) -> None: 29 | # Using ray helps debugging the error when it failed 30 | # as compared to multiprocessing. 31 | ray.init() 32 | 33 | distributed_init_port = get_open_port() 34 | refs = [] 35 | for rank in range(tensor_parallel_size): 36 | refs.append( 37 | test_target.remote(tensor_parallel_size, rank, 38 | distributed_init_port)) 39 | ray.get(refs) 40 | 41 | ray.shutdown() 42 | -------------------------------------------------------------------------------- /vllm/transformers_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/transformers_utils/__init__.py -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.transformers_utils.configs.chatglm import ChatGLMConfig 2 | from vllm.transformers_utils.configs.dbrx import DbrxConfig 3 | # RWConfig is for the original tiiuae/falcon-40b(-instruct) and 4 | # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the 5 | # `FalconConfig` class from the official HuggingFace transformers library. 6 | from vllm.transformers_utils.configs.falcon import RWConfig 7 | from vllm.transformers_utils.configs.jais import JAISConfig 8 | from vllm.transformers_utils.configs.mpt import MPTConfig 9 | 10 | __all__ = [ 11 | "ChatGLMConfig", 12 | "DbrxConfig", 13 | "MPTConfig", 14 | "RWConfig", 15 | "JAISConfig", 16 | ] 17 | -------------------------------------------------------------------------------- /vllm/transformers_utils/tokenizer_group/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from vllm.config import TokenizerPoolConfig 4 | from vllm.executor.ray_utils import ray 5 | from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( 6 | BaseTokenizerGroup) 7 | from vllm.transformers_utils.tokenizer_group.tokenizer_group import ( 8 | TokenizerGroup) 9 | 10 | if ray: 11 | from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import ( 12 | RayTokenizerGroupPool) 13 | else: 14 | RayTokenizerGroupPool = None # type: ignore 15 | 16 | 17 | def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig], 18 | **init_kwargs) -> BaseTokenizerGroup: 19 | if tokenizer_pool_config is None: 20 | return TokenizerGroup(**init_kwargs) 21 | if tokenizer_pool_config.pool_type == "ray": 22 | if RayTokenizerGroupPool is None: 23 | raise ImportError( 24 | "RayTokenizerGroupPool is not available. Please install " 25 | "the ray package to use the Ray tokenizer group pool.") 26 | return RayTokenizerGroupPool.from_config(tokenizer_pool_config, 27 | **init_kwargs) 28 | else: 29 | raise ValueError( 30 | f"Unknown pool type: {tokenizer_pool_config.pool_type}") 31 | 32 | 33 | __all__ = ["get_tokenizer_group", "BaseTokenizerGroup"] 34 | -------------------------------------------------------------------------------- /vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Optional 3 | 4 | from transformers import PreTrainedTokenizer 5 | 6 | from vllm.lora.request import LoRARequest 7 | 8 | 9 | class BaseTokenizerGroup(ABC): 10 | """A group of tokenizers that can be used for LoRA adapters.""" 11 | 12 | @abstractmethod 13 | def ping(self) -> bool: 14 | """Check if the tokenizer group is alive.""" 15 | pass 16 | 17 | @abstractmethod 18 | def get_max_input_len(self, 19 | lora_request: Optional[LoRARequest] = None 20 | ) -> Optional[int]: 21 | """Get the maximum input length for the LoRA request.""" 22 | pass 23 | 24 | @abstractmethod 25 | def encode(self, 26 | prompt: str, 27 | request_id: Optional[str] = None, 28 | lora_request: Optional[LoRARequest] = None) -> List[int]: 29 | """Encode a prompt using the tokenizer group.""" 30 | pass 31 | 32 | @abstractmethod 33 | async def encode_async( 34 | self, 35 | prompt: str, 36 | request_id: Optional[str] = None, 37 | lora_request: Optional[LoRARequest] = None) -> List[int]: 38 | """Encode a prompt using the tokenizer group.""" 39 | pass 40 | 41 | @abstractmethod 42 | def get_lora_tokenizer( 43 | self, 44 | lora_request: Optional[LoRARequest] = None 45 | ) -> "PreTrainedTokenizer": 46 | """Get a tokenizer for a LoRA request.""" 47 | pass 48 | 49 | @abstractmethod 50 | async def get_lora_tokenizer_async( 51 | self, 52 | lora_request: Optional[LoRARequest] = None 53 | ) -> "PreTrainedTokenizer": 54 | """Get a tokenizer for a LoRA request.""" 55 | pass 56 | -------------------------------------------------------------------------------- /vllm/transformers_utils/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer 2 | 3 | __all__ = [ 4 | "BaichuanTokenizer", 5 | ] 6 | -------------------------------------------------------------------------------- /vllm/usage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/usage/__init__.py -------------------------------------------------------------------------------- /vllm/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/worker/__init__.py --------------------------------------------------------------------------------