├── .buildkite
    ├── check-wheel-size.py
    ├── download-images.sh
    ├── run-amd-test.sh
    ├── run-benchmarks.sh
    ├── run-cpu-test.sh
    ├── run-neuron-test.sh
    ├── test-pipeline.yaml
    └── test-template.j2
├── .dockerignore
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── 100-documentation.yml
    │   ├── 200-installation.yml
    │   ├── 300-usage.yml
    │   ├── 400-bug report.yml
    │   ├── 500-feature request.yml
    │   ├── 600-new model.yml
    │   ├── 700-performance discussion.yml
    │   ├── 750-RFC.yml
    │   ├── 800-misc discussion.yml
    │   └── config.yml
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── mypy.yaml
    │   ├── publish.yml
    │   ├── ruff.yml
    │   ├── scripts
    │       ├── build.sh
    │       ├── create_release.js
    │       ├── cuda-install.sh
    │       ├── env.sh
    │       └── pytorch-install.sh
    │   └── yapf.yml
├── .gitignore
├── .readthedocs.yaml
├── .yapfignore
├── CMakeLists.txt
├── CONTRIBUTING.md
├── Dockerfile
├── Dockerfile.cpu
├── Dockerfile.neuron
├── Dockerfile.rocm
├── LICENSE
├── MANIFEST.in
├── README.md
├── README_vllm_musa.md
├── benchmarks
    ├── README.md
    ├── backend_request_func.py
    ├── benchmark_latency.py
    ├── benchmark_prefix_caching.py
    ├── benchmark_serving.py
    ├── benchmark_throughput.py
    ├── kernels
    │   ├── benchmark_aqlm.py
    │   ├── benchmark_mixtral_moe.py
    │   ├── benchmark_paged_attention.py
    │   └── benchmark_rope.py
    ├── launch_tgi_server.sh
    └── sonnet.txt
├── build_musa.sh
├── cmake
    ├── cpu_extension.cmake
    ├── hipify.py
    └── utils.cmake
├── collect_env.py
├── csrc_musa
    ├── activation_kernels.mu
    ├── attention
    │   ├── attention_dtypes.h
    │   ├── attention_generic.muh
    │   ├── attention_kernels.mu
    │   ├── attention_utils.muh
    │   ├── dtype_bfloat16.muh
    │   ├── dtype_float16.muh
    │   ├── dtype_float32.muh
    │   └── dtype_fp8.muh
    ├── cache.h
    ├── cache_kernels.mu
    ├── cpu
    │   ├── activation.cpp
    │   ├── attention.cpp
    │   ├── cache.cpp
    │   ├── cpu_types.hpp
    │   ├── layernorm.cpp
    │   ├── pos_encoding.cpp
    │   └── pybind.cpp
    ├── custom_all_reduce.mu
    ├── custom_all_reduce.muh
    ├── custom_all_reduce_test.mu
    ├── dispatch_utils.h
    ├── layernorm_kernels.mu
    ├── moe
    │   ├── moe_ops.cpp
    │   ├── moe_ops.h
    │   └── topk_softmax_kernels.mu
    ├── moe_align_block_size_kernels.mu
    ├── musa_compat.h
    ├── musa_utils.h
    ├── musa_utils_kernels.mu
    ├── ops.h
    ├── pos_encoding_kernels.mu
    ├── punica
    │   ├── .LICENSE
    │   ├── bgmv
    │   │   ├── bgmv_bf16_bf16_bf16.mu
    │   │   ├── bgmv_bf16_fp32_bf16.mu
    │   │   ├── bgmv_config.h
    │   │   ├── bgmv_fp16_fp16_fp16.mu
    │   │   ├── bgmv_fp16_fp32_fp16.mu
    │   │   ├── bgmv_fp32_bf16_bf16.mu
    │   │   ├── bgmv_fp32_fp16_fp16.mu
    │   │   ├── bgmv_impl.muh
    │   │   ├── generator.py
    │   │   └── vec_dtypes.muh
    │   └── punica_ops.cc
    ├── pybind.cpp
    ├── quantization
    │   ├── aqlm
    │   │   └── gemm_kernels.mu
    │   ├── awq
    │   │   ├── dequantize.muh
    │   │   └── gemm_kernels.mu
    │   ├── fp8
    │   │   ├── amd_detail
    │   │   │   ├── hip_float8.h
    │   │   │   ├── hip_float8_impl.h
    │   │   │   └── quant_utils.muh
    │   │   └── fp8_cuda_kernels.mu
    │   ├── fp8_e5m2_kvcache
    │   │   └── quant_utils.muh
    │   ├── gptq
    │   │   ├── compat.muh
    │   │   ├── matrix_view.muh
    │   │   ├── q_gemm.mu
    │   │   ├── qdq_2.muh
    │   │   ├── qdq_3.muh
    │   │   ├── qdq_4.muh
    │   │   ├── qdq_8.muh
    │   │   └── qdq_util.muh
    │   ├── gptq_marlin
    │   │   ├── gptq_marlin.mu
    │   │   ├── gptq_marlin.muh
    │   │   └── gptq_marlin_repack.mu
    │   ├── marlin
    │   │   ├── .LICENSE
    │   │   └── marlin_cuda_kernel.mu
    │   └── squeezellm
    │   │   └── quant_cuda_kernel.mu
    └── reduction_utils.muh
├── docs
    ├── Makefile
    ├── README.md
    ├── make.bat
    ├── requirements-docs.txt
    └── source
    │   ├── assets
    │       ├── dev
    │       │   └── dockerfile-stages-dependency.png
    │       ├── kernel
    │       │   ├── k_vecs.png
    │       │   ├── key.png
    │       │   ├── logits_vec.png
    │       │   ├── q_vecs.png
    │       │   ├── query.png
    │       │   ├── v_vec.png
    │       │   └── value.png
    │       └── logos
    │       │   ├── vllm-logo-only-light.png
    │       │   ├── vllm-logo-text-dark.png
    │       │   └── vllm-logo-text-light.png
    │   ├── conf.py
    │   ├── dev
    │       ├── dockerfile
    │       │   └── dockerfile.rst
    │       ├── engine
    │       │   ├── async_llm_engine.rst
    │       │   ├── engine_index.rst
    │       │   └── llm_engine.rst
    │       ├── kernel
    │       │   └── paged_attention.rst
    │       └── sampling_params.rst
    │   ├── generate_examples.py
    │   ├── getting_started
    │       ├── amd-installation.rst
    │       ├── cpu-installation.rst
    │       ├── examples
    │       │   └── examples_index.template.rst
    │       ├── installation.rst
    │       ├── neuron-installation.rst
    │       └── quickstart.rst
    │   ├── index.rst
    │   ├── models
    │       ├── adding_model.rst
    │       ├── engine_args.rst
    │       ├── lora.rst
    │       ├── performance.rst
    │       └── supported_models.rst
    │   ├── quantization
    │       ├── auto_awq.rst
    │       ├── fp8_e4m3_kvcache.rst
    │       └── fp8_e5m2_kvcache.rst
    │   └── serving
    │       ├── deploying_with_bentoml.rst
    │       ├── deploying_with_docker.rst
    │       ├── deploying_with_kserve.rst
    │       ├── deploying_with_triton.rst
    │       ├── distributed_serving.rst
    │       ├── env_vars.rst
    │       ├── integrations.rst
    │       ├── metrics.rst
    │       ├── openai_compatible_server.md
    │       ├── run_on_sky.rst
    │       ├── serving_with_langchain.rst
    │       └── usage_stats.md
├── examples
    ├── api_client.py
    ├── aqlm_example.py
    ├── fp8
    │   ├── README.md
    │   ├── extract_scales.py
    │   └── quantizer
    │   │   ├── README.md
    │   │   └── quantize.py
    ├── gradio_openai_chatbot_webserver.py
    ├── gradio_webserver.py
    ├── llava_example.py
    ├── llm_engine_example.py
    ├── logging_configuration.md
    ├── multilora_inference.py
    ├── offline_inference.py
    ├── offline_inference_distributed.py
    ├── offline_inference_neuron.py
    ├── offline_inference_with_prefix.py
    ├── openai_chat_completion_client.py
    ├── openai_completion_client.py
    ├── production_monitoring
    │   ├── README.md
    │   ├── docker-compose.yaml
    │   ├── grafana.json
    │   └── prometheus.yaml
    ├── template_alpaca.jinja
    ├── template_baichuan.jinja
    ├── template_chatglm.jinja
    ├── template_chatglm2.jinja
    ├── template_chatml.jinja
    ├── template_falcon.jinja
    ├── template_falcon_180b.jinja
    ├── template_inkbot.jinja
    └── tensorize_vllm_model.py
├── format.sh
├── musa_porting.py
├── pyproject.toml
├── requirements-build.txt
├── requirements-common.txt
├── requirements-cpu.txt
├── requirements-cuda.txt
├── requirements-dev.txt
├── requirements-musa.txt
├── requirements-neuron.txt
├── requirements-rocm.txt
├── rocm_patch
    └── rocm_bf16.patch
├── setup.py
├── tests
    ├── __init__.py
    ├── async_engine
    │   ├── api_server_async_engine.py
    │   ├── test_api_server.py
    │   ├── test_async_llm_engine.py
    │   ├── test_chat_template.py
    │   ├── test_merge_async_iterators.py
    │   ├── test_openapi_server_ray.py
    │   └── test_request_tracker.py
    ├── basic_correctness
    │   ├── test_basic_correctness.py
    │   ├── test_chunked_prefill.py
    │   └── test_preemption.py
    ├── conftest.py
    ├── core
    │   ├── __init__.py
    │   ├── block
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── e2e
    │   │   │   ├── conftest.py
    │   │   │   └── test_correctness.py
    │   │   ├── test_block_manager_v2.py
    │   │   ├── test_block_table.py
    │   │   ├── test_common.py
    │   │   ├── test_cpu_gpu_block_allocator.py
    │   │   ├── test_naive_block.py
    │   │   └── test_prefix_caching_block.py
    │   ├── test_block_manager.py
    │   ├── test_chunked_prefill_scheduler.py
    │   ├── test_scheduler.py
    │   └── utils.py
    ├── distributed
    │   ├── test_basic_distributed_correctness.py
    │   ├── test_chunked_prefill_distributed.py
    │   ├── test_comm_ops.py
    │   ├── test_custom_all_reduce.py
    │   ├── test_pynccl.py
    │   └── test_pynccl_library.py
    ├── engine
    │   ├── output_processor
    │   │   └── test_multi_step.py
    │   ├── test_computed_prefix_blocks.py
    │   ├── test_detokenization.py
    │   ├── test_multiproc_workers.py
    │   ├── test_skip_tokenizer_init.py
    │   ├── test_stop_reason.py
    │   └── test_stop_strings.py
    ├── entrypoints
    │   ├── openai
    │   │   └── test_serving_chat.py
    │   ├── test_guided_processors.py
    │   ├── test_llm_generate.py
    │   ├── test_openai_server.py
    │   └── test_server_oot_registration.py
    ├── fp8_kv
    │   ├── llama2-70b-fp8-kv
    │   │   └── kv_cache_scales.json
    │   └── llama2-7b-fp8-kv
    │   │   └── kv_cache_scales.json
    ├── kernels
    │   ├── allclose_default.py
    │   ├── conftest.py
    │   ├── test_activation.py
    │   ├── test_attention.py
    │   ├── test_cache.py
    │   ├── test_layernorm.py
    │   ├── test_moe.py
    │   ├── test_pos_encoding.py
    │   ├── test_prefix_prefill.py
    │   ├── test_rand.py
    │   └── test_sampler.py
    ├── lora
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_baichuan.py
    │   ├── test_chatglm3.py
    │   ├── test_gemma.py
    │   ├── test_layer_variation.py
    │   ├── test_layers.py
    │   ├── test_llama.py
    │   ├── test_lora.py
    │   ├── test_lora_checkpoints.py
    │   ├── test_lora_manager.py
    │   ├── test_mixtral.py
    │   ├── test_punica.py
    │   ├── test_quant_model.py
    │   ├── test_tokenizer_group.py
    │   ├── test_utils.py
    │   ├── test_worker.py
    │   └── utils.py
    ├── metrics
    │   └── test_metrics.py
    ├── model_executor
    │   └── weight_utils.py
    ├── models
    │   ├── test_aqlm.py
    │   ├── test_big_models.py
    │   ├── test_fp8.py
    │   ├── test_gptq_marlin.py
    │   ├── test_llava.py
    │   ├── test_marlin.py
    │   ├── test_mistral.py
    │   ├── test_models.py
    │   ├── test_oot_registration.py
    │   └── utils.py
    ├── prefix_caching
    │   └── test_prefix_caching.py
    ├── prompts
    │   ├── example.txt
    │   └── summary.txt
    ├── quantization
    │   ├── test_configs.py
    │   └── test_fp8.py
    ├── samplers
    │   ├── test_beam_search.py
    │   ├── test_ignore_eos.py
    │   ├── test_logits_processor.py
    │   ├── test_logprobs.py
    │   ├── test_ranks.py
    │   ├── test_rejection_sampler.py
    │   ├── test_sampler.py
    │   └── test_seeded_generate.py
    ├── spec_decode
    │   ├── __init__.py
    │   ├── e2e
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── test_compatibility.py
    │   │   ├── test_logprobs.py
    │   │   ├── test_multistep_correctness.py
    │   │   └── test_ngram_correctness.py
    │   ├── test_batch_expansion.py
    │   ├── test_metrics.py
    │   ├── test_multi_step_worker.py
    │   ├── test_ngram_worker.py
    │   ├── test_spec_decode_worker.py
    │   ├── test_utils.py
    │   └── utils.py
    ├── tensorizer_loader
    │   ├── __init__.py
    │   ├── tensorize_vllm_model_for_testing.py
    │   └── test_tensorizer.py
    ├── test_cache_block_hashing.py
    ├── test_config.py
    ├── test_logger.py
    ├── test_logits_processor.py
    ├── test_regression.py
    ├── test_sampling_params.py
    ├── test_sequence.py
    ├── tokenization
    │   ├── __init__.py
    │   ├── test_cached_tokenizer.py
    │   ├── test_detokenize.py
    │   ├── test_tokenizer.py
    │   └── test_tokenizer_group.py
    └── worker
    │   ├── __init__.py
    │   ├── test_model_runner.py
    │   └── test_swap.py
└── vllm
    ├── __init__.py
    ├── _custom_ops.py
    ├── attention
        ├── __init__.py
        ├── backends
        │   ├── __init__.py
        │   ├── abstract.py
        │   ├── flash_attn.py
        │   ├── flashinfer.py
        │   ├── rocm_flash_attn.py
        │   ├── torch_sdpa.py
        │   └── xformers.py
        ├── layer.py
        ├── ops
        │   ├── __init__.py
        │   ├── paged_attn.py
        │   ├── prefix_prefill.py
        │   └── triton_flash_attention.py
        └── selector.py
    ├── block.py
    ├── config.py
    ├── core
        ├── __init__.py
        ├── block
        │   ├── __init__.py
        │   ├── block_table.py
        │   ├── common.py
        │   ├── cpu_gpu_block_allocator.py
        │   ├── interfaces.py
        │   ├── naive_block.py
        │   └── prefix_caching_block.py
        ├── block_manager_v1.py
        ├── block_manager_v2.py
        ├── evictor_v1.py
        ├── evictor_v2.py
        ├── interfaces.py
        ├── policy.py
        └── scheduler.py
    ├── distributed
        ├── __init__.py
        ├── communication_op.py
        ├── device_communicators
        │   ├── __init__.py
        │   ├── custom_all_reduce.py
        │   ├── pymccl.py
        │   ├── pymccl_utils.py
        │   └── pynccl.py
        ├── parallel_state.py
        └── utils.py
    ├── engine
        ├── __init__.py
        ├── arg_utils.py
        ├── async_llm_engine.py
        ├── llm_engine.py
        ├── metrics.py
        └── output_processor
        │   ├── __init__.py
        │   ├── interfaces.py
        │   ├── multi_step.py
        │   ├── single_step.py
        │   ├── stop_checker.py
        │   └── util.py
    ├── entrypoints
        ├── __init__.py
        ├── api_server.py
        ├── llm.py
        └── openai
        │   ├── __init__.py
        │   ├── api_server.py
        │   ├── cli_args.py
        │   ├── protocol.py
        │   ├── serving_chat.py
        │   ├── serving_completion.py
        │   └── serving_engine.py
    ├── envs.py
    ├── executor
        ├── __init__.py
        ├── cpu_executor.py
        ├── distributed_gpu_executor.py
        ├── executor_base.py
        ├── gpu_executor.py
        ├── multiproc_worker_utils.py
        ├── neuron_executor.py
        ├── ray_gpu_executor.py
        └── ray_utils.py
    ├── logger.py
    ├── logging
        ├── __init__.py
        └── formatter.py
    ├── lora
        ├── __init__.py
        ├── fully_sharded_layers.py
        ├── layers.py
        ├── lora.py
        ├── models.py
        ├── punica.py
        ├── request.py
        ├── utils.py
        └── worker_manager.py
    ├── model_executor
        ├── __init__.py
        ├── guided_decoding
        │   ├── __init__.py
        │   ├── lm_format_enforcer_decoding.py
        │   ├── outlines_decoding.py
        │   └── outlines_logits_processors.py
        ├── layers
        │   ├── __init__.py
        │   ├── activation.py
        │   ├── fused_moe
        │   │   ├── __init__.py
        │   │   ├── configs
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
        │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
        │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
        │   │   │   └── README
        │   │   └── fused_moe.py
        │   ├── layernorm.py
        │   ├── linear.py
        │   ├── logits_processor.py
        │   ├── ops
        │   │   ├── __init__.py
        │   │   ├── rand.py
        │   │   └── sample.py
        │   ├── quantization
        │   │   ├── __init__.py
        │   │   ├── aqlm.py
        │   │   ├── awq.py
        │   │   ├── base_config.py
        │   │   ├── fp8.py
        │   │   ├── gptq.py
        │   │   ├── gptq_marlin.py
        │   │   ├── marlin.py
        │   │   ├── schema.py
        │   │   └── squeezellm.py
        │   ├── rejection_sampler.py
        │   ├── rotary_embedding.py
        │   ├── sampler.py
        │   └── vocab_parallel_embedding.py
        ├── model_loader
        │   ├── __init__.py
        │   ├── loader.py
        │   ├── neuron.py
        │   ├── tensorizer.py
        │   ├── utils.py
        │   └── weight_utils.py
        ├── models
        │   ├── __init__.py
        │   ├── baichuan.py
        │   ├── bloom.py
        │   ├── chatglm.py
        │   ├── commandr.py
        │   ├── dbrx.py
        │   ├── decilm.py
        │   ├── deepseek.py
        │   ├── falcon.py
        │   ├── gemma.py
        │   ├── gpt2.py
        │   ├── gpt_bigcode.py
        │   ├── gpt_j.py
        │   ├── gpt_neox.py
        │   ├── internlm2.py
        │   ├── jais.py
        │   ├── llama.py
        │   ├── llava.py
        │   ├── minicpm.py
        │   ├── mixtral.py
        │   ├── mixtral_quant.py
        │   ├── mpt.py
        │   ├── olmo.py
        │   ├── opt.py
        │   ├── orion.py
        │   ├── phi.py
        │   ├── qwen.py
        │   ├── qwen2.py
        │   ├── qwen2_moe.py
        │   ├── stablelm.py
        │   ├── starcoder2.py
        │   └── xverse.py
        ├── sampling_metadata.py
        └── utils.py
    ├── outputs.py
    ├── py.typed
    ├── sampling_params.py
    ├── sequence.py
    ├── spec_decode
        ├── __init__.py
        ├── batch_expansion.py
        ├── interfaces.py
        ├── metrics.py
        ├── multi_step_worker.py
        ├── ngram_worker.py
        ├── spec_decode_worker.py
        ├── top1_proposer.py
        └── util.py
    ├── test_utils.py
    ├── transformers_utils
        ├── __init__.py
        ├── config.py
        ├── configs
        │   ├── __init__.py
        │   ├── chatglm.py
        │   ├── dbrx.py
        │   ├── falcon.py
        │   ├── jais.py
        │   └── mpt.py
        ├── detokenizer.py
        ├── tokenizer.py
        ├── tokenizer_group
        │   ├── __init__.py
        │   ├── base_tokenizer_group.py
        │   ├── ray_tokenizer_group.py
        │   └── tokenizer_group.py
        └── tokenizers
        │   ├── __init__.py
        │   └── baichuan.py
    ├── usage
        ├── __init__.py
        └── usage_lib.py
    ├── utils.py
    └── worker
        ├── __init__.py
        ├── cache_engine.py
        ├── cpu_model_runner.py
        ├── cpu_worker.py
        ├── model_runner.py
        ├── neuron_model_runner.py
        ├── neuron_worker.py
        ├── worker.py
        └── worker_base.py


/.buildkite/check-wheel-size.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import zipfile
 3 | 
 4 | MAX_SIZE_MB = 100
 5 | 
 6 | 
 7 | def print_top_10_largest_files(zip_file):
 8 |     with zipfile.ZipFile(zip_file, 'r') as z:
 9 |         file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
10 |         file_sizes.sort(key=lambda x: x[1], reverse=True)
11 |         for f, size in file_sizes[:10]:
12 |             print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
13 | 
14 | 
15 | def check_wheel_size(directory):
16 |     for root, _, files in os.walk(directory):
17 |         for f in files:
18 |             if f.endswith(".whl"):
19 |                 wheel_path = os.path.join(root, f)
20 |                 wheel_size = os.path.getsize(wheel_path)
21 |                 wheel_size_mb = wheel_size / (1024 * 1024)
22 |                 if wheel_size_mb > MAX_SIZE_MB:
23 |                     print(
24 |                         f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
25 |                         f"compare to the allowed size ({MAX_SIZE_MB} MB).")
26 |                     print_top_10_largest_files(wheel_path)
27 |                     return 1
28 |                 else:
29 |                     print(f"Wheel {wheel_path} is within the allowed size "
30 |                           f"({wheel_size_mb} MB).")
31 |     return 0
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     import sys
36 |     sys.exit(check_wheel_size(sys.argv[1]))
37 | 


--------------------------------------------------------------------------------
/.buildkite/download-images.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | set -o pipefail
 5 | 
 6 | (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 7 | 
 8 | # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
 9 | mkdir -p images
10 | cd images
11 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
12 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
13 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
14 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
15 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
16 | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
17 | 
18 | cd -
19 | 


--------------------------------------------------------------------------------
/.buildkite/run-amd-test.sh:
--------------------------------------------------------------------------------
 1 | # This script build the ROCm docker image and runs test inside it.
 2 | set -ex
 3 | 
 4 | # Print ROCm version
 5 | echo "--- ROCm info"
 6 | rocminfo
 7 | 
 8 | echo "--- Resetting GPUs"
 9 | 
10 | echo "reset" > /opt/amdgpu/etc/gpu_state
11 | 
12 | while true; do
13 |         sleep 3
14 |         if grep -q clean /opt/amdgpu/etc/gpu_state; then
15 |                 echo "GPUs state is \"clean\""
16 |                 break
17 |         fi
18 | done
19 | 
20 | echo "--- Building container"
21 | sha=$(git rev-parse --short HEAD)
22 | container_name=rocm_${sha}
23 | docker build \
24 |         -t ${container_name} \
25 |         -f Dockerfile.rocm \
26 |         --progress plain \
27 |         .
28 | 
29 | remove_docker_container() {
30 |    docker rm -f ${container_name} || docker image rm -f ${container_name} || true
31 | }
32 | trap remove_docker_container EXIT
33 | 
34 | echo "--- Running container"
35 | 
36 | docker run \
37 |         --device /dev/kfd --device /dev/dri \
38 |         --network host \
39 |         --rm \
40 |         -e HF_TOKEN \
41 |         --name ${container_name} \
42 |         ${container_name} \
43 |         /bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")
44 | 
45 | 


--------------------------------------------------------------------------------
/.buildkite/run-cpu-test.sh:
--------------------------------------------------------------------------------
 1 | # This script build the CPU docker image and run the offline inference inside the container.
 2 | # It serves a sanity check for compilation and basic model usage.
 3 | set -ex
 4 | 
 5 | # Try building the docker image
 6 | docker build -t cpu-test -f Dockerfile.cpu .
 7 | 
 8 | # Setup cleanup
 9 | remove_docker_container() { docker rm -f cpu-test || true; }
10 | trap remove_docker_container EXIT
11 | remove_docker_container
12 | 
13 | # Run the image and launch offline inference
14 | docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py
15 | 


--------------------------------------------------------------------------------
/.buildkite/run-neuron-test.sh:
--------------------------------------------------------------------------------
 1 | # This script build the Neuron docker image and run the API server inside the container.
 2 | # It serves a sanity check for compilation and basic model usage.
 3 | set -e
 4 | 
 5 | # Try building the docker image
 6 | aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
 7 | 
 8 | # prune old image and containers to save disk space, and only once a day
 9 | # by using a timestamp file in tmp.
10 | if [ -f /tmp/neuron-docker-build-timestamp ]; then
11 |     last_build=$(cat /tmp/neuron-docker-build-timestamp)
12 |     current_time=$(date +%s)
13 |     if [ $((current_time - last_build)) -gt 86400 ]; then
14 |         docker system prune -f
15 |         echo $current_time > /tmp/neuron-docker-build-timestamp
16 |     fi
17 | else
18 |     echo $(date +%s) > /tmp/neuron-docker-build-timestamp
19 | fi
20 | 
21 | docker build -t neuron -f Dockerfile.neuron .
22 | 
23 | # Setup cleanup
24 | remove_docker_container() { docker rm -f neuron || true; }
25 | trap remove_docker_container EXIT
26 | remove_docker_container
27 | 
28 | # Run the image
29 | docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
30 |        --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
31 | 
32 | # Wait for the server to start
33 | wait_for_server_to_start() {
34 |     timeout=300
35 |     counter=0
36 | 
37 |     while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
38 |         sleep 1
39 |         counter=$((counter + 1))
40 |         if [ $counter -ge $timeout ]; then
41 |             echo "Timeout after $timeout seconds"
42 |             break
43 |         fi
44 |     done
45 | }
46 | wait_for_server_to_start
47 | 
48 | # Test a simple prompt
49 | curl -X POST -H "Content-Type: application/json" \
50 |     localhost:8000/generate \
51 |     -d '{"prompt": "San Francisco is a"}'
52 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | vllm/*.so
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/100-documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to https://docs.vllm.ai/
 3 | title: "[Doc]: "
 4 | labels: ["documentation"]
 5 | 
 6 | body:
 7 | - type: textarea
 8 |   attributes:
 9 |     label: 📚 The doc issue
10 |     description: >
11 |       A clear and concise description of what content in https://docs.vllm.ai/ is an issue.
12 |   validations:
13 |     required: true
14 | - type: textarea
15 |   attributes:
16 |     label: Suggest a potential alternative/fix
17 |     description: >
18 |       Tell us how we could improve the documentation in this regard.
19 | - type: markdown
20 |   attributes:
21 |     value: >
22 |       Thanks for contributing 🎉!
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/200-installation.yml:
--------------------------------------------------------------------------------
 1 | name: 🛠️ Installation
 2 | description: Report an issue here when you hit errors during installation.
 3 | title: "[Installation]: "
 4 | labels: ["installation"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Your current environment
14 |     description: |
15 |       Please run the following and paste the output below.
16 |       ```sh
17 |       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
18 |       # For security purposes, please feel free to check the contents of collect_env.py before running it.
19 |       python collect_env.py
20 |       ```
21 |       It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
22 |     value: |
23 |       ```text
24 |       The output of `python collect_env.py`
25 |       ```
26 |   validations:
27 |     required: true
28 | - type: textarea
29 |   attributes:
30 |     label: How you are installing vllm
31 |     description: |
32 |       Paste the full command you are trying to execute.
33 |     value: |
34 |       ```sh
35 |       pip install -vvv vllm
36 |       ```
37 | - type: markdown
38 |   attributes:
39 |     value: >
40 |       Thanks for contributing 🎉!
41 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/300-usage.yml:
--------------------------------------------------------------------------------
 1 | name: 💻 Usage
 2 | description: Raise an issue here if you don't know how to use vllm.
 3 | title: "[Usage]: "
 4 | labels: ["usage"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Your current environment
14 |     description: |
15 |       Please run the following and paste the output below.
16 |       ```sh
17 |       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
18 |       # For security purposes, please feel free to check the contents of collect_env.py before running it.
19 |       python collect_env.py
20 |       ```
21 |       It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
22 |     value: |
23 |       ```text
24 |       The output of `python collect_env.py`
25 |       ```
26 |   validations:
27 |     required: true
28 | - type: textarea
29 |   attributes:
30 |     label: How would you like to use vllm
31 |     description: |
32 |       A detailed description of how you want to use vllm.
33 |     value: |
34 |       I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm.
35 | - type: markdown
36 |   attributes:
37 |     value: >
38 |       Thanks for contributing 🎉!
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/500-feature request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Submit a proposal/request for a new vllm feature
 3 | title: "[Feature]: "
 4 | labels: ["feature request"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: 🚀 The feature, motivation and pitch
14 |     description: >
15 |       A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
16 |   validations:
17 |     required: true
18 | - type: textarea
19 |   attributes:
20 |     label: Alternatives
21 |     description: >
22 |       A description of any alternative solutions or features you've considered, if any.
23 | - type: textarea
24 |   attributes:
25 |     label: Additional context
26 |     description: >
27 |       Add any other context or screenshots about the feature request.
28 | - type: markdown
29 |   attributes:
30 |     value: >
31 |       Thanks for contributing 🎉!
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/600-new model.yml:
--------------------------------------------------------------------------------
 1 | name: 🤗 Support request for a new model from huggingface
 2 | description: Submit a proposal/request for a new model from huggingface
 3 | title: "[New Model]: "
 4 | labels: ["new model"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | 
12 |       #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
13 | - type: textarea
14 |   attributes:
15 |     label: The model to consider.
16 |     description: >
17 |       A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 .
18 |   validations:
19 |     required: true
20 | - type: textarea
21 |   attributes:
22 |     label: The closest model vllm already supports.
23 |     description: >
24 |       Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for?
25 | - type: textarea
26 |   attributes:
27 |     label: What's your difficulty of supporting the model you want?
28 |     description: >
29 |       For example, any new operators or new architecture?
30 | - type: markdown
31 |   attributes:
32 |     value: >
33 |       Thanks for contributing 🎉!
34 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/700-performance discussion.yml:
--------------------------------------------------------------------------------
 1 | name: ⚡ Discussion on the performance of vllm
 2 | description: Submit a proposal/discussion about the performance of vllm
 3 | title: "[Performance]: "
 4 | labels: ["performance"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Proposal to improve performance
14 |     description: >
15 |       How do you plan to improve vllm's performance?
16 |   validations:
17 |     required: false
18 | - type: textarea
19 |   attributes:
20 |     label: Report of performance regression
21 |     description: >
22 |       Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks .
23 |   validations:
24 |     required: false
25 | - type: textarea
26 |   attributes:
27 |     label: Misc discussion on performance
28 |     description: >
29 |       Anything about the performance.
30 |   validations:
31 |     required: false
32 | - type: textarea
33 |   attributes:
34 |     label: Your current environment (if you think it is necessary)
35 |     description: |
36 |       Please run the following and paste the output below.
37 |       ```sh
38 |       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
39 |       # For security purposes, please feel free to check the contents of collect_env.py before running it.
40 |       python collect_env.py
41 |       ```
42 |       It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
43 |     value: |
44 |       ```text
45 |       The output of `python collect_env.py`
46 |       ```
47 |   validations:
48 |     required: false
49 | - type: markdown
50 |   attributes:
51 |     value: >
52 |       Thanks for contributing 🎉!
53 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/750-RFC.yml:
--------------------------------------------------------------------------------
 1 | name: 💬 Request for comments (RFC).
 2 | description: Ask for feedback on major architectural changes or design choices.
 3 | title: "[RFC]: "
 4 | labels: ["RFC"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
11 | - type: textarea
12 |   attributes:
13 |     label: Motivation.
14 |     description: >
15 |       The motivation of the RFC.
16 |   validations:
17 |     required: true
18 | - type: textarea
19 |   attributes:
20 |     label: Proposed Change.
21 |     description: >
22 |       The proposed change of the RFC.
23 |   validations:
24 |     required: true
25 | - type: textarea
26 |   attributes:
27 |     label: Feedback Period.
28 |     description: >
29 |       The feedback period of the RFC. Usually at least one week.
30 |   validations:
31 |     required: false
32 | - type: textarea
33 |   attributes:
34 |     label: CC List.
35 |     description: >
36 |       The list of people you want to CC.
37 |   validations:
38 |     required: false
39 | - type: textarea
40 |   attributes:
41 |     label: Any Other Things.
42 |     description: >
43 |       Any other things you would like to mention.
44 |   validations:
45 |     required: false
46 | - type: markdown
47 |   attributes:
48 |     value: >
49 |       Thanks for contributing 🎉!
50 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/800-misc discussion.yml:
--------------------------------------------------------------------------------
 1 | name: 🎲 Misc/random discussions that do not fit into the above categories.
 2 | description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
 3 | title: "[Misc]: "
 4 | labels: ["misc"]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
11 | - type: textarea
12 |   attributes:
13 |     label: Anything you want to discuss about vllm.
14 |     description: >
15 |       Anything you want to discuss about vllm.
16 |   validations:
17 |     required: true
18 | - type: markdown
19 |   attributes:
20 |     value: >
21 |       Thanks for contributing 🎉!
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | 


--------------------------------------------------------------------------------
/.github/workflows/mypy.yaml:
--------------------------------------------------------------------------------
 1 | name: mypy
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 |   ruff:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.8", "3.9", "3.10", "3.11"]
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v2
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install mypy==1.9.0
29 |         pip install types-setuptools
30 |         pip install types-PyYAML
31 |         pip install types-requests
32 |         pip install types-setuptools
33 |     - name: Mypy
34 |       run: |
35 |         mypy vllm/attention --config-file pyproject.toml
36 |         mypy vllm/core --config-file pyproject.toml
37 |         mypy vllm/distributed --config-file pyproject.toml
38 |         mypy vllm/entrypoints --config-file pyproject.toml
39 |         mypy vllm/executor --config-file pyproject.toml
40 |         mypy vllm/usage --config-file pyproject.toml
41 |         mypy vllm/*.py --config-file pyproject.toml
42 |         mypy vllm/transformers_utils --config-file pyproject.toml
43 |         mypy vllm/engine  --config-file pyproject.toml
44 |         mypy vllm/worker --config-file pyproject.toml
45 |         mypy vllm/spec_decode --config-file pyproject.toml
46 |         mypy vllm/model_executor  --config-file pyproject.toml
47 |         mypy vllm/lora --config-file pyproject.toml
48 |         mypy vllm/logging --config-file pyproject.toml
49 |         mypy vllm/model_executor --config-file pyproject.toml
50 | 
51 | 


--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
 1 | name: ruff
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 |   ruff:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.8", "3.9", "3.10", "3.11"]
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v2
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
29 |     - name: Analysing the code with ruff
30 |       run: |
31 |         ruff .
32 |     - name: Spelling check with codespell
33 |       run: |
34 |         codespell --toml pyproject.toml
35 |     - name: Run isort
36 |       run: |
37 |         isort . --check-only
38 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | cuda_home=/usr/local/cuda-$2
 5 | 
 6 | # Update paths
 7 | PATH=${cuda_home}/bin:$PATH
 8 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 9 | 
10 | # Install requirements
11 | $python_executable -m pip install wheel packaging
12 | $python_executable -m pip install -r requirements-cuda.txt
13 | 
14 | # Limit the number of parallel jobs to avoid OOM
15 | export MAX_JOBS=1
16 | # Make sure punica is built for the release (for LoRA)
17 | export VLLM_INSTALL_PUNICA_KERNELS=1
18 | # Make sure release wheels are built for the following architectures
19 | export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
20 | # Build
21 | $python_executable setup.py bdist_wheel --dist-dir=dist
22 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/create_release.js:
--------------------------------------------------------------------------------
 1 | // Uses Github's API to create the release and wait for result.
 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
 3 | 
 4 | module.exports = async (github, context, core) => {
 5 | 	try {
 6 | 		const response = await github.rest.repos.createRelease({
 7 | 			draft: false,
 8 | 			generate_release_notes: true,
 9 | 			name: process.env.RELEASE_TAG,
10 | 			owner: context.repo.owner,
11 | 			prerelease: true,
12 | 			repo: context.repo.repo,
13 | 			tag_name: process.env.RELEASE_TAG,
14 | 		});
15 | 
16 | 		core.setOutput('upload_url', response.data.upload_url);
17 | 	} catch (error) {
18 | 		core.setFailed(error.message);
19 | 	}
20 | }


--------------------------------------------------------------------------------
/.github/workflows/scripts/cuda-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Replace '.' with '-' ex: 11.8 -> 11-8
 4 | cuda_version=$(echo $1 | tr "." "-")
 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
 6 | OS=$(echo $2 | tr -d ".\-")
 7 | 
 8 | # Installs CUDA
 9 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
11 | rm cuda-keyring_1.1-1_all.deb
12 | sudo apt -qq update
13 | sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
14 | sudo apt clean
15 | 
16 | # Test nvcc
17 | PATH=/usr/local/cuda-$1/bin:${PATH}
18 | nvcc --version
19 | 
20 | # Log gcc, g++, c++ versions
21 | gcc --version
22 | g++ --version
23 | c++ --version
24 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This file installs common linux environment tools
 4 | 
 5 | export LANG C.UTF-8
 6 | 
 7 | # python_version=$1
 8 | 
 9 | sudo    apt-get update && \
10 | sudo    apt-get install -y --no-install-recommends \
11 |         software-properties-common \
12 | 
13 | sudo    apt-get install -y --no-install-recommends \
14 |         build-essential \
15 |         apt-utils \
16 |         ca-certificates \
17 |         wget \
18 |         git \
19 |         vim \
20 |         libssl-dev \
21 |         curl \
22 |         unzip \
23 |         unrar \
24 |         cmake \
25 |         net-tools \
26 |         sudo \
27 |         autotools-dev \
28 |         rsync \
29 |         jq \
30 |         openssh-server \
31 |         tmux \
32 |         screen \
33 |         htop \
34 |         pdsh \
35 |         openssh-client \
36 |         lshw \
37 |         dmidecode \
38 |         util-linux \
39 |         automake \
40 |         autoconf \
41 |         libtool \
42 |         net-tools \
43 |         pciutils \
44 |         libpci-dev \
45 |         libaio-dev \
46 |         libcap2 \
47 |         libtinfo5 \
48 |         fakeroot \
49 |         devscripts \
50 |         debhelper \
51 |         nfs-common
52 | 
53 | # Remove github bloat files to free up disk space
54 | sudo rm -rf "/usr/local/share/boost"
55 | sudo rm -rf "$AGENT_TOOLSDIRECTORY"
56 | sudo rm -rf "/usr/share/dotnet"
57 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/pytorch-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | pytorch_version=$2
 5 | cuda_version=$3
 6 | 
 7 | # Install torch
 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
 9 | $python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
10 | 
11 | # Print version information
12 | $python_executable --version
13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)"
14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
16 | 


--------------------------------------------------------------------------------
/.github/workflows/yapf.yml:
--------------------------------------------------------------------------------
 1 | name: yapf
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | jobs:
13 |   yapf:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.8", "3.9", "3.10", "3.11"]
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install yapf==0.32.0
28 |         pip install toml==0.10.2
29 |     - name: Running yapf
30 |       run: |
31 |         yapf --diff --recursive .
32 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3.8"
10 | 
11 | sphinx:
12 |    configuration: docs/source/conf.py
13 | 
14 | # If using Sphinx, optionally build your docs in additional formats such as PDF
15 | formats:
16 |    - pdf
17 | 
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 |    install:
21 |    - requirements: docs/requirements-docs.txt
22 | 


--------------------------------------------------------------------------------
/.yapfignore:
--------------------------------------------------------------------------------
1 | collect_env.py
2 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to vLLM
 2 | 
 3 | Thank you for your interest in contributing to vLLM!
 4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
 5 | There are several ways you can contribute to the project:
 6 | 
 7 | - Identify and report any issues or bugs.
 8 | - Request or add a new model.
 9 | - Suggest or implement new features.
10 | 
11 | However, remember that contributions aren't just about code.
12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
13 | 
14 | Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects.
16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
17 | 
18 | 
19 | ## Setup for development
20 | 
21 | ### Build from source
22 | 
23 | ```bash
24 | pip install -e .  # This may take several minutes.
25 | ```
26 | 
27 | ### Testing
28 | 
29 | ```bash
30 | pip install -r requirements-dev.txt
31 | 
32 | # linting and formatting
33 | bash format.sh
34 | # Static type checking
35 | mypy
36 | # Unit tests
37 | pytest tests/
38 | ```
39 | **Note:** Currently, the repository does not pass the mypy tests.
40 | 
41 | 
42 | ## Contributing Guidelines
43 | 
44 | ### Issue Reporting
45 | 
46 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
47 | If not, please file a new issue, providing as much relevant information as possible.
48 | 
49 | ### Pull Requests & Code Reviews
50 | 
51 | Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
52 | 
53 | ### Thank You
54 | 
55 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
56 | Your contributions make vLLM a great tool for everyone!
57 | 


--------------------------------------------------------------------------------
/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
 2 | 
 3 | FROM ubuntu:22.04
 4 | 
 5 | RUN apt-get update  -y \
 6 |     && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
 7 |     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 8 | 
 9 | RUN pip install --upgrade pip \
10 |     && pip install wheel packaging ninja setuptools>=49.4.0 numpy
11 | 
12 | COPY ./ /workspace/vllm
13 | 
14 | WORKDIR /workspace/vllm
15 | 
16 | RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
17 | 
18 | RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
19 | 
20 | CMD ["/bin/bash"]
21 | 


--------------------------------------------------------------------------------
/Dockerfile.neuron:
--------------------------------------------------------------------------------
 1 | # default base image
 2 | ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
 3 | 
 4 | FROM $BASE_IMAGE
 5 | 
 6 | RUN echo "Base image is $BASE_IMAGE"
 7 | 
 8 | # Install some basic utilities
 9 | RUN apt-get update && apt-get install python3 python3-pip -y
10 | 
11 | ### Mount Point ###
12 | # When launching the container, mount the code directory to /app
13 | ARG APP_MOUNT=/app
14 | VOLUME [ ${APP_MOUNT} ]
15 | WORKDIR ${APP_MOUNT}
16 | 
17 | RUN python3 -m pip install --upgrade pip
18 | RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
19 | RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
20 | RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
21 | RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
22 | 
23 | COPY ./vllm /app/vllm/vllm
24 | COPY ./setup.py /app/vllm/setup.py
25 | COPY ./requirements-common.txt /app/vllm/requirements-common.txt
26 | COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
27 | 
28 | RUN cd /app/vllm \
29 |     && python3 -m pip install -U -r requirements-neuron.txt
30 | 
31 | ENV VLLM_BUILD_WITH_NEURON 1
32 | RUN cd /app/vllm \
33 |     && pip install -e . \
34 |     && cd ..
35 | 
36 | CMD ["/bin/bash"]
37 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include LICENSE
 2 | include requirements-common.txt
 3 | include requirements-cuda.txt
 4 | include requirements-rocm.txt
 5 | include requirements-neuron.txt
 6 | include requirements-cpu.txt
 7 | include CMakeLists.txt
 8 | 
 9 | recursive-include cmake *
10 | recursive-include csrc *
11 | 


--------------------------------------------------------------------------------
/README_vllm_musa.md:
--------------------------------------------------------------------------------
 1 | # vllm_musa
 2 | 
 3 | 摩尔线程致力于构建完善好用的国产GPU应用生态，自主研发了MUSA架构及软件平台。vllm项目是业界广泛使用的大语言模型的推理和服务引擎，使用CUDA/ROCm提供GPU加速能力。为了方便摩尔线程GPU用户使用vllm框架，我们发起vllm_musa开源项目为vllm提供MUSA加速，让用户可释放摩尔线程GPU的澎湃算力。
 4 | 
 5 | 现有的vllm代码不支持摩尔线程GPU作为后端，因此我们新增了MUSA设备后端。vllm_musa接口与官方接口一致，用户无需改动业务代码，开箱即用。
 6 | 
 7 | MUSA的一大优势是CUDA兼容，通过musify工具，我们可以快速将官方代码porting至MUSA软件栈，用户可以根据文档自行升级vllm版本并适配MUSA软件栈。
 8 | 
 9 | ## 依赖
10 | 
11 | - musa_toolkit >= dev3.0.0
12 | - pytorch >= v2.2.0
13 | - [torch_musa](https://github.com/MooreThreads/torch_musa) >= v1.3.0
14 | - triton >= v2.2.0
15 | - ray >= 2.9
16 | - vllm v0.4.2
17 | 
18 | ## 使用
19 | ### 编译
20 | 运行 `bash build_musa.sh`
21 | ### 测试示例
22 | ```
23 | from vllm import LLM, SamplingParams
24 | from transformers import AutoTokenizer, LlamaForCausalLM
25 | import transformers
26 | import time
27 | import torch
28 | import torch_musa
29 | 
30 | 
31 | model_path = <path_to_llm_model>
32 | 
33 | prompts = [
34 |     "Hello, my name is",
35 |     "The president of the United States is",
36 |     "The capital of France is",
37 |     "The future of AI is",
38 | ]
39 | 
40 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
41 | llm = LLM(model=model_path, trust_remote_code=True, device="musa")
42 | 
43 | outputs = llm.generate(prompts, sampling_params)
44 | 
45 | # Print the outputs.
46 | for output in outputs:
47 |     prompt = output.prompt
48 |     generated_text = output.outputs[0].text
49 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
50 | 
51 | ```
52 | 
53 | ## Porting
54 | 
55 | 当前仓库porting自vllm v0.4.2版本。如果用户希望使用更高版本的vllm，只需要运行`musa_porting.py`将原生CUDA代码适配到MUSA代码即可。当然随着vllm的迭代可能会有些代码成为漏网之鱼，没有porting成功，用户可自行修改`musa_porting.py`文件中的文本替换规则。从而发挥MUSA强大的CUDA兼容能力。
56 | 
57 | ### 步骤
58 | 1. 运行 `python musa_porting.py`
59 | 2. 将`CMakeLists.txt`中需要编译的文件后缀从`.cu`修改为`.mu`
60 | 3. 编译运行vllm_musa
61 | 
62 | ## 贡献
63 | 
64 | 欢迎广大用户及开发者使用、反馈，助力vllm_musa功能及性能持续完善。
65 | 
66 | 社区共建，期待广大开发者与我们一道，共同打造MUSA软件生态。我们将陆续推出一系列开源软件MUSA加速项目。


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarking vLLM
2 | 
3 | ## Downloading the ShareGPT dataset
4 | 
5 | You can download the dataset by running:
6 | ```bash
7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
8 | ```
9 | 


--------------------------------------------------------------------------------
/benchmarks/launch_tgi_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PORT=8000
 4 | MODEL=$1
 5 | TOKENS=$2
 6 | 
 7 | docker run --gpus all --shm-size 1g -p $PORT:80 \
 8 |            -v $PWD/data:/data \
 9 |            ghcr.io/huggingface/text-generation-inference:1.4.0 \
10 |            --model-id $MODEL \
11 |            --sharded false  \
12 |            --max-input-length 1024 \
13 |            --max-total-tokens 2048 \
14 |            --max-best-of 5 \
15 |            --max-concurrent-requests 5000 \
16 |            --max-batch-total-tokens $TOKENS
17 | 


--------------------------------------------------------------------------------
/build_musa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | set -e
 5 | 
 6 | pip install -r requirements-build.txt
 7 | pip install -r requirements-musa.txt
 8 | 
 9 | export VLLM_TARGET_DEVICE=musa
10 | export CMAKE_BUILD_TYPE=Debug
11 | export VERBOSE=1
12 | export VLLM_ATTENTION_BACKEND=FLASH_ATTN
13 | 
14 | rm -rf build
15 | rm -rf dist
16 | rm -rf vllm.egg-info
17 | pip uninstall -y vllm
18 | 
19 | python setup.py bdist_wheel
20 | pip install dist/*


--------------------------------------------------------------------------------
/csrc_musa/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "attention_generic.muh"
4 | #include "dtype_float16.muh"
5 | #include "dtype_float32.muh"
6 | #include "dtype_bfloat16.muh"
7 | #include "dtype_fp8.muh"
8 | 


--------------------------------------------------------------------------------
/csrc_musa/attention/attention_generic.muh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
 3 |  * Copyright (c) 2024 - 2024 Moore Threads Technology Co., Ltd("Moore Threads"). All rights reserved.
 4 |  * Copyright (c) 2023, The vLLM team.
 5 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 6 |  *
 7 |  * Licensed under the Apache License, Version 2.0 (the "License");
 8 |  * you may not use this file except in compliance with the License.
 9 |  * You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | #pragma once
20 | 
21 | #include <stdint.h>
22 | 
23 | namespace vllm {
24 | 
25 | // A vector type to store Q, K, V elements.
26 | template<typename T, int VEC_SIZE>
27 | struct Vec {};
28 | 
29 | // A vector type to store FP32 accumulators.
30 | template<typename T>
31 | struct FloatVec {};
32 | 
33 | // Template vector operations.
34 | template<typename Acc, typename A, typename B>
35 | inline __device__ Acc mul(A a, B b);
36 | 
37 | template<typename T>
38 | inline __device__ float sum(T v);
39 | 
40 | template<typename T>
41 | inline __device__ float dot(T a, T b) {
42 |   return sum(mul<T, T, T>(a, b));
43 | }
44 | 
45 | template<typename A, typename T>
46 | inline __device__ float dot(T a, T b) {
47 |   return sum(mul<A, T, T>(a, b));
48 | }
49 | 
50 | template<typename T>
51 | inline __device__ void zero(T& dst) {
52 |   constexpr int WORDS = sizeof(T) / 4;
53 |   union {
54 |     T raw;
55 |     uint32_t words[WORDS];
56 |   } tmp;
57 | 
58 | #pragma unroll
59 |   for (int ii = 0; ii < WORDS; ++ii) {
60 |     tmp.words[ii] = 0u;
61 |   }
62 |   dst = tmp.raw;
63 | }
64 | 
65 | } // namespace vllm
66 | 


--------------------------------------------------------------------------------
/csrc_musa/attention/attention_utils.muh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
 3 |  * Copyright (c) 2024 - 2024 Moore Threads Technology Co., Ltd("Moore Threads"). All rights reserved.
 4 |  * Copyright (c) 2023, The vLLM team.
 5 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 6 |  *
 7 |  * Licensed under the Apache License, Version 2.0 (the "License");
 8 |  * you may not use this file except in compliance with the License.
 9 |  * You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | #pragma once
20 | 
21 | #include "../musa_compat.h"
22 | #include "attention_dtypes.h"
23 | 
24 | #include <float.h>
25 | #include <type_traits>
26 | 
27 | namespace vllm {
28 | 
29 | // Q*K^T operation.
30 | template<int THREAD_GROUP_SIZE, typename Vec, int N>
31 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
32 |   using A_vec = typename FloatVec<Vec>::Type;
33 |   // Compute the parallel products for Q*K^T (treat vector lanes separately).
34 |   A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
35 | #pragma unroll
36 |   for (int ii = 1; ii < N; ++ii) {
37 |     qk_vec = fma(q[ii], k[ii], qk_vec);
38 |   }
39 | 
40 |   // Finalize the reduction across lanes.
41 |   float qk = sum(qk_vec);
42 | #pragma unroll
43 |   for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
44 |     qk += VLLM_SHFL_XOR_SYNC(qk, mask);
45 |   }
46 |   return qk;
47 | }
48 | 
49 | template<typename T, int THREAD_GROUP_SIZE>
50 | struct Qk_dot {
51 |   template<typename Vec, int N>
52 |   static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
53 |     return qk_dot_<THREAD_GROUP_SIZE>(q, k);
54 |   }
55 | };
56 | 
57 | } // namespace vllm
58 | 


--------------------------------------------------------------------------------
/csrc_musa/attention/dtype_fp8.muh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "attention_generic.muh"
 4 | 
 5 | #include <stdint.h>
 6 | #ifdef ENABLE_FP8_E5M2
 7 | #include <cuda_fp8.h>
 8 | #endif
 9 | 
10 | namespace vllm {
11 | #if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
12 | // fp8 vector types for quantization of kv cache
13 | 
14 | template<>
15 | struct Vec<uint8_t, 1> {
16 |     using Type = uint8_t;
17 | };
18 | 
19 | template<>
20 | struct Vec<uint8_t, 2> {
21 |     using Type = uint16_t;
22 | };
23 | 
24 | template<>
25 | struct Vec<uint8_t, 4> {
26 |     using Type = uint32_t;
27 | };
28 | 
29 | template<>
30 | struct Vec<uint8_t, 8> {
31 |     using Type = uint2;
32 | };
33 | #endif // ENABLE_FP8_E5M2
34 | 
35 | } // namespace vllm
36 | 


--------------------------------------------------------------------------------
/csrc_musa/cache.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/extension.h>
 4 | 
 5 | #include <map>
 6 | #include <vector>
 7 | 
 8 | void swap_blocks(
 9 |   torch::Tensor& src,
10 |   torch::Tensor& dst,
11 |   const std::map<int64_t, int64_t>& block_mapping);
12 | 
13 | void copy_blocks(
14 |   std::vector<torch::Tensor>& key_caches,
15 |   std::vector<torch::Tensor>& value_caches,
16 |   const std::map<int64_t, std::vector<int64_t>>& block_mapping);
17 | 
18 | void reshape_and_cache(
19 |   torch::Tensor& key,
20 |   torch::Tensor& value,
21 |   torch::Tensor& key_cache,
22 |   torch::Tensor& value_cache,
23 |   torch::Tensor& slot_mapping,
24 |   const std::string& kv_cache_dtype,
25 |   const float kv_scale);
26 | 
27 | void reshape_and_cache_flash(
28 |   torch::Tensor& key,
29 |   torch::Tensor& value,
30 |   torch::Tensor& key_cache,
31 |   torch::Tensor& value_cache,
32 |   torch::Tensor& slot_mapping,
33 |   const std::string& kv_cache_dtype);
34 | 
35 | // Just for unittest
36 | void convert_fp8(
37 |   torch::Tensor& src_cache,
38 |   torch::Tensor& dst_cache);
39 | 


--------------------------------------------------------------------------------
/csrc_musa/cpu/pybind.cpp:
--------------------------------------------------------------------------------
 1 | #include "cache.h"
 2 | #include "cuda_utils.h"
 3 | #include "ops.h"
 4 | #include <torch/extension.h>
 5 | 
 6 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 7 |   // vLLM custom ops
 8 |   pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
 9 | 
10 |   // Attention ops
11 |   ops.def(
12 |     "paged_attention_v1",
13 |     &paged_attention_v1,
14 |     "Compute the attention between an input query and the cached keys/values using PagedAttention.");
15 |   ops.def(
16 |     "paged_attention_v2",
17 |     &paged_attention_v2,
18 |     "PagedAttention V2.");
19 | 
20 |   // Activation ops
21 |   ops.def(
22 |     "silu_and_mul",
23 |     &silu_and_mul,
24 |     "Activation function used in SwiGLU.");
25 |   ops.def(
26 |     "gelu_and_mul",
27 |     &gelu_and_mul,
28 |     "Activation function used in GeGLU with `none` approximation.");
29 |   ops.def(
30 |     "gelu_tanh_and_mul",
31 |     &gelu_tanh_and_mul,
32 |     "Activation function used in GeGLU with `tanh` approximation.");
33 |   ops.def(
34 |     "gelu_new",
35 |     &gelu_new,
36 |     "GELU implementation used in GPT-2.");
37 |   ops.def(
38 |     "gelu_fast",
39 |     &gelu_fast,
40 |     "Approximate GELU implementation.");
41 | 
42 |   // Layernorm
43 |   ops.def(
44 |     "rms_norm",
45 |     &rms_norm,
46 |     "Apply Root Mean Square (RMS) Normalization to the input tensor.");
47 | 
48 |   ops.def(
49 |     "fused_add_rms_norm",
50 |     &fused_add_rms_norm,
51 |     "In-place fused Add and RMS Normalization");
52 | 
53 |   // Rotary embedding
54 |   ops.def(
55 |     "rotary_embedding",
56 |     &rotary_embedding,
57 |     "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
58 | 
59 |   // Cache ops
60 |   pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
61 |   cache_ops.def(
62 |     "swap_blocks",
63 |     &swap_blocks,
64 |     "Swap in (out) the cache blocks from src to dst");
65 |   cache_ops.def(
66 |     "copy_blocks",
67 |     &copy_blocks,
68 |     "Copy the cache blocks from src to dst");
69 |   cache_ops.def(
70 |     "reshape_and_cache",
71 |     &reshape_and_cache,
72 |     "Reshape the key and value tensors and cache them");
73 | }
74 | 


--------------------------------------------------------------------------------
/csrc_musa/dispatch_utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from
 3 |  * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
 4 |  */
 5 | #pragma once
 6 | 
 7 | #include <torch/extension.h>
 8 | 
 9 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)              \
10 |   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
11 |   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
12 |   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
13 | 
14 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
15 |   AT_DISPATCH_SWITCH(                                             \
16 |     TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
17 | 
18 | #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)     \
19 |   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
20 |   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
21 |   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)   \
22 |   AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
23 | 
24 | #define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...)           \
25 |   AT_DISPATCH_SWITCH(                                                    \
26 |     TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
27 |     
28 | #define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...)             \
29 |   AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)      \
30 |   AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)      \
31 |   AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)     \
32 |   AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)       \
33 |   AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
34 | 
35 | #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)             \
36 |   AT_DISPATCH_SWITCH(                                             \
37 |     TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
38 | 


--------------------------------------------------------------------------------
/csrc_musa/moe/moe_ops.cpp:
--------------------------------------------------------------------------------
1 | #include "moe_ops.h"
2 | 
3 | #include <torch/extension.h>
4 | 
5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
6 |   m.def("topk_softmax", &topk_softmax, "Apply topk softmax to the gating outputs.");
7 | }
8 | 


--------------------------------------------------------------------------------
/csrc_musa/moe/moe_ops.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/extension.h>
 4 | 
 5 | void topk_softmax(
 6 |   torch::Tensor& topk_weights,
 7 |   torch::Tensor& topk_indices,
 8 |   torch::Tensor& token_expert_indices,
 9 |   torch::Tensor& gating_output);
10 | 


--------------------------------------------------------------------------------
/csrc_musa/musa_compat.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef USE_ROCM
 4 | #include <hip/hip_runtime.h>
 5 | #endif
 6 | 
 7 | #ifndef USE_ROCM
 8 |   #define WARP_SIZE 32
 9 | #else
10 |   #define WARP_SIZE warpSize
11 | #endif
12 | 
13 | #ifndef USE_ROCM
14 |   #define VLLM_LDG(arg) __ldg(arg)
15 | #else
16 |   #define VLLM_LDG(arg) *(arg)
17 | #endif
18 | 
19 | #ifndef USE_ROCM
20 |   #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask)
21 | #else
22 |   #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
23 | #endif
24 | 
25 | #ifndef USE_ROCM
26 |   #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
27 | #else
28 |   #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
29 | #endif
30 | 
31 | #ifndef USE_ROCM
32 |   #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
33 |     musaFuncSetAttribute(FUNC, musaFuncAttributeMaxDynamicSharedMemorySize, VAL)
34 | #else
35 |   #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
36 |     hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
37 | #endif
38 | 
39 | 


--------------------------------------------------------------------------------
/csrc_musa/musa_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/extension.h>
 4 | 
 5 | int get_device_attribute(
 6 |     int attribute,
 7 |     int device_id);
 8 | 
 9 | int get_max_shared_memory_per_block_device_attribute(
10 |     int device_id);
11 | 


--------------------------------------------------------------------------------
/csrc_musa/musa_utils_kernels.mu:
--------------------------------------------------------------------------------
 1 | #ifdef USE_ROCM
 2 |   #include <hip/hip_runtime.h>
 3 |   #include <hip/hip_runtime_api.h>
 4 | #endif
 5 | int get_device_attribute(
 6 |     int attribute,
 7 |     int device_id)
 8 | {
 9 |     int device, value;
10 |     if (device_id < 0) {
11 |         musaGetDevice(&device);
12 |     }
13 |     else {
14 |         device = device_id;
15 |     }
16 |     musaDeviceGetAttribute(&value, static_cast<musaDeviceAttr>(attribute), device);
17 |     return value;
18 | }
19 | 
20 | 
21 | int get_max_shared_memory_per_block_device_attribute(
22 |     int device_id)
23 | {
24 | int attribute;    
25 | // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
26 | // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
27 | 
28 | #ifdef USE_ROCM
29 |     attribute = hipDeviceAttributeMaxSharedMemoryPerBlock;
30 | #else
31 |     attribute = musaDevAttrMaxSharedMemoryPerBlockOptin;
32 | #endif
33 | 
34 |     return get_device_attribute(attribute, device_id);
35 | }
36 | 


--------------------------------------------------------------------------------
/csrc_musa/punica/bgmv/bgmv_bf16_bf16_bf16.mu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, mt_bfloat16, mt_bfloat16, mt_bfloat16)
5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, mt_bfloat16, mt_bfloat16, mt_bfloat16)
6 | 


--------------------------------------------------------------------------------
/csrc_musa/punica/bgmv/bgmv_bf16_fp32_bf16.mu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, mt_bfloat16, float, mt_bfloat16)
5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, mt_bfloat16, float, mt_bfloat16)
6 | 


--------------------------------------------------------------------------------
/csrc_musa/punica/bgmv/bgmv_fp16_fp16_fp16.mu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half)
5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, nv_half, nv_half)
6 | 


--------------------------------------------------------------------------------
/csrc_musa/punica/bgmv/bgmv_fp16_fp32_fp16.mu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half)
5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, float, nv_half)
6 | 


--------------------------------------------------------------------------------
/csrc_musa/punica/bgmv/bgmv_fp32_bf16_bf16.mu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, mt_bfloat16, mt_bfloat16)
5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, mt_bfloat16, mt_bfloat16)
6 | 


--------------------------------------------------------------------------------
/csrc_musa/punica/bgmv/bgmv_fp32_fp16_fp16.mu:
--------------------------------------------------------------------------------
1 | #include "bgmv_config.h"
2 | #include "bgmv_impl.cuh"
3 | 
4 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half)
5 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_half, nv_half)
6 | 


--------------------------------------------------------------------------------
/csrc_musa/punica/bgmv/generator.py:
--------------------------------------------------------------------------------
 1 | DTYPES = ["fp16", "bf16", "fp32"]
 2 | DTYPE_MAP = {
 3 |     "fp16": "nv_half",
 4 |     "bf16": "mt_bfloat16",
 5 |     "fp32": "float",
 6 | }
 7 | 
 8 | TEMPLATE = """
 9 | #include "bgmv_config.h"
10 | #include "bgmv_impl.cuh"
11 | 
12 | FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
13 | FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, {input_dtype}, {output_dtype}, {weight_dtype})
14 | """.lstrip()  # noqa: E501
15 | 
16 | for input_dtype in DTYPES:
17 |     for output_dtype in DTYPES:
18 |         for weight_dtype in DTYPES:
19 |             if weight_dtype == "fp32":
20 |                 # FP32 weights are not supported.
21 |                 continue
22 |             if output_dtype == "fp32":
23 |                 # LoRA A matrix.
24 |                 if input_dtype != weight_dtype:
25 |                     # NOTE(woosuk): While Punica supports the case where the
26 |                     # input and weight dtypes are different, we only generate
27 |                     # the kernels the same dtypes to reduce the binary size.
28 |                     continue
29 |             elif input_dtype == "fp32":
30 |                 # LoRA B matrix.
31 |                 if output_dtype != weight_dtype:
32 |                     # NOTE(woosuk): While Punica supports the case where the
33 |                     # output and weight dtypes are different, we only generate
34 |                     # the kernels the same dtypes to reduce the binary size.
35 |                     continue
36 |             elif not (input_dtype == output_dtype == weight_dtype):
37 |                 # NOTE(woosuk): While Punica supports mixed data types for
38 |                 # input, output, and weight, we only generate the kernels with
39 |                 # the same data types to reduce the binary size.
40 |                 continue
41 | 
42 |             kernel_definition = TEMPLATE.format(
43 |                 input_dtype=DTYPE_MAP[input_dtype],
44 |                 output_dtype=DTYPE_MAP[output_dtype],
45 |                 weight_dtype=DTYPE_MAP[weight_dtype])
46 |             filename = f"bgmv_{input_dtype}_{output_dtype}_{weight_dtype}.cu"
47 |             with open(filename, "w") as f:
48 |                 f.write(kernel_definition)
49 | 


--------------------------------------------------------------------------------
/csrc_musa/quantization/gptq/compat.muh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _compat_cuh
 6 | #define _compat_cuh
 7 | 
 8 | namespace vllm {
 9 | namespace gptq {
10 | // atomicAdd for half types, to support CC < 7.x
11 | 
12 | __device__ __forceinline__ void atomicAdd_half(half* address, half val)
13 | {
14 |     unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
15 |     unsigned int old = *address_as_ui;
16 |     unsigned int assumed;
17 | 
18 |     do
19 |     {
20 |         assumed = old;
21 |         __half_raw hsum;
22 |         hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
23 |         half tmpres = __hadd(hsum, val);
24 |         hsum = __half_raw(tmpres);
25 |         old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
26 |         old = atomicCAS(address_as_ui, assumed, old);
27 |     }
28 |     while (assumed != old);
29 | }
30 | 
31 | // atomicAdd for half2 types
32 | 
33 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
34 | {
35 |     unsigned int* address_as_ui = (unsigned int*)address;
36 |     unsigned int old = *address_as_ui;
37 |     unsigned int assumed;
38 |     do
39 |     {
40 |         assumed = old;
41 |         half2 old_val = *((half2*)&old);
42 |         half2 new_val = __hadd2(old_val, val);
43 |         old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
44 |     }
45 |     while (assumed != old);
46 | }
47 | 
48 | //
49 | 
50 | #if defined(__MUSA_ARCH__) || defined(USE_ROCM)
51 | #if __MUSA_ARCH__ < 700 || defined(USE_ROCM)
52 | 
53 | __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
54 | 
55 | #if __MUSA_ARCH__ < 600 || defined(USE_ROCM)
56 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
57 | #endif
58 | 
59 | #endif
60 | #endif
61 | 
62 | }  // namespace gptq
63 | }  // namespace vllm
64 | #endif
65 | 


--------------------------------------------------------------------------------
/csrc_musa/quantization/gptq/qdq_8.muh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_8_cuh
 6 | #define _qdq_8_cuh
 7 | 
 8 | #include "qdq_util.cuh"
 9 | 
10 | namespace vllm {
11 | namespace gptq {
12 | 
13 | __forceinline__ __device__ void shuffle_8bit_4
14 | (
15 |     uint32_t* q,
16 |     int stride
17 | )
18 | {
19 | }
20 | 
21 | __forceinline__ __device__ void dequant_8bit_8
22 | (
23 |     const uint32_t q_0,
24 |     const uint32_t q_1,
25 |     half2 (&dq)[4],
26 |     int stride,
27 |     const uint32_t zero
28 | )
29 | {
30 |     half dqh[8];
31 |     for (int i = 0; i < 4; i++) dqh[i    ] = dq_ns(exb(q_0, i * 8, 0xff), zero);
32 |     for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
33 | 
34 |     for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
35 | }
36 | 
37 | }  // namespace gptq
38 | }  // namespace vllm
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/csrc_musa/quantization/gptq/qdq_util.muh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_util_cuh
 6 | #define _qdq_util_cuh
 7 | 
 8 | namespace vllm {
 9 | namespace gptq {
10 | 
11 | union half2_uint32
12 | {
13 |     uint32_t as_uint32;
14 |     half2 as_half2;
15 |     __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
16 |     __device__ half2_uint32(half2 val) : as_half2(val) {}
17 | };
18 | 
19 | union half_uint16
20 | {
21 |     uint16_t as_uint16;
22 |     half as_half;
23 |     __device__ half_uint16(uint16_t val) : as_uint16(val) {}
24 |     __device__ half_uint16(half val) : as_half(val) {}
25 | };
26 | 
27 | // Max_scale premultiplied by 1/256
28 | 
29 | __forceinline__ __device__ half dq_scale(const int qs, const half max_scale)
30 | {
31 |     int qs_i = qs + 1;
32 |     half qs_h = __int2half_rn(qs_i * qs_i);
33 |     qs_h = __hmul(qs_h, max_scale);
34 |     return qs_h;
35 | }
36 | 
37 | __forceinline__ __device__ half dq(const int q, const int qzero, const half scale)
38 | {
39 |     return __hmul(__int2half_rn(q - qzero), scale);
40 | }
41 | 
42 | __forceinline__ __device__ half dq_ns(const int q, const int qzero)
43 | {
44 |     //return __hsub(__int2half_rn(q), __int2half_rn(qzero));
45 |     return __int2half_rn(q - qzero);
46 | }
47 | 
48 | __forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask)
49 | {
50 |     return (int)((q >> shift) & mask);
51 | }
52 | 
53 | __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask)
54 | {
55 |     return (int)(__funnelshift_rc(q0, q1, shift) & mask);
56 | }
57 | 
58 | }  // namespace gptq
59 | }  // namespace vllm
60 | #endif
61 | 


--------------------------------------------------------------------------------
/csrc_musa/quantization/gptq_marlin/gptq_marlin.muh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/extension.h>
 4 | 
 5 | #include "torch_musa/csrc/aten/musa/MUSAContext.h"
 6 | #include "torch_musa/csrc/core/MUSAGuard.h"
 7 | #include <musa.h>
 8 | #include <musa_fp16.h>
 9 | #include <musa_runtime.h>
10 | #include <iostream>
11 | 
12 | namespace gptq_marlin {
13 | 
14 | // 8 warps are a good choice since every SM has 4 schedulers and having more than 1 warp per
15 | // schedule allows some more latency hiding. At the same time, we want relatively few warps to have
16 | // many registers per warp and small tiles.
17 | static constexpr int default_threads = 256;
18 | 
19 | static constexpr int pipe_stages = 4; // 4 pipeline stages fit into shared memory
20 | 
21 | static constexpr int min_thread_n = 64;
22 | static constexpr int min_thread_k = 64;
23 | 
24 | static constexpr int tile_size = 16;
25 | static constexpr int max_par   = 16;
26 | 
27 | template <typename T, int n>
28 | struct Vec {
29 |   T             elems[n];
30 |   __device__ T& operator[](int i) { return elems[i]; }
31 | };
32 | 
33 | using I4 = Vec<int, 4>;
34 | 
35 | constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
36 | 
37 | #if defined(__MUSA_ARCH__) && __MUSA_ARCH__ < 800
38 |   // No support for async
39 | #else
40 | 
41 | __device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, bool pred = true) {
42 |   const int BYTES = 16;
43 |   uint32_t  smem  = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
44 |   asm volatile("{\n"
45 |                "   .reg .pred p;\n"
46 |                "   setp.ne.b32 p, %0, 0;\n"
47 |                "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
48 |                "}\n" ::"r"((int)pred),
49 |                "r"(smem), "l"(glob_ptr), "n"(BYTES));
50 | }
51 | 
52 | __device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
53 |   const int BYTES = 16;
54 |   uint32_t  smem  = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
55 |   asm volatile("{\n"
56 |                "   cp.async.cg.shared.global [%0], [%1], %2;\n"
57 |                "}\n" ::"r"(smem),
58 |                "l"(glob_ptr), "n"(BYTES));
59 | }
60 | 
61 | __device__ inline void cp_async_fence() { asm volatile("cp.async.commit_group;\n" ::); }
62 | 
63 | template <int n>
64 | __device__ inline void cp_async_wait() {
65 |   asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
66 | }
67 | 
68 | #endif
69 | 
70 | } // namespace gptq_marlin
71 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # vLLM documents
 2 | 
 3 | ## Build the docs
 4 | 
 5 | ```bash
 6 | # Install dependencies.
 7 | pip install -r requirements-docs.txt
 8 | 
 9 | # Build the docs.
10 | make clean
11 | make html
12 | ```
13 | 
14 | ## Open the docs with your browser
15 | 
16 | ```bash
17 | python -m http.server -d build/html/
18 | ```
19 | Launch your browser and open localhost:8000.
20 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
 1 | sphinx == 6.2.1
 2 | sphinx-book-theme == 1.0.1
 3 | sphinx-copybutton == 0.5.2
 4 | myst-parser == 2.0.0
 5 | sphinx-argparse
 6 | 
 7 | # packages to install to build the documentation
 8 | pydantic
 9 | -f https://download.pytorch.org/whl/cpu
10 | torch
11 | py-cpuinfo
12 | transformers
13 | 


--------------------------------------------------------------------------------
/docs/source/assets/dev/dockerfile-stages-dependency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/dev/dockerfile-stages-dependency.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/k_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/k_vecs.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/key.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/logits_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/logits_vec.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/q_vecs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/q_vecs.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/query.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/v_vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/v_vec.png


--------------------------------------------------------------------------------
/docs/source/assets/kernel/value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/kernel/value.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-only-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/logos/vllm-logo-only-light.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/logos/vllm-logo-text-dark.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/docs/source/assets/logos/vllm-logo-text-light.png


--------------------------------------------------------------------------------
/docs/source/dev/dockerfile/dockerfile.rst:
--------------------------------------------------------------------------------
 1 | Dockerfile
 2 | ====================
 3 | 
 4 | See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_ for the main Dockerfile to construct 
 5 | the image for running an OpenAI compatible server with vLLM.
 6 | 
 7 | -  Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
 8 | 
 9 |    - All build stages
10 |    - The default build target (highlighted in grey)
11 |    - External images (with dashed borders)
12 |    
13 |    The edges of the build graph represent:
14 |    
15 |    - FROM ... dependencies (with a solid line and a full arrow head)
16 |    - COPY --from=... dependencies (with a dashed line and an empty arrow head)
17 |    - RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
18 | 
19 |    .. figure:: ../../assets/dev/dockerfile-stages-dependency.png
20 |       :alt: query
21 |       :width: 100%
22 |       :align: center
23 | 
24 |    Made using: https://github.com/patrickhoefler/dockerfilegraph
25 | 
26 |    Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present):
27 | 
28 |    .. code:: bash
29 | 
30 |       dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
31 | 
32 |    or in case you want to run it directly with the docker image:
33 |    
34 |    .. code:: bash
35 | 
36 |       docker run \
37 |          --rm \
38 |          --user "$(id -u):$(id -g)" \
39 |          --workdir /workspace \
40 |          --volume "$(pwd)":/workspace \
41 |          ghcr.io/patrickhoefler/dockerfilegraph:alpine \
42 |          --output png \
43 |          --dpi 200 \
44 |          --max-label-length 50 \
45 |          --filename Dockerfile \
46 |          --legend
47 | 
48 |    (To run it for a different file, you can pass in a different argument to the flag `--filename`.)
49 | 
50 |    


--------------------------------------------------------------------------------
/docs/source/dev/engine/async_llm_engine.rst:
--------------------------------------------------------------------------------
1 | AsyncLLMEngine
2 | =================================
3 | 
4 | .. autoclass:: vllm.AsyncLLMEngine
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/source/dev/engine/engine_index.rst:
--------------------------------------------------------------------------------
 1 | vLLM Engine
 2 | =================================
 3 | 
 4 | .. automodule:: vllm.engine
 5 | .. currentmodule:: vllm.engine
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 |    :caption: Engines
10 | 
11 |    llm_engine
12 |    async_llm_engine
13 | 
14 | 


--------------------------------------------------------------------------------
/docs/source/dev/engine/llm_engine.rst:
--------------------------------------------------------------------------------
1 | LLMEngine
2 | =================================
3 | 
4 | .. autoclass:: vllm.LLMEngine
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/source/dev/sampling_params.rst:
--------------------------------------------------------------------------------
1 | Sampling Params
2 | ===============
3 | 
4 | .. autoclass:: vllm.SamplingParams
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/generate_examples.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from pathlib import Path
 3 | 
 4 | 
 5 | def fix_case(text: str) -> str:
 6 |     subs = [
 7 |         ("api", "API"),
 8 |         ("llm", "LLM"),
 9 |         ("vllm", "vLLM"),
10 |         ("openai", "OpenAI"),
11 |         ("multilora", "MultiLoRA"),
12 |     ]
13 |     for sub in subs:
14 |         text = re.sub(*sub, text, flags=re.IGNORECASE)
15 |     return text
16 | 
17 | 
18 | def underline(title: str, character: str = "=") -> str:
19 |     return f"{title}\n{character * len(title)}"
20 | 
21 | 
22 | def generate_title(filename: str) -> str:
23 |     # Turn filename into a title
24 |     title = filename.replace("_", " ").title()
25 |     # Handle acronyms and names
26 |     title = fix_case(title)
27 |     # Underline title
28 |     title = underline(title)
29 |     return title
30 | 
31 | 
32 | def generate_examples():
33 |     root_dir = Path(__file__).parent.parent.parent.resolve()
34 | 
35 |     # Source paths
36 |     script_dir = root_dir / "examples"
37 |     script_paths = sorted(script_dir.glob("*.py"))
38 | 
39 |     # Destination paths
40 |     doc_dir = root_dir / "docs/source/getting_started/examples"
41 |     doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths]
42 | 
43 |     # Generate the example docs for each example script
44 |     for script_path, doc_path in zip(script_paths, doc_paths):
45 |         script_url = f"https://github.com/vllm-project/vllm/blob/main/examples/{script_path.name}"
46 |         # Make script_path relative to doc_path and call it include_path
47 |         include_path = '../../../..' / script_path.relative_to(root_dir)
48 |         content = (f"{generate_title(doc_path.stem)}\n\n"
49 |                    f"Source {script_url}.\n\n"
50 |                    f".. literalinclude:: {include_path}\n"
51 |                    "    :language: python\n"
52 |                    "    :linenos:\n")
53 |         with open(doc_path, "w+") as f:
54 |             f.write(content)
55 | 
56 |     # Generate the toctree for the example scripts
57 |     with open(doc_dir / "examples_index.template.rst") as f:
58 |         examples_index = f.read()
59 |     with open(doc_dir / "examples_index.rst", "w+") as f:
60 |         example_docs = "\n   ".join(path.stem for path in script_paths)
61 |         f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))
62 | 


--------------------------------------------------------------------------------
/docs/source/getting_started/examples/examples_index.template.rst:
--------------------------------------------------------------------------------
1 | Examples
2 | =================================
3 | 
4 | .. toctree::
5 |    :maxdepth: 1
6 |    :caption: Scripts
7 | 
8 |    %EXAMPLE_DOCS%
9 | 


--------------------------------------------------------------------------------
/docs/source/models/engine_args.rst:
--------------------------------------------------------------------------------
 1 | .. _engine_args:
 2 | 
 3 | Engine Arguments
 4 | ================
 5 | 
 6 | Below, you can find an explanation of every engine argument for vLLM:
 7 | 
 8 | .. argparse::
 9 |     :module: vllm.engine.arg_utils
10 |     :func: _engine_args_parser
11 |     :prog: -m vllm.entrypoints.openai.api_server
12 |     :nodefaultconst:
13 | 
14 | Async Engine Arguments
15 | ----------------------
16 | 
17 | Below are the additional arguments related to the asynchronous engine:
18 | 
19 | .. argparse::
20 |     :module: vllm.engine.arg_utils
21 |     :func: _async_engine_args_parser
22 |     :prog: -m vllm.entrypoints.openai.api_server
23 |     :nodefaultconst:


--------------------------------------------------------------------------------
/docs/source/models/performance.rst:
--------------------------------------------------------------------------------
 1 | .. _performance:
 2 | 
 3 | Performance and Tuning
 4 | ======================
 5 | 
 6 | Chunked Prefill
 7 | ---------------
 8 | vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
 9 | 
10 | You can enable the feature by specifying
11 | 
12 | .. code-block:: python
13 | 
14 |     llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
15 |     # Set max_num_batched_tokens to tune performance.
16 |     # NOTE: 512 is the default max_num_batched_tokens for chunked prefill.
17 |     # llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512)
18 | 
19 | By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch. This policy optimizes the TTFT (time to thefirst token), but incurs slower ITL (inter token latency) and inefficient GPU utilization.
20 | 
21 | Once chunked prefill is enabled, the policy is changed to
22 | 
23 | - prioritize decode requests. It batches all pending decode requests to the batch before scheduling any prefill.
24 | - When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills. If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it.
25 | 
26 | This policy has two benefits.
27 | 
28 | - It improves ITL (inter token latency) and generation decode because decode requests are prioritized.
29 | - It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
30 | 
31 | You can tune the performance by changing `max_num_batched_tokens`.
32 | By default, it is set to 512, which has the best ITL on A100 in the initial benchmark.
33 | Smaller batch size achieves better ITL because there are fewer prefills interrupting decodes.
34 | Higher batch size achieves better TTFT as you can put more prefill to the batch.
35 | If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
36 | Note that the default batch size (512) is optimized for ITL, and it may have lower throughput than the default scheduler. We recommend you set `max_num_batched_tokens > 2048` for throughput.
37 | 
38 | See related papers for more details (https://arxiv.org/pdf/2401.08671 or https://arxiv.org/pdf/2308.16369). 
39 | 


--------------------------------------------------------------------------------
/docs/source/quantization/fp8_e5m2_kvcache.rst:
--------------------------------------------------------------------------------
 1 | .. _fp8_kv_cache:
 2 | 
 3 | FP8 E5M2 KV Cache
 4 | ==================
 5 | 
 6 | The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits.
 7 | The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other.
 8 | 
 9 | Here is an example of how to enable this feature:
10 | 
11 | .. code-block:: python
12 | 
13 |     from vllm import LLM, SamplingParams
14 |     # Sample prompts.
15 |     prompts = [
16 |         "Hello, my name is",
17 |         "The president of the United States is",
18 |         "The capital of France is",
19 |         "The future of AI is",
20 |     ]
21 |     # Create a sampling params object.
22 |     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
23 |     # Create an LLM.
24 |     llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8")
25 |     # Generate texts from the prompts. The output is a list of RequestOutput objects
26 |     # that contain the prompt, generated text, and other information.
27 |     outputs = llm.generate(prompts, sampling_params)
28 |     # Print the outputs.
29 |     for output in outputs:
30 |         prompt = output.prompt
31 |         generated_text = output.outputs[0].text
32 |         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
33 | 
34 | 
35 | Note, current prefix caching doesn't work with FP8 KV cache enabled, forward_prefix kernel should handle different KV and cache type.
36 | 
37 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_bentoml.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_bentoml:
2 | 
3 | Deploying with BentoML
4 | ======================
5 | 
6 | `BentoML <https://github.com/bentoml/BentoML>`_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
7 | 
8 | For details, see the tutorial `vLLM inference in the BentoML documentation <https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html>`_.


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_kserve.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_kserve:
2 | 
3 | Deploying with KServe
4 | ============================
5 | 
6 | vLLM can be deployed with `KServe <https://github.com/kserve/kserve>`_ on Kubernetes for highly scalable distributed model serving.
7 | 
8 | Please see `this guide <https://kserve.github.io/website/latest/modelserving/v1beta1/llm/vllm/>`_ for more details on using vLLM with KServe.
9 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_triton.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_triton:
2 | 
3 | Deploying with NVIDIA Triton
4 | ============================
5 | 
6 | The `Triton Inference Server <https://github.com/triton-inference-server>`_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m <https://huggingface.co/facebook/opt-125m>`_ model using vLLM. Please see `Deploying a vLLM model in Triton <https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton>`_ for more details.
7 | 


--------------------------------------------------------------------------------
/docs/source/serving/distributed_serving.rst:
--------------------------------------------------------------------------------
 1 | .. _distributed_serving:
 2 | 
 3 | Distributed Inference and Serving
 4 | =================================
 5 | 
 6 | vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with `Ray <https://github.com/ray-project/ray>`_. To run distributed inference, install Ray with:
 7 | 
 8 | .. code-block:: console
 9 | 
10 |     $ pip install ray
11 | 
12 | To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
13 | 
14 | .. code-block:: python
15 | 
16 |     from vllm import LLM
17 |     llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
18 |     output = llm.generate("San Franciso is a")
19 | 
20 | To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
21 | 
22 | .. code-block:: console
23 | 
24 |     $ python -m vllm.entrypoints.api_server \
25 |     $     --model facebook/opt-13b \
26 |     $     --tensor-parallel-size 4
27 | 
28 | To scale vLLM beyond a single machine, start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM:
29 | 
30 | .. code-block:: console
31 | 
32 |     $ # On head node
33 |     $ ray start --head
34 | 
35 |     $ # On worker nodes
36 |     $ ray start --address=<ray-head-address>
37 | 
38 | After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.


--------------------------------------------------------------------------------
/docs/source/serving/env_vars.rst:
--------------------------------------------------------------------------------
 1 | Environment Variables
 2 | ========================
 3 | 
 4 | vLLM uses the following environment variables to configure the system:
 5 | 
 6 | .. literalinclude:: ../../../vllm/envs.py
 7 |     :language: python
 8 |     :start-after: begin-env-vars-definition
 9 |     :end-before: end-env-vars-definition
10 | 


--------------------------------------------------------------------------------
/docs/source/serving/integrations.rst:
--------------------------------------------------------------------------------
 1 | Integrations
 2 | ------------
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    run_on_sky
 8 |    deploying_with_kserve
 9 |    deploying_with_triton
10 |    deploying_with_bentoml
11 |    serving_with_langchain
12 | 


--------------------------------------------------------------------------------
/docs/source/serving/metrics.rst:
--------------------------------------------------------------------------------
 1 | Production Metrics
 2 | ==================
 3 | 
 4 | vLLM exposes a number of metrics that can be used to monitor the health of the
 5 | system. These metrics are exposed via the `/metrics` endpoint on the vLLM
 6 | OpenAI compatible API server.
 7 | 
 8 | The following metrics are exposed:
 9 | 
10 | .. literalinclude:: ../../../vllm/engine/metrics.py
11 |     :language: python
12 |     :start-after: begin-metrics-definitions
13 |     :end-before: end-metrics-definitions
14 | 


--------------------------------------------------------------------------------
/docs/source/serving/serving_with_langchain.rst:
--------------------------------------------------------------------------------
 1 | .. _run_on_langchain:
 2 | 
 3 | Serving with Langchain
 4 | ============================
 5 | 
 6 | vLLM is also available via `Langchain <https://github.com/langchain-ai/langchain>`_ .
 7 | 
 8 | To install langchain, run
 9 | 
10 | .. code-block:: console
11 | 
12 |     $ pip install langchain langchain_community -q
13 | 
14 | To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``.
15 | 
16 | .. code-block:: python
17 | 
18 |     from langchain_community.llms import VLLM
19 | 
20 |     llm = VLLM(model="mosaicml/mpt-7b",
21 |                trust_remote_code=True,  # mandatory for hf models
22 |                max_new_tokens=128,
23 |                top_k=10,
24 |                top_p=0.95,
25 |                temperature=0.8,
26 |                # tensor_parallel_size=... # for distributed inference
27 |     )
28 | 
29 |     print(llm("What is the capital of France ?"))
30 | 
31 | Please refer to this `Tutorial <https://python.langchain.com/docs/integrations/llms/vllm>`_ for more details.
32 | 


--------------------------------------------------------------------------------
/docs/source/serving/usage_stats.md:
--------------------------------------------------------------------------------
 1 | # Usage Stats Collection
 2 | 
 3 | vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit.
 4 | 
 5 | ## What data is collected?
 6 | 
 7 | You can see the up to date list of data collected by vLLM in the [usage_lib.py](https://github.com/vllm-project/vllm/blob/main/vllm/usage/usage_lib.py).
 8 | 
 9 | Here is an example as of v0.4.0:
10 | 
11 | ```json
12 | {
13 |   "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109",
14 |   "provider": "GCP",
15 |   "num_cpu": 24,
16 |   "cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz",
17 |   "cpu_family_model_stepping": "6,85,7",
18 |   "total_memory": 101261135872,
19 |   "architecture": "x86_64",
20 |   "platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31",
21 |   "gpu_count": 2,
22 |   "gpu_type": "NVIDIA L4",
23 |   "gpu_memory_per_device": 23580639232,
24 |   "model_architecture": "OPTForCausalLM",
25 |   "vllm_version": "0.3.2+cu123",
26 |   "context": "LLM_CLASS",
27 |   "log_time": 1711663373492490000,
28 |   "source": "production",
29 |   "dtype": "torch.float16",
30 |   "tensor_parallel_size": 1,
31 |   "block_size": 16,
32 |   "gpu_memory_utilization": 0.9,
33 |   "quantization": null,
34 |   "kv_cache_dtype": "auto",
35 |   "enable_lora": false,
36 |   "enable_prefix_caching": false,
37 |   "enforce_eager": false,
38 |   "disable_custom_all_reduce": true
39 | }
40 | ```
41 | 
42 | You can preview the collected data by running the following command:
43 | 
44 | ```bash
45 | tail ~/.config/vllm/usage_stats.json
46 | ```
47 | 
48 | ## Opt-out of Usage Stats Collection
49 | 
50 | You can opt-out of usage stats collection by setting the VLLM_NO_USAGE_STATS or DO_NOT_TRACK environment variable, or by creating a ~/.config/vllm/do_not_track file:
51 | 
52 | ```bash
53 | # Any of the following methods can disable usage stats collection
54 | export VLLM_NO_USAGE_STATS=1
55 | export DO_NOT_TRACK=1
56 | mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track
57 | ```
58 | 


--------------------------------------------------------------------------------
/examples/aqlm_example.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from vllm import LLM, SamplingParams
 4 | 
 5 | 
 6 | def main():
 7 | 
 8 |     parser = argparse.ArgumentParser(description='AQLM examples')
 9 | 
10 |     parser.add_argument('--model',
11 |                         '-m',
12 |                         type=str,
13 |                         default=None,
14 |                         help='model path, as for HF')
15 |     parser.add_argument('--choice',
16 |                         '-c',
17 |                         type=int,
18 |                         default=0,
19 |                         help='known good models by index, [0-4]')
20 |     parser.add_argument('--tensor_parallel_size',
21 |                         '-t',
22 |                         type=int,
23 |                         default=1,
24 |                         help='tensor parallel size')
25 | 
26 |     args = parser.parse_args()
27 | 
28 |     models = [
29 |         "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
30 |         "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
31 |         "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
32 |         "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
33 |         "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
34 |     ]
35 | 
36 |     model = LLM(args.model if args.model is not None else models[args.choice],
37 |                 tensor_parallel_size=args.tensor_parallel_size)
38 | 
39 |     sampling_params = SamplingParams(max_tokens=100, temperature=0)
40 |     outputs = model.generate("Hello my name is",
41 |                              sampling_params=sampling_params)
42 |     print(outputs[0].outputs[0].text)
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/examples/fp8/quantizer/README.md:
--------------------------------------------------------------------------------
 1 | ### Quantizer Utilities
 2 | `quantize.py`: NVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM:
 3 | `https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py`
 4 | 
 5 | ### Prerequisite
 6 | 
 7 | #### AMMO (AlgorithMic Model Optimization) Installation: nvidia-ammo 0.7.1 or later
 8 | `pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` 
 9 | 
10 | #### AMMO Download (code and docs)
11 | `https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz`
12 | `https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.7.1.tar.gz`
13 | 
14 | ### Usage
15 | 
16 | #### Run on H100 system for speed if FP8; number of GPUs depends on the model size
17 | 
18 | #### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache:
19 | `python quantize.py --model_dir ./ll2-7b --dtype float16 --qformat fp8 --kv_cache_dtype fp8 --output_dir ./ll2_7b_fp8 --calib_size 512 --tp_size 1`
20 | 
21 | Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference)
22 | ```
23 | # ll ./ll2_7b_fp8/
24 | total 19998244
25 | drwxr-xr-x 2 root root        4096 Feb  7 01:08 ./
26 | drwxrwxr-x 8 1060 1061        4096 Feb  7 01:08 ../
27 | -rw-r--r-- 1 root root      176411 Feb  7 01:08 llama_tp1.json
28 | -rw-r--r-- 1 root root 13477087480 Feb  7 01:09 llama_tp1_rank0.npz
29 | -rw-r--r-- 1 root root  7000893272 Feb  7 01:08 rank0.safetensors
30 | #
31 | ```
32 | 
33 | 


--------------------------------------------------------------------------------
/examples/gradio_webserver.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import gradio as gr
 5 | import requests
 6 | 
 7 | 
 8 | def http_bot(prompt):
 9 |     headers = {"User-Agent": "vLLM Client"}
10 |     pload = {
11 |         "prompt": prompt,
12 |         "stream": True,
13 |         "max_tokens": 128,
14 |     }
15 |     response = requests.post(args.model_url,
16 |                              headers=headers,
17 |                              json=pload,
18 |                              stream=True)
19 | 
20 |     for chunk in response.iter_lines(chunk_size=8192,
21 |                                      decode_unicode=False,
22 |                                      delimiter=b"\0"):
23 |         if chunk:
24 |             data = json.loads(chunk.decode("utf-8"))
25 |             output = data["text"][0]
26 |             yield output
27 | 
28 | 
29 | def build_demo():
30 |     with gr.Blocks() as demo:
31 |         gr.Markdown("# vLLM text completion demo\n")
32 |         inputbox = gr.Textbox(label="Input",
33 |                               placeholder="Enter text and press ENTER")
34 |         outputbox = gr.Textbox(label="Output",
35 |                                placeholder="Generated result from the model")
36 |         inputbox.submit(http_bot, [inputbox], [outputbox])
37 |     return demo
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--host", type=str, default=None)
43 |     parser.add_argument("--port", type=int, default=8001)
44 |     parser.add_argument("--model-url",
45 |                         type=str,
46 |                         default="http://localhost:8000/generate")
47 |     args = parser.parse_args()
48 | 
49 |     demo = build_demo()
50 |     demo.queue().launch(server_name=args.host,
51 |                         server_port=args.port,
52 |                         share=True)
53 | 


--------------------------------------------------------------------------------
/examples/llm_engine_example.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from typing import List, Tuple
 3 | 
 4 | from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 5 | 
 6 | 
 7 | def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
 8 |     """Create a list of test prompts with their sampling parameters."""
 9 |     return [
10 |         ("A robot may not injure a human being",
11 |          SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
12 |         ("To be or not to be,",
13 |          SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
14 |         ("What is the meaning of life?",
15 |          SamplingParams(n=2,
16 |                         best_of=5,
17 |                         temperature=0.8,
18 |                         top_p=0.95,
19 |                         frequency_penalty=0.1)),
20 |         ("It is only with the heart that one can see rightly",
21 |          SamplingParams(n=3, best_of=3, use_beam_search=True,
22 |                         temperature=0.0)),
23 |     ]
24 | 
25 | 
26 | def process_requests(engine: LLMEngine,
27 |                      test_prompts: List[Tuple[str, SamplingParams]]):
28 |     """Continuously process a list of prompts and handle the outputs."""
29 |     request_id = 0
30 | 
31 |     while test_prompts or engine.has_unfinished_requests():
32 |         if test_prompts:
33 |             prompt, sampling_params = test_prompts.pop(0)
34 |             engine.add_request(str(request_id), prompt, sampling_params)
35 |             request_id += 1
36 | 
37 |         request_outputs: List[RequestOutput] = engine.step()
38 | 
39 |         for request_output in request_outputs:
40 |             if request_output.finished:
41 |                 print(request_output)
42 | 
43 | 
44 | def initialize_engine(args: argparse.Namespace) -> LLMEngine:
45 |     """Initialize the LLMEngine from the command line arguments."""
46 |     engine_args = EngineArgs.from_cli_args(args)
47 |     return LLMEngine.from_engine_args(engine_args)
48 | 
49 | 
50 | def main(args: argparse.Namespace):
51 |     """Main function that sets up and runs the prompt processing."""
52 |     engine = initialize_engine(args)
53 |     test_prompts = create_test_prompts()
54 |     process_requests(engine, test_prompts)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     parser = argparse.ArgumentParser(
59 |         description='Demo on using the LLMEngine class directly')
60 |     parser = EngineArgs.add_cli_args(parser)
61 |     args = parser.parse_args()
62 |     main(args)
63 | 


--------------------------------------------------------------------------------
/examples/offline_inference.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(model="facebook/opt-125m")
15 | # Generate texts from the prompts. The output is a list of RequestOutput objects
16 | # that contain the prompt, generated text, and other information.
17 | outputs = llm.generate(prompts, sampling_params)
18 | # Print the outputs.
19 | for output in outputs:
20 |     prompt = output.prompt
21 |     generated_text = output.outputs[0].text
22 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
23 | 


--------------------------------------------------------------------------------
/examples/offline_inference_neuron.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(
15 |     model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
16 |     max_num_seqs=8,
17 |     # The max_model_len and block_size arguments are required to be same as
18 |     # max sequence length when targeting neuron device.
19 |     # Currently, this is a known limitation in continuous batching support
20 |     # in transformers-neuronx.
21 |     # TODO(liangfu): Support paged-attention in transformers-neuronx.
22 |     max_model_len=128,
23 |     block_size=128,
24 |     # The device can be automatically detected when AWS Neuron SDK is installed.
25 |     # The device argument can be either unspecified for automated detection,
26 |     # or explicitly assigned.
27 |     device="neuron",
28 |     tensor_parallel_size=2)
29 | # Generate texts from the prompts. The output is a list of RequestOutput objects
30 | # that contain the prompt, generated text, and other information.
31 | outputs = llm.generate(prompts, sampling_params)
32 | # Print the outputs.
33 | for output in outputs:
34 |     prompt = output.prompt
35 |     generated_text = output.outputs[0].text
36 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
37 | 


--------------------------------------------------------------------------------
/examples/offline_inference_with_prefix.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | prefix = (
 4 |     "You are an expert school principal, skilled in effectively managing "
 5 |     "faculty and staff. Draft 10-15 questions for a potential first grade "
 6 |     "Head Teacher for my K-12, all-girls', independent school that emphasizes "
 7 |     "community, joyful discovery, and life-long learning. The candidate is "
 8 |     "coming in for a first-round panel interview for a 8th grade Math "
 9 |     "teaching role. They have 5 years of previous teaching experience "
10 |     "as an assistant teacher at a co-ed, public school with experience "
11 |     "in middle school math teaching. Based on these information, fulfill "
12 |     "the following paragraph: ")
13 | 
14 | # Sample prompts.
15 | prompts = [
16 |     "Hello, my name is",
17 |     "The president of the United States is",
18 |     "The capital of France is",
19 |     "The future of AI is",
20 | ]
21 | # Create a sampling params object.
22 | sampling_params = SamplingParams(temperature=0.0)
23 | 
24 | # Create an LLM.
25 | llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True)
26 | 
27 | generating_prompts = [prefix + prompt for prompt in prompts]
28 | 
29 | # Generate texts from the prompts. The output is a list of RequestOutput objects
30 | # that contain the prompt, generated text, and other information.
31 | outputs = llm.generate(generating_prompts, sampling_params)
32 | # Print the outputs.
33 | for output in outputs:
34 |     prompt = output.prompt
35 |     generated_text = output.outputs[0].text
36 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
37 | 
38 | print("-" * 80)
39 | 
40 | # The llm.generate call will batch all prompts and send the batch at once
41 | # if resources allow. The prefix will only be cached after the first batch
42 | # is processed, so we need to call generate once to calculate the prefix
43 | # and cache it.
44 | outputs = llm.generate(generating_prompts[0], sampling_params)
45 | 
46 | # Subsequent batches can leverage the cached prefix
47 | outputs = llm.generate(generating_prompts, sampling_params)
48 | 
49 | # Print the outputs. You should see the same outputs as before
50 | for output in outputs:
51 |     prompt = output.prompt
52 |     generated_text = output.outputs[0].text
53 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
54 | 


--------------------------------------------------------------------------------
/examples/openai_chat_completion_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai_api_key = "EMPTY"
 5 | openai_api_base = "http://localhost:8000/v1"
 6 | 
 7 | client = OpenAI(
 8 |     # defaults to os.environ.get("OPENAI_API_KEY")
 9 |     api_key=openai_api_key,
10 |     base_url=openai_api_base,
11 | )
12 | 
13 | models = client.models.list()
14 | model = models.data[0].id
15 | 
16 | chat_completion = client.chat.completions.create(
17 |     messages=[{
18 |         "role": "system",
19 |         "content": "You are a helpful assistant."
20 |     }, {
21 |         "role": "user",
22 |         "content": "Who won the world series in 2020?"
23 |     }, {
24 |         "role":
25 |         "assistant",
26 |         "content":
27 |         "The Los Angeles Dodgers won the World Series in 2020."
28 |     }, {
29 |         "role": "user",
30 |         "content": "Where was it played?"
31 |     }],
32 |     model=model,
33 | )
34 | 
35 | print("Chat completion results:")
36 | print(chat_completion)
37 | 


--------------------------------------------------------------------------------
/examples/openai_completion_client.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai_api_key = "EMPTY"
 5 | openai_api_base = "http://localhost:8000/v1"
 6 | 
 7 | client = OpenAI(
 8 |     # defaults to os.environ.get("OPENAI_API_KEY")
 9 |     api_key=openai_api_key,
10 |     base_url=openai_api_base,
11 | )
12 | 
13 | models = client.models.list()
14 | model = models.data[0].id
15 | 
16 | # Completion API
17 | stream = False
18 | completion = client.completions.create(
19 |     model=model,
20 |     prompt="A robot may not injure a human being",
21 |     echo=False,
22 |     n=2,
23 |     stream=stream,
24 |     logprobs=3)
25 | 
26 | print("Completion results:")
27 | if stream:
28 |     for c in completion:
29 |         print(c)
30 | else:
31 |     print(completion)
32 | 


--------------------------------------------------------------------------------
/examples/production_monitoring/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | # docker-compose.yaml
 2 | version: "3"
 3 | 
 4 | services:
 5 |   prometheus:
 6 |     image: prom/prometheus:latest
 7 |     extra_hosts:
 8 |       - "host.docker.internal:host-gateway"     # allow a direct connection from container to the local machine
 9 |     ports:
10 |       - "9090:9090"   # the default port used by Prometheus
11 |     volumes:
12 |       - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
13 | 
14 |   grafana:
15 |     image: grafana/grafana:latest
16 |     depends_on:
17 |       - prometheus
18 |     ports:
19 |       - "3000:3000" # the default port used by Grafana
20 | 


--------------------------------------------------------------------------------
/examples/production_monitoring/prometheus.yaml:
--------------------------------------------------------------------------------
 1 | # prometheus.yaml
 2 | global:
 3 |   scrape_interval: 5s
 4 |   evaluation_interval: 30s
 5 | 
 6 | scrape_configs:
 7 |   - job_name: vllm
 8 |     static_configs:
 9 |       - targets:
10 |           - 'host.docker.internal:8000'
11 | 


--------------------------------------------------------------------------------
/examples/template_alpaca.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {% for message in messages %}
 4 | {% if message['role'] == 'user' %}
 5 | ### Instruction:
 6 | {{ message['content']|trim -}}
 7 | {% if not loop.last %}
 8 | 
 9 | 
10 | {% endif %}
11 | {% elif message['role'] == 'assistant' %}
12 | ### Response:
13 | {{ message['content']|trim -}}
14 | {% if not loop.last %}
15 | 
16 | 
17 | {% endif %}
18 | {% elif message['role'] == 'user_context' %}
19 | ### Input:
20 | {{ message['content']|trim -}}
21 | {% if not loop.last %}
22 | 
23 | 
24 | {% endif %}
25 | {% endif %}
26 | {% endfor %}
27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
28 | ### Response:
29 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_baichuan.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {%- for message in messages -%}
 4 |     {%- if message['role'] == 'user' -%}
 5 |         {{- '<reserved_106>' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- '<reserved_107>' + message['content'] -}}
 8 |     {%- endif -%}
 9 | {%- endfor -%}
10 | 
11 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
12 |     {{- '<reserved_107>' -}}
13 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_chatglm.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=0) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/examples/template_chatglm2.jinja:
--------------------------------------------------------------------------------
 1 | {%- set counter = namespace(index=1) -%}
 2 | {%- for message in messages -%}
 3 |     {%- if message['role'] == 'user' -%}
 4 |         {{- '[Round ' + counter.index|string + ']\n\n问：' + message['content'] -}}
 5 |         {%- set counter.index = counter.index + 1 -%}
 6 |     {%- endif -%}
 7 |     {%- if message['role'] == 'assistant' -%}
 8 |         {{- '\n\n答：' + message['content'] -}}
 9 |         {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |             {{- '\n\n' -}}
11 |         {%- endif -%}
12 |     {%- endif -%}
13 | {%- endfor -%}
14 | 
15 | 
16 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
17 |     {{- '\n\n答：' -}}
18 | {%- endif -%}


--------------------------------------------------------------------------------
/examples/template_chatml.jinja:
--------------------------------------------------------------------------------
1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}


--------------------------------------------------------------------------------
/examples/template_falcon.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'user' -%}
 3 |         {{- 'User: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'assistant' -%}
 5 |         {{- 'Assistant: ' + message['content'] -}}
 6 |     {%- endif -%}
 7 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
 8 |         {{- '\n' -}}
 9 |     {%- endif -%}
10 | {%- endfor -%}
11 | 
12 | 
13 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
14 |     {{- 'Assistant:' -}}
15 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_falcon_180b.jinja:
--------------------------------------------------------------------------------
 1 | {%- for message in messages -%}
 2 |     {%- if message['role'] == 'system' -%}
 3 |         {{- 'System: ' + message['content'] -}}
 4 |     {%- elif message['role'] == 'user' -%}
 5 |         {{- 'User: ' + message['content'] -}}
 6 |     {%- elif message['role'] == 'assistant' -%}
 7 |         {{- 'Falcon: ' + message['content'] -}}
 8 |     {%- endif -%}
 9 |     {%- if (loop.last and add_generation_prompt) or not loop.last -%}
10 |         {{- '\n' -}}
11 |     {%- endif -%}
12 | {%- endfor -%}
13 | 
14 | 
15 | {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
16 |     {{- 'Falcon:' -}}
17 | {% endif %}


--------------------------------------------------------------------------------
/examples/template_inkbot.jinja:
--------------------------------------------------------------------------------
 1 | <#meta#>
 2 | - Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }}
 3 | - Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }}
 4 | <#system#>
 5 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 6 | <#chat#>
 7 | {% for message in messages %}
 8 | {% if message['role'] == 'user' %}
 9 | <#user#>
10 | {{ message['content']|trim -}}
11 | {% if not loop.last %}
12 | 
13 | {% endif %}
14 | {% elif message['role'] == 'assistant' %}
15 | <#bot#>
16 | {{ message['content']|trim -}}
17 | {% if not loop.last %}
18 | 
19 | {% endif %}
20 | {% elif message['role'] == 'user_context' %}
21 | <#user_context#>
22 | {{ message['content']|trim -}}
23 | {% if not loop.last %}
24 | 
25 | {% endif %}
26 | {% endif %}
27 | {% endfor %}
28 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
29 | <#bot#>
30 | {% endif %}


--------------------------------------------------------------------------------
/musa_porting.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup, find_packages
 3 | from torch_musa.utils.simple_porting import SimplePorting
 4 | from torch_musa.utils.musa_extension import MUSAExtension
 5 | 
 6 | SimplePorting(cuda_dir_path="./csrc", mapping_rule={
 7 |     "x.device().is_cuda()": "true",
 8 |     "#include <ATen/cuda/CUDAContext.h>": "#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"",
 9 |     "#include <c10/cuda/CUDAGuard.h>": "#include \"torch_musa/csrc/core/MUSAGuard.h\"",
10 |     "#include <ATen/cuda/Exceptions.h>": "#include \"torch_musa/csrc/core/MUSAException.h\"",
11 |     "#include <c10/cuda/CUDAStream.h>": "#include \"torch_musa/csrc/core/MUSAStream.h\"",
12 |     "at::kCUDA": "at::musa::kMUSA",
13 |     "at::cuda::getCurrentCUDAStream()": "at::musa::getCurrentMUSAStream()",
14 |     "__nv_bfloat16": "__mt_bfloat16",
15 |     "at::cuda::OptionalCUDAGuard": "at::musa::OptionalMUSAGuard",
16 |     "at::cuda::getCurrentCUDABlasHandle()": "at::musa::getCurrentMUSABlasHandle()",
17 |     "ATen/cuda/CUDATensorMethods.cuh": "ATen/musa/MUSA_PORT_TensorMethods.muh",
18 |     "#include \"attention_generic.cuh\"": "#include \"attention_generic.muh\"",
19 |     "#include \"reduction_utils.cuh\"": "#include \"reduction_utils.muh\"",
20 |     "#include <THC/THCAtomics.cuh>": "#include <THC/THCAtomics.muh>",
21 |     "#include \"dtype_float16.cuh\"": "#include \"dtype_float16.muh\"",
22 |     "#include \"dtype_float32.cuh\"": "#include \"dtype_float32.muh\"",
23 |     "#include \"custom_all_reduce.cuh\"": "#include \"custom_all_reduce.muh\"",
24 |     "#include \"dtype_bfloat16.cuh\"": "#include \"dtype_bfloat16.muh\"",
25 |     "#include \"dtype_fp8.cuh\"": "#include \"dtype_fp8.muh\"",
26 |     "#include \"attention_utils.cuh\"": "#include \"attention_utils.muh\"",
27 |     "cuPointerGetAttribute": "muPointerGetAttribute",
28 |     "CUdeviceptr": "MUdeviceptr",
29 |     "CUDA_SUCCESS": "MUSA_SUCCESS",
30 |     "CU_POINTER_ATTRIBUTE_RANGE_START_ADDR": "MU_POINTER_ATTRIBUTE_RANGE_START_ADDR",
31 |     "c10::cuda": "c10::musa",
32 |     "cudaStreamIsCapturing": "at::musa::musaStreamIsCapturing",
33 |     "AT_CUDA_CHECK": "C10_MUSA_CHECK",
34 |     "nv_bfloat16": "mt_bfloat16",
35 |     "struct __align__(16) RankData { const void *__restrict__ ptrs[8]; };":"struct __align__(16) RankData { const void *__restrict__ ptrs[8]; RankData& operator=(const RankData& ){return *this;} };"
36 |     }).run()


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | # Should be mirrored in requirements-build.txt
 3 | requires = [
 4 |     "cmake>=3.21",
 5 |     "ninja",
 6 |     "packaging",
 7 |     "setuptools >= 49.4.0",
 8 |     "torch == 2.2.0",
 9 |     "wheel",
10 | ]
11 | build-backend = "setuptools.build_meta"
12 | 
13 | [tool.ruff]
14 | # Allow lines to be as long as 80.
15 | line-length = 80
16 | exclude = [
17 |     # External file, leaving license intact
18 |     "examples/fp8/quantizer/quantize.py"
19 | ]
20 | 
21 | [tool.ruff.lint]
22 | select = [
23 |     # pycodestyle
24 |     "E",
25 |     # Pyflakes
26 |     "F",
27 |     # pyupgrade
28 |     # "UP",
29 |     # flake8-bugbear
30 |     "B",
31 |     # flake8-simplify
32 |     "SIM",
33 |     # isort
34 |     # "I",
35 |     "G",
36 | ]
37 | ignore = [
38 |     # star imports
39 |     "F405", "F403",
40 |     # lambda expression assignment
41 |     "E731",
42 |     # Loop control variable not used within loop body
43 |     "B007",
44 | ]
45 | 
46 | [tool.mypy]
47 | python_version = "3.9"
48 | 
49 | ignore_missing_imports = true
50 | check_untyped_defs = true
51 | follow_imports = "skip"
52 | 
53 | files = "vllm"
54 | # TODO(woosuk): Include the code from Megatron and HuggingFace.
55 | exclude = [
56 |     "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
57 |     # Ignore triton kernels in ops.
58 |     'vllm/attention/ops/.*\.py$'
59 | ]
60 | 
61 | [tool.codespell]
62 | ignore-words-list = "dout, te, indicies"
63 | skip = "./tests/prompts,./benchmarks/sonnet.txt"
64 | 
65 | [tool.isort]
66 | use_parentheses = true
67 | skip_gitignore = true
68 | 


--------------------------------------------------------------------------------
/requirements-build.txt:
--------------------------------------------------------------------------------
1 | # Should be mirrored in pyproject.toml
2 | cmake>=3.21
3 | ninja
4 | packaging
5 | setuptools>=49.4.0
6 | torch==2.2.0
7 | wheel
8 | triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.


--------------------------------------------------------------------------------
/requirements-common.txt:
--------------------------------------------------------------------------------
 1 | cmake >= 3.21
 2 | ninja  # For faster builds.
 3 | psutil
 4 | sentencepiece  # Required for LLaMA tokenizer.
 5 | numpy
 6 | requests
 7 | py-cpuinfo
 8 | transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
 9 | tokenizers >= 0.19.1  # Required for Llama 3.
10 | fastapi
11 | openai
12 | uvicorn[standard]
13 | pydantic >= 2.0  # Required for OpenAI server.
14 | prometheus_client >= 0.18.0
15 | prometheus-fastapi-instrumentator >= 7.0.0
16 | tiktoken == 0.6.0  # Required for DBRX tokenizer
17 | lm-format-enforcer == 0.9.8
18 | outlines == 0.0.34 # Requires torch >= 2.1.0
19 | typing_extensions
20 | filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
21 | 


--------------------------------------------------------------------------------
/requirements-cpu.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | # Dependencies for x86_64 CPUs
5 | torch == 2.3.0+cpu
6 | triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.


--------------------------------------------------------------------------------
/requirements-cuda.txt:
--------------------------------------------------------------------------------
 1 | # Common dependencies
 2 | -r requirements-common.txt
 3 | 
 4 | # Dependencies for NVIDIA GPUs
 5 | ray >= 2.9
 6 | nvidia-ml-py # for pynvml package
 7 | vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
 8 | torch == 2.3.0
 9 | xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
10 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # formatting
 2 | yapf==0.32.0
 3 | toml==0.10.2
 4 | tomli==2.0.1
 5 | ruff==0.1.5
 6 | codespell==2.2.6
 7 | isort==5.13.2
 8 | 
 9 | # type checking
10 | mypy==1.9.0
11 | types-PyYAML
12 | types-requests
13 | types-setuptools
14 | 
15 | # testing
16 | pytest
17 | tensorizer==2.9.0
18 | pytest-forked
19 | pytest-asyncio
20 | pytest-rerunfailures
21 | pytest-shard
22 | httpx
23 | einops # required for MPT
24 | requests
25 | ray
26 | peft
27 | awscli
28 | 
29 | # Benchmarking
30 | aiohttp
31 | 
32 | # Multimodal
33 | pillow
34 | 


--------------------------------------------------------------------------------
/requirements-musa.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | # Dependencies for MTHREADS GPUs
5 | ray >= 2.9
6 | torch == 2.2.0
7 | triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
8 | 


--------------------------------------------------------------------------------
/requirements-neuron.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | # Dependencies for Neuron devices
5 | transformers-neuronx >= 0.9.0
6 | torch-neuronx >= 2.1.0
7 | neuronx-cc
8 | 


--------------------------------------------------------------------------------
/requirements-rocm.txt:
--------------------------------------------------------------------------------
1 | # Common dependencies
2 | -r requirements-common.txt
3 | 
4 | # Dependencies for AMD GPUs
5 | ray == 2.9.3
6 | 


--------------------------------------------------------------------------------
/rocm_patch/rocm_bf16.patch:
--------------------------------------------------------------------------------
 1 | --- amd_hip_bf16.h	2024-02-06 18:28:58.268699142 +0000
 2 | +++ amd_hip_bf16.h.new	2024-02-06 18:28:31.988647133 +0000
 3 | @@ -90,10 +90,10 @@
 4 |  #include "math_fwd.h"              // ocml device functions
 5 |  
 6 |  #if defined(__HIPCC_RTC__)
 7 | -#define __HOST_DEVICE__ __device__
 8 | +#define __HOST_DEVICE__ __device__ static
 9 |  #else
10 |  #include <climits>
11 | -#define __HOST_DEVICE__ __host__ __device__
12 | +#define __HOST_DEVICE__ __host__ __device__ static inline
13 |  #endif
14 |  
15 |  // Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on
16 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/__init__.py


--------------------------------------------------------------------------------
/tests/async_engine/api_server_async_engine.py:
--------------------------------------------------------------------------------
 1 | """vllm.entrypoints.api_server with some extra logging for testing."""
 2 | import argparse
 3 | from typing import Any, Dict
 4 | 
 5 | import uvicorn
 6 | from fastapi.responses import JSONResponse, Response
 7 | 
 8 | import vllm.entrypoints.api_server
 9 | from vllm.engine.arg_utils import AsyncEngineArgs
10 | from vllm.engine.async_llm_engine import AsyncLLMEngine
11 | 
12 | app = vllm.entrypoints.api_server.app
13 | 
14 | 
15 | class AsyncLLMEngineWithStats(AsyncLLMEngine):
16 | 
17 |     def __init__(self, *args, **kwargs):
18 |         super().__init__(*args, **kwargs)
19 |         self._num_aborts = 0
20 | 
21 |     async def abort(self, request_id: str) -> None:
22 |         await super().abort(request_id)
23 |         self._num_aborts += 1
24 | 
25 |     def testing_stats(self) -> Dict[str, Any]:
26 |         return {"num_aborted_requests": self._num_aborts}
27 | 
28 | 
29 | @app.get("/stats")
30 | def stats() -> Response:
31 |     """Get the statistics of the engine."""
32 |     return JSONResponse(engine.testing_stats())
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument("--host", type=str, default="localhost")
38 |     parser.add_argument("--port", type=int, default=8000)
39 |     parser = AsyncEngineArgs.add_cli_args(parser)
40 |     args = parser.parse_args()
41 | 
42 |     engine_args = AsyncEngineArgs.from_cli_args(args)
43 |     engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
44 |     vllm.entrypoints.api_server.engine = engine
45 |     uvicorn.run(
46 |         app,
47 |         host=args.host,
48 |         port=args.port,
49 |         log_level="debug",
50 |         timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
51 | 


--------------------------------------------------------------------------------
/tests/async_engine/test_merge_async_iterators.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import AsyncIterator, Tuple
 3 | 
 4 | import pytest
 5 | 
 6 | from vllm.utils import merge_async_iterators
 7 | 
 8 | 
 9 | @pytest.mark.asyncio
10 | async def test_merge_async_iterators():
11 | 
12 |     async def mock_async_iterator(idx: int) -> AsyncIterator[str]:
13 |         try:
14 |             while True:
15 |                 yield f"item from iterator {idx}"
16 |                 await asyncio.sleep(0.1)
17 |         except asyncio.CancelledError:
18 |             pass
19 | 
20 |     iterators = [mock_async_iterator(i) for i in range(3)]
21 |     merged_iterator: AsyncIterator[Tuple[int, str]] = merge_async_iterators(
22 |         *iterators)
23 | 
24 |     async def stream_output(generator: AsyncIterator[Tuple[int, str]]):
25 |         async for idx, output in generator:
26 |             print(f"idx: {idx}, output: {output}")
27 | 
28 |     task = asyncio.create_task(stream_output(merged_iterator))
29 |     await asyncio.sleep(0.5)
30 |     task.cancel()
31 |     with pytest.raises(asyncio.CancelledError):
32 |         await task
33 | 
34 |     for iterator in iterators:
35 |         try:
36 |             await asyncio.wait_for(anext(iterator), 1)
37 |         except StopAsyncIteration:
38 |             # All iterators should be cancelled and print this message.
39 |             print("Iterator was cancelled normally")
40 |         except (Exception, asyncio.CancelledError) as e:
41 |             raise AssertionError() from e
42 | 


--------------------------------------------------------------------------------
/tests/basic_correctness/test_basic_correctness.py:
--------------------------------------------------------------------------------
 1 | """Compare the short outputs of HF and vLLM when using greedy sampling.
 2 | 
 3 | Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 4 | """
 5 | import os
 6 | 
 7 | import pytest
 8 | 
 9 | MODELS = [
10 |     "facebook/opt-125m",
11 |     "meta-llama/Llama-2-7b-hf",
12 | ]
13 | VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
14 | 
15 | 
16 | @pytest.mark.parametrize("model", MODELS)
17 | @pytest.mark.parametrize("dtype", ["half"])
18 | @pytest.mark.parametrize("max_tokens", [5])
19 | @pytest.mark.parametrize("enforce_eager", [False, True])
20 | def test_models(
21 |     hf_runner,
22 |     vllm_runner,
23 |     example_prompts,
24 |     model: str,
25 |     dtype: str,
26 |     max_tokens: int,
27 |     enforce_eager: bool,
28 | ) -> None:
29 |     backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
30 |     if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
31 |         pytest.skip("Skipping non-eager test for FlashInferBackend.")
32 | 
33 |     hf_model = hf_runner(model, dtype=dtype)
34 |     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
35 |     del hf_model
36 | 
37 |     vllm_model = vllm_runner(model,
38 |                              dtype=dtype,
39 |                              enforce_eager=enforce_eager,
40 |                              gpu_memory_utilization=0.7)
41 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
42 |     del vllm_model
43 | 
44 |     for i in range(len(example_prompts)):
45 |         hf_output_ids, hf_output_str = hf_outputs[i]
46 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
47 |         assert hf_output_str == vllm_output_str, (
48 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
49 |         assert hf_output_ids == vllm_output_ids, (
50 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
51 | 


--------------------------------------------------------------------------------
/tests/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/core/__init__.py


--------------------------------------------------------------------------------
/tests/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/core/block/__init__.py


--------------------------------------------------------------------------------
/tests/core/block/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.fixture()
 5 | def should_do_global_cleanup_after_test() -> bool:
 6 |     """Disable the global cleanup fixture for tests in this directory. This
 7 |     provides a ~10x speedup for unit tests that don't load a model to GPU.
 8 | 
 9 |     This requires that tests in this directory clean up after themselves if they
10 |     use the GPU.
11 |     """
12 |     return False
13 | 


--------------------------------------------------------------------------------
/tests/core/block/e2e/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from tests.conftest import cleanup
 4 | from vllm import LLM
 5 | from vllm.model_executor.utils import set_random_seed
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
10 |                            baseline_llm_kwargs, seed):
11 |     return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
12 |                                 baseline_llm_kwargs, seed)
13 | 
14 | 
15 | @pytest.fixture
16 | def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
17 |                        test_llm_kwargs, seed):
18 |     return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
19 |                                 test_llm_kwargs, seed)
20 | 
21 | 
22 | def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
23 |                          distinct_llm_kwargs, seed):
24 |     kwargs = {
25 |         **common_llm_kwargs,
26 |         **per_test_common_llm_kwargs,
27 |         **distinct_llm_kwargs,
28 |     }
29 | 
30 |     def generator_inner():
31 |         llm = LLM(**kwargs)
32 | 
33 |         set_random_seed(seed)
34 | 
35 |         yield llm
36 |         del llm
37 |         cleanup()
38 | 
39 |     for llm in generator_inner():
40 |         yield llm
41 |         del llm
42 | 


--------------------------------------------------------------------------------
/tests/core/block/test_common.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import pytest
 4 | 
 5 | from vllm.core.block.common import RefCounter
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("seed", list(range(20)))
 9 | @pytest.mark.parametrize("num_incrs", [1, 100])
10 | @pytest.mark.parametrize("num_blocks", [1024])
11 | def test_incr(seed: int, num_incrs: int, num_blocks: int):
12 |     random.seed(seed)
13 | 
14 |     all_block_indices = list(range(num_blocks))
15 |     counter = RefCounter(all_block_indices=all_block_indices)
16 | 
17 |     block_id = random.randint(0, num_blocks - 1)
18 |     for i in range(num_incrs):
19 |         value = counter.incr(block_id)
20 |         assert value == i + 1
21 | 
22 | 
23 | @pytest.mark.parametrize("seed", list(range(20)))
24 | @pytest.mark.parametrize("num_incrs", [1, 100])
25 | @pytest.mark.parametrize("num_blocks", [1024])
26 | def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
27 |     random.seed(seed)
28 | 
29 |     all_block_indices = list(range(num_blocks))
30 |     counter = RefCounter(all_block_indices=all_block_indices)
31 | 
32 |     block_id = random.randint(0, num_blocks - 1)
33 |     for i in range(num_incrs):
34 |         value = counter.incr(block_id)
35 |         assert value == i + 1
36 | 
37 |     for i in range(num_incrs):
38 |         value = counter.decr(block_id)
39 |         assert value == num_incrs - (i + 1)
40 | 
41 |     with pytest.raises(AssertionError):
42 |         counter.decr(block_id)
43 | 


--------------------------------------------------------------------------------
/tests/core/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from typing import Iterable, Optional, Tuple
 3 | 
 4 | from vllm import SamplingParams
 5 | from vllm.lora.request import LoRARequest
 6 | from vllm.sequence import Logprob, Sequence, SequenceGroup
 7 | 
 8 | 
 9 | def create_dummy_prompt(
10 |     request_id: str,
11 |     prompt_length: int,
12 |     block_size: Optional[int] = None,
13 |     lora_request: Optional[LoRARequest] = None,
14 |     use_beam_search: bool = False,
15 |     best_of: int = 1,
16 | ) -> Tuple[Sequence, SequenceGroup]:
17 |     if not block_size:
18 |         block_size = prompt_length
19 | 
20 |     # Create dummy prompt sequence with tokens 0...block_size-1
21 |     # and prompt "0 ... block_size".
22 |     prompt_tokens = list(range(prompt_length))
23 |     prompt_str = " ".join([str(t) for t in prompt_tokens])
24 |     prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
25 |     seq_group = SequenceGroup(
26 |         request_id, [prompt],
27 |         SamplingParams(use_beam_search=use_beam_search, best_of=best_of),
28 |         time.time(), lora_request)
29 | 
30 |     return prompt, seq_group
31 | 
32 | 
33 | def create_seq_group(
34 |         seq_prompt_len: int = 1024,
35 |         seq_output_lens: Iterable[int] = (128, ),
36 |         request_id: str = '0',
37 |         seq_id_start: int = 0,
38 |         sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
39 | 
40 |     assert len(seq_output_lens) > 0
41 | 
42 |     if sampling_params is None:
43 |         sampling_params = SamplingParams()
44 | 
45 |     prompt_token_ids = [0] * seq_prompt_len
46 | 
47 |     seqs = []
48 |     for seq_id_offset, output_len in enumerate(seq_output_lens):
49 |         seq = Sequence(
50 |             seq_id=seq_id_start + seq_id_offset,
51 |             prompt="",
52 |             prompt_token_ids=prompt_token_ids,
53 |             block_size=16,
54 |         )
55 | 
56 |         for i in range(output_len):
57 |             seq.append_token_id(
58 |                 token_id=i,
59 |                 logprobs={i: Logprob(0.0)},
60 |             )
61 |         seqs.append(seq)
62 | 
63 |     seq_group = SequenceGroup(
64 |         request_id=request_id,
65 |         seqs=seqs,
66 |         sampling_params=sampling_params,
67 |         arrival_time=time.time(),
68 |     )
69 | 
70 |     return seq_group
71 | 
72 | 
73 | def round_up_to_next_block(seq_len: int, block_size: int) -> int:
74 |     return (seq_len + block_size - 1) // block_size
75 | 


--------------------------------------------------------------------------------
/tests/distributed/test_basic_distributed_correctness.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and distributed vLLM when using greedy sampling.
 2 | vLLM will allocate all the available memory, so we need to run the tests one
 3 | by one. The solution is to pass arguments (model name) by environment
 4 | variables.
 5 | Run:
 6 | ```sh
 7 | TEST_DIST_MODEL=facebook/opt-125m pytest \
 8 |     test_basic_distributed_correctness.py
 9 | TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
10 |     test_basic_distributed_correctness.py
11 | ```
12 | """
13 | import os
14 | 
15 | import pytest
16 | import torch
17 | 
18 | MODELS = [
19 |     os.environ["TEST_DIST_MODEL"],
20 | ]
21 | VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
22 | 
23 | 
24 | @pytest.mark.skipif(torch.cuda.device_count() < 2,
25 |                     reason="Need at least 2 GPUs to run the test.")
26 | @pytest.mark.parametrize("model", MODELS)
27 | @pytest.mark.parametrize("dtype", ["half"])
28 | @pytest.mark.parametrize("max_tokens", [5])
29 | def test_models(
30 |     hf_runner,
31 |     vllm_runner,
32 |     example_prompts,
33 |     model: str,
34 |     dtype: str,
35 |     max_tokens: int,
36 | ) -> None:
37 |     enforce_eager = False
38 |     backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
39 |     if backend_by_env_var == "FLASHINFER":
40 |         enforce_eager = True
41 | 
42 |     hf_model = hf_runner(model, dtype=dtype)
43 |     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
44 |     del hf_model
45 | 
46 |     vllm_model = vllm_runner(model,
47 |                              dtype=dtype,
48 |                              tensor_parallel_size=2,
49 |                              enforce_eager=enforce_eager)
50 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
51 |     del vllm_model
52 | 
53 |     for i in range(len(example_prompts)):
54 |         hf_output_ids, hf_output_str = hf_outputs[i]
55 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
56 |         assert hf_output_str == vllm_output_str, (
57 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
58 |         assert hf_output_ids == vllm_output_ids, (
59 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
60 | 


--------------------------------------------------------------------------------
/tests/distributed/test_chunked_prefill_distributed.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and distributed vLLM when using greedy sampling.
 2 | vLLM will allocate all the available memory, so we need to run the tests one
 3 | by one. The solution is to pass arguments (model name) by environment
 4 | variables.
 5 | 
 6 | Run:
 7 | ```sh
 8 | TEST_DIST_MODEL=facebook/opt-125m pytest \
 9 |     test_chunked_prefill_distributed.py
10 | TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
11 |     test_chunked_prefill_distributed.py
12 | ```
13 | """
14 | import os
15 | 
16 | import pytest
17 | import torch
18 | 
19 | MODELS = [
20 |     os.environ["TEST_DIST_MODEL"],
21 | ]
22 | 
23 | 
24 | @pytest.mark.skipif(torch.cuda.device_count() < 2,
25 |                     reason="Need at least 2 GPUs to run the test.")
26 | @pytest.mark.parametrize("model", MODELS)
27 | @pytest.mark.parametrize("dtype", ["half"])
28 | @pytest.mark.parametrize("max_tokens", [5])
29 | @pytest.mark.parametrize("chunked_prefill_token_size", [16])
30 | def test_models(
31 |     hf_runner,
32 |     vllm_runner,
33 |     example_prompts,
34 |     model: str,
35 |     dtype: str,
36 |     max_tokens: int,
37 |     chunked_prefill_token_size: int,
38 | ) -> None:
39 |     # Add a chunked prefill config.
40 |     max_num_seqs = min(chunked_prefill_token_size, 256)
41 |     assert chunked_prefill_token_size != -1
42 |     enable_chunked_prefill = True
43 |     max_num_batched_tokens = chunked_prefill_token_size
44 | 
45 |     hf_model = hf_runner(model, dtype=dtype)
46 |     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
47 |     del hf_model
48 | 
49 |     vllm_model = vllm_runner(
50 |         model,
51 |         dtype=dtype,
52 |         tensor_parallel_size=2,
53 |         max_num_seqs=max_num_seqs,
54 |         enable_chunked_prefill=enable_chunked_prefill,
55 |         max_num_batched_tokens=max_num_batched_tokens,
56 |     )
57 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
58 |     del vllm_model
59 | 
60 |     for i in range(len(example_prompts)):
61 |         hf_output_ids, hf_output_str = hf_outputs[i]
62 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
63 |         assert hf_output_str == vllm_output_str, (
64 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
65 |         assert hf_output_ids == vllm_output_ids, (
66 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
67 | 


--------------------------------------------------------------------------------
/tests/distributed/test_pynccl_library.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import tempfile
 3 | 
 4 | 
 5 | def target_fn(env, filepath):
 6 |     from vllm.utils import update_environment_variables
 7 |     update_environment_variables(env)
 8 |     from vllm.utils import nccl_integrity_check
 9 |     nccl_integrity_check(filepath)
10 | 
11 | 
12 | def test_library_file():
13 |     # note: don't import vllm.distributed.device_communicators.pynccl
14 |     # before running this test, otherwise the library file will be loaded
15 |     # and it might interfere with the test
16 |     from vllm.utils import find_nccl_library
17 |     so_file = find_nccl_library()
18 |     with open(so_file, 'rb') as f:
19 |         content = f.read()
20 |     try:
21 |         # corrupt the library file, should raise an exception
22 |         with open(so_file, 'wb') as f:
23 |             f.write(content[:len(content) // 2])
24 |         p = multiprocessing.Process(target=target_fn, args=({}, so_file))
25 |         p.start()
26 |         p.join()
27 |         assert p.exitcode != 0
28 | 
29 |         # move the library file to a tmp path
30 |         # test VLLM_NCCL_SO_PATH
31 |         fd, path = tempfile.mkstemp()
32 |         with open(path, 'wb') as f:
33 |             f.write(content)
34 |         p = multiprocessing.Process(target=target_fn,
35 |                                     args=({
36 |                                         "VLLM_NCCL_SO_PATH": path
37 |                                     }, path))
38 |         p.start()
39 |         p.join()
40 |         assert p.exitcode == 0
41 |     finally:
42 |         with open(so_file, 'wb') as f:
43 |             f.write(content)
44 | 


--------------------------------------------------------------------------------
/tests/engine/test_computed_prefix_blocks.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.engine.arg_utils import EngineArgs
 4 | from vllm.engine.llm_engine import LLMEngine
 5 | from vllm.sampling_params import SamplingParams
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 9 | @pytest.mark.parametrize("block_size", [16])
10 | def test_computed_prefix_blocks(model: str, block_size: int):
11 |     # This test checks if we are able to run the engine to completion
12 |     # without triggering asserts.
13 |     # We are in a scenario where all blocks from the second request's prompt
14 |     # are full and already computed when the second request arrives.
15 |     prompt = (
16 |         "You are a helpful assistant. How do I build a car from cardboard and "
17 |         "paper clips? Is there an easy to follow video tutorial available "
18 |         "online for free?")
19 |     prompt2 = (
20 |         " Please recommend to me some resources where I can learn not only to "
21 |         "handle technical difficulties of building a car, but also "
22 |         "decoration.")
23 | 
24 |     engine_args = EngineArgs(model=model,
25 |                              block_size=block_size,
26 |                              enable_prefix_caching=True)
27 | 
28 |     engine = LLMEngine.from_engine_args(engine_args)
29 |     sampling_params = SamplingParams()
30 | 
31 |     engine.add_request("0", prompt + prompt2, sampling_params)
32 |     engine.step()
33 |     engine.add_request("1", prompt, sampling_params)
34 |     engine.step()
35 | 


--------------------------------------------------------------------------------
/tests/engine/test_detokenization.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.entrypoints.llm import LLM
 4 | from vllm.sampling_params import SamplingParams
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 8 | def test_computed_prefix_blocks(model: str):
 9 |     # This test checks if the engine generates completions both with and
10 |     # without optional detokenization, that detokenization includes text
11 |     # and no-detokenization doesn't, and that both completions have the same
12 |     # token_ids.
13 |     prompt = (
14 |         "You are a helpful assistant. How do I build a car from cardboard and "
15 |         "paper clips? Is there an easy to follow video tutorial available "
16 |         "online for free?")
17 | 
18 |     llm = LLM(model=model)
19 |     sampling_params = SamplingParams(max_tokens=10,
20 |                                      temperature=0.0,
21 |                                      detokenize=False)
22 | 
23 |     outputs_no_detokenization = llm.generate(prompt,
24 |                                              sampling_params)[0].outputs[0]
25 |     sampling_params.detokenize = True
26 |     outputs_with_detokenization = llm.generate(prompt,
27 |                                                sampling_params)[0].outputs[0]
28 | 
29 |     assert outputs_no_detokenization.text == ''
30 |     assert outputs_with_detokenization.text != ''
31 |     assert outputs_no_detokenization.token_ids == \
32 |         outputs_with_detokenization.token_ids
33 | 


--------------------------------------------------------------------------------
/tests/engine/test_skip_tokenizer_init.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.entrypoints.llm import LLM
 4 | from vllm.sampling_params import SamplingParams
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 8 | def test_skip_tokenizer_initialization(model: str):
 9 |     # This test checks if the flag skip_tokenizer_init skips the initialization
10 |     # of tokenizer and detokenizer. The generated output is expected to contain
11 |     # token ids.
12 |     llm = LLM(model=model, skip_tokenizer_init=True)
13 |     sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
14 |     with pytest.raises(ValueError) as err:
15 |         llm.generate("abc", sampling_params)
16 |     assert "prompts must be None if" in str(err.value)
17 |     outputs = llm.generate(prompt_token_ids=[[1, 2, 3]],
18 |                            sampling_params=sampling_params)
19 |     assert len(outputs) > 0
20 |     completions = outputs[0].outputs
21 |     assert len(completions) > 0
22 |     assert completions[0].text == ""
23 |     assert completions[0].token_ids
24 | 


--------------------------------------------------------------------------------
/tests/engine/test_stop_reason.py:
--------------------------------------------------------------------------------
 1 | """Test the different finish_reason="stop" situations during generation:
 2 |     1. One of the provided stop strings
 3 |     2. One of the provided stop tokens
 4 |     3. The EOS token
 5 | 
 6 | Run `pytest tests/engine/test_stop_reason.py`.
 7 | """
 8 | 
 9 | import pytest
10 | import transformers
11 | 
12 | from vllm import SamplingParams
13 | 
14 | MODEL = "facebook/opt-350m"
15 | STOP_STR = "."
16 | SEED = 42
17 | MAX_TOKENS = 1024
18 | 
19 | 
20 | @pytest.fixture
21 | def vllm_model(vllm_runner):
22 |     vllm_model = vllm_runner(MODEL)
23 |     yield vllm_model
24 |     del vllm_model
25 | 
26 | 
27 | def test_stop_reason(vllm_model, example_prompts):
28 |     tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
29 |     stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
30 |     llm = vllm_model.model
31 | 
32 |     # test stop token
33 |     outputs = llm.generate(example_prompts,
34 |                            sampling_params=SamplingParams(
35 |                                seed=SEED,
36 |                                max_tokens=MAX_TOKENS,
37 |                                stop_token_ids=[stop_token_id]))
38 |     for output in outputs:
39 |         output = output.outputs[0]
40 |         assert output.finish_reason == "stop"
41 |         assert output.stop_reason == stop_token_id
42 | 
43 |     # test stop string
44 |     outputs = llm.generate(example_prompts,
45 |                            sampling_params=SamplingParams(
46 |                                seed=SEED, max_tokens=MAX_TOKENS, stop="."))
47 |     for output in outputs:
48 |         output = output.outputs[0]
49 |         assert output.finish_reason == "stop"
50 |         assert output.stop_reason == STOP_STR
51 | 
52 |     # test EOS token
53 |     outputs = llm.generate(example_prompts,
54 |                            sampling_params=SamplingParams(
55 |                                seed=SEED, max_tokens=MAX_TOKENS))
56 |     for output in outputs:
57 |         output = output.outputs[0]
58 |         assert output.finish_reason == "length" or (
59 |             output.finish_reason == "stop" and output.stop_reason is None)
60 | 


--------------------------------------------------------------------------------
/tests/entrypoints/openai/test_serving_chat.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from dataclasses import dataclass
 3 | 
 4 | from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 5 | 
 6 | MODEL_NAME = "openai-community/gpt2"
 7 | CHAT_TEMPLATE = "Dummy chat template for testing {}"
 8 | 
 9 | 
10 | @dataclass
11 | class MockModelConfig:
12 |     tokenizer = MODEL_NAME
13 |     trust_remote_code = False
14 |     tokenizer_mode = "auto"
15 |     max_model_len = 100
16 |     tokenizer_revision = None
17 | 
18 | 
19 | @dataclass
20 | class MockEngine:
21 | 
22 |     async def get_model_config(self):
23 |         return MockModelConfig
24 | 
25 | 
26 | async def _async_serving_chat_init():
27 |     serving_completion = OpenAIServingChat(MockEngine(),
28 |                                            served_model_names=[MODEL_NAME],
29 |                                            response_role="assistant",
30 |                                            chat_template=CHAT_TEMPLATE)
31 |     return serving_completion
32 | 
33 | 
34 | def test_async_serving_chat_init():
35 |     serving_completion = asyncio.run(_async_serving_chat_init())
36 |     assert serving_completion.tokenizer is not None
37 |     assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE
38 | 


--------------------------------------------------------------------------------
/tests/entrypoints/test_llm_generate.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm import LLM, SamplingParams
 4 | 
 5 | 
 6 | def test_multiple_sampling_params():
 7 | 
 8 |     llm = LLM(model="facebook/opt-125m",
 9 |               max_num_batched_tokens=4096,
10 |               tensor_parallel_size=1)
11 | 
12 |     prompts = [
13 |         "Hello, my name is",
14 |         "The president of the United States is",
15 |         "The capital of France is",
16 |         "The future of AI is",
17 |     ]
18 | 
19 |     sampling_params = [
20 |         SamplingParams(temperature=0.01, top_p=0.95),
21 |         SamplingParams(temperature=0.3, top_p=0.95),
22 |         SamplingParams(temperature=0.7, top_p=0.95),
23 |         SamplingParams(temperature=0.99, top_p=0.95),
24 |     ]
25 | 
26 |     # Multiple SamplingParams should be matched with each prompt
27 |     outputs = llm.generate(prompts, sampling_params=sampling_params)
28 |     assert len(prompts) == len(outputs)
29 | 
30 |     # Exception raised, if the size of params does not match the size of prompts
31 |     with pytest.raises(ValueError):
32 |         outputs = llm.generate(prompts, sampling_params=sampling_params[:3])
33 | 
34 |     # Single SamplingParams should be applied to every prompt
35 |     single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
36 |     outputs = llm.generate(prompts, sampling_params=single_sampling_params)
37 |     assert len(prompts) == len(outputs)
38 | 
39 |     # sampling_params is None, default params should be applied
40 |     outputs = llm.generate(prompts, sampling_params=None)
41 |     assert len(prompts) == len(outputs)


--------------------------------------------------------------------------------
/tests/entrypoints/test_server_oot_registration.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import sys
 3 | import time
 4 | 
 5 | import torch
 6 | from openai import OpenAI, OpenAIError
 7 | 
 8 | from vllm import ModelRegistry
 9 | from vllm.model_executor.models.opt import OPTForCausalLM
10 | from vllm.model_executor.sampling_metadata import SamplingMetadata
11 | from vllm.utils import get_open_port
12 | 
13 | 
14 | class MyOPTForCausalLM(OPTForCausalLM):
15 | 
16 |     def compute_logits(self, hidden_states: torch.Tensor,
17 |                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
18 |         # this dummy model always predicts the first token
19 |         logits = super().compute_logits(hidden_states, sampling_metadata)
20 |         logits.zero_()
21 |         logits[:, 0] += 1.0
22 |         return logits
23 | 
24 | 
25 | def server_function(port):
26 |     # register our dummy model
27 |     ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM)
28 |     sys.argv = ["placeholder.py"] + \
29 |         ("--model facebook/opt-125m --dtype"
30 |         f" float32 --api-key token-abc123 --port {port}").split()
31 |     import runpy
32 |     runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
33 | 
34 | 
35 | def test_oot_registration_for_api_server():
36 |     port = get_open_port()
37 |     server = multiprocessing.Process(target=server_function, args=(port, ))
38 |     server.start()
39 |     client = OpenAI(
40 |         base_url=f"http://localhost:{port}/v1",
41 |         api_key="token-abc123",
42 |     )
43 |     while True:
44 |         try:
45 |             completion = client.chat.completions.create(
46 |                 model="facebook/opt-125m",
47 |                 messages=[{
48 |                     "role": "system",
49 |                     "content": "You are a helpful assistant."
50 |                 }, {
51 |                     "role": "user",
52 |                     "content": "Hello!"
53 |                 }],
54 |                 temperature=0,
55 |             )
56 |             break
57 |         except OpenAIError as e:
58 |             if "Connection error" in str(e):
59 |                 time.sleep(3)
60 |             else:
61 |                 raise e
62 |     server.kill()
63 |     generated_text = completion.choices[0].message.content
64 |     # make sure only the first token is generated
65 |     rest = generated_text.replace("<s>", "")
66 |     assert rest == ""
67 | 


--------------------------------------------------------------------------------
/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_type": "llama",
 3 |     "kv_cache": {
 4 |         "dtype": "float8_e4m3fn",
 5 |         "scaling_factor": {
 6 |             "0": {
 7 |                 "0": 0.0152239128947258,
 8 |                 "1": 0.0188860222697258,
 9 |                 "2": 0.0354178324341774,
10 |                 "3": 0.0376674123108387,
11 |                 "4": 0.0418526791036129,
12 |                 "5": 0.0433175228536129,
13 |                 "6": 0.0397600457072258,
14 |                 "7": 0.0424455925822258,
15 |                 "8": 0.0415387861430645,
16 |                 "9": 0.0408412404358387,
17 |                 "10": 0.0395856611430645,
18 |                 "11": 0.0377371683716774,
19 |                 "12": 0.0400739423930645,
20 |                 "13": 0.040771484375,
21 |                 "14": 0.0393415205180645,
22 |                 "15": 0.0369001142680645,
23 |                 "16": 0.03857421875,
24 |                 "17": 0.0387486070394516,
25 |                 "18": 0.0403180830180645,
26 |                 "19": 0.0396205373108387,
27 |                 "20": 0.0375627800822258,
28 |                 "21": 0.0407366082072258,
29 |                 "22": 0.0432477705180645,
30 |                 "23": 0.0377022884786129,
31 |                 "24": 0.0399693101644516,
32 |                 "25": 0.0374581478536129,
33 |                 "26": 0.0413295216858387,
34 |                 "27": 0.0442243330180645,
35 |                 "28": 0.0424804724752903,
36 |                 "29": 0.0456891767680645,
37 |                 "30": 0.0409109964966774,
38 |                 "31": 0.0482352152466774
39 |             }
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/tests/kernels/allclose_default.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # Reference default values of atol and rtol are from
 4 | # https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
 5 | default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
 6 | default_rtol = {
 7 |     torch.float16: 1e-3,
 8 |     torch.bfloat16: 1.6e-2,
 9 |     torch.float: 1.3e-6
10 | }
11 | 
12 | 
13 | def get_default_atol(output) -> float:
14 |     return default_atol[output.dtype]
15 | 
16 | 
17 | def get_default_rtol(output) -> float:
18 |     return default_rtol[output.dtype]
19 | 


--------------------------------------------------------------------------------
/tests/kernels/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.utils import (create_kv_caches_with_random,
 4 |                         create_kv_caches_with_random_flash)
 5 | 
 6 | 
 7 | @pytest.fixture()
 8 | def kv_cache_factory():
 9 |     return create_kv_caches_with_random
10 | 
11 | 
12 | @pytest.fixture()
13 | def kv_cache_factory_flashinfer():
14 |     return create_kv_caches_with_random_flash
15 | 


--------------------------------------------------------------------------------
/tests/kernels/test_layernorm.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from vllm.model_executor.layers.layernorm import RMSNorm
 5 | 
 6 | DTYPES = [torch.half, torch.bfloat16, torch.float]
 7 | NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
 8 | HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
 9 |                 8199]  # Arbitrary values for testing
10 | ADD_RESIDUAL = [False, True]
11 | SEEDS = [0]
12 | CUDA_DEVICES = [
13 |     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
14 | ]
15 | 
16 | 
17 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
18 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
19 | @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
20 | @pytest.mark.parametrize("dtype", DTYPES)
21 | @pytest.mark.parametrize("seed", SEEDS)
22 | @pytest.mark.parametrize("device", CUDA_DEVICES)
23 | @torch.inference_mode()
24 | def test_rms_norm(
25 |     num_tokens: int,
26 |     hidden_size: int,
27 |     add_residual: bool,
28 |     dtype: torch.dtype,
29 |     seed: int,
30 |     device: str,
31 | ) -> None:
32 |     torch.random.manual_seed(seed)
33 |     if torch.cuda.is_available():
34 |         torch.cuda.manual_seed(seed)
35 |     torch.set_default_device(device)
36 |     layer = RMSNorm(hidden_size).to(dtype=dtype)
37 |     layer.weight.data.normal_(mean=1.0, std=0.1)
38 |     scale = 1 / (2 * hidden_size)
39 |     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
40 |     x *= scale
41 |     residual = torch.randn_like(x) * scale if add_residual else None
42 | 
43 |     # NOTE(woosuk): The reference implementation should be executed first
44 |     # because the custom kernel is in-place.
45 |     ref_out = layer._forward(x, residual)
46 |     out = layer(x, residual)
47 |     # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
48 |     # numerical errors than other operators because they involve reductions.
49 |     # Therefore, we use a larger tolerance.
50 |     if add_residual:
51 |         assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
52 |         assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
53 |     else:
54 |         assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2)
55 | 


--------------------------------------------------------------------------------
/tests/kernels/test_rand.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | from vllm.model_executor.layers.ops.rand import seeded_uniform
 7 | from vllm.model_executor.utils import set_random_seed
 8 | 
 9 | 
10 | @pytest.mark.parametrize("dtype",
11 |                          [torch.float32, torch.float16, torch.bfloat16])
12 | @pytest.mark.parametrize("use_3d", [True, False])
13 | def test_seeded_uniform(dtype: torch.dtype, use_3d: bool):
14 |     device = "cuda"
15 |     for seed in range(512):
16 |         set_random_seed(seed)
17 |         rows = random.randint(1, 512)
18 |         cols = random.randint(1, 64000)
19 |         if use_3d:
20 |             third_dim = random.randint(2, 10)
21 |             dims = [rows, third_dim, cols]
22 |         else:
23 |             dims = [rows, cols]
24 |         seeds = torch.randint(torch.iinfo(torch.long).min,
25 |                               torch.iinfo(torch.long).max, (rows, ),
26 |                               device=device)
27 | 
28 |         # Test that the same seed produces the same output
29 |         out = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
30 |         out2 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
31 |         torch.testing.assert_close(out, out2)
32 |         # del to save memory
33 |         del out2
34 | 
35 |         out3 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
36 |         torch.testing.assert_close(out, out3)
37 |         # del to save memory
38 |         del out3
39 | 
40 |         # Initialize out tensor with garbage to ensure that it is overwritten
41 |         out_with_tensor = seeded_uniform(
42 |             *dims,
43 |             out=torch.full(
44 |                 (*dims, ),
45 |                 -1,
46 |                 dtype=dtype,
47 |                 device=device,
48 |             ),
49 |             seeds=seeds,
50 |             dtype=dtype,
51 |         )
52 |         torch.testing.assert_close(out, out_with_tensor)
53 | 


--------------------------------------------------------------------------------
/tests/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/lora/__init__.py


--------------------------------------------------------------------------------
/tests/lora/test_gemma.py:
--------------------------------------------------------------------------------
 1 | import vllm
 2 | from vllm.lora.request import LoRARequest
 3 | 
 4 | MODEL_PATH = "google/gemma-7b"
 5 | 
 6 | 
 7 | def do_sample(llm, lora_path: str, lora_id: int) -> str:
 8 |     prompts = [
 9 |         "Quote: Imagination is",
10 |         "Quote: Be yourself;",
11 |         "Quote: So many books,",
12 |     ]
13 |     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
14 |     outputs = llm.generate(
15 |         prompts,
16 |         sampling_params,
17 |         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
18 |         if lora_id else None)
19 |     # Print the outputs.
20 |     generated_texts = []
21 |     for output in outputs:
22 |         prompt = output.prompt
23 |         generated_text = output.outputs[0].text.strip()
24 |         generated_texts.append(generated_text)
25 |         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
26 |     return generated_texts
27 | 
28 | 
29 | def test_gemma_lora(gemma_lora_files):
30 |     llm = vllm.LLM(MODEL_PATH,
31 |                    max_model_len=1024,
32 |                    enable_lora=True,
33 |                    max_loras=4)
34 | 
35 |     expected_lora_output = [
36 |         "more important than knowledge.\nAuthor: Albert Einstein\n",
37 |         "everyone else is already taken.\nAuthor: Oscar Wilde\n",
38 |         "so little time\nAuthor: Frank Zappa\n",
39 |     ]
40 | 
41 |     output1 = do_sample(llm, gemma_lora_files, lora_id=1)
42 |     for i in range(len(expected_lora_output)):
43 |         assert output1[i].startswith(expected_lora_output[i])
44 |     output2 = do_sample(llm, gemma_lora_files, lora_id=2)
45 |     for i in range(len(expected_lora_output)):
46 |         assert output2[i].startswith(expected_lora_output[i])
47 | 


--------------------------------------------------------------------------------
/tests/lora/test_tokenizer_group.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from transformers import AutoTokenizer, PreTrainedTokenizerBase
 3 | 
 4 | from vllm.lora.request import LoRARequest
 5 | from vllm.transformers_utils.tokenizer import get_lora_tokenizer
 6 | from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
 7 | 
 8 | from ..conftest import get_tokenizer_pool_config
 9 | 
10 | 
11 | @pytest.mark.asyncio
12 | @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
13 | async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
14 |     reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
15 |     tokenizer_group = get_tokenizer_group(
16 |         get_tokenizer_pool_config(tokenizer_group_type),
17 |         tokenizer_id="gpt2",
18 |         enable_lora=True,
19 |         max_num_seqs=1,
20 |         max_input_length=None,
21 |     )
22 |     lora_request = LoRARequest("1", 1, sql_lora_files)
23 |     assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
24 |         request_id="request_id", prompt="prompt", lora_request=lora_request)
25 |     assert reference_tokenizer.encode(
26 |         "prompt") == await tokenizer_group.encode_async(
27 |             request_id="request_id",
28 |             prompt="prompt",
29 |             lora_request=lora_request)
30 |     assert isinstance(tokenizer_group.get_lora_tokenizer(None),
31 |                       PreTrainedTokenizerBase)
32 |     assert tokenizer_group.get_lora_tokenizer(
33 |         None) == await tokenizer_group.get_lora_tokenizer_async(None)
34 | 
35 |     assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request),
36 |                       PreTrainedTokenizerBase)
37 |     assert tokenizer_group.get_lora_tokenizer(
38 |         lora_request) != tokenizer_group.get_lora_tokenizer(None)
39 |     assert tokenizer_group.get_lora_tokenizer(
40 |         lora_request) == await tokenizer_group.get_lora_tokenizer_async(
41 |             lora_request)
42 | 
43 | 
44 | def test_get_lora_tokenizer(sql_lora_files, tmpdir):
45 |     lora_request = None
46 |     tokenizer = get_lora_tokenizer(lora_request)
47 |     assert not tokenizer
48 | 
49 |     lora_request = LoRARequest("1", 1, sql_lora_files)
50 |     tokenizer = get_lora_tokenizer(lora_request)
51 |     assert tokenizer.get_added_vocab()
52 | 
53 |     lora_request = LoRARequest("1", 1, str(tmpdir))
54 |     tokenizer = get_lora_tokenizer(lora_request)
55 |     assert not tokenizer
56 | 


--------------------------------------------------------------------------------
/tests/model_executor/weight_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | import huggingface_hub.constants
 5 | import pytest
 6 | from huggingface_hub.utils import LocalEntryNotFoundError
 7 | 
 8 | from vllm.model_executor.model_loader.weight_utils import (
 9 |     download_weights_from_hf, enable_hf_transfer)
10 | 
11 | 
12 | def test_hf_transfer_auto_activation():
13 |     if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
14 |         # in case it is already set, we can't test the auto activation
15 |         pytest.skip(
16 |             "HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
17 |     enable_hf_transfer()
18 |     try:
19 |         # enable hf hub transfer if available
20 |         import hf_transfer  # type: ignore # noqa
21 |         HF_TRANFER_ACTIVE = True
22 |     except ImportError:
23 |         HF_TRANFER_ACTIVE = False
24 |     assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
25 |             HF_TRANFER_ACTIVE)
26 | 
27 | 
28 | def test_download_weights_from_hf():
29 |     with tempfile.TemporaryDirectory() as tmpdir:
30 |         # assert LocalEntryNotFoundError error is thrown
31 |         # if offline is set and model is not cached
32 |         huggingface_hub.constants.HF_HUB_OFFLINE = True
33 |         with pytest.raises(LocalEntryNotFoundError):
34 |             download_weights_from_hf("facebook/opt-125m",
35 |                                      allow_patterns=["*.safetensors", "*.bin"],
36 |                                      cache_dir=tmpdir)
37 | 
38 |         # download the model
39 |         huggingface_hub.constants.HF_HUB_OFFLINE = False
40 |         download_weights_from_hf("facebook/opt-125m",
41 |                                  allow_patterns=["*.safetensors", "*.bin"],
42 |                                  cache_dir=tmpdir)
43 | 
44 |         # now it should work offline
45 |         huggingface_hub.constants.HF_HUB_OFFLINE = True
46 |         assert download_weights_from_hf(
47 |             "facebook/opt-125m",
48 |             allow_patterns=["*.safetensors", "*.bin"],
49 |             cache_dir=tmpdir) is not None
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     test_hf_transfer_auto_activation()
54 |     test_download_weights_from_hf()
55 | 


--------------------------------------------------------------------------------
/tests/models/test_big_models.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM when using greedy sampling.
 2 | 
 3 | This tests bigger models and use half precision.
 4 | 
 5 | Run `pytest tests/models/test_big_models.py`.
 6 | """
 7 | import pytest
 8 | 
 9 | MODELS = [
10 |     "meta-llama/Llama-2-7b-hf",
11 |     # "mistralai/Mistral-7B-v0.1",  # Broken
12 |     # "Deci/DeciLM-7b",  # Broken
13 |     # "tiiuae/falcon-7b",  # Broken
14 |     "EleutherAI/gpt-j-6b",
15 |     "mosaicml/mpt-7b",
16 |     # "Qwen/Qwen1.5-0.5B"  # Broken,
17 | ]
18 | 
19 | 
20 | @pytest.mark.parametrize("model", MODELS)
21 | @pytest.mark.parametrize("dtype", ["half"])
22 | @pytest.mark.parametrize("max_tokens", [32])
23 | def test_models(
24 |     hf_runner,
25 |     vllm_runner,
26 |     example_prompts,
27 |     model: str,
28 |     dtype: str,
29 |     max_tokens: int,
30 | ) -> None:
31 |     hf_model = hf_runner(model, dtype=dtype)
32 |     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
33 |     del hf_model
34 | 
35 |     vllm_model = vllm_runner(model, dtype=dtype)
36 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
37 |     del vllm_model
38 | 
39 |     for i in range(len(example_prompts)):
40 |         hf_output_ids, hf_output_str = hf_outputs[i]
41 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
42 |         assert hf_output_str == vllm_output_str, (
43 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
44 |         assert hf_output_ids == vllm_output_ids, (
45 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
46 | 
47 | 
48 | @pytest.mark.parametrize("model", MODELS)
49 | @pytest.mark.parametrize("dtype", ["half"])
50 | def test_model_print(
51 |     vllm_runner,
52 |     model: str,
53 |     dtype: str,
54 | ) -> None:
55 |     vllm_model = vllm_runner(model, dtype=dtype)
56 |     # This test is for verifying whether the model's extra_repr
57 |     # can be printed correctly.
58 |     print(vllm_model.model.llm_engine.model_executor.driver_worker.
59 |           model_runner.model)
60 |     del vllm_model
61 | 


--------------------------------------------------------------------------------
/tests/models/test_mistral.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
 2 | 
 3 | Run `pytest tests/models/test_mistral.py`.
 4 | """
 5 | import pytest
 6 | 
 7 | MODELS = [
 8 |     "mistralai/Mistral-7B-Instruct-v0.1",
 9 | ]
10 | 
11 | 
12 | @pytest.mark.parametrize("model", MODELS)
13 | @pytest.mark.parametrize("dtype", ["bfloat16"])
14 | @pytest.mark.parametrize("max_tokens", [128])
15 | @pytest.mark.skip(
16 |     "Two problems: 1. Failing correctness tests. 2. RuntimeError: expected "
17 |     "scalar type BFloat16 but found Half (only in CI).")
18 | def test_models(
19 |     hf_runner,
20 |     vllm_runner,
21 |     example_long_prompts,
22 |     model: str,
23 |     dtype: str,
24 |     max_tokens: int,
25 | ) -> None:
26 |     hf_model = hf_runner(model, dtype=dtype)
27 |     hf_outputs = hf_model.generate_greedy(example_long_prompts, max_tokens)
28 |     del hf_model
29 | 
30 |     vllm_model = vllm_runner(model, dtype=dtype)
31 |     vllm_outputs = vllm_model.generate_greedy(example_long_prompts, max_tokens)
32 |     del vllm_model
33 | 
34 |     for i in range(len(example_long_prompts)):
35 |         hf_output_ids, hf_output_str = hf_outputs[i]
36 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
37 |         assert hf_output_str == vllm_output_str, (
38 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
39 |         assert hf_output_ids == vllm_output_ids, (
40 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
41 | 


--------------------------------------------------------------------------------
/tests/models/test_models.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM when using greedy sampling.
 2 | 
 3 | This test only tests small models. Big models such as 7B should be tested from
 4 | test_big_models.py because it could use a larger instance to run tests.
 5 | 
 6 | Run `pytest tests/models/test_models.py`.
 7 | """
 8 | import pytest
 9 | 
10 | MODELS = [
11 |     "facebook/opt-125m",
12 |     "gpt2",
13 |     "bigcode/tiny_starcoder_py",
14 |     "EleutherAI/pythia-70m",
15 |     "bigscience/bloom-560m",  # Testing alibi slopes.
16 |     "microsoft/phi-2",
17 |     "stabilityai/stablelm-3b-4e1t",
18 |     # "allenai/OLMo-1B",  # Broken
19 |     "bigcode/starcoder2-3b",
20 | ]
21 | 
22 | 
23 | @pytest.mark.parametrize("model", MODELS)
24 | @pytest.mark.parametrize("dtype", ["float"])
25 | @pytest.mark.parametrize("max_tokens", [96])
26 | def test_models(
27 |     hf_runner,
28 |     vllm_runner,
29 |     example_prompts,
30 |     model: str,
31 |     dtype: str,
32 |     max_tokens: int,
33 | ) -> None:
34 |     # To pass the small model tests, we need full precision.
35 |     assert dtype == "float"
36 | 
37 |     hf_model = hf_runner(model, dtype=dtype)
38 |     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
39 |     del hf_model
40 | 
41 |     vllm_model = vllm_runner(model, dtype=dtype)
42 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
43 |     del vllm_model
44 | 
45 |     for i in range(len(example_prompts)):
46 |         hf_output_ids, hf_output_str = hf_outputs[i]
47 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
48 |         assert hf_output_str == vllm_output_str, (
49 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
50 |         assert hf_output_ids == vllm_output_ids, (
51 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
52 | 
53 | 
54 | @pytest.mark.parametrize("model", MODELS)
55 | @pytest.mark.parametrize("dtype", ["float"])
56 | def test_model_print(
57 |     vllm_runner,
58 |     model: str,
59 |     dtype: str,
60 | ) -> None:
61 |     vllm_model = vllm_runner(model, dtype=dtype)
62 |     # This test is for verifying whether the model's extra_repr
63 |     # can be printed correctly.
64 |     print(vllm_model.model.llm_engine.model_executor.driver_worker.
65 |           model_runner.model)
66 |     del vllm_model
67 | 


--------------------------------------------------------------------------------
/tests/models/test_oot_registration.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from vllm import LLM, ModelRegistry, SamplingParams
 4 | from vllm.model_executor.models.opt import OPTForCausalLM
 5 | from vllm.model_executor.sampling_metadata import SamplingMetadata
 6 | 
 7 | 
 8 | class MyOPTForCausalLM(OPTForCausalLM):
 9 | 
10 |     def compute_logits(self, hidden_states: torch.Tensor,
11 |                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
12 |         # this dummy model always predicts the first token
13 |         logits = super().compute_logits(hidden_states, sampling_metadata)
14 |         logits.zero_()
15 |         logits[:, 0] += 1.0
16 |         return logits
17 | 
18 | 
19 | def test_oot_registration():
20 |     # register our dummy model
21 |     ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM)
22 |     prompts = ["Hello, my name is", "The text does not matter"]
23 |     sampling_params = SamplingParams(temperature=0)
24 |     llm = LLM(model="facebook/opt-125m")
25 |     first_token = llm.get_tokenizer().decode(0)
26 |     outputs = llm.generate(prompts, sampling_params)
27 | 
28 |     for output in outputs:
29 |         generated_text = output.outputs[0].text
30 |         # make sure only the first token is generated
31 |         rest = generated_text.replace(first_token, "")
32 |         assert rest == ""
33 | 


--------------------------------------------------------------------------------
/tests/models/utils.py:
--------------------------------------------------------------------------------
 1 | def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
 2 |     """Compare the logprobs of two sequences generated by different models, 
 3 |     which should be similar but not necessarily equal.
 4 |     """
 5 |     # Loop through responses to each prompt.
 6 |     for prompt_idx, (outputs_0,
 7 |                      outputs_1) in enumerate(zip(outputs_0_lst,
 8 |                                                  outputs_1_lst)):
 9 |         output_ids_0, output_str_0, logprobs_0 = outputs_0
10 |         output_ids_1, output_str_1, logprobs_1 = outputs_1
11 | 
12 |         # Loop through generated tokens.
13 |         for idx, (output_id_0,
14 |                   output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
15 | 
16 |             # If generated tokens don't match, then
17 |             if output_id_0 != output_id_1:
18 |                 # Each predicted token must be in top N logprobs of the other
19 |                 assert output_id_0 in logprobs_1[idx], (
20 |                     f"Test{prompt_idx}:"
21 |                     f"\n{name_0}:\t{output_str_0!r}"
22 |                     f"\n{name_1}:\t{output_str_1!r}")
23 |                 assert output_id_1 in logprobs_0[idx], (
24 |                     f"Test{prompt_idx}:"
25 |                     f"\n{name_0}:\t{output_str_0!r}"
26 |                     f"\n{name_1}:\t{output_str_1!r}")
27 | 
28 |                 # Break out since sequences will now diverge.
29 |                 break
30 | 


--------------------------------------------------------------------------------
/tests/prompts/example.txt:
--------------------------------------------------------------------------------
1 | vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
2 | Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
3 | Compare and contrast artificial intelligence with human intelligence in terms of processing information.
4 | Describe the basic components of a neural network and how it can be trained.
5 | Write a short story about a robot that dreams for the first time.
6 | Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
7 | Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
8 | Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
9 | 


--------------------------------------------------------------------------------
/tests/quantization/test_fp8.py:
--------------------------------------------------------------------------------
 1 | """Tests whether FP8 computation is enabled correctly.
 2 | 
 3 | Run `pytest tests/quantization/test_fp8.py --forked`.
 4 | """
 5 | import pytest
 6 | import torch
 7 | 
 8 | from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 9 | from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
10 | 
11 | capability = torch.cuda.get_device_capability()
12 | capability = capability[0] * 10 + capability[1]
13 | 
14 | 
15 | @pytest.mark.skipif(
16 |     capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
17 |     reason="FP8 is not supported on this GPU type.")
18 | def test_load_fp16_model(vllm_runner) -> None:
19 |     llm = vllm_runner("facebook/opt-125m", quantization="fp8")
20 | 
21 |     model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
22 |     fc1 = model.model.decoder.layers[0].fc1
23 |     assert isinstance(fc1.quant_method, Fp8LinearMethod)
24 |     assert fc1.weight.dtype == torch.float8_e4m3fn
25 | 


--------------------------------------------------------------------------------
/tests/samplers/test_beam_search.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM when using beam search.
 2 | 
 3 | Run `pytest tests/samplers/test_beam_search.py`.
 4 | """
 5 | import gc
 6 | 
 7 | import pytest
 8 | import torch
 9 | 
10 | # FIXME(zhuohan): The test can not pass if we:
11 | #   1. Increase max_tokens to 256.
12 | #   2. Increase beam_width to 8.
13 | #   3. Use the model "huggyllama/llama-7b".
14 | MAX_TOKENS = [128]
15 | BEAM_WIDTHS = [4]
16 | MODELS = ["facebook/opt-125m"]
17 | 
18 | 
19 | @pytest.mark.parametrize("model", MODELS)
20 | @pytest.mark.parametrize("dtype", ["half"])
21 | @pytest.mark.parametrize("max_tokens", MAX_TOKENS)
22 | @pytest.mark.parametrize("beam_width", BEAM_WIDTHS)
23 | def test_beam_search_single_input(
24 |     hf_runner,
25 |     vllm_runner,
26 |     example_prompts,
27 |     model: str,
28 |     dtype: str,
29 |     max_tokens: int,
30 |     beam_width: int,
31 | ) -> None:
32 |     example_prompts = example_prompts[:1]
33 |     hf_model = hf_runner(model, dtype=dtype)
34 |     hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
35 |                                                max_tokens)
36 |     del hf_model
37 | 
38 |     vllm_model = vllm_runner(model, dtype=dtype)
39 |     vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
40 |                                                    max_tokens)
41 |     del vllm_model
42 |     # NOTE(woosuk): For some reason, the following GC is required to avoid
43 |     # GPU OOM errors in the following tests using `vllm_runner`.
44 |     gc.collect()
45 |     torch.cuda.empty_cache()
46 | 
47 |     for i in range(len(example_prompts)):
48 |         hf_output_ids, _ = hf_outputs[i]
49 |         vllm_output_ids, _ = vllm_outputs[i]
50 |         assert len(hf_output_ids) == len(vllm_output_ids)
51 |         for j in range(len(hf_output_ids)):
52 |             assert hf_output_ids[j] == vllm_output_ids[j], (
53 |                 f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
54 |                 f"vLLM: {vllm_output_ids}")
55 | 


--------------------------------------------------------------------------------
/tests/samplers/test_ignore_eos.py:
--------------------------------------------------------------------------------
 1 | """Make sure ignore_eos works.
 2 | 
 3 | Run `pytest tests/samplers/test_ignore_eos.py`.
 4 | """
 5 | 
 6 | import pytest
 7 | 
 8 | from vllm import SamplingParams
 9 | 
10 | MODELS = ["facebook/opt-125m"]
11 | 
12 | 
13 | @pytest.mark.parametrize("model", MODELS)
14 | @pytest.mark.parametrize("dtype", ["half"])
15 | @pytest.mark.parametrize("max_tokens", [1024])
16 | def test_beam_search_single_input(
17 |     vllm_runner,
18 |     example_prompts,
19 |     model: str,
20 |     dtype: str,
21 |     max_tokens: int,
22 | ) -> None:
23 |     example_prompts = "1 + 1 is"
24 | 
25 |     vllm_model = vllm_runner(model, dtype=dtype)
26 |     sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
27 |     ignore_eos_output = vllm_model.model.generate(
28 |         example_prompts, sampling_params=sampling_params)
29 |     print(len(ignore_eos_output[0].outputs[0].token_ids))
30 |     assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) < 10
31 |     assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) >= 0
32 | 


--------------------------------------------------------------------------------
/tests/samplers/test_logits_processor.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from vllm import SamplingParams
 5 | 
 6 | MODELS = ["facebook/opt-125m"]
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("model", MODELS)
10 | @pytest.mark.parametrize("dtype", ["half"])
11 | def test_logits_processor_force_generate(
12 |     vllm_runner,
13 |     example_prompts,
14 |     model: str,
15 |     dtype: str,
16 | ) -> None:
17 |     vllm_model = vllm_runner(model, dtype=dtype)
18 |     tokenizer = vllm_model.model.get_tokenizer()
19 |     repeat_times = 2
20 |     enforced_answers = " vLLM"
21 |     vllm_token_ids = tokenizer.encode(enforced_answers,
22 |                                       add_special_tokens=False)
23 |     max_tokens = len(vllm_token_ids) * repeat_times
24 | 
25 |     def pick_vllm(token_ids, logits):
26 |         token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
27 |         logits[token_id] = torch.finfo(logits.dtype).max
28 |         return logits
29 | 
30 |     params_with_logprobs = SamplingParams(
31 |         logits_processors=[pick_vllm],
32 |         prompt_logprobs=3,
33 |         max_tokens=max_tokens,
34 |     )
35 | 
36 |     # test logits_processors when prompt_logprobs is not None
37 |     vllm_model.model._add_request(
38 |         prompt=example_prompts[0],
39 |         sampling_params=params_with_logprobs,
40 |         prompt_token_ids=None,
41 |     )
42 | 
43 |     # test prompt_logprobs is not None
44 |     vllm_model.model._add_request(
45 |         prompt=example_prompts[1],
46 |         sampling_params=SamplingParams(
47 |             prompt_logprobs=3,
48 |             max_tokens=max_tokens,
49 |         ),
50 |         prompt_token_ids=None,
51 |     )
52 | 
53 |     # test grouped requests
54 |     vllm_model.model._add_request(
55 |         prompt=example_prompts[2],
56 |         sampling_params=SamplingParams(max_tokens=max_tokens),
57 |         prompt_token_ids=None,
58 |     )
59 | 
60 |     outputs = vllm_model.model._run_engine(False)
61 | 
62 |     assert outputs[0].outputs[0].text == enforced_answers * repeat_times
63 | 


--------------------------------------------------------------------------------
/tests/samplers/test_ranks.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm import SamplingParams
 4 | 
 5 | MODELS = ["facebook/opt-125m"]
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("model", MODELS)
 9 | @pytest.mark.parametrize("dtype", ["half"])
10 | def test_ranks(
11 |     vllm_runner,
12 |     model,
13 |     dtype,
14 |     example_prompts,
15 | ):
16 |     max_tokens = 5
17 |     num_top_logprobs = 5
18 |     num_prompt_logprobs = 5
19 | 
20 |     vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs)
21 | 
22 |     ## Test greedy logprobs ranks
23 |     vllm_sampling_params = SamplingParams(temperature=0.0,
24 |                                           top_p=1.0,
25 |                                           max_tokens=max_tokens,
26 |                                           logprobs=num_top_logprobs,
27 |                                           prompt_logprobs=num_prompt_logprobs)
28 |     vllm_results = vllm_model.generate_w_logprobs(example_prompts,
29 |                                                   vllm_sampling_params)
30 |     for result in vllm_results:
31 |         assert result[2] is not None
32 |         assert len(result[2]) == len(result[0])
33 |         # check whether all chosen tokens have ranks = 1
34 |         for token, logprobs in zip(result[0], result[2]):
35 |             assert token in logprobs
36 |             assert logprobs[token].rank == 1
37 | 
38 |     ## Test non-greedy logprobs ranks
39 |     sampling_params = SamplingParams(temperature=1.0,
40 |                                      top_p=1.0,
41 |                                      max_tokens=max_tokens,
42 |                                      logprobs=num_top_logprobs,
43 |                                      prompt_logprobs=num_prompt_logprobs)
44 |     res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
45 |     for result in res:
46 |         assert result[2] is not None
47 |         assert len(result[2]) == len(result[0])
48 |         # check whether all chosen tokens have ranks
49 |         for token, logprobs in zip(result[0], result[2]):
50 |             assert logprobs[token].rank >= 1
51 | 


--------------------------------------------------------------------------------
/tests/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/spec_decode/__init__.py


--------------------------------------------------------------------------------
/tests/spec_decode/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/spec_decode/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/tensorizer_loader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/tensorizer_loader/__init__.py


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | from vllm.config import ModelConfig
 2 | 
 3 | 
 4 | def test_get_sliding_window():
 5 |     TEST_SLIDING_WINDOW = 4096
 6 |     # Test that the sliding window is correctly computed.
 7 |     # For Qwen1.5/Qwen2, get_sliding_window() should be None
 8 |     # when use_sliding_window is False.
 9 |     qwen2_model_config = ModelConfig(
10 |         "Qwen/Qwen1.5-7B",
11 |         "Qwen/Qwen1.5-7B",
12 |         tokenizer_mode="auto",
13 |         trust_remote_code=False,
14 |         seed=0,
15 |         dtype="float16",
16 |         revision=None,
17 |     )
18 | 
19 |     qwen2_model_config.hf_config.use_sliding_window = False
20 |     qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
21 |     assert qwen2_model_config.get_sliding_window() is None
22 | 
23 |     qwen2_model_config.hf_config.use_sliding_window = True
24 |     assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
25 | 
26 |     mistral_model_config = ModelConfig(
27 |         "mistralai/Mistral-7B-v0.1",
28 |         "mistralai/Mistral-7B-v0.1",
29 |         tokenizer_mode="auto",
30 |         trust_remote_code=False,
31 |         seed=0,
32 |         dtype="float16",
33 |         revision=None,
34 |     )
35 |     mistral_model_config.hf_config.sliding_window = None
36 |     assert mistral_model_config.get_sliding_window() is None
37 | 
38 |     mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
39 |     assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW


--------------------------------------------------------------------------------
/tests/test_regression.py:
--------------------------------------------------------------------------------
 1 | """Containing tests that check for regressions in vLLM's behavior.
 2 | 
 3 | It should include tests that are reported by users and making sure they
 4 | will never happen again.
 5 | 
 6 | """
 7 | import gc
 8 | 
 9 | import torch
10 | 
11 | from vllm import LLM, SamplingParams
12 | 
13 | 
14 | def test_duplicated_ignored_sequence_group():
15 |     """https://github.com/vllm-project/vllm/issues/1655"""
16 | 
17 |     sampling_params = SamplingParams(temperature=0.01,
18 |                                      top_p=0.1,
19 |                                      max_tokens=256)
20 |     llm = LLM(model="facebook/opt-125m",
21 |               max_num_batched_tokens=4096,
22 |               tensor_parallel_size=1)
23 |     prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
24 |     outputs = llm.generate(prompts, sampling_params=sampling_params)
25 | 
26 |     assert len(prompts) == len(outputs)
27 | 
28 | 
29 | def test_max_tokens_none():
30 |     sampling_params = SamplingParams(temperature=0.01,
31 |                                      top_p=0.1,
32 |                                      max_tokens=None)
33 |     llm = LLM(model="facebook/opt-125m",
34 |               max_num_batched_tokens=4096,
35 |               tensor_parallel_size=1)
36 |     prompts = ["Just say hello!"]
37 |     outputs = llm.generate(prompts, sampling_params=sampling_params)
38 | 
39 |     assert len(prompts) == len(outputs)
40 | 
41 | 
42 | def test_gc():
43 |     llm = LLM("facebook/opt-125m", enforce_eager=True)
44 |     del llm
45 | 
46 |     gc.collect()
47 |     torch.cuda.empty_cache()
48 | 
49 |     # The memory allocated for model and KV cache should be released.
50 |     # The memory allocated for PyTorch and others should be less than 50MB.
51 |     # Usually, it's around 10MB.
52 |     allocated = torch.cuda.memory_allocated()
53 |     assert allocated < 50 * 1024 * 1024
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     import pytest
58 |     pytest.main([__file__])
59 | 


--------------------------------------------------------------------------------
/tests/test_sampling_params.py:
--------------------------------------------------------------------------------
 1 | """Tests for the SamplingParams class.
 2 | """
 3 | from vllm import SamplingParams
 4 | 
 5 | 
 6 | def test_max_tokens_none():
 7 |     """max_tokens=None should be allowed"""
 8 |     SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     import pytest
13 |     pytest.main([__file__])
14 | 


--------------------------------------------------------------------------------
/tests/tokenization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/tokenization/__init__.py


--------------------------------------------------------------------------------
/tests/tokenization/test_cached_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | from vllm.transformers_utils.tokenizer import get_cached_tokenizer
 6 | 
 7 | 
 8 | def test_cached_tokenizer():
 9 |     reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
10 |     reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"})
11 |     reference_tokenizer.add_special_tokens(
12 |         {"additional_special_tokens": ["<SEP>"]})
13 |     cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer))
14 | 
15 |     assert reference_tokenizer.encode("prompt") == cached_tokenizer.encode(
16 |         "prompt")
17 |     assert set(reference_tokenizer.all_special_ids) == set(
18 |         cached_tokenizer.all_special_ids)
19 |     assert set(reference_tokenizer.all_special_tokens) == set(
20 |         cached_tokenizer.all_special_tokens)
21 |     assert set(reference_tokenizer.all_special_tokens_extended) == set(
22 |         cached_tokenizer.all_special_tokens_extended)
23 | 


--------------------------------------------------------------------------------
/tests/tokenization/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from transformers import PreTrainedTokenizerBase
 3 | 
 4 | from vllm.transformers_utils.tokenizer import get_tokenizer
 5 | 
 6 | TOKENIZER_NAMES = [
 7 |     "facebook/opt-125m",
 8 |     "gpt2",
 9 | ]
10 | 
11 | 
12 | @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
13 | def test_tokenizer_revision(tokenizer_name: str):
14 |     # Assume that "main" branch always exists
15 |     tokenizer = get_tokenizer(tokenizer_name, revision="main")
16 |     assert isinstance(tokenizer, PreTrainedTokenizerBase)
17 | 
18 |     # Assume that "never" branch always does not exist
19 |     with pytest.raises(OSError, match='not a valid git identifier'):
20 |         get_tokenizer(tokenizer_name, revision="never")
21 | 


--------------------------------------------------------------------------------
/tests/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/tests/worker/__init__.py


--------------------------------------------------------------------------------
/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 2 | 
 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine
 5 | from vllm.engine.llm_engine import LLMEngine
 6 | from vllm.entrypoints.llm import LLM
 7 | from vllm.executor.ray_utils import initialize_ray_cluster
 8 | from vllm.model_executor.models import ModelRegistry
 9 | from vllm.outputs import CompletionOutput, RequestOutput
10 | from vllm.sampling_params import SamplingParams
11 | 
12 | __version__ = "0.4.2"
13 | 
14 | __all__ = [
15 |     "LLM",
16 |     "ModelRegistry",
17 |     "SamplingParams",
18 |     "RequestOutput",
19 |     "CompletionOutput",
20 |     "LLMEngine",
21 |     "EngineArgs",
22 |     "AsyncLLMEngine",
23 |     "AsyncEngineArgs",
24 |     "initialize_ray_cluster",
25 | ]
26 | 


--------------------------------------------------------------------------------
/vllm/attention/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.attention.backends.abstract import (AttentionBackend,
 2 |                                               AttentionMetadata,
 3 |                                               AttentionMetadataPerStage)
 4 | from vllm.attention.layer import Attention
 5 | from vllm.attention.selector import get_attn_backend
 6 | 
 7 | __all__ = [
 8 |     "AttentionBackend",
 9 |     "AttentionMetadata",
10 |     "Attention",
11 |     "get_attn_backend",
12 |     "AttentionMetadataPerStage",
13 | ]
14 | 


--------------------------------------------------------------------------------
/vllm/attention/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/attention/backends/__init__.py


--------------------------------------------------------------------------------
/vllm/attention/layer.py:
--------------------------------------------------------------------------------
 1 | """Attention layer."""
 2 | from typing import List, Optional
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from vllm.attention.backends.abstract import (AttentionMetadata,
 8 |                                               AttentionMetadataPerStage)
 9 | from vllm.attention.selector import get_attn_backend
10 | 
11 | 
12 | class Attention(nn.Module):
13 |     """Attention layer.
14 | 
15 |     This class takes query, key, and value tensors as input. The input tensors
16 |     can either contain prompt tokens or generation tokens.
17 |     The class does the following:
18 | 
19 |     1. Store the input key and value tensors in the KV cache.
20 |     2. Perform (multi-head/multi-query/grouped-query) attention.
21 |     3. Return the output tensor.
22 |     """
23 | 
24 |     def __init__(
25 |         self,
26 |         num_heads: int,
27 |         head_size: int,
28 |         scale: float,
29 |         num_kv_heads: Optional[int] = None,
30 |         alibi_slopes: Optional[List[float]] = None,
31 |         sliding_window: Optional[int] = None,
32 |     ) -> None:
33 |         super().__init__()
34 |         self.backend = get_attn_backend(torch.get_default_dtype())
35 |         impl_cls = self.backend.get_impl_cls()
36 |         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
37 |                              alibi_slopes, sliding_window)
38 | 
39 |     def forward(
40 |         self,
41 |         query: torch.Tensor,
42 |         key: torch.Tensor,
43 |         value: torch.Tensor,
44 |         kv_cache: Optional[torch.Tensor],
45 |         attn_metadata: AttentionMetadata[AttentionMetadataPerStage],
46 |         kv_scale: float = 1.0,
47 |     ) -> torch.Tensor:
48 |         return self.impl.forward(query, key, value, kv_cache, attn_metadata,
49 |                                  kv_scale)
50 | 
51 |     def extra_repr(self) -> str:
52 |         s = f"head_size={self.impl.head_size}"  # type: ignore
53 |         s += f", num_heads={self.impl.num_heads}"  # type: ignore
54 |         s += f", num_kv_heads={self.impl.num_kv_heads}"  # type: ignore
55 |         s += f", scale={self.impl.scale}"  # type: ignore
56 |         return s
57 | 


--------------------------------------------------------------------------------
/vllm/attention/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/attention/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/core/__init__.py


--------------------------------------------------------------------------------
/vllm/core/block/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/core/block/__init__.py


--------------------------------------------------------------------------------
/vllm/core/policy.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | from typing import Deque
 3 | 
 4 | from vllm.sequence import SequenceGroup
 5 | 
 6 | 
 7 | class Policy:
 8 | 
 9 |     def get_priority(
10 |         self,
11 |         now: float,
12 |         seq_group: SequenceGroup,
13 |     ) -> float:
14 |         raise NotImplementedError
15 | 
16 |     def sort_by_priority(
17 |         self,
18 |         now: float,
19 |         seq_groups: Deque[SequenceGroup],
20 |     ) -> Deque[SequenceGroup]:
21 |         return deque(
22 |             sorted(
23 |                 seq_groups,
24 |                 key=lambda seq_group: self.get_priority(now, seq_group),
25 |                 reverse=True,
26 |             ))
27 | 
28 | 
29 | class FCFS(Policy):
30 | 
31 |     def get_priority(
32 |         self,
33 |         now: float,
34 |         seq_group: SequenceGroup,
35 |     ) -> float:
36 |         return now - seq_group.metrics.arrival_time
37 | 
38 | 
39 | class PolicyFactory:
40 | 
41 |     _POLICY_REGISTRY = {'fcfs': FCFS}
42 | 
43 |     @classmethod
44 |     def get_policy(cls, policy_name: str, **kwargs) -> Policy:
45 |         return cls._POLICY_REGISTRY[policy_name](**kwargs)
46 | 


--------------------------------------------------------------------------------
/vllm/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | from .communication_op import *
2 | from .parallel_state import *
3 | from .utils import *
4 | 


--------------------------------------------------------------------------------
/vllm/distributed/device_communicators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/distributed/device_communicators/__init__.py


--------------------------------------------------------------------------------
/vllm/distributed/device_communicators/pymccl_utils.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | from typing import Optional
 3 | 
 4 | import torch
 5 | from torch.distributed import ProcessGroup, ReduceOp
 6 | 
 7 | from vllm.logger import init_logger
 8 | 
 9 | logger = init_logger(__name__)
10 | 
11 | try:
12 |     from vllm.distributed.device_communicators.pymccl import (MCCLCommunicator,
13 |                                                               mcclGetVersion)
14 | except Exception as e:
15 |     # in non-MTHREADS environments, we can't import the mccl module
16 |     # e.g. when running on machines with AMD GPUs
17 |     logger.info("Failed to import MCCL library: %s", e)
18 |     logger.info("It is expected if you are not running on Mthreads GPUs.")
19 |     pass
20 | 
21 | comm: Optional["MCCLCommunicator"] = None
22 | 
23 | 
24 | def is_initialized() -> bool:
25 |     """Returns whether the NCCL backend is initialized."""
26 |     return comm is not None
27 | 
28 | 
29 | @contextlib.contextmanager
30 | def set_pymccl_stream(stream: torch.cuda.Stream):
31 |     """Set the cuda stream for communication"""
32 |     try:
33 |         assert comm is not None
34 |         comm.stream = stream
35 |         yield
36 |     finally:
37 |         pass
38 | 
39 | 
40 | def init_process_group(group: Optional[ProcessGroup] = None) -> None:
41 |     assert not is_initialized()
42 |     global comm
43 |     logger.info("vLLM is using nccl==%s", mcclGetVersion())
44 |     comm = MCCLCommunicator(group=group)
45 | 
46 | 
47 | def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None:
48 |     """All-reduces the input tensor across the process group."""
49 |     assert input_.is_musa, f"{input_} should be a musa tensor"
50 |     assert comm is not None
51 |     comm.all_reduce(input_, op)
52 | 
53 | 
54 | def destroy_process_group() -> None:
55 |     global comm
56 |     comm = None
57 | 
58 | 
59 | def get_world_size() -> int:
60 |     """Returns the world size."""
61 |     assert comm is not None
62 |     return comm.world_size
63 | 
64 | 
65 | def get_nccl_backend() -> Optional["MCCLCommunicator"]:
66 |     return comm
67 | 


--------------------------------------------------------------------------------
/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/engine/__init__.py


--------------------------------------------------------------------------------
/vllm/engine/output_processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/engine/output_processor/__init__.py


--------------------------------------------------------------------------------
/vllm/engine/output_processor/util.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from vllm.sequence import SamplerOutput, SequenceGroupOutput
 4 | 
 5 | 
 6 | def create_output_by_sequence_group(
 7 |         sampler_outputs: List[SamplerOutput],
 8 |         num_seq_groups: int) -> List[List[SequenceGroupOutput]]:
 9 |     """Helper method which transforms a 2d list organized by
10 |     [step][sequence group] into [sequence group][step].
11 |     """
12 |     output_by_sequence_group: List[List[SamplerOutput]] = [
13 |         [] for _ in range(num_seq_groups)
14 |     ]
15 |     for step in sampler_outputs:
16 |         for i, sequence_group_output in enumerate(step):
17 |             output_by_sequence_group[i].append(sequence_group_output)
18 | 
19 |     return output_by_sequence_group
20 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/entrypoints/__init__.py


--------------------------------------------------------------------------------
/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/entrypoints/openai/__init__.py


--------------------------------------------------------------------------------
/vllm/executor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/executor/__init__.py


--------------------------------------------------------------------------------
/vllm/logging/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.logging.formatter import NewLineFormatter
2 | 
3 | __all__ = [
4 |     "NewLineFormatter",
5 | ]
6 | 


--------------------------------------------------------------------------------
/vllm/logging/formatter.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | class NewLineFormatter(logging.Formatter):
 5 |     """Adds logging prefix to newlines to align multi-line messages."""
 6 | 
 7 |     def __init__(self, fmt, datefmt=None, style="%"):
 8 |         logging.Formatter.__init__(self, fmt, datefmt, style)
 9 | 
10 |     def format(self, record):
11 |         msg = logging.Formatter.format(self, record)
12 |         if record.message != "":
13 |             parts = msg.split(record.message)
14 |             msg = msg.replace("\n", "\r\n" + parts[0])
15 |         return msg
16 | 


--------------------------------------------------------------------------------
/vllm/lora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/lora/__init__.py


--------------------------------------------------------------------------------
/vllm/lora/request.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class LoRARequest:
 6 |     """
 7 |     Request for a LoRA adapter.
 8 | 
 9 |     Note that this class should be be used internally. For online
10 |     serving, it is recommended to not allow users to use this class but
11 |     instead provide another layer of abstraction to prevent users from
12 |     accessing unauthorized LoRA adapters.
13 | 
14 |     lora_int_id must be globally unique for a given adapter.
15 |     This is currently not enforced in vLLM.
16 |     """
17 | 
18 |     lora_name: str
19 |     lora_int_id: int
20 |     lora_local_path: str
21 | 
22 |     def __post_init__(self):
23 |         if self.lora_int_id < 1:
24 |             raise ValueError(
25 |                 f"lora_int_id must be > 0, got {self.lora_int_id}")
26 | 
27 |     def __eq__(self, value: object) -> bool:
28 |         return isinstance(
29 |             value, LoRARequest) and self.lora_int_id == value.lora_int_id
30 | 
31 |     def __hash__(self) -> int:
32 |         return self.lora_int_id
33 | 


--------------------------------------------------------------------------------
/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.model_executor.sampling_metadata import SamplingMetadata
2 | from vllm.model_executor.utils import set_random_seed
3 | 
4 | __all__ = [
5 |     "SamplingMetadata",
6 |     "set_random_seed",
7 | ]
8 | 


--------------------------------------------------------------------------------
/vllm/model_executor/guided_decoding/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Union
 2 | 
 3 | from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
 4 |                                               CompletionRequest)
 5 | from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (
 6 |     get_lm_format_enforcer_guided_decoding_logits_processor)
 7 | from vllm.model_executor.guided_decoding.outlines_decoding import (
 8 |     get_outlines_guided_decoding_logits_processor)
 9 | from vllm.sampling_params import LogitsProcessor
10 | 
11 | 
12 | async def get_guided_decoding_logits_processor(
13 |         guided_decoding_backend: str, request: Union[CompletionRequest,
14 |                                                      ChatCompletionRequest],
15 |         tokenizer) -> Optional[LogitsProcessor]:
16 |     if guided_decoding_backend == 'outlines':
17 |         return await get_outlines_guided_decoding_logits_processor(
18 |             request, tokenizer)
19 |     if guided_decoding_backend == 'lm-format-enforcer':
20 |         return await get_lm_format_enforcer_guided_decoding_logits_processor(
21 |             request, tokenizer)
22 | 
23 |     raise ValueError(
24 |         f"Unknown guided decoding backend '{guided_decoding_backend}'. "
25 |         "Must be one of 'outlines, 'lm-format-enforcer'")
26 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/model_executor/layers/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/fused_moe/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.model_executor.layers.fused_moe.fused_moe import (
2 |     fused_moe, get_config_file_name)
3 | 
4 | __all__ = [
5 |     "fused_moe",
6 |     "get_config_file_name",
7 | ]
8 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/fused_moe/configs/README:
--------------------------------------------------------------------------------
 1 | This directory contains tuned configurations for different settings of the fused_moe kernel.
 2 | For different settings of
 3 | - E (number of experts)
 4 | - N (intermediate size)
 5 | - device_name (torch.cuda.get_device_name())
 6 | the JSON file contains a mapping from M (batch size) to the chosen configuration.
 7 | 
 8 | The example configurations provided are for the Mixtral model for TP2 on H100
 9 | and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
10 | N = 7168 and for TP4 we have N = 3584.
11 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/layernorm.py:
--------------------------------------------------------------------------------
 1 | """Custom normalization layers."""
 2 | from typing import Optional, Tuple, Union
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from vllm import _custom_ops as ops
 8 | 
 9 | 
10 | class RMSNorm(nn.Module):
11 |     """Root mean square normalization.
12 | 
13 |     Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
14 |     Refer to https://arxiv.org/abs/1910.07467
15 |     """
16 | 
17 |     def __init__(
18 |         self,
19 |         hidden_size: int,
20 |         eps: float = 1e-6,
21 |     ) -> None:
22 |         super().__init__()
23 |         self.weight = nn.Parameter(torch.ones(hidden_size))
24 |         self.variance_epsilon = eps
25 | 
26 |     def _forward(
27 |         self,
28 |         x: torch.Tensor,
29 |         residual: Optional[torch.Tensor] = None,
30 |     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
31 |         """PyTorch-native implementation equivalent to forward()."""
32 |         orig_dtype = x.dtype
33 |         x = x.to(torch.float32)
34 |         if residual is not None:
35 |             x = x + residual.to(torch.float32)
36 |             residual = x.to(orig_dtype)
37 | 
38 |         variance = x.pow(2).mean(dim=-1, keepdim=True)
39 |         x = x * torch.rsqrt(variance + self.variance_epsilon)
40 |         x = x.to(orig_dtype) * self.weight
41 |         if residual is None:
42 |             return x
43 |         else:
44 |             return x, residual
45 | 
46 |     def forward(
47 |         self,
48 |         x: torch.Tensor,
49 |         residual: Optional[torch.Tensor] = None,
50 |     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
51 |         if residual is not None:
52 |             ops.fused_add_rms_norm(
53 |                 x,
54 |                 residual,
55 |                 self.weight.data,
56 |                 self.variance_epsilon,
57 |             )
58 |             return x, residual
59 |         out = torch.empty_like(x)
60 |         ops.rms_norm(
61 |             out,
62 |             x,
63 |             self.weight.data,
64 |             self.variance_epsilon,
65 |         )
66 |         return out
67 | 
68 |     def extra_repr(self) -> str:
69 |         s = f"hidden_size={self.weight.data.size(0)}"
70 |         s += f", eps={self.variance_epsilon}"
71 |         return s
72 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/model_executor/layers/ops/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Type
 2 | 
 3 | from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
 4 | from vllm.model_executor.layers.quantization.awq import AWQConfig
 5 | from vllm.model_executor.layers.quantization.base_config import (
 6 |     QuantizationConfig)
 7 | from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 8 | from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 9 | from vllm.model_executor.layers.quantization.gptq_marlin import (
10 |     GPTQMarlinConfig)
11 | from vllm.model_executor.layers.quantization.marlin import MarlinConfig
12 | from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
13 | 
14 | QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
15 |     "aqlm": AQLMConfig,
16 |     "awq": AWQConfig,
17 |     "fp8": Fp8Config,
18 |     "gptq": GPTQConfig,
19 |     "squeezellm": SqueezeLLMConfig,
20 |     "gptq_marlin": GPTQMarlinConfig,
21 |     "marlin": MarlinConfig,
22 | }
23 | 
24 | 
25 | def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
26 |     if quantization not in QUANTIZATION_METHODS:
27 |         raise ValueError(f"Invalid quantization method: {quantization}")
28 |     return QUANTIZATION_METHODS[quantization]
29 | 
30 | 
31 | __all__ = [
32 |     "QuantizationConfig",
33 |     "get_quantization_config",
34 |     "QUANTIZATION_METHODS",
35 | ]
36 | 


--------------------------------------------------------------------------------
/vllm/model_executor/model_loader/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from torch import nn
 4 | 
 5 | from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
 6 |                          ParallelConfig, SchedulerConfig, VisionLanguageConfig)
 7 | from vllm.model_executor.model_loader.loader import (BaseModelLoader,
 8 |                                                      get_model_loader)
 9 | from vllm.model_executor.model_loader.utils import (
10 |     get_architecture_class_name, get_model_architecture)
11 | 
12 | 
13 | def get_model(
14 |         *, model_config: ModelConfig, load_config: LoadConfig,
15 |         device_config: DeviceConfig, parallel_config: ParallelConfig,
16 |         scheduler_config: SchedulerConfig, lora_config: Optional[LoRAConfig],
17 |         vision_language_config: Optional[VisionLanguageConfig]) -> nn.Module:
18 |     loader = get_model_loader(load_config)
19 |     return loader.load_model(model_config=model_config,
20 |                              device_config=device_config,
21 |                              lora_config=lora_config,
22 |                              vision_language_config=vision_language_config,
23 |                              parallel_config=parallel_config,
24 |                              scheduler_config=scheduler_config)
25 | 
26 | 
27 | __all__ = [
28 |     "get_model", "get_model_loader", "BaseModelLoader",
29 |     "get_architecture_class_name", "get_model_architecture"
30 | ]
31 | 


--------------------------------------------------------------------------------
/vllm/model_executor/model_loader/utils.py:
--------------------------------------------------------------------------------
 1 | """Utilities for selecting and loading models."""
 2 | import contextlib
 3 | from typing import Tuple, Type
 4 | 
 5 | import torch
 6 | from torch import nn
 7 | 
 8 | from vllm.config import ModelConfig
 9 | from vllm.model_executor.models import ModelRegistry
10 | 
11 | 
12 | @contextlib.contextmanager
13 | def set_default_torch_dtype(dtype: torch.dtype):
14 |     """Sets the default torch dtype to the given dtype."""
15 |     old_dtype = torch.get_default_dtype()
16 |     torch.set_default_dtype(dtype)
17 |     yield
18 |     torch.set_default_dtype(old_dtype)
19 | 
20 | 
21 | def get_model_architecture(
22 |         model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
23 |     architectures = getattr(model_config.hf_config, "architectures", [])
24 |     # Special handling for quantized Mixtral.
25 |     # FIXME(woosuk): This is a temporary hack.
26 |     if (model_config.quantization is not None
27 |             and model_config.quantization != "fp8"
28 |             and "MixtralForCausalLM" in architectures):
29 |         architectures = ["QuantMixtralForCausalLM"]
30 | 
31 |     for arch in architectures:
32 |         model_cls = ModelRegistry.load_model_cls(arch)
33 |         if model_cls is not None:
34 |             return (model_cls, arch)
35 |     raise ValueError(
36 |         f"Model architectures {architectures} are not supported for now. "
37 |         f"Supported architectures: {ModelRegistry.get_supported_archs()}")
38 | 
39 | 
40 | def get_architecture_class_name(model_config: ModelConfig) -> str:
41 |     return get_model_architecture(model_config)[1]
42 | 


--------------------------------------------------------------------------------
/vllm/model_executor/utils.py:
--------------------------------------------------------------------------------
 1 | """Utils for model executor."""
 2 | import random
 3 | from typing import Any, Dict, Optional
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | 
 9 | def set_random_seed(seed: int) -> None:
10 |     random.seed(seed)
11 |     np.random.seed(seed)
12 |     torch.manual_seed(seed)
13 |     if torch.cuda.is_available():
14 |         torch.cuda.manual_seed_all(seed)
15 |     elif torch.musa.is_available():
16 |         torch.musa.manual_seed_all(seed)
17 | 
18 | 
19 | def set_weight_attrs(
20 |     weight: torch.Tensor,
21 |     weight_attrs: Optional[Dict[str, Any]],
22 | ):
23 |     """Set attributes on a weight tensor.
24 | 
25 |     This method is used to set attributes on a weight tensor. This method
26 |     will not overwrite existing attributes.
27 | 
28 |     Args:
29 |         weight: The weight tensor.
30 |         weight_attrs: A dictionary of attributes to set on the weight tensor.
31 |     """
32 |     if weight_attrs is None:
33 |         return
34 |     for key, value in weight_attrs.items():
35 |         assert not hasattr(
36 |             weight, key), (f"Overwriting existing tensor attribute: {key}")
37 |         setattr(weight, key, value)
38 | 


--------------------------------------------------------------------------------
/vllm/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | # The vllm package uses inline types.
3 | 


--------------------------------------------------------------------------------
/vllm/spec_decode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/spec_decode/__init__.py


--------------------------------------------------------------------------------
/vllm/spec_decode/interfaces.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from dataclasses import dataclass
 3 | 
 4 | import torch
 5 | 
 6 | from vllm.sequence import ExecuteModelRequest
 7 | 
 8 | 
 9 | @dataclass
10 | class SpeculativeProposals:
11 |     """Datastructure used to represent proposal tokens from some proposer. It
12 |     also tracks how many speculative tokens each sequence has.
13 |     """
14 | 
15 |     # Speculative proposal tokens.
16 |     proposal_token_ids: torch.Tensor
17 | 
18 |     # Probabilities of the proposal tokens according to the proposer.
19 |     proposal_probs: torch.Tensor
20 | 
21 |     # The valid length of each proposal; can be zero.
22 |     proposal_lens: torch.Tensor
23 | 
24 |     def __repr__(self):
25 |         return (f"SpeculativeProposals("
26 |                 f"proposal_token_ids={self.proposal_token_ids}, "
27 |                 f"proposal_probs={self.proposal_probs.shape}, "
28 |                 f"proposal_lens={self.proposal_lens})")
29 | 
30 | 
31 | @dataclass
32 | class SpeculativeScores:
33 |     """Datastructure used to represent the scores of speculative tokens
34 |     according to the scoring model.
35 |     """
36 | 
37 |     # Probabilities of the speculative tokens according to the scoring model.
38 |     probs: torch.Tensor
39 | 
40 |     # Log-probabilities of the speculative tokens according to the scoring
41 |     # model. These values can be used to generate Logprob objects that are
42 |     # returned to the user.
43 |     logprobs: torch.Tensor
44 | 
45 |     # Token ids sampled from the scoring model. Used for speculative bonus
46 |     # tokens and also non-speculative normal decoding.
47 |     token_ids: torch.Tensor
48 | 
49 |     def __repr__(self):
50 |         return (f"SpeculativeScores("
51 |                 f"probs={self.probs.shape}, "
52 |                 f"token_ids={self.token_ids.shape})")
53 | 
54 | 
55 | class SpeculativeProposer(ABC):
56 | 
57 |     @abstractmethod
58 |     def get_proposals(
59 |         self,
60 |         execute_model_req: ExecuteModelRequest,
61 |     ) -> SpeculativeProposals:
62 |         raise NotImplementedError
63 | 
64 | 
65 | class SpeculativeScorer(ABC):
66 | 
67 |     @abstractmethod
68 |     def score_proposals(
69 |         self,
70 |         execute_model_req: ExecuteModelRequest,
71 |         proposals: SpeculativeProposals,
72 |     ) -> SpeculativeScores:
73 |         raise NotImplementedError
74 | 


--------------------------------------------------------------------------------
/vllm/test_utils.py:
--------------------------------------------------------------------------------
 1 | import ray
 2 | 
 3 | from vllm.distributed import (ensure_model_parallel_initialized,
 4 |                               init_distributed_environment)
 5 | from vllm.utils import get_open_port
 6 | 
 7 | 
 8 | def init_test_distributed_environment(
 9 |     pipeline_parallel_size: int,
10 |     tensor_parallel_size: int,
11 |     rank: int,
12 |     distributed_init_port: str,
13 |     local_rank: int = -1,
14 | ) -> None:
15 |     distributed_init_method = f"tcp://localhost:{distributed_init_port}"
16 |     init_distributed_environment(
17 |         world_size=pipeline_parallel_size * tensor_parallel_size,
18 |         rank=rank,
19 |         distributed_init_method=distributed_init_method,
20 |         local_rank=local_rank)
21 |     ensure_model_parallel_initialized(tensor_parallel_size,
22 |                                       pipeline_parallel_size)
23 | 
24 | 
25 | def multi_process_tensor_parallel(
26 |     tensor_parallel_size: int,
27 |     test_target,
28 | ) -> None:
29 |     # Using ray helps debugging the error when it failed
30 |     # as compared to multiprocessing.
31 |     ray.init()
32 | 
33 |     distributed_init_port = get_open_port()
34 |     refs = []
35 |     for rank in range(tensor_parallel_size):
36 |         refs.append(
37 |             test_target.remote(tensor_parallel_size, rank,
38 |                                distributed_init_port))
39 |     ray.get(refs)
40 | 
41 |     ray.shutdown()
42 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/transformers_utils/__init__.py


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 2 | from vllm.transformers_utils.configs.dbrx import DbrxConfig
 3 | # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 4 | # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 5 | # `FalconConfig` class from the official HuggingFace transformers library.
 6 | from vllm.transformers_utils.configs.falcon import RWConfig
 7 | from vllm.transformers_utils.configs.jais import JAISConfig
 8 | from vllm.transformers_utils.configs.mpt import MPTConfig
 9 | 
10 | __all__ = [
11 |     "ChatGLMConfig",
12 |     "DbrxConfig",
13 |     "MPTConfig",
14 |     "RWConfig",
15 |     "JAISConfig",
16 | ]
17 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizer_group/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from vllm.config import TokenizerPoolConfig
 4 | from vllm.executor.ray_utils import ray
 5 | from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
 6 |     BaseTokenizerGroup)
 7 | from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
 8 |     TokenizerGroup)
 9 | 
10 | if ray:
11 |     from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
12 |         RayTokenizerGroupPool)
13 | else:
14 |     RayTokenizerGroupPool = None  # type: ignore
15 | 
16 | 
17 | def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
18 |                         **init_kwargs) -> BaseTokenizerGroup:
19 |     if tokenizer_pool_config is None:
20 |         return TokenizerGroup(**init_kwargs)
21 |     if tokenizer_pool_config.pool_type == "ray":
22 |         if RayTokenizerGroupPool is None:
23 |             raise ImportError(
24 |                 "RayTokenizerGroupPool is not available. Please install "
25 |                 "the ray package to use the Ray tokenizer group pool.")
26 |         return RayTokenizerGroupPool.from_config(tokenizer_pool_config,
27 |                                                  **init_kwargs)
28 |     else:
29 |         raise ValueError(
30 |             f"Unknown pool type: {tokenizer_pool_config.pool_type}")
31 | 
32 | 
33 | __all__ = ["get_tokenizer_group", "BaseTokenizerGroup"]
34 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List, Optional
 3 | 
 4 | from transformers import PreTrainedTokenizer
 5 | 
 6 | from vllm.lora.request import LoRARequest
 7 | 
 8 | 
 9 | class BaseTokenizerGroup(ABC):
10 |     """A group of tokenizers that can be used for LoRA adapters."""
11 | 
12 |     @abstractmethod
13 |     def ping(self) -> bool:
14 |         """Check if the tokenizer group is alive."""
15 |         pass
16 | 
17 |     @abstractmethod
18 |     def get_max_input_len(self,
19 |                           lora_request: Optional[LoRARequest] = None
20 |                           ) -> Optional[int]:
21 |         """Get the maximum input length for the LoRA request."""
22 |         pass
23 | 
24 |     @abstractmethod
25 |     def encode(self,
26 |                prompt: str,
27 |                request_id: Optional[str] = None,
28 |                lora_request: Optional[LoRARequest] = None) -> List[int]:
29 |         """Encode a prompt using the tokenizer group."""
30 |         pass
31 | 
32 |     @abstractmethod
33 |     async def encode_async(
34 |             self,
35 |             prompt: str,
36 |             request_id: Optional[str] = None,
37 |             lora_request: Optional[LoRARequest] = None) -> List[int]:
38 |         """Encode a prompt using the tokenizer group."""
39 |         pass
40 | 
41 |     @abstractmethod
42 |     def get_lora_tokenizer(
43 |             self,
44 |             lora_request: Optional[LoRARequest] = None
45 |     ) -> "PreTrainedTokenizer":
46 |         """Get a tokenizer for a LoRA request."""
47 |         pass
48 | 
49 |     @abstractmethod
50 |     async def get_lora_tokenizer_async(
51 |             self,
52 |             lora_request: Optional[LoRARequest] = None
53 |     ) -> "PreTrainedTokenizer":
54 |         """Get a tokenizer for a LoRA request."""
55 |         pass
56 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer
2 | 
3 | __all__ = [
4 |     "BaichuanTokenizer",
5 | ]
6 | 


--------------------------------------------------------------------------------
/vllm/usage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/usage/__init__.py


--------------------------------------------------------------------------------
/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MooreThreads/vllm_musa/5b191fb9840e276101d151482b5a871c72effbc0/vllm/worker/__init__.py


--------------------------------------------------------------------------------