├── .python-version ├── tests ├── __init__.py ├── vec_inf │ ├── __init__.py │ ├── cli │ │ ├── __init__.py │ │ └── test_utils.py │ └── client │ │ ├── __init__.py │ │ ├── test_vars.env │ │ ├── test_models.py │ │ ├── test_examples.py │ │ └── test_utils.py └── test_imports.py ├── vec_inf ├── __init__.py ├── cli │ ├── __init__.py │ ├── _vars.py │ ├── _utils.py │ └── _cli.py ├── config │ ├── README.md │ └── environment.yaml ├── client │ ├── __init__.py │ ├── _exceptions.py │ ├── _client_vars.py │ ├── _slurm_vars.py │ ├── config.py │ ├── models.py │ ├── _slurm_templates.py │ ├── api.py │ ├── _slurm_script_generator.py │ └── _utils.py ├── README.md └── find_port.sh ├── .github ├── ISSUE_TEMPLATE │ ├── config.yml │ ├── model-request.md │ ├── bug_report.md │ └── feature_request.md ├── pull_request_template.md ├── dependabot.yml └── workflows │ ├── publish.yml │ ├── code_checks.yml │ ├── docker.yml │ ├── unit_tests.yml │ └── docs.yml ├── docs ├── assets │ ├── favicon.ico │ └── vector-logo.svg ├── overrides │ └── partials │ │ ├── logo.html │ │ └── copyright.html ├── api.md ├── Makefile ├── make.bat ├── index.md ├── contributing.md └── stylesheets │ └── extra.css ├── examples ├── inference │ ├── llm │ │ ├── completions.sh │ │ ├── completions.py │ │ └── chat_completions.py │ ├── text_embedding │ │ └── embeddings.py │ └── vlm │ │ └── vision_completions.py ├── logits │ └── logits.py ├── slurm_dependency │ ├── run_workflow.sh │ ├── downstream_job.sbatch │ ├── run_downstream.py │ └── README.md ├── README.md └── api │ └── basic_usage.py ├── codecov.yml ├── venv.sh ├── LICENSE ├── .pre-commit-config.yaml ├── profile ├── avg_throughput.py └── gen.py ├── Dockerfile ├── .gitignore ├── mkdocs.yml ├── pyproject.toml ├── README.md └── MODEL_TRACKING.md /.python-version: -------------------------------------------------------------------------------- 1 | 3.10 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit tests module.""" 2 | -------------------------------------------------------------------------------- /vec_inf/__init__.py: -------------------------------------------------------------------------------- 1 | """vec_inf package.""" 2 | -------------------------------------------------------------------------------- /vec_inf/cli/__init__.py: -------------------------------------------------------------------------------- 1 | """vec_inf cli package.""" 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | -------------------------------------------------------------------------------- /tests/vec_inf/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit tests for vec_inf package.""" 2 | -------------------------------------------------------------------------------- /tests/vec_inf/cli/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit tests for vec_inf.cli subpackage.""" 2 | -------------------------------------------------------------------------------- /tests/vec_inf/client/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for the Vector Inference API.""" 2 | -------------------------------------------------------------------------------- /tests/vec_inf/client/test_vars.env: -------------------------------------------------------------------------------- 1 | MY_VAR=5 2 | VLLM_CACHE_ROOT=/cache/vllm 3 | -------------------------------------------------------------------------------- /docs/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VectorInstitute/vector-inference/HEAD/docs/assets/favicon.ico -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # PR Type 2 | [Feature | Fix | Documentation | Other() ] 3 | 4 | # Short Description 5 | ... 6 | 7 | # Tests Added 8 | ... 9 | -------------------------------------------------------------------------------- /docs/overrides/partials/logo.html: -------------------------------------------------------------------------------- 1 | {% if config.theme.logo %} 2 | logo 3 | {% else %} 4 | {{ config.site_name }} 5 | {% endif %} 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/model-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Model request 3 | about: Request for new model weights or model config 4 | title: New model request for [MODEL_NAME] 5 | labels: new model 6 | assignees: XkunW 7 | 8 | --- 9 | 10 | ### Request Type 11 | Model weights | Model config | Both 12 | 13 | ### Model Name 14 | Name of the model requested 15 | -------------------------------------------------------------------------------- /examples/inference/llm/completions.sh: -------------------------------------------------------------------------------- 1 | #!bin/bash 2 | 3 | # The url can be found with vec-inf status $JOB_ID 4 | export API_BASE_URL=http://gpuXXX:XXXX/v1 5 | 6 | # Update the model path accordingly 7 | curl ${API_BASE_URL}/completions \ 8 | -H "Content-Type: application/json" \ 9 | -d '{ 10 | "model": "Meta-Llama-3.1-8B-Instruct", 11 | "prompt": "What is the capital of Canada?", 12 | "max_tokens": 20 13 | }' 14 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | branch: main 3 | require_ci_to_pass: true 4 | notify: 5 | after_n_builds: 2 6 | wait_for_ci: yes 7 | comment: 8 | behavior: default 9 | layout: reach,diff,flags,tree,reach 10 | show_carryforward_flags: false 11 | require_changes: true 12 | coverage: 13 | status: 14 | changes: true 15 | default_rules: 16 | flag_coverage_not_uploaded_behavior: include 17 | patch: true 18 | project: true 19 | github_checks: 20 | annotations: true 21 | -------------------------------------------------------------------------------- /examples/inference/llm/completions.py: -------------------------------------------------------------------------------- 1 | """Example of how to use the OpenAI API to generate completions.""" 2 | 3 | from openai import OpenAI 4 | 5 | 6 | # The url can be found with vec-inf status $JOB_ID 7 | client = OpenAI(base_url="http://gpuXXX:XXXX/v1", api_key="EMPTY") 8 | 9 | # Update the model path accordingly 10 | completion = client.completions.create( 11 | model="Meta-Llama-3.1-8B-Instruct", 12 | prompt="Where is the capital of Canada?", 13 | max_tokens=20, 14 | ) 15 | 16 | print(completion) 17 | -------------------------------------------------------------------------------- /examples/logits/logits.py: -------------------------------------------------------------------------------- 1 | """Example of how to get logits from the model.""" 2 | 3 | from openai import OpenAI 4 | 5 | 6 | # The url can be found with vec-inf status $JOB_ID 7 | client = OpenAI(base_url="http://gpuXXX:XXXX/v1", api_key="EMPTY") 8 | 9 | completion = client.completions.create( 10 | model="Meta-Llama-3.1-8B-Instruct", 11 | prompt="Where is the capital of Canada?", 12 | max_tokens=1, 13 | logprobs=128256, # Set to model vocab size to get logits 14 | ) 15 | 16 | print(completion.choices[0].logprobs) 17 | -------------------------------------------------------------------------------- /vec_inf/config/README.md: -------------------------------------------------------------------------------- 1 | # Configs 2 | 3 | * [`environment.yaml`](environment.yaml): Configuration for the Slurm cluster environment, including image paths, resource availabilities, default value, and etc. 4 | * [`models.yaml`](models.yaml): Configuration for launching model inference servers, including Slurm parameters as well as `vllm serve` arguments. 5 | 6 | **NOTE**: These configs acts as last resort fallbacks in the `vec-inf` package, they will be updated to match the latest cached config on the Vector Killarney cluster with each new package version release. 7 | -------------------------------------------------------------------------------- /examples/slurm_dependency/run_workflow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ---- Config ---- 4 | MODEL_NAME="Meta-Llama-3.1-8B-Instruct" 5 | LAUNCH_ARGS="$MODEL_NAME" 6 | 7 | # ---- Step 1: Launch the server 8 | RAW_JSON=$(vec-inf launch $LAUNCH_ARGS --json-mode) 9 | SERVER_JOB_ID=$(echo "$RAW_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin)['slurm_job_id'])") 10 | echo "Launched server as job $SERVER_JOB_ID" 11 | echo "$RAW_JSON" 12 | 13 | # ---- Step 2: Submit downstream job 14 | sbatch --dependency=after:$SERVER_JOB_ID --export=SERVER_JOB_ID=$SERVER_JOB_ID downstream_job.sbatch 15 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "github-actions" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /docs/api.md: -------------------------------------------------------------------------------- 1 | # Python API Reference 2 | 3 | This section documents the Python API for vector-inference. 4 | 5 | ## Client Interface 6 | 7 | ::: vec_inf.client.api.VecInfClient 8 | options: 9 | show_root_heading: true 10 | show_root_full_path: true 11 | members: true 12 | 13 | ## Model Config 14 | 15 | ::: vec_inf.client.config.ModelConfig 16 | options: 17 | show_root_heading: true 18 | show_root_full_path: true 19 | members: true 20 | 21 | 22 | ## Data Models 23 | 24 | ::: vec_inf.client.models 25 | options: 26 | show_root_heading: true 27 | members: true 28 | -------------------------------------------------------------------------------- /examples/inference/text_embedding/embeddings.py: -------------------------------------------------------------------------------- 1 | """Example of how to use the OpenAI API to generate embeddings.""" 2 | 3 | from openai import OpenAI 4 | 5 | 6 | # The url can be found with vec-inf status $JOB_ID 7 | client = OpenAI(base_url="http://gpuXXX:XXXX/v1", api_key="EMPTY") 8 | 9 | model_name = "bge-base-en-v1.5" 10 | 11 | input_texts = [ 12 | "The chef prepared a delicious meal.", 13 | ] 14 | 15 | # test single embedding 16 | embedding_response = client.embeddings.create( 17 | model=model_name, 18 | input=input_texts, 19 | encoding_format="float", 20 | ) 21 | 22 | print(embedding_response) 23 | -------------------------------------------------------------------------------- /tests/vec_inf/cli/test_utils.py: -------------------------------------------------------------------------------- 1 | """Tests for the utils functions in the vec-inf cli.""" 2 | 3 | from vec_inf.cli._utils import create_table 4 | 5 | 6 | def test_create_table_with_header(): 7 | """Test that create_table creates a table with the correct header.""" 8 | table = create_table("Key", "Value") 9 | assert table.columns[0].header == "Key" 10 | assert table.columns[1].header == "Value" 11 | assert table.show_header is True 12 | 13 | 14 | def test_create_table_without_header(): 15 | """Test create_table without header.""" 16 | table = create_table(show_header=False) 17 | assert table.show_header is False 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: XkunW 7 | 8 | --- 9 | 10 | ### Describe the bug 11 | A clear and concise description of what the bug is. 12 | 13 | ### To Reproduce 14 | Code snippet or clear steps to reproduce behaviour. 15 | 16 | ### Expected behavior 17 | A clear and concise description of what you expected to happen. 18 | 19 | ### Screenshots 20 | If applicable, add screenshots to help explain your problem. 21 | 22 | ### Version 23 | - Version info such as v0.1.5 24 | 25 | ### Additional context 26 | Add any other context about the problem here. 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Is your feature request related to a problem? Please describe. 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | ### Describe the solution you'd like 14 | A clear and concise description of what you want to happen. 15 | 16 | ### Describe alternatives you've considered 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | ### Additional context 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /examples/inference/llm/chat_completions.py: -------------------------------------------------------------------------------- 1 | """Example of how to use the OpenAI API to generate chat completions.""" 2 | 3 | from openai import OpenAI 4 | 5 | 6 | # The url can be found with vec-inf status $JOB_ID 7 | client = OpenAI(base_url="http://gpuXXX:XXXX/v1", api_key="EMPTY") 8 | 9 | # Update the model path accordingly 10 | completion = client.chat.completions.create( 11 | model="Meta-Llama-3.1-8B-Instruct", 12 | messages=[ 13 | { 14 | "role": "system", 15 | "content": "You are a pirate chatbot who always responds in pirate speak!", 16 | }, 17 | {"role": "user", "content": "Who are you?"}, 18 | ], 19 | ) 20 | 21 | print(completion.model_dump_json()) 22 | -------------------------------------------------------------------------------- /docs/overrides/partials/copyright.html: -------------------------------------------------------------------------------- 1 | 23 | -------------------------------------------------------------------------------- /examples/slurm_dependency/downstream_job.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=Meta-Llama-3.1-8B-Instruct-downstream 3 | #SBATCH --partition=a40 4 | #SBATCH --qos=m2 5 | #SBATCH --time=08:00:00 6 | #SBATCH --nodes=1 7 | #SBATCH --gpus-per-node=1 8 | #SBATCH --cpus-per-task=4 9 | #SBATCH --mem=8G 10 | #SBATCH --output=$HOME/.vec-inf-logs/Meta-Llama-3.1-8B-Instruct-downstream.%j.out 11 | #SBATCH --error=$HOME/.vec-inf-logs/Meta-Llama-3.1-8B-Instruct-downstream.%j.err 12 | 13 | # Activate your environment 14 | # TODO: update this path to match your venv location 15 | source $HOME/vector-inference/.venv/bin/activate 16 | 17 | # Wait for the server to be ready using the job ID passed as CLI arg 18 | python run_downstream.py "$SERVER_JOB_ID" 19 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: publish package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Install apt dependencies 12 | run: | 13 | sudo apt-get update 14 | sudo apt-get install libcurl4-openssl-dev libssl-dev 15 | 16 | - uses: actions/checkout@v5.0.0 17 | 18 | - name: Install uv 19 | uses: astral-sh/setup-uv@v7 20 | with: 21 | version: "0.6.6" 22 | enable-cache: true 23 | 24 | - uses: actions/setup-python@v6 25 | with: 26 | python-version: '3.10' 27 | 28 | - name: Build package 29 | run: uv build 30 | 31 | - name: Publish package 32 | run: uv publish --token ${{ secrets.PYPI_API_TOKEN }} 33 | -------------------------------------------------------------------------------- /examples/slurm_dependency/run_downstream.py: -------------------------------------------------------------------------------- 1 | """Example script to query a launched model via the OpenAI-compatible API.""" 2 | 3 | import sys 4 | 5 | from openai import OpenAI 6 | 7 | from vec_inf.client import VecInfClient 8 | 9 | 10 | if len(sys.argv) < 2: 11 | raise ValueError("Expected server job ID as the first argument.") 12 | job_id = sys.argv[1] 13 | 14 | vi_client = VecInfClient() 15 | print(f"Waiting for SLURM job {job_id} to be ready...") 16 | status = vi_client.wait_until_ready(slurm_job_id=job_id) 17 | print(f"Server is ready at {status.base_url}") 18 | 19 | api_client = OpenAI(base_url=status.base_url, api_key="EMPTY") 20 | resp = api_client.completions.create( 21 | model="Meta-Llama-3.1-8B-Instruct", 22 | prompt="Where is the capital of Canada?", 23 | max_tokens=20, 24 | ) 25 | 26 | print(resp) 27 | -------------------------------------------------------------------------------- /vec_inf/config/environment.yaml: -------------------------------------------------------------------------------- 1 | paths: 2 | image_path: "/model-weights/vec-inf-shared/vector-inference_latest.sif" 3 | 4 | containerization: 5 | module_load_cmd: "module load apptainer" 6 | module_name: "apptainer" 7 | 8 | limits: 9 | max_gpus_per_node: 8 10 | max_num_nodes: 178 11 | max_cpus_per_task: 64 12 | 13 | allowed_values: 14 | qos: [] 15 | partition: [] 16 | resource_type: ["l40s", "h100"] 17 | 18 | required_args: 19 | account: "VEC_INF_ACCOUNT" 20 | work_dir: "VEC_INF_WORK_DIR" 21 | 22 | default_args: 23 | cpus_per_task: "16" 24 | mem_per_node: "64G" 25 | time: "08:00:00" 26 | qos: "" 27 | partition: "" 28 | resource_type: "" 29 | exclude: "" 30 | nodelist: "" 31 | bind: "" 32 | venv: "apptainer" 33 | data_type: "auto" 34 | log_dir: "~/.vec-inf-logs" 35 | model_weights_parent_dir: "/model-weights" 36 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | # NOTE: This is not generated from `sphinx-quickstart` and manually added 23 | serve: 24 | sphinx-autobuild $(SOURCEDIR) $(BUILDDIR) 25 | -------------------------------------------------------------------------------- /vec_inf/client/__init__.py: -------------------------------------------------------------------------------- 1 | """Programmatic API for Vector Inference. 2 | 3 | This module provides a Python API for launching and managing inference servers 4 | using `vec_inf`. It is an alternative to the command-line interface, and allows 5 | users direct control over the lifecycle of inference servers via python scripts. 6 | """ 7 | 8 | from vec_inf.client.api import VecInfClient 9 | from vec_inf.client.config import ModelConfig 10 | from vec_inf.client.models import ( 11 | LaunchOptions, 12 | LaunchResponse, 13 | MetricsResponse, 14 | ModelInfo, 15 | ModelStatus, 16 | ModelType, 17 | StatusResponse, 18 | ) 19 | 20 | 21 | __all__ = [ 22 | "VecInfClient", 23 | "LaunchResponse", 24 | "StatusResponse", 25 | "ModelInfo", 26 | "MetricsResponse", 27 | "ModelStatus", 28 | "ModelType", 29 | "LaunchOptions", 30 | "ModelConfig", 31 | ] 32 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /vec_inf/cli/_vars.py: -------------------------------------------------------------------------------- 1 | """Constants for CLI rendering. 2 | 3 | This module defines constant mappings for model type priorities and colors 4 | used in the CLI display formatting. 5 | 6 | Constants 7 | --------- 8 | MODEL_TYPE_PRIORITY : dict 9 | Mapping of model types to their display priority (lower numbers shown first) 10 | 11 | MODEL_TYPE_COLORS : dict 12 | Mapping of model types to their display colors in Rich 13 | 14 | Notes 15 | ----- 16 | These constants are used primarily by the ListCmdDisplay class to ensure 17 | consistent sorting and color coding of different model types in the CLI output. 18 | """ 19 | 20 | MODEL_TYPE_PRIORITY = { 21 | "LLM": 0, 22 | "VLM": 1, 23 | "Text_Embedding": 2, 24 | "Reward_Modeling": 3, 25 | } 26 | 27 | MODEL_TYPE_COLORS = { 28 | "LLM": "cyan", 29 | "VLM": "bright_blue", 30 | "Text_Embedding": "purple", 31 | "Reward_Modeling": "bright_magenta", 32 | } 33 | -------------------------------------------------------------------------------- /vec_inf/client/_exceptions.py: -------------------------------------------------------------------------------- 1 | """Exceptions for the vector inference package.""" 2 | 3 | 4 | class ModelConfigurationError(Exception): 5 | """Raised when the model config or weights are missing or invalid.""" 6 | 7 | pass 8 | 9 | 10 | class MissingRequiredFieldsError(ValueError): 11 | """Raised when required fields are missing from the provided parameters.""" 12 | 13 | pass 14 | 15 | 16 | class ModelNotFoundError(KeyError): 17 | """Raised when the specified model name is not found in the configuration.""" 18 | 19 | pass 20 | 21 | 22 | class SlurmJobError(RuntimeError): 23 | """Raised when there's an error with a Slurm job.""" 24 | 25 | pass 26 | 27 | 28 | class APIError(Exception): 29 | """Base exception for API errors.""" 30 | 31 | pass 32 | 33 | 34 | class ServerError(Exception): 35 | """Exception raised when there's an error with the inference server.""" 36 | 37 | pass 38 | -------------------------------------------------------------------------------- /examples/inference/vlm/vision_completions.py: -------------------------------------------------------------------------------- 1 | """Example of using the OpenAI API to generate completions for vision tasks.""" 2 | 3 | from openai import OpenAI 4 | 5 | 6 | # The url can be found with vec-inf status $JOB_ID 7 | client = OpenAI(base_url="http://gpuXXX:XXXX/v1", api_key="EMPTY") 8 | 9 | # Update the model path accordingly 10 | completion = client.chat.completions.create( 11 | model="llava-1.5-13b-hf", 12 | messages=[ 13 | { 14 | "role": "user", 15 | "content": [ 16 | {"type": "text", "text": "What's in this image?"}, 17 | { 18 | "type": "image_url", 19 | "image_url": { 20 | "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", 21 | }, 22 | }, 23 | ], 24 | } 25 | ], 26 | max_tokens=50, 27 | ) 28 | 29 | print(completion) 30 | -------------------------------------------------------------------------------- /venv.sh: -------------------------------------------------------------------------------- 1 | #!bin/bash 2 | 3 | # Load python module if you are on Vector cluster and install uv 4 | module load python/3.10.13 5 | module load rust 6 | curl -LsSf https://astral.sh/uv/install.sh | sh 7 | 8 | # Optional: it's recommended to change the cache directory to somewhere in the scratch space to avoid 9 | # running out of space in your home directory, below is an example for the Vector cluster 10 | mkdir -p /scratch/$(whoami)/uv_cache 11 | export UV_CACHE_DIR=/scratch/$(whoami)/uv_cache 12 | 13 | # To see if the cache directory is set correctly, run the following command 14 | # uv config get cache-dir 15 | echo "Cache directory set to: $(uv config get cache-dir)" 16 | 17 | # Install dependencies via uv 18 | uv sync 19 | 20 | # Activate the virtual environment 21 | source .venv/bin/activate 22 | 23 | # Deactivate the virtual environment 24 | # deactivate 25 | 26 | # To check where your virtual environment is located, run the following command 27 | # uv venv --show-path 28 | 29 | # Alternatively, to activate your virtual environment without running uv shell, run the following command 30 | # source $(uv venv --show-path)/bin/activate 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Vector Institute 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /vec_inf/cli/_utils.py: -------------------------------------------------------------------------------- 1 | """Helper functions for the CLI. 2 | 3 | This module provides utility functions for creating consistent table displays 4 | in the command-line interface. 5 | """ 6 | 7 | from rich.table import Table 8 | 9 | 10 | def create_table( 11 | key_title: str = "", value_title: str = "", show_header: bool = True 12 | ) -> Table: 13 | """Create a table for displaying model status. 14 | 15 | Creates a two-column Rich table with consistent styling for displaying 16 | key-value pairs in the CLI. 17 | 18 | Parameters 19 | ---------- 20 | key_title : str, default="" 21 | Title for the key column 22 | value_title : str, default="" 23 | Title for the value column 24 | show_header : bool, default=True 25 | Whether to display column headers 26 | 27 | Returns 28 | ------- 29 | Table 30 | Rich Table instance with configured styling: 31 | - Headers in bold magenta 32 | - Key column in dim style 33 | - Value column in default style 34 | """ 35 | table = Table(show_header=show_header, header_style="bold magenta") 36 | table.add_column(key_title, style="dim") 37 | table.add_column(value_title) 38 | return table 39 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | - [`inference`](inference): Examples for sending inference requests 3 | - [`llm/chat_completions.py`](inference/llm/chat_completions.py): Python example of sending chat completion requests to OpenAI compatible server 4 | - [`llm/completions.py`](inference/llm/completions.py): Python example of sending completion requests to OpenAI compatible server 5 | - [`llm/completions.sh`](inference/llm/completions.sh): Bash example of sending completion requests to OpenAI compatible server, supports JSON mode 6 | - [`text_embedding/embeddings.py`](inference/text_embedding/embeddings.py): Python example of sending text embedding requests to OpenAI compatible server 7 | - [`vlm/vision_completions.py`](inference/vlm/vision_completions.py): Python example of sending chat completion requests with image attached to prompt to OpenAI compatible server for vision language models 8 | - [`logits`](logits): Example for logits generation 9 | - [`logits.py`](logits/logits.py): Python example of getting logits from hosted model. 10 | - [`api`](api): Examples for using the Python API 11 | - [`basic_usage.py`](api/basic_usage.py): Basic Python example demonstrating the Vector Inference API 12 | - [`slurm_dependency`](slurm_dependency): Example of launching a model with `vec-inf` and running a downstream SLURM job that waits for the server to be ready before sending a request. 13 | -------------------------------------------------------------------------------- /tests/test_imports.py: -------------------------------------------------------------------------------- 1 | """Test the imports of the vec_inf package.""" 2 | 3 | import unittest 4 | 5 | import pytest 6 | 7 | 8 | class TestVecInfImports(unittest.TestCase): 9 | """Test the imports of the vec_inf package.""" 10 | 11 | def test_imports(self): 12 | """Test that all modules can be imported.""" 13 | try: 14 | # CLI imports 15 | import vec_inf.cli # noqa: PLC0415 16 | import vec_inf.cli._cli # noqa: PLC0415 17 | import vec_inf.cli._helper # noqa: PLC0415 18 | 19 | # Client imports 20 | import vec_inf.client # noqa: PLC0415 21 | import vec_inf.client._client_vars # noqa: F401, PLC0415 22 | import vec_inf.client._exceptions # noqa: PLC0415 23 | import vec_inf.client._helper # noqa: PLC0415 24 | import vec_inf.client._slurm_script_generator # noqa: PLC0415 25 | import vec_inf.client._slurm_templates # noqa: PLC0415 26 | import vec_inf.client._slurm_vars # noqa: PLC0415 27 | import vec_inf.client._utils # noqa: PLC0415 28 | import vec_inf.client.api # noqa: PLC0415 29 | import vec_inf.client.config # noqa: PLC0415 30 | import vec_inf.client.models # noqa: F401, PLC0415 31 | 32 | except ImportError as e: 33 | pytest.fail(f"Import failed: {e}") 34 | -------------------------------------------------------------------------------- /examples/slurm_dependency/README.md: -------------------------------------------------------------------------------- 1 | # SLURM Dependency Workflow Example 2 | 3 | This example demonstrates how to launch a model server using `vec-inf`, and run a downstream SLURM job that waits for the server to become ready before querying it. 4 | 5 | ## Files 6 | 7 | This directory contains the following: 8 | 9 | 1. [run_workflow.sh](run_workflow.sh) 10 | Launches the model server and submits the downstream job with a dependency, so it starts only after the server job begins running. 11 | 12 | 2. [downstream_job.sbatch](downstream_job.sbatch) 13 | A SLURM job script that runs the downstream logic (e.g., prompting the model). 14 | 15 | 3. [run_downstream.py](run_downstream.py) 16 | A Python script that waits until the inference server is ready, then sends a request using the OpenAI-compatible API. 17 | 18 | ## What to update 19 | 20 | Before running this example, update the following in [downstream_job.sbatch](downstream_job.sbatch): 21 | 22 | - `--job-name`, `--output`, and `--error` paths 23 | - Virtual environment path in the `source` line 24 | - SLURM resource configuration (e.g., partition, memory, GPU) 25 | 26 | Also update the model name in [run_downstream.py](run_downstream.py) to match what you're launching. 27 | 28 | ## Running the example 29 | 30 | First, activate a virtual environment where `vec-inf` is installed. Then, from this directory, run: 31 | 32 | ```bash 33 | bash run_workflow.sh 34 | -------------------------------------------------------------------------------- /vec_inf/README.md: -------------------------------------------------------------------------------- 1 | ## `vec-inf` CLI Commands 2 | 3 | * `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server. 4 | * `batch-launch`: Specify a list of models to launch multiple OpenAI compatible inference servers at the same time. 5 | * `status`: Check the status of all `vec-inf` jobs, or a specific job by providing its job ID. 6 | * `metrics`: Streams performance metrics to the console. 7 | * `shutdown`: Shutdown a model by providing its Slurm job ID. 8 | * `list`: List all available model names, or view the default/cached configuration of a specific model. 9 | * `cleanup`: Remove old log directories. You can filter by `--model-family`, `--model-name`, `--job-id`, and/or `--before-job-id`. Use `--dry-run` to preview what would be deleted. 10 | 11 | Use `--help` to see all available options 12 | 13 | ## `VecInfClient` API 14 | 15 | * `launch_model`: Launch an OpenAI compatible inference server. 16 | * `batch_launch_models`: Launch multiple OpenAI compatible inference servers. 17 | * `fetch_running_jobs`: Get the running `vec-inf` job IDs. 18 | * `get_status`: Get the status of a running model. 19 | * `get_metrics`: Get the performance metrics of a running model. 20 | * `shutdown_model`: Shutdown a running model. 21 | * `list_models`" List all available models. 22 | * `get_model_config`: Get the configuration for a specific model. 23 | * `wait_until_ready`: Wait until a model is ready or fails. 24 | * `cleanup_logs`: Remove logs from the log directory. 25 | -------------------------------------------------------------------------------- /examples/api/basic_usage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Basic example of Vector Inference API usage. 3 | 4 | This script demonstrates the core features of the Vector Inference API 5 | for launching and interacting with models. 6 | """ 7 | 8 | from vec_inf.client import VecInfClient 9 | 10 | 11 | # Create the API client 12 | client = VecInfClient() 13 | 14 | # List available models 15 | print("Listing available models...") 16 | models = client.list_models() 17 | print(f"Found {len(models)} models") 18 | for model in models[:3]: # Show just the first few 19 | print(f"- {model.name} ({model.model_type})") 20 | 21 | # Launch a model (replace with an actual model name from your environment) 22 | model_name = "Meta-Llama-3.1-8B-Instruct" # Use an available model from your list 23 | print(f"\nLaunching {model_name}...") 24 | response = client.launch_model(model_name) 25 | job_id = response.slurm_job_id 26 | print(f"Launched with job ID: {job_id}") 27 | 28 | # Wait for the model to be ready 29 | print("Waiting for model to be ready...") 30 | status = client.wait_until_ready(job_id) 31 | print(f"Model is ready at: {status.base_url}") 32 | 33 | # Get metrics 34 | print("\nRetrieving metrics...") 35 | metrics = client.get_metrics(job_id) 36 | if isinstance(metrics.metrics, dict): 37 | for key, value in metrics.metrics.items(): 38 | print(f"- {key}: {value}") 39 | 40 | # Shutdown when done 41 | print("\nShutting down model...") 42 | client.shutdown_model(job_id) 43 | print("Model shutdown complete") 44 | -------------------------------------------------------------------------------- /vec_inf/find_port.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Function to check if a port is available on the specified IP 4 | is_port_available() { 5 | local ip=$1 6 | local port=$2 7 | # Attempt to listen on the specified port and IP. Use & to background the process. 8 | nc -l $ip $port &> /dev/null & 9 | 10 | # Capture the PID of the background process 11 | local pid=$! 12 | # Wait a short moment to ensure nc had time to bind to the port 13 | sleep 0.1 14 | 15 | # Check if nc is still running. If so, the port was available. 16 | if kill -0 $pid &> /dev/null; then 17 | # Kill the background nc process 18 | kill $pid &> /dev/null 19 | return 0 # True, port is available 20 | else 21 | return 1 # False, port is not available 22 | fi 23 | } 24 | 25 | # Function to find an available port on the specified IP 26 | find_available_port() { 27 | local ip=$1 28 | local base_port=$2 29 | local max_port=$3 30 | 31 | # Generate shuffled list of ports; fallback to sequential if shuf not present 32 | if command -v shuf >/dev/null 2>&1; then 33 | local port_list 34 | port_list=$(shuf -i "${base_port}-${max_port}") 35 | else 36 | local port_list 37 | port_list=$(seq $base_port $max_port) 38 | fi 39 | 40 | for port in $port_list; do 41 | if is_port_available $ip $port; then 42 | echo $port 43 | return 44 | fi 45 | done 46 | echo "No available port between $base_port and $max_port for $ip." >&2 47 | return 1 48 | } 49 | -------------------------------------------------------------------------------- /.github/workflows/code_checks.yml: -------------------------------------------------------------------------------- 1 | name: code checks 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - develop 8 | paths: 9 | - .pre-commit-config.yaml 10 | - .github/workflows/code_checks.yml 11 | - '**.py' 12 | - uv.lock 13 | - pyproject.toml 14 | - '**.ipynb' 15 | pull_request: 16 | branches: 17 | - main 18 | - develop 19 | paths: 20 | - .pre-commit-config.yaml 21 | - .github/workflows/code_checks.yml 22 | - '**.py' 23 | - uv.lock 24 | - pyproject.toml 25 | - '**.ipynb' 26 | 27 | jobs: 28 | run-code-check: 29 | runs-on: ubuntu-latest 30 | steps: 31 | - uses: actions/checkout@v5.0.0 32 | - name: Install uv 33 | uses: astral-sh/setup-uv@v7 34 | with: 35 | # Install a specific version of uv. 36 | version: "0.5.21" 37 | enable-cache: true 38 | - name: "Set up Python" 39 | uses: actions/setup-python@v6 40 | with: 41 | python-version-file: ".python-version" 42 | - name: Install the project 43 | run: uv sync --dev --prerelease=allow 44 | - name: Install dependencies and check code 45 | run: | 46 | source .venv/bin/activate 47 | pre-commit run --all-files 48 | - name: pip-audit (gh-action-pip-audit) 49 | uses: pypa/gh-action-pip-audit@v1.1.0 50 | with: 51 | virtual-environment: .venv/ 52 | # Temporary: ignore pip advisory until fixed in pip>=25.3 53 | ignore-vulns: GHSA-4xh5-x5gv-qwph 54 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v6.0.0 # Use the ref you want to point at 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: check-ast 7 | - id: check-builtin-literals 8 | - id: check-docstring-first 9 | - id: check-executables-have-shebangs 10 | - id: debug-statements 11 | - id: end-of-file-fixer 12 | - id: mixed-line-ending 13 | args: [--fix=lf] 14 | - id: requirements-txt-fixer 15 | - id: check-yaml 16 | args: [--unsafe] 17 | - id: check-toml 18 | 19 | - repo: https://github.com/astral-sh/ruff-pre-commit 20 | rev: 'v0.14.5' 21 | hooks: 22 | - id: ruff 23 | args: [--fix, --exit-non-zero-on-fix] 24 | types_or: [python, jupyter] 25 | - id: ruff-format 26 | types_or: [python, jupyter] 27 | 28 | - repo: https://github.com/pre-commit/mirrors-mypy 29 | rev: v1.18.2 30 | hooks: 31 | - id: mypy 32 | entry: python3 -m mypy --config-file pyproject.toml 33 | language: system 34 | types: [python] 35 | exclude: "tests" 36 | 37 | - repo: https://github.com/nbQA-dev/nbQA 38 | rev: 1.9.1 39 | hooks: 40 | - id: nbqa-ruff 41 | args: [--fix, --exit-non-zero-on-fix] 42 | 43 | - repo: local 44 | hooks: 45 | - id: pytest 46 | name: pytest 47 | entry: python3 -m pytest -m "not integration_test" 48 | language: system 49 | pass_filenames: false 50 | always_run: true 51 | 52 | ci: 53 | autofix_commit_msg: | 54 | [pre-commit.ci] Add auto fixes from pre-commit.com hooks 55 | 56 | for more information, see https://pre-commit.ci 57 | autofix_prs: true 58 | autoupdate_branch: '' 59 | autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate' 60 | autoupdate_schedule: weekly 61 | skip: [pytest,mypy] 62 | submodules: false 63 | -------------------------------------------------------------------------------- /.github/workflows/docker.yml: -------------------------------------------------------------------------------- 1 | name: docker 2 | 3 | on: 4 | release: 5 | types: [published] 6 | push: 7 | branches: 8 | - main 9 | paths: 10 | - Dockerfile 11 | - .github/workflows/docker.yml 12 | - uv.lock 13 | pull_request: 14 | branches: 15 | - main 16 | paths: 17 | - Dockerfile 18 | - .github/workflows/docker.yml 19 | - uv.lock 20 | 21 | jobs: 22 | push_to_registry: 23 | name: Push Docker image to Docker Hub 24 | runs-on: 25 | - self-hosted 26 | - docker 27 | steps: 28 | - name: Checkout repository 29 | uses: actions/checkout@v5.0.0 30 | 31 | - name: Extract vLLM version 32 | id: vllm-version 33 | run: | 34 | VERSION=$(grep -A 1 'name = "vllm"' uv.lock | grep version | cut -d '"' -f 2) 35 | echo "version=$VERSION" >> $GITHUB_OUTPUT 36 | 37 | - name: Set up Docker Buildx 38 | uses: docker/setup-buildx-action@v3 39 | 40 | - name: Log in to Docker Hub 41 | uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef 42 | with: 43 | username: ${{ secrets.DOCKER_USERNAME }} 44 | password: ${{ secrets.DOCKER_PASSWORD }} 45 | 46 | - name: Extract metadata (tags, labels) for Docker 47 | id: meta 48 | uses: docker/metadata-action@318604b99e75e41977312d83839a89be02ca4893 49 | with: 50 | images: vectorinstitute/vector-inference 51 | 52 | - name: Build and push Docker image 53 | uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 54 | with: 55 | context: . 56 | file: ./Dockerfile 57 | push: true 58 | tags: | 59 | ${{ steps.meta.outputs.tags }} 60 | vectorinstitute/vector-inference:${{ steps.vllm-version.outputs.version }} 61 | labels: ${{ steps.meta.outputs.labels }} 62 | -------------------------------------------------------------------------------- /tests/vec_inf/client/test_models.py: -------------------------------------------------------------------------------- 1 | """Tests for the Vector Inference API data models.""" 2 | 3 | from vec_inf.client import LaunchOptions, ModelInfo, ModelStatus, ModelType 4 | 5 | 6 | def test_model_info_creation(): 7 | """Test creating a ModelInfo instance.""" 8 | model = ModelInfo( 9 | name="test-model", 10 | family="test-family", 11 | variant="test-variant", 12 | model_type=ModelType.LLM, 13 | config={"gpus_per_node": 1}, 14 | ) 15 | 16 | assert model.name == "test-model" 17 | assert model.family == "test-family" 18 | assert model.variant == "test-variant" 19 | assert model.model_type == ModelType.LLM 20 | assert model.config["gpus_per_node"] == 1 21 | 22 | 23 | def test_model_info_optional_fields(): 24 | """Test ModelInfo with optional fields omitted.""" 25 | model = ModelInfo( 26 | name="test-model", 27 | family="test-family", 28 | variant=None, 29 | model_type=ModelType.LLM, 30 | config={}, 31 | ) 32 | 33 | assert model.name == "test-model" 34 | assert model.family == "test-family" 35 | assert model.variant is None 36 | assert model.model_type == ModelType.LLM 37 | 38 | 39 | def test_launch_options_default_values(): 40 | """Test LaunchOptions with default values.""" 41 | options = LaunchOptions() 42 | 43 | assert options.gpus_per_node is None 44 | assert options.partition is None 45 | assert options.data_type is None 46 | assert options.num_nodes is None 47 | assert options.model_family is None 48 | 49 | 50 | def test_model_status_enum(): 51 | """Test ModelStatus enum values.""" 52 | assert ModelStatus.PENDING.value == "PENDING" 53 | assert ModelStatus.LAUNCHING.value == "LAUNCHING" 54 | assert ModelStatus.READY.value == "READY" 55 | assert ModelStatus.FAILED.value == "FAILED" 56 | assert ModelStatus.SHUTDOWN.value == "SHUTDOWN" 57 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Vector Inference: Easy inference on Slurm clusters 2 | 3 | This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/stable/). **This package runs natively on the Vector Institute cluster environment**. To adapt to other environments, follow the instructions in [Installation](#installation). 4 | 5 | **NOTE**: Supported models on Killarney are tracked [here](https://github.com/VectorInstitute/vector-inference/blob/main/MODEL_TRACKING.md) 6 | 7 | ## Installation 8 | 9 | If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package: 10 | 11 | ```bash 12 | pip install vec-inf 13 | ``` 14 | 15 | Otherwise, we recommend using the provided [`Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.11.0`. 16 | 17 | If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it: 18 | * Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/config), then install from source by running `pip install .`. 19 | * The package would try to look for cached configuration files in your environment before using the default configuration. The default cached configuration directory path points to `/model-weights/vec-inf-shared`, you would need to create an `environment.yaml` and a `models.yaml` following the format of these files in [`vec_inf/config`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/config). 20 | * The package would also look for an enviroment variable `VEC_INF_CONFIG_DIR`. You can put your `environment.yaml` and `models.yaml` in a directory of your choice and set the enviroment variable `VEC_INF_CONFIG_DIR` to point to that location. 21 | -------------------------------------------------------------------------------- /profile/avg_throughput.py: -------------------------------------------------------------------------------- 1 | """Calculate the average prompt and generation throughput from a log file.""" 2 | 3 | import argparse 4 | import re 5 | 6 | 7 | def filter_throughput(log_file_path: str) -> None: 8 | """Filter log file for non-zero entries and calculate the avg throughput.""" 9 | avg_prompt_throughput = [] 10 | avg_generation_throughput = [] 11 | # Define a regular expression pattern to extract throughput values 12 | pattern = r"Avg prompt throughput: ([^,]+) tokens/s, Avg generation throughput: ([^,]+) tokens/s" 13 | 14 | # Open the log file 15 | with open(log_file_path, "r") as file: 16 | # Iterate over each line in the file 17 | for line in file: 18 | # Use regex to find matches 19 | match = re.search(pattern, line) 20 | if match: 21 | # Extract prompt and generation throughput values 22 | prompt_throughput = match.group(1).strip() 23 | generation_throughput = match.group(2).strip() 24 | 25 | # Check if both throughput values are not zero 26 | if prompt_throughput != "0.0": 27 | avg_prompt_throughput.append(float(prompt_throughput)) 28 | if generation_throughput != "0.0": 29 | avg_generation_throughput.append(float(generation_throughput)) 30 | 31 | print( 32 | f"Average prompt throughput: {sum(avg_prompt_throughput) / len(avg_prompt_throughput)} tokens/s" 33 | ) 34 | print( 35 | f"Average generation throughput: {sum(avg_generation_throughput) / len(avg_generation_throughput)} tokens/s" 36 | ) 37 | 38 | 39 | def main() -> None: 40 | """Run the main function.""" 41 | # Create the parser 42 | parser = argparse.ArgumentParser( 43 | description="Filter log file for non-zero throughput entries." 44 | ) 45 | 46 | # Add the arguments 47 | parser.add_argument("--path", type=str, help="The path to the log file") 48 | 49 | # Execute the parse_args() method 50 | args = parser.parse_args() 51 | 52 | # Use the provided arguments 53 | filter_throughput(args.path) 54 | 55 | 56 | if __name__ == "__main__": 57 | main() 58 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04 2 | 3 | # Non-interactive apt-get commands 4 | ARG DEBIAN_FRONTEND=noninteractive 5 | 6 | # No GPUs visible during build 7 | ARG CUDA_VISIBLE_DEVICES=none 8 | 9 | # Specify CUDA architectures -> 7.5: Quadro RTX 6000 & T4, 8.0: A100, 8.6: A40, 8.9: L40S, 9.0: H100 10 | ARG TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0+PTX" 11 | 12 | # Set the Python version 13 | ARG PYTHON_VERSION=3.10.12 14 | 15 | # Install system dependencies 16 | RUN apt-get update && apt-get install -y \ 17 | wget build-essential libssl-dev zlib1g-dev libbz2-dev \ 18 | libreadline-dev libsqlite3-dev libffi-dev libncursesw5-dev \ 19 | xz-utils tk-dev libxml2-dev libxmlsec1-dev liblzma-dev git vim \ 20 | && rm -rf /var/lib/apt/lists/* 21 | 22 | # Install Python 23 | RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \ 24 | tar -xzf Python-$PYTHON_VERSION.tgz && \ 25 | cd Python-$PYTHON_VERSION && \ 26 | ./configure --enable-optimizations && \ 27 | make -j$(nproc) && \ 28 | make altinstall && \ 29 | cd .. && \ 30 | rm -rf Python-$PYTHON_VERSION.tgz Python-$PYTHON_VERSION 31 | 32 | # Install pip and core Python tools 33 | RUN wget https://bootstrap.pypa.io/get-pip.py && \ 34 | python3.10 get-pip.py && \ 35 | rm get-pip.py && \ 36 | python3.10 -m pip install --upgrade pip setuptools wheel uv 37 | 38 | # Install RDMA support 39 | RUN apt-get update && apt-get install -y \ 40 | libibverbs1 libibverbs-dev ibverbs-utils \ 41 | librdmacm1 librdmacm-dev rdmacm-utils \ 42 | rdma-core ibverbs-providers infiniband-diags perftest \ 43 | && rm -rf /var/lib/apt/lists/* 44 | 45 | # Set up RDMA environment (these will persist in the final container) 46 | ENV LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH" 47 | ENV NCCL_IB_DISABLE=0 48 | ENV NCCL_SOCKET_IFNAME="^lo,docker0" 49 | ENV NCCL_NET_GDR_LEVEL=PHB 50 | ENV NCCL_IB_TIMEOUT=22 51 | ENV NCCL_IB_RETRY_CNT=7 52 | ENV NCCL_DEBUG=INFO 53 | 54 | # Set up project 55 | WORKDIR /vec-inf 56 | COPY . /vec-inf 57 | 58 | # Install project dependencies with build requirements 59 | RUN uv pip install --system -e .[dev] --prerelease=allow 60 | 61 | # Install a single, system NCCL (from NVIDIA CUDA repo in base image) 62 | RUN apt-get update && apt-get install -y --allow-change-held-packages\ 63 | libnccl2 libnccl-dev \ 64 | && rm -rf /var/lib/apt/lists/* 65 | 66 | # Set the default command to start an interactive shell 67 | CMD ["bash"] 68 | -------------------------------------------------------------------------------- /.github/workflows/unit_tests.yml: -------------------------------------------------------------------------------- 1 | name: unit tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - develop 8 | paths: 9 | - .pre-commit-config.yaml 10 | - .github/workflows/code_checks.yml 11 | - .github/workflows/docs_build.yml 12 | - .github/workflows/docs_deploy.yml 13 | - .github/workflows/unit_tests.yml 14 | - .github/workflows/integration_tests.yml 15 | - '**.py' 16 | - '**.ipynb' 17 | - uv.lock 18 | - pyproject.toml 19 | - '**.rst' 20 | - '**.md' 21 | pull_request: 22 | branches: 23 | - main 24 | - develop 25 | paths: 26 | - .pre-commit-config.yaml 27 | - .github/workflows/code_checks.yml 28 | - .github/workflows/docs_build.yml 29 | - .github/workflows/docs_deploy.yml 30 | - .github/workflows/unit_tests.yml 31 | - .github/workflows/integration_tests.yml 32 | - '**.py' 33 | - '**.ipynb' 34 | - uv.lock 35 | - pyproject.toml 36 | - '**.rst' 37 | - '**.md' 38 | 39 | jobs: 40 | unit-tests: 41 | runs-on: ubuntu-latest 42 | strategy: 43 | matrix: 44 | python-version: ["3.10", "3.11", "3.12"] 45 | steps: 46 | - uses: actions/checkout@v5.0.0 47 | 48 | - name: Install uv 49 | uses: astral-sh/setup-uv@v7 50 | with: 51 | # Install a specific version of uv. 52 | version: "0.5.21" 53 | enable-cache: true 54 | 55 | - name: "Set up Python ${{ matrix.python-version }}" 56 | uses: actions/setup-python@v6 57 | with: 58 | python-version: ${{ matrix.python-version }} 59 | 60 | - name: Install the project 61 | run: uv sync --dev --prerelease=allow 62 | 63 | - name: Install dependencies and check code 64 | run: | 65 | uv run --frozen pytest -m "not integration_test" --cov vec_inf --cov-report=xml tests 66 | 67 | - name: Install the core package only 68 | run: uv sync --no-dev 69 | 70 | - name: Run package import tests 71 | run: | 72 | uv run --frozen pytest tests/test_imports.py 73 | 74 | - name: Import Codecov GPG public key 75 | run: | 76 | gpg --keyserver keyserver.ubuntu.com --recv-keys 806BB28AED779869 77 | 78 | - name: Upload coverage to Codecov 79 | uses: codecov/codecov-action@v5.5.1 80 | with: 81 | token: ${{ secrets.CODECOV_TOKEN }} 82 | files: ./coverage.xml 83 | name: codecov-umbrella 84 | fail_ci_if_error: true 85 | verbose: true 86 | -------------------------------------------------------------------------------- /vec_inf/client/_client_vars.py: -------------------------------------------------------------------------------- 1 | """Global variables for Vector Inference. 2 | 3 | This module contains configuration constants and templates used throughout the 4 | Vector Inference package, including model configurations, and metric definitions. 5 | 6 | Constants 7 | --------- 8 | MODEL_READY_SIGNATURE : str 9 | Signature string indicating successful model server startup 10 | SRC_DIR : str 11 | Absolute path to the package source directory 12 | KEY_METRICS : dict 13 | Mapping of vLLM metrics to their human-readable names 14 | SLURM_JOB_CONFIG_ARGS : dict 15 | Mapping of SLURM configuration arguments to their parameter names 16 | VLLM_SHORT_TO_LONG_MAP : dict 17 | Mapping of vLLM short arguments to their long names 18 | """ 19 | 20 | from pathlib import Path 21 | 22 | 23 | MODEL_READY_SIGNATURE = "INFO: Application startup complete." 24 | SRC_DIR = str(Path(__file__).parent.parent) 25 | 26 | 27 | # Key production metrics for inference servers 28 | KEY_METRICS = { 29 | "vllm:prompt_tokens_total": "total_prompt_tokens", 30 | "vllm:generation_tokens_total": "total_generation_tokens", 31 | "vllm:e2e_request_latency_seconds_sum": "request_latency_sum", 32 | "vllm:e2e_request_latency_seconds_count": "request_latency_count", 33 | "vllm:request_queue_time_seconds_sum": "queue_time_sum", 34 | "vllm:request_success_total": "successful_requests_total", 35 | "vllm:num_requests_running": "requests_running", 36 | "vllm:num_requests_waiting": "requests_waiting", 37 | "vllm:num_requests_swapped": "requests_swapped", 38 | "vllm:gpu_cache_usage_perc": "gpu_cache_usage", 39 | "vllm:cpu_cache_usage_perc": "cpu_cache_usage", 40 | } 41 | 42 | # Slurm job configuration arguments 43 | SLURM_JOB_CONFIG_ARGS = { 44 | "job-name": "model_name", 45 | "partition": "partition", 46 | "account": "account", 47 | "chdir": "work_dir", 48 | "qos": "qos", 49 | "time": "time", 50 | "nodes": "num_nodes", 51 | "exclude": "exclude", 52 | "nodelist": "node_list", 53 | "gres": "gres", 54 | "cpus-per-task": "cpus_per_task", 55 | "mem": "mem_per_node", 56 | "output": "out_file", 57 | "error": "err_file", 58 | } 59 | 60 | # vLLM engine args mapping between short and long names 61 | VLLM_SHORT_TO_LONG_MAP = { 62 | "-tp": "--tensor-parallel-size", 63 | "-pp": "--pipeline-parallel-size", 64 | "-dp": "--data-parallel-size", 65 | "-dpl": "--data-parallel-size-local", 66 | "-dpa": "--data-parallel-address", 67 | "-dpp": "--data-parallel-rpc-port", 68 | "-O": "--compilation-config", 69 | "-q": "--quantization", 70 | } 71 | 72 | # Required matching arguments for batch mode 73 | BATCH_MODE_REQUIRED_MATCHING_ARGS = ["venv", "log_dir"] 74 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pipenv 85 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 86 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 87 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 88 | # install all needed dependencies. 89 | #Pipfile.lock 90 | 91 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 92 | __pypackages__/ 93 | 94 | # Celery stuff 95 | celerybeat-schedule 96 | celerybeat.pid 97 | 98 | # SageMath parsed files 99 | *.sage.py 100 | 101 | # Environments 102 | .env 103 | .venv 104 | env/ 105 | venv/ 106 | ENV/ 107 | env.bak/ 108 | venv.bak/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | .spyproject 113 | 114 | # Rope project settings 115 | .ropeproject 116 | 117 | # mkdocs documentation 118 | /site 119 | 120 | # mypy 121 | .mypy_cache/ 122 | .dmypy.json 123 | dmypy.json 124 | 125 | # Pyre type checker 126 | .pyre/ 127 | 128 | # pycharm 129 | .idea/ 130 | 131 | # VS Code 132 | .vscode/ 133 | 134 | # MacOS 135 | .DS_Store 136 | 137 | # Slurm logs 138 | *.out 139 | *.err 140 | 141 | # Server url files 142 | *_url 143 | 144 | logs/ 145 | 146 | local/ 147 | slurm/ 148 | scripts/ 149 | 150 | # vLLM bug reporting files 151 | collect_env.py 152 | 153 | # build files 154 | dist/ 155 | 156 | # type stubs 157 | stubs/ 158 | mypy.ini 159 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | extra_css: 2 | - stylesheets/extra.css 3 | extra: 4 | generator: false 5 | social: 6 | - icon: fontawesome/brands/discord 7 | link: 404.html 8 | - icon: fontawesome/brands/github 9 | link: https://github.com/VectorInstitute/vector-inference 10 | version: 11 | provider: mike 12 | default: latest 13 | markdown_extensions: 14 | - attr_list 15 | - admonition 16 | - md_in_html 17 | - pymdownx.highlight: 18 | anchor_linenums: true 19 | line_spans: __span 20 | pygments_lang_class: true 21 | - pymdownx.inlinehilite 22 | - pymdownx.details 23 | - pymdownx.snippets 24 | - pymdownx.superfences 25 | - pymdownx.emoji: 26 | emoji_index: !!python/name:material.extensions.emoji.twemoji 27 | emoji_generator: !!python/name:material.extensions.emoji.to_svg 28 | - toc: 29 | permalink: true 30 | - meta 31 | - footnotes 32 | nav: 33 | - Home: index.md 34 | - User Guide: user_guide.md 35 | - API Reference: api.md 36 | - Contributing: contributing.md 37 | plugins: 38 | - search 39 | - mike: 40 | version_selector: true 41 | css_dir: stylesheets 42 | canonical_version: latest 43 | alias_type: symlink 44 | deploy_prefix: '' 45 | - mkdocstrings: 46 | default_handler: python 47 | handlers: 48 | python: 49 | paths: [../vec_inf] 50 | options: 51 | docstring_style: numpy 52 | members_order: source 53 | separate_signature: true 54 | show_overloads: true 55 | show_submodules: true 56 | show_root_heading: false 57 | show_root_full_path: true 58 | show_root_toc_entry: false 59 | show_symbol_type_heading: true 60 | show_symbol_type_toc: true 61 | repo_url: https://github.com/VectorInstitute/vector-inference 62 | repo_name: VectorInstitute/vector-inference 63 | site_name: Vector Inference 64 | site_url: https://vectorinstitute.github.io/vector-inference/ 65 | theme: 66 | name: material 67 | custom_dir: docs/overrides 68 | favicon: assets/favicon-48x48.svg 69 | features: 70 | - content.code.annotate 71 | - content.code.copy 72 | - navigation.footer 73 | - navigation.indexes 74 | - navigation.instant 75 | - navigation.tabs 76 | - navigation.tabs.sticky 77 | - navigation.top 78 | - search.suggest 79 | - search.highlight 80 | - toc.follow 81 | icon: 82 | repo: fontawesome/brands/github 83 | logo: assets/vector-logo.svg 84 | logo_footer: assets/vector-logo.svg 85 | palette: 86 | - media: "(prefers-color-scheme: light)" 87 | scheme: default 88 | primary: vector 89 | accent: vector-teal 90 | toggle: 91 | icon: material/brightness-7 92 | name: Switch to dark mode 93 | - media: "(prefers-color-scheme: dark)" 94 | scheme: slate 95 | primary: black 96 | accent: vector-teal 97 | toggle: 98 | icon: material/brightness-4 99 | name: Switch to light mode 100 | -------------------------------------------------------------------------------- /vec_inf/client/_slurm_vars.py: -------------------------------------------------------------------------------- 1 | """Slurm cluster configuration variables.""" 2 | 3 | import os 4 | import warnings 5 | from pathlib import Path 6 | from typing import Any, TypeAlias 7 | 8 | import yaml 9 | from typing_extensions import Literal 10 | 11 | 12 | CACHED_CONFIG_DIR = Path("/model-weights/vec-inf-shared") 13 | 14 | 15 | def load_env_config() -> dict[str, Any]: 16 | """Load the environment configuration.""" 17 | 18 | def load_yaml_config(path: Path) -> dict[str, Any]: 19 | """Load YAML config with error handling.""" 20 | try: 21 | with path.open() as f: 22 | return yaml.safe_load(f) or {} 23 | except FileNotFoundError as err: 24 | raise FileNotFoundError(f"Could not find config: {path}") from err 25 | except yaml.YAMLError as err: 26 | raise ValueError(f"Error parsing YAML config at {path}: {err}") from err 27 | 28 | cached_config_path = CACHED_CONFIG_DIR / "environment.yaml" 29 | default_path = ( 30 | cached_config_path 31 | if cached_config_path.exists() 32 | else Path(__file__).resolve().parent.parent / "config" / "environment.yaml" 33 | ) 34 | config = load_yaml_config(default_path) 35 | 36 | user_path = os.getenv("VEC_INF_CONFIG_DIR") 37 | if user_path: 38 | user_path_obj = Path(user_path, "environment.yaml") 39 | if user_path_obj.exists(): 40 | user_config = load_yaml_config(user_path_obj) 41 | config.update(user_config) 42 | else: 43 | warnings.warn( 44 | f"WARNING: Could not find user config directory: {user_path}, revert to default config located at {default_path}", 45 | UserWarning, 46 | stacklevel=2, 47 | ) 48 | 49 | return config 50 | 51 | 52 | _config = load_env_config() 53 | 54 | # Extract path values 55 | IMAGE_PATH = _config["paths"]["image_path"] 56 | 57 | # Extract containerization info 58 | CONTAINER_LOAD_CMD = _config["containerization"]["module_load_cmd"] 59 | CONTAINER_MODULE_NAME = _config["containerization"]["module_name"] 60 | 61 | # Extract limits 62 | MAX_GPUS_PER_NODE = _config["limits"]["max_gpus_per_node"] 63 | MAX_NUM_NODES = _config["limits"]["max_num_nodes"] 64 | MAX_CPUS_PER_TASK = _config["limits"]["max_cpus_per_task"] 65 | 66 | 67 | # Create dynamic Literal types 68 | def create_literal_type(values: list[str], fallback: str = "") -> Any: 69 | """Create a Literal type from a list, with configurable fallback.""" 70 | if not values: 71 | return Literal[fallback] 72 | return Literal[tuple(values)] 73 | 74 | 75 | QOS: TypeAlias = create_literal_type(_config["allowed_values"]["qos"]) # type: ignore[valid-type] 76 | PARTITION: TypeAlias = create_literal_type(_config["allowed_values"]["partition"]) # type: ignore[valid-type] 77 | RESOURCE_TYPE: TypeAlias = create_literal_type( # type: ignore[valid-type] 78 | _config["allowed_values"]["resource_type"] 79 | ) 80 | 81 | # Extract required arguments, for launching jobs that don't have a default value and 82 | # their corresponding environment variables 83 | REQUIRED_ARGS: dict[str, str] = _config["required_args"] 84 | 85 | # Extract default arguments 86 | DEFAULT_ARGS: dict[str, str] = _config["default_args"] 87 | -------------------------------------------------------------------------------- /tests/vec_inf/client/test_examples.py: -------------------------------------------------------------------------------- 1 | """Tests to verify the API examples function properly.""" 2 | 3 | from pathlib import Path 4 | from unittest.mock import MagicMock, patch 5 | 6 | import pytest 7 | 8 | from vec_inf.client import ModelStatus, ModelType, VecInfClient 9 | 10 | 11 | @pytest.fixture 12 | def mock_client(): 13 | """Create a mocked VecInfClient.""" 14 | client = MagicMock(spec=VecInfClient) 15 | 16 | # Set up mock responses 17 | mock_model1 = MagicMock() 18 | mock_model1.name = "test-model" 19 | mock_model1.family = "test-family" 20 | mock_model1.type = ModelType.LLM 21 | 22 | mock_model2 = MagicMock() 23 | mock_model2.name = "test-model-2" 24 | mock_model2.family = "test-family-2" 25 | mock_model2.type = ModelType.VLM 26 | 27 | client.list_models.return_value = [mock_model1, mock_model2] 28 | 29 | launch_response = MagicMock() 30 | launch_response.slurm_job_id = "123456" 31 | launch_response.model_name = "Meta-Llama-3.1-8B-Instruct" 32 | client.launch_model.return_value = launch_response 33 | 34 | status_response = MagicMock() 35 | status_response.status = ModelStatus.READY 36 | status_response.base_url = "http://gpu123:8080/v1" 37 | client.wait_until_ready.return_value = status_response 38 | 39 | metrics_response = MagicMock() 40 | metrics_response.metrics = {"throughput": "10.5"} 41 | client.get_metrics.return_value = metrics_response 42 | 43 | return client 44 | 45 | 46 | @pytest.mark.skipif( 47 | not ( 48 | Path(__file__).parent.parent.parent.parent 49 | / "examples" 50 | / "api" 51 | / "basic_usage.py" 52 | ).exists(), 53 | reason="Example file not found", 54 | ) 55 | def test_api_usage_example(): 56 | """Test the basic API usage example.""" 57 | example_path = ( 58 | Path(__file__).parent.parent.parent.parent 59 | / "examples" 60 | / "api" 61 | / "basic_usage.py" 62 | ) 63 | 64 | # Create a mock client 65 | mock_client = MagicMock(spec=VecInfClient) 66 | 67 | # Set up mock responses 68 | mock_model = MagicMock() 69 | mock_model.name = "Meta-Llama-3.1-8B-Instruct" 70 | mock_model.type = ModelType.LLM 71 | mock_client.list_models.return_value = [mock_model] 72 | 73 | launch_response = MagicMock() 74 | launch_response.slurm_job_id = "123456" 75 | mock_client.launch_model.return_value = launch_response 76 | 77 | status_response = MagicMock() 78 | status_response.status = ModelStatus.READY 79 | status_response.base_url = "http://gpu123:8080/v1" 80 | mock_client.wait_until_ready.return_value = status_response 81 | 82 | metrics_response = MagicMock() 83 | metrics_response.metrics = {"throughput": "10.5"} 84 | mock_client.get_metrics.return_value = metrics_response 85 | 86 | # Mock the VecInfClient class 87 | with ( 88 | patch("vec_inf.client.VecInfClient", return_value=mock_client), 89 | patch("builtins.print"), 90 | example_path.open() as f, 91 | ): 92 | exec(f.read()) 93 | 94 | # Verify the client methods were called 95 | mock_client.list_models.assert_called_once() 96 | mock_client.launch_model.assert_called_once() 97 | mock_client.wait_until_ready.assert_called_once() 98 | mock_client.get_metrics.assert_called_once() 99 | mock_client.shutdown_model.assert_called_once() 100 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "vec-inf" 3 | version = "0.7.3" 4 | description = "Efficient LLM inference on Slurm clusters using vLLM." 5 | readme = "README.md" 6 | authors = [{name = "Marshall Wang", email = "marshall.wang@vectorinstitute.ai"}] 7 | license = "MIT" 8 | requires-python = ">=3.10" 9 | dependencies = [ 10 | "requests>=2.31.0", 11 | "click>=8.1.0", 12 | "rich>=13.7.0", 13 | "pydantic>=2.10.6", 14 | "pyyaml>=6.0.2", 15 | ] 16 | 17 | [dependency-groups] 18 | dev = [ 19 | "codecov>=2.1.13", 20 | "mypy>=1.15.0", 21 | "nbqa>=1.9.1", 22 | "openai>=1.65.1", 23 | "pip-audit>=2.8.0", 24 | "pre-commit>=4.1.0", 25 | "pytest>=8.3.4", 26 | "pytest-asyncio>=0.25.3", 27 | "pytest-cov>=6.0.0", 28 | "pytest-mock>=3.14.0", 29 | "ruff>=0.9.6", 30 | ] 31 | docs = [ 32 | "mkdocs>=1.5.3", 33 | "mkdocs-material>=9.5.12", 34 | "mkdocstrings>=0.24.1", 35 | "mkdocstrings-python>=1.8.0", 36 | "pymdown-extensions>=10.7.1", 37 | "mike>=2.0.0", 38 | ] 39 | 40 | [project.optional-dependencies] 41 | dev = [ 42 | "xgrammar>=0.1.11", 43 | "torch>=2.7.0", 44 | "vllm>=0.10.0", 45 | "ray[default]>=2.50.0", 46 | "cupy-cuda12x==12.1.0", 47 | "flashinfer-python>=0.4.0", 48 | "sglang>=0.5.0", 49 | ] 50 | 51 | [project.scripts] 52 | vec-inf = "vec_inf.cli._cli:cli" 53 | 54 | [build-system] 55 | requires = ["hatchling"] 56 | build-backend = "hatchling.build" 57 | 58 | [tool.hatch.build.targets.wheel] 59 | packages = ["vec_inf"] 60 | 61 | [tool.mypy] 62 | ignore_missing_imports = true 63 | install_types = true 64 | pretty = true 65 | namespace_packages = true 66 | explicit_package_bases = true 67 | non_interactive = true 68 | warn_unused_configs = true 69 | allow_any_generics = false 70 | allow_subclassing_any = false 71 | allow_untyped_calls = false 72 | allow_untyped_defs = false 73 | allow_incomplete_defs = false 74 | check_untyped_defs = true 75 | allow_untyped_decorators = false 76 | warn_redundant_casts = true 77 | warn_unused_ignores = true 78 | warn_return_any = true 79 | implicit_reexport = false 80 | strict_equality = true 81 | extra_checks = true 82 | 83 | [tool.ruff] 84 | include = ["*.py", "pyproject.toml", "*.ipynb"] 85 | line-length = 88 86 | 87 | [tool.ruff.format] 88 | quote-style = "double" 89 | indent-style = "space" 90 | docstring-code-format = true 91 | 92 | [tool.ruff.lint] 93 | select = [ 94 | "A", # flake8-builtins 95 | "B", # flake8-bugbear 96 | "COM", # flake8-commas 97 | "C4", # flake8-comprehensions 98 | "RET", # flake8-return 99 | "SIM", # flake8-simplify 100 | "ICN", # flake8-import-conventions 101 | "Q", # flake8-quotes 102 | "RSE", # flake8-raise 103 | "D", # pydocstyle 104 | "E", # pycodestyle 105 | "F", # pyflakes 106 | "I", # isort 107 | "W", # pycodestyle 108 | "N", # pep8-naming 109 | "ERA", # eradicate 110 | "PL", # pylint 111 | ] 112 | fixable = ["A", "B", "COM", "C4", "RET", "SIM", "ICN", "Q", "RSE", "D", "E", "F", "I", "W", "N", "ERA", "PL"] 113 | ignore = [ 114 | "B905", # `zip()` without an explicit `strict=` parameter 115 | "E501", # line too long 116 | "D203", # 1 blank line required before class docstring 117 | "D213", # Multi-line docstring summary should start at the second line 118 | "PLR2004", # Replace magic number with named constant 119 | "PLR0913", # Too many arguments 120 | "COM812", # Missing trailing comma 121 | ] 122 | 123 | # Ignore import violations in all `__init__.py` files. 124 | [tool.ruff.lint.per-file-ignores] 125 | "__init__.py" = ["E402", "F401", "F403", "F811"] 126 | 127 | [tool.ruff.lint.pep8-naming] 128 | ignore-names = ["X*", "setUp"] 129 | 130 | [tool.ruff.lint.isort] 131 | lines-after-imports = 2 132 | 133 | [tool.ruff.lint.pydocstyle] 134 | convention = "numpy" 135 | 136 | [tool.ruff.lint.pycodestyle] 137 | max-doc-length = 88 138 | 139 | [tool.pytest.ini_options] 140 | markers = [ 141 | "integration_test: marks tests as integration tests", 142 | ] 143 | 144 | [tool.coverage] 145 | [tool.coverage.run] 146 | source=["vec_inf"] 147 | omit=["tests/*", "*__init__.py"] 148 | -------------------------------------------------------------------------------- /profile/gen.py: -------------------------------------------------------------------------------- 1 | """Testing script.""" 2 | 3 | import time 4 | from typing import List, Union 5 | 6 | import requests 7 | 8 | 9 | # Change the ENDPOINT and MODEL_PATH to match your setup 10 | ENDPOINT = "http://gpuXXX:XXXX/v1" 11 | MODEL_PATH = "Meta-Llama-3-70B" 12 | 13 | # Configuration 14 | API_KEY = "EMPTY" 15 | HEADERS = { 16 | "Authorization": f"Bearer {API_KEY}", 17 | "Content-Type": "application/json", 18 | } 19 | 20 | # Sample prompts for testing 21 | PROMPTS = [ 22 | "Translate the following English text to French: 'Hello, how are you?'", 23 | "What is the square root of 144?", 24 | "Summarize the following paragraph: 'Artificial intelligence refers to the simulation of human intelligence in machines...'", 25 | "Explain the process of photosynthesis in plants.", 26 | "What are the main differences between classical and quantum physics?", 27 | "Summarize the plot of 'To Kill a Mockingbird' by Harper Lee.", 28 | "Describe the economic impacts of climate change on agriculture.", 29 | "Translate the following sentence into Spanish: 'Where is the closest grocery store?'", 30 | "How does a lithium-ion battery work?", 31 | "Provide a brief biography of Marie Curie.", 32 | "What are the key factors that led to the end of the Cold War?", 33 | "Write a poem about the sunset over the ocean.", 34 | "Explain the rules of chess.", 35 | "What is blockchain technology and how does it work?", 36 | "Give a step-by-step guide on how to bake chocolate chip cookies.", 37 | "Describe the human digestive system.", 38 | "What is the theory of relativity?", 39 | "How to perform a basic oil change on a car.", 40 | "What are the symptoms and treatments for type 2 diabetes?", 41 | "Summarize the last episode of 'Game of Thrones'.", 42 | "Explain the role of the United Nations in world peace.", 43 | "Describe the culture and traditions of Japan.", 44 | "Provide a detailed explanation of the stock market.", 45 | "How do solar panels generate electricity?", 46 | "What is machine learning and how is it applied in daily life?", 47 | "Discuss the impact of the internet on modern education.", 48 | "Write a short story about a lost dog finding its way home.", 49 | "What are the benefits of meditation?", 50 | "Explain the process of recycling plastic.", 51 | "What is the significance of the Magna Carta?", 52 | "How does the human immune system fight viruses?", 53 | "Describe the stages of a frog's life cycle.", 54 | "Explain Newton's three laws of motion.", 55 | "What are the best practices for sustainable farming?", 56 | "Give a history of the Olympic Games.", 57 | "What are the causes and effects of global warming?", 58 | "Write an essay on the importance of voting.", 59 | "How is artificial intelligence used in healthcare?", 60 | "What is the function of the Federal Reserve?", 61 | "Describe the geography of South America.", 62 | "Explain how to set up a freshwater aquarium.", 63 | "What are the major works of William Shakespeare?", 64 | "How do antibiotics work against bacterial infections?", 65 | "Discuss the role of art in society.", 66 | "What are the main sources of renewable energy?", 67 | "How to prepare for a job interview.", 68 | "Describe the life cycle of a butterfly.", 69 | "What are the main components of a computer?", 70 | "Write a review of the latest Marvel movie.", 71 | "What are the ethical implications of cloning?", 72 | "Explain the significance of the Pyramids of Giza.", 73 | "Describe the process of making wine.", 74 | "How does the GPS system work?", 75 | ] 76 | 77 | 78 | def send_request(prompt: List[str]) -> Union[float, None]: 79 | """Send a request to the API.""" 80 | data = {"model": f"{MODEL_PATH}", "prompt": prompt, "max_tokens": 100} 81 | start_time = time.time() 82 | response = requests.post(f"{ENDPOINT}/completions", headers=HEADERS, json=data) 83 | duration = time.time() - start_time 84 | if response.status_code == 200: 85 | return duration 86 | return None 87 | 88 | 89 | def main() -> None: 90 | """Run main function.""" 91 | for _ in range(10): 92 | print("Sending 20x requests 0-52...") 93 | send_request(PROMPTS * 20) 94 | print("Done!") 95 | 96 | 97 | if __name__ == "__main__": 98 | main() 99 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to Vector Inference 2 | 3 | Thank you for your interest in contributing to Vector Inference! This guide will help you get started with development, testing, and documentation contributions. 4 | 5 | ## Development Setup 6 | 7 | ### Prerequisites 8 | 9 | - Python 3.10 or newer 10 | - [uv](https://github.com/astral-sh/uv) for dependency management 11 | 12 | ### Setting Up Development Environment 13 | 14 | 1. Clone the repository: 15 | ```bash 16 | git clone https://github.com/VectorInstitute/vector-inference.git 17 | cd vector-inference 18 | ``` 19 | 20 | 2. Install development dependencies: 21 | ```bash 22 | uv sync --all-extras --group dev 23 | ``` 24 | 25 | 3. Install pre-commit hooks: 26 | ```bash 27 | pre-commit install 28 | ``` 29 | 30 | !!! tip "Using Virtual Environments" 31 | If you prefer using virtual environments, you can use `uv venv` to create one: 32 | ```bash 33 | uv venv 34 | source .venv/bin/activate 35 | ``` 36 | 37 | ## Development Workflow 38 | 39 | ### Code Style and Linting 40 | 41 | We use several tools to ensure code quality: 42 | 43 | - **ruff** for linting and formatting 44 | - **mypy** for type checking 45 | 46 | You can run these tools with: 47 | 48 | ```bash 49 | # Linting 50 | uv run ruff check . 51 | 52 | # Type checking 53 | uv run mypy 54 | 55 | # Format code 56 | uv run ruff format . 57 | ``` 58 | 59 | !!! note "Pre-commit Hooks" 60 | The pre-commit hooks will automatically run these checks before each commit. 61 | If the hooks fail, you will need to fix the issues before you can commit. 62 | 63 | ### Testing 64 | 65 | All new features and bug fixes should include tests. We use pytest for testing: 66 | 67 | ```bash 68 | # Run all tests 69 | uv run pytest 70 | 71 | # Run tests with coverage 72 | uv run pytest --cov=vec_inf 73 | ``` 74 | 75 | ## Documentation 76 | 77 | ### Documentation Setup 78 | 79 | Install the documentation dependencies: 80 | 81 | ```bash 82 | uv sync --group docs 83 | ``` 84 | 85 | ### Building Documentation 86 | 87 | Build and serve the documentation locally: 88 | 89 | ```bash 90 | # Standard build 91 | mkdocs build 92 | 93 | # Serve locally with hot-reload 94 | mkdocs serve 95 | ``` 96 | 97 | ### Versioned Documentation 98 | 99 | Vector Inference uses [mike](https://github.com/jimporter/mike) to manage versioned documentation. This allows users to access documentation for specific versions of the library. 100 | 101 | #### Available Versions 102 | 103 | The documentation is available in multiple versions: 104 | 105 | - `latest` - Always points to the most recent stable release 106 | - Version-specific documentation (e.g., `0.5.0`, `0.4.0`) 107 | 108 | #### Versioning Strategy 109 | 110 | Our versioning strategy follows these rules: 111 | 112 | 1. Each release gets its own version number matching the package version (e.g., `0.5.0`) 113 | 2. The `latest` alias always points to the most recent stable release 114 | 3. Documentation is automatically deployed when changes are pushed to the main branch 115 | 116 | #### Working with Mike Locally 117 | 118 | To preview or work with versioned documentation: 119 | 120 | ```bash 121 | # Build and deploy a specific version to your local gh-pages branch 122 | mike deploy 0.5.0 123 | 124 | # Add an alias for the latest version 125 | mike deploy 0.5.0 latest 126 | 127 | # Set the default version to redirect to 128 | mike set-default latest 129 | 130 | # View the deployed versions 131 | mike list 132 | 133 | # Serve the versioned documentation locally 134 | mike serve 135 | ``` 136 | 137 | #### Automatic Documentation Deployment 138 | 139 | Documentation is automatically deployed through GitHub Actions: 140 | 141 | - On pushes to `main`, documentation is deployed with the version from `pyproject.toml` and the `latest` alias 142 | - Through manual trigger in the GitHub Actions workflow, where you can specify the version to deploy 143 | 144 | !!! info "When to Update Documentation" 145 | - When adding new features 146 | - When changing existing APIs 147 | - When fixing bugs that affect user experience 148 | - When improving explanations or examples 149 | 150 | ## Pull Request Process 151 | 152 | 1. **Fork the repository** and create your branch from `main` 153 | 2. **Make your changes** and add appropriate tests 154 | 3. **Ensure tests pass** and code meets style guidelines 155 | 4. **Write clear documentation** for your changes 156 | 5. **Submit a pull request** with a clear description of the changes 157 | 158 | !!! important "Checklist Before Submitting PR" 159 | - [ ] All tests pass 160 | - [ ] Code is formatted with ruff 161 | - [ ] Type annotations are correct 162 | - [ ] Documentation is updated 163 | - [ ] Commit messages are clear and descriptive 164 | 165 | ## Release Process 166 | 167 | 1. Update version in `pyproject.toml` 168 | 2. Update changelogs and documentation as needed 169 | 3. Create a new tag and release on GitHub 170 | 4. Documentation for the new version will be automatically deployed 171 | 172 | ## License 173 | 174 | By contributing to Vector Inference, you agree that your contributions will be licensed under the project's [MIT License](https://github.com/VectorInstitute/vector-inference/blob/main/LICENSE). 175 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | permissions: 3 | contents: write 4 | pull-requests: write 5 | 6 | on: 7 | push: 8 | branches: 9 | - main 10 | paths: 11 | - .pre-commit-config.yaml 12 | - .github/workflows/docs.yml 13 | - '**.py' 14 | - '**.ipynb' 15 | - '**.html' 16 | - '**.js' 17 | - '**.md' 18 | - uv.lock 19 | - pyproject.toml 20 | - mkdocs.yml 21 | - '**.png' 22 | - '**.svg' 23 | pull_request: 24 | branches: 25 | - main 26 | paths: 27 | - .pre-commit-config.yaml 28 | - .github/workflows/docs.yml 29 | - '**.py' 30 | - '**.ipynb' 31 | - '**.js' 32 | - '**.html' 33 | - uv.lock 34 | - pyproject.toml 35 | - '**.md' 36 | - mkdocs.yml 37 | - '**.png' 38 | - '**.svg' 39 | release: 40 | types: [published] 41 | # Allow manual trigger 42 | workflow_dispatch: 43 | inputs: 44 | version: 45 | description: 'Version to deploy (e.g., 0.5.0, latest)' 46 | required: true 47 | default: 'latest' 48 | 49 | jobs: 50 | build: 51 | runs-on: ubuntu-latest 52 | steps: 53 | - name: Checkout code 54 | uses: actions/checkout@v5.0.0 55 | with: 56 | fetch-depth: 0 # Fetch all history for proper versioning 57 | 58 | - name: Install uv 59 | uses: astral-sh/setup-uv@v7 60 | with: 61 | version: "0.5.21" 62 | enable-cache: true 63 | 64 | - name: Set up Python 65 | uses: actions/setup-python@v6 66 | with: 67 | python-version-file: ".python-version" 68 | 69 | - name: Install the project 70 | run: uv sync --all-extras --group docs --prerelease=allow 71 | 72 | - name: Build docs 73 | run: uv run --frozen mkdocs build 74 | 75 | - name: Create .nojekyll file 76 | run: touch site/.nojekyll 77 | 78 | - name: Upload artifact 79 | uses: actions/upload-artifact@v5 80 | with: 81 | name: docs-site 82 | path: site/ 83 | retention-days: 1 84 | 85 | deploy: 86 | needs: build 87 | if: (github.event_name == 'push' && github.ref == 'refs/heads/main') || github.event_name == 'workflow_dispatch' || github.event_name == 'release' 88 | runs-on: ubuntu-latest 89 | steps: 90 | - name: Checkout code 91 | uses: actions/checkout@v5.0.0 92 | with: 93 | fetch-depth: 0 # Fetch all history for proper versioning 94 | 95 | - name: Install uv 96 | uses: astral-sh/setup-uv@v7 97 | with: 98 | version: "0.5.21" 99 | enable-cache: true 100 | 101 | - name: Set up Python 102 | uses: actions/setup-python@v6 103 | with: 104 | python-version-file: ".python-version" 105 | 106 | - name: Install the project 107 | run: uv sync --all-extras --group docs --frozen 108 | 109 | - name: Configure Git Credentials 110 | run: | 111 | git config user.name github-actions[bot] 112 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com 113 | 114 | - name: Download artifact 115 | uses: actions/download-artifact@v6 116 | with: 117 | name: docs-site 118 | path: site 119 | 120 | - name: Ensure .nojekyll exists 121 | run: touch site/.nojekyll 122 | 123 | - name: Determine version 124 | id: version 125 | run: | 126 | if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then 127 | # Use the version provided in the workflow dispatch 128 | echo "VERSION=${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT 129 | echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT 130 | elif [[ "${{ github.event_name }}" == "release" ]]; then 131 | # Use the tag from the release 132 | VERSION="${{ github.ref_name }}" 133 | # Remove 'v' prefix if present 134 | VERSION="${VERSION#v}" 135 | echo "VERSION=$VERSION" >> $GITHUB_OUTPUT 136 | echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT 137 | elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then 138 | # For pushes to main, tag as "main" 139 | echo "VERSION=main" >> $GITHUB_OUTPUT 140 | # No alias for main 141 | echo "VERSION_ALIAS=" >> $GITHUB_OUTPUT 142 | else 143 | # Get version from pyproject.toml as fallback 144 | VERSION=$(grep -m 1 '^version = ' pyproject.toml | sed 's/^version = "\(.*\)"$/\1/') 145 | echo "VERSION=$VERSION" >> $GITHUB_OUTPUT 146 | echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT 147 | fi 148 | 149 | - name: Deploy docs with mike 150 | run: | 151 | VERSION=${{ steps.version.outputs.VERSION }} 152 | ALIAS=${{ steps.version.outputs.VERSION_ALIAS }} 153 | 154 | # Add a temporary remote to fetch gh-pages if it exists 155 | git remote add temp https://github.com/${{ github.repository }}.git || true 156 | git fetch temp gh-pages || true 157 | 158 | DEPLOY_ARGS="--push --update-aliases $VERSION" 159 | 160 | if [[ ! -z "$ALIAS" ]]; then 161 | DEPLOY_ARGS="$DEPLOY_ARGS $ALIAS" 162 | fi 163 | 164 | # Activate the virtual environment 165 | source .venv/bin/activate 166 | 167 | echo "Running: mike deploy $DEPLOY_ARGS" 168 | mike deploy $DEPLOY_ARGS 169 | 170 | # Set default version to latest only if we're deploying a version with the latest alias 171 | if [[ ! -z "$ALIAS" && "$ALIAS" == "latest" ]]; then 172 | mike set-default --push latest 173 | fi 174 | 175 | # Remove the temporary remote 176 | git remote remove temp || true 177 | -------------------------------------------------------------------------------- /docs/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | [data-md-color-primary="vector"] { 2 | --md-primary-fg-color: #eb088a; 3 | --md-primary-fg-color--light: #f252a5; 4 | --md-primary-fg-color--dark: #b00068; 5 | --md-primary-bg-color: hsla(0, 0%, 100%, 1); 6 | --md-primary-bg-color--light: hsla(0, 0%, 100%, 0.7); 7 | } 8 | 9 | [data-md-color-primary="black"] { 10 | --md-primary-fg-color: #181818; 11 | --md-primary-fg-color--light: #f252a5; 12 | --md-primary-fg-color--dark: #b00068; 13 | --md-primary-bg-color: #eb088a; 14 | } 15 | 16 | [data-md-color-accent="vector-teal"] { 17 | --md-accent-fg-color: #48c0d9; 18 | --md-accent-fg-color--transparent: #526cfe1a; 19 | --md-accent-bg-color: #fff; 20 | --md-accent-bg-color--light: #ffffffb3; 21 | } 22 | 23 | [data-md-color-scheme="slate"][data-md-color-primary="black"] { 24 | --md-typeset-a-color: #eb088a; 25 | } 26 | 27 | [data-md-color-scheme="default"] { 28 | /* Default light mode styling */ 29 | } 30 | 31 | [data-md-color-scheme="slate"] { 32 | --md-typeset-a-color: #eb088a; 33 | /* Dark mode styling */ 34 | } 35 | 36 | /* Vector logo css styling to match overrides/partial/copyright.html */ 37 | .md-footer-vector { 38 | display: flex; 39 | align-items: center; 40 | padding: 0 0.6rem; 41 | } 42 | 43 | .md-footer-vector img { 44 | height: 24px; /* Reduce height to a fixed value */ 45 | width: auto; /* Maintain aspect ratio */ 46 | transition: opacity 0.25s; 47 | opacity: 0.7; 48 | } 49 | 50 | .md-footer-vector img:hover { 51 | opacity: 1; 52 | } 53 | 54 | /* Make the inner footer grid elements distribute evenly */ 55 | .md-footer-meta__inner { 56 | display: flex; 57 | justify-content: space-between; 58 | align-items: center; 59 | } 60 | 61 | /* To make socials and Vector logo not stack when viewing on mobile */ 62 | @media screen and (max-width: 76.234375em) { 63 | .md-footer-meta__inner.md-grid { 64 | flex-direction: row; 65 | justify-content: space-between; 66 | align-items: center; 67 | } 68 | 69 | .md-copyright, 70 | .md-social { 71 | width: auto; 72 | max-width: 49%; 73 | } 74 | 75 | /* Prevent margin that causes stacking */ 76 | .md-social { 77 | margin: 0; 78 | } 79 | } 80 | 81 | /* Reduce margins for h2 when using grid cards */ 82 | .grid.cards h2 { 83 | margin-top: 0; /* Remove top margin completely in cards */ 84 | margin-bottom: 0.5rem; /* Smaller bottom margin in cards */ 85 | } 86 | 87 | .vector-icon { 88 | color: #eb088a; 89 | opacity: 0.7; 90 | margin-right: 0.2em; 91 | } 92 | 93 | /* Version selector styling - Material theme */ 94 | 95 | /* Version selector container */ 96 | .md-version { 97 | position: relative; 98 | display: inline-block; 99 | margin-left: 0.25rem; 100 | } 101 | 102 | /* Current version button styling */ 103 | .md-version__current { 104 | display: inline-flex; 105 | align-items: center; 106 | font-size: 0.7rem; 107 | font-weight: 600; 108 | color: var(--md-primary-bg-color); 109 | padding: 0.4rem 0.8rem; 110 | margin: 0.4rem 0; 111 | background-color: rgba(255, 255, 255, 0.1); 112 | border-radius: 4px; 113 | border: 1px solid rgba(255, 255, 255, 0.2); 114 | cursor: pointer; 115 | transition: all 0.15s ease-in-out; 116 | } 117 | 118 | /* Hover effect for current version button */ 119 | .md-version__current:hover { 120 | background-color: rgba(255, 255, 255, 0.2); 121 | box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); 122 | } 123 | 124 | /* Down arrow for version dropdown */ 125 | .md-version__current:after { 126 | display: inline-block; 127 | margin-left: 0.5rem; 128 | content: ""; 129 | vertical-align: middle; 130 | border-top: 0.3em solid; 131 | border-right: 0.3em solid transparent; 132 | border-bottom: 0; 133 | border-left: 0.3em solid transparent; 134 | } 135 | 136 | /* Dropdown menu */ 137 | .md-version__list { 138 | position: absolute; 139 | top: 100%; 140 | left: 0; 141 | z-index: 10; 142 | min-width: 125%; 143 | margin: 0.1rem 0 0; 144 | padding: 0; 145 | background-color: var(--md-primary-fg-color); 146 | border-radius: 4px; 147 | box-shadow: 0 4px 16px rgba(0, 0, 0, 0.2); 148 | opacity: 0; 149 | visibility: hidden; 150 | transform: translateY(-8px); 151 | transition: all 0.2s ease; 152 | } 153 | 154 | /* Show dropdown when parent is hovered */ 155 | .md-version:hover .md-version__list { 156 | opacity: 1; 157 | visibility: visible; 158 | transform: translateY(0); 159 | } 160 | 161 | /* Version list items */ 162 | .md-version__item { 163 | list-style: none; 164 | padding: 0; 165 | } 166 | 167 | /* Version links */ 168 | .md-version__link { 169 | display: block; 170 | padding: 0.5rem 1rem; 171 | font-size: 0.75rem; 172 | color: var(--md-primary-bg-color); 173 | transition: background-color 0.15s; 174 | text-decoration: none; 175 | } 176 | 177 | /* Version link hover */ 178 | .md-version__link:hover { 179 | background-color: var(--md-primary-fg-color--dark); 180 | text-decoration: none; 181 | } 182 | 183 | /* Active version in dropdown */ 184 | .md-version__link--active { 185 | background-color: var(--md-accent-fg-color); 186 | color: var(--md-accent-bg-color); 187 | font-weight: 700; 188 | } 189 | 190 | /* For the Material selector */ 191 | .md-header__option { 192 | display: flex; 193 | align-items: center; 194 | } 195 | 196 | /* Version selector in Material 9.x */ 197 | .md-select { 198 | position: relative; 199 | margin-left: 0.5rem; 200 | } 201 | 202 | .md-select__label { 203 | font-size: 0.7rem; 204 | font-weight: 600; 205 | color: var(--md-primary-bg-color); 206 | cursor: pointer; 207 | padding: 0.4rem 0.8rem; 208 | background-color: rgba(255, 255, 255, 0.1); 209 | border-radius: 4px; 210 | border: 1px solid rgba(255, 255, 255, 0.2); 211 | transition: all 0.15s ease-in-out; 212 | } 213 | 214 | .md-select__label:hover { 215 | background-color: rgba(255, 255, 255, 0.2); 216 | box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); 217 | } 218 | 219 | /* Version selector in Material 9.2+ */ 220 | .md-header__button.md-select { 221 | display: inline-flex; 222 | align-items: center; 223 | margin: 0 0.8rem; 224 | } 225 | 226 | /* For Material 9.x+ with specific version selector */ 227 | .md-typeset .md-version-warn { 228 | padding: 0.6rem 1rem; 229 | margin: 1.5rem 0; 230 | background-color: rgba(235, 8, 138, 0.1); 231 | border-left: 4px solid #eb088a; 232 | border-radius: 0.2rem; 233 | color: var(--md-default-fg-color); 234 | font-size: 0.8rem; 235 | } 236 | -------------------------------------------------------------------------------- /vec_inf/client/config.py: -------------------------------------------------------------------------------- 1 | """Model configuration. 2 | 3 | This module provides a Pydantic model for validating and managing model deployment 4 | configurations, including hardware requirements and model specifications. 5 | """ 6 | 7 | from pathlib import Path 8 | from typing import Any, Optional, Union 9 | 10 | from pydantic import BaseModel, ConfigDict, Field 11 | from typing_extensions import Literal 12 | 13 | from vec_inf.client._slurm_vars import ( 14 | DEFAULT_ARGS, 15 | MAX_CPUS_PER_TASK, 16 | MAX_GPUS_PER_NODE, 17 | MAX_NUM_NODES, 18 | PARTITION, 19 | QOS, 20 | RESOURCE_TYPE, 21 | ) 22 | 23 | 24 | class ModelConfig(BaseModel): 25 | """Pydantic model for validating and managing model deployment configurations. 26 | 27 | A configuration class that handles validation and management of model deployment 28 | settings, including model specifications, hardware requirements, and runtime 29 | parameters. 30 | 31 | Parameters 32 | ---------- 33 | model_name : str 34 | Name of the model, must be alphanumeric with allowed characters: '-', '_', '.' 35 | model_family : str 36 | Family/architecture of the model 37 | model_variant : str, optional 38 | Specific variant or version of the model family 39 | model_type : {'LLM', 'VLM', 'Text_Embedding', 'Reward_Modeling'} 40 | Type of model architecture 41 | gpus_per_node : int 42 | Number of GPUs to use per node (1-MAX_GPUS_PER_NODE) 43 | num_nodes : int 44 | Number of nodes to use for deployment (1-MAX_NUM_NODES) 45 | cpus_per_task : int, optional 46 | Number of CPU cores per task (1-MAX_CPUS_PER_TASK) 47 | mem_per_node : str, optional 48 | Memory allocation per node in GB format (e.g., '32G') 49 | vocab_size : int 50 | Size of the model's vocabulary (1-1,000,000) 51 | account : str, optional 52 | Charge resources used by this job to specified account. 53 | work_dir : str, optional 54 | Set working directory for the batch job 55 | qos : Union[QOS, str], optional 56 | Quality of Service tier for job scheduling 57 | time : str, optional 58 | Time limit for the job in HH:MM:SS format 59 | partition : Union[PARTITION, str], optional 60 | Slurm partition for job scheduling 61 | resource_type : Union[RESOURCE_TYPE, str], optional 62 | Type of resource to request for the job 63 | venv : str, optional 64 | Virtual environment or container system to use 65 | log_dir : Path, optional 66 | Directory path for storing logs 67 | model_weights_parent_dir : Path, optional 68 | Base directory containing model weights 69 | vllm_args : dict[str, Any], optional 70 | Additional arguments for vLLM engine configuration 71 | 72 | Notes 73 | ----- 74 | All fields are validated using Pydantic's validation system. The model is 75 | configured to be immutable (frozen) and forbids extra fields. 76 | """ 77 | 78 | model_name: str = Field(..., min_length=3, pattern=r"^[a-zA-Z0-9\-_\.]+$") 79 | model_family: str = Field(..., min_length=2) 80 | model_variant: Optional[str] = Field( 81 | default=None, description="Specific variant/version of the model family" 82 | ) 83 | model_type: Literal["LLM", "VLM", "Text_Embedding", "Reward_Modeling"] = Field( 84 | ..., description="Type of model architecture" 85 | ) 86 | gpus_per_node: int = Field( 87 | ..., gt=0, le=MAX_GPUS_PER_NODE, description="GPUs per node" 88 | ) 89 | num_nodes: int = Field(..., gt=0, le=MAX_NUM_NODES, description="Number of nodes") 90 | cpus_per_task: int = Field( 91 | default=int(DEFAULT_ARGS["cpus_per_task"]), 92 | gt=0, 93 | le=MAX_CPUS_PER_TASK, 94 | description="CPUs per task", 95 | ) 96 | mem_per_node: str = Field( 97 | default=DEFAULT_ARGS["mem_per_node"], 98 | pattern=r"^\d{1,4}G$", 99 | description="Memory per node", 100 | ) 101 | vocab_size: int = Field(..., gt=0, le=1_000_000) 102 | account: Optional[str] = Field( 103 | default=None, description="Account name for job scheduling" 104 | ) 105 | work_dir: Optional[str] = Field( 106 | default=None, description="Working directory for the job" 107 | ) 108 | qos: Optional[Union[QOS, str]] = Field( 109 | default=DEFAULT_ARGS["qos"] if DEFAULT_ARGS["qos"] != "" else None, 110 | description="Quality of Service tier", 111 | ) 112 | time: str = Field( 113 | default=DEFAULT_ARGS["time"], 114 | pattern=r"^\d{2}:\d{2}:\d{2}$", 115 | description="HH:MM:SS time limit", 116 | ) 117 | partition: Optional[Union[PARTITION, str]] = Field( 118 | default=DEFAULT_ARGS["partition"] if DEFAULT_ARGS["partition"] != "" else None, 119 | description="GPU partition type", 120 | ) 121 | resource_type: Optional[Union[RESOURCE_TYPE, str]] = Field( 122 | default=DEFAULT_ARGS["resource_type"] 123 | if DEFAULT_ARGS["resource_type"] != "" 124 | else None, 125 | description="Resource type", 126 | ) 127 | exclude: Optional[str] = Field( 128 | default=DEFAULT_ARGS["exclude"], 129 | description="Exclude certain nodes from the resources granted to the job", 130 | ) 131 | nodelist: Optional[str] = Field( 132 | default=DEFAULT_ARGS["nodelist"], 133 | description="Request a specific list of nodes for deployment", 134 | ) 135 | bind: Optional[str] = Field( 136 | default=DEFAULT_ARGS["bind"], 137 | description="Additional binds for the container", 138 | ) 139 | venv: str = Field( 140 | default=DEFAULT_ARGS["venv"], 141 | description="Virtual environment/container system", 142 | ) 143 | log_dir: Path = Field( 144 | default=Path(DEFAULT_ARGS["log_dir"]), 145 | description="Log directory path", 146 | ) 147 | model_weights_parent_dir: Path = Field( 148 | default=Path(DEFAULT_ARGS["model_weights_parent_dir"]), 149 | description="Base directory for model weights", 150 | ) 151 | vllm_args: Optional[dict[str, Any]] = Field( 152 | default={}, description="vLLM engine arguments" 153 | ) 154 | env: Optional[dict[str, Any]] = Field( 155 | default={}, description="Environment variables to be set" 156 | ) 157 | model_config = ConfigDict( 158 | extra="forbid", str_strip_whitespace=True, validate_default=True, frozen=True 159 | ) 160 | -------------------------------------------------------------------------------- /vec_inf/client/models.py: -------------------------------------------------------------------------------- 1 | """Data models for Vector Inference API. 2 | 3 | This module contains the data model classes used by the Vector Inference API 4 | for both request parameters and response objects. 5 | 6 | Classes 7 | ------- 8 | ModelStatus : Enum 9 | Status states of a model 10 | ModelType : Enum 11 | Types of supported models 12 | LaunchResponse : dataclass 13 | Response from model launch operation 14 | StatusResponse : dataclass 15 | Response from model status check 16 | MetricsResponse : dataclass 17 | Response from metrics collection 18 | LaunchOptions : dataclass 19 | Options for model launch 20 | LaunchOptionsDict : TypedDict 21 | Dictionary representation of launch options 22 | ModelInfo : datacitten 23 | Information about available models 24 | """ 25 | 26 | from dataclasses import dataclass, field 27 | from enum import Enum 28 | from typing import Any, Optional, Union 29 | 30 | 31 | class ModelStatus(str, Enum): 32 | """Enum representing the possible status states of a model. 33 | 34 | Attributes 35 | ---------- 36 | PENDING : str 37 | Model is waiting for Slurm to allocate resources 38 | LAUNCHING : str 39 | Model is in the process of starting 40 | READY : str 41 | Model is running and ready to serve requests 42 | FAILED : str 43 | Model failed to start or encountered an error 44 | SHUTDOWN : str 45 | Model was intentionally stopped 46 | UNAVAILABLE : str 47 | Model status cannot be determined 48 | """ 49 | 50 | PENDING = "PENDING" 51 | LAUNCHING = "LAUNCHING" 52 | READY = "READY" 53 | FAILED = "FAILED" 54 | SHUTDOWN = "SHUTDOWN" 55 | UNAVAILABLE = "UNAVAILABLE" 56 | 57 | 58 | class ModelType(str, Enum): 59 | """Enum representing the possible model types. 60 | 61 | Attributes 62 | ---------- 63 | LLM : str 64 | Large Language Model 65 | VLM : str 66 | Vision Language Model 67 | TEXT_EMBEDDING : str 68 | Text Embedding Model 69 | REWARD_MODELING : str 70 | Reward Modeling Model 71 | """ 72 | 73 | LLM = "LLM" 74 | VLM = "VLM" 75 | TEXT_EMBEDDING = "Text_Embedding" 76 | REWARD_MODELING = "Reward_Modeling" 77 | 78 | 79 | @dataclass 80 | class LaunchResponse: 81 | """Response from launching a model. 82 | 83 | Parameters 84 | ---------- 85 | slurm_job_id : str 86 | ID of the launched SLURM job 87 | model_name : str 88 | Name of the launched model 89 | config : dict[str, Any] 90 | Configuration used for the launch 91 | raw_output : str 92 | Raw output from the launch command (hidden from repr) 93 | """ 94 | 95 | slurm_job_id: str 96 | model_name: str 97 | config: dict[str, Any] 98 | raw_output: str = field(repr=False) 99 | 100 | 101 | @dataclass 102 | class BatchLaunchResponse: 103 | """Response from launching multiple models in batch mode. 104 | 105 | Parameters 106 | ---------- 107 | slurm_job_id : str 108 | ID of the launched SLURM job 109 | slurm_job_name : str 110 | Name of the launched SLURM job 111 | model_names : list[str] 112 | Names of the launched models 113 | config : dict[str, Any] 114 | Configuration used for the launch 115 | raw_output : str 116 | Raw output from the launch command (hidden from repr) 117 | """ 118 | 119 | slurm_job_id: str 120 | slurm_job_name: str 121 | model_names: list[str] 122 | config: dict[str, Any] 123 | raw_output: str = field(repr=False) 124 | 125 | 126 | @dataclass 127 | class StatusResponse: 128 | """Response from checking a model's status. 129 | 130 | Parameters 131 | ---------- 132 | model_name : str 133 | Name of the model 134 | log_dir : str 135 | Path to the SLURM log directory 136 | server_status : ModelStatus 137 | Current status of the server 138 | job_state : Union[str, ModelStatus] 139 | Current state of the SLURM job 140 | raw_output : str 141 | Raw output from status check (hidden from repr) 142 | base_url : str, optional 143 | Base URL of the model server if ready 144 | pending_reason : str, optional 145 | Reason for pending state if applicable 146 | failed_reason : str, optional 147 | Reason for failure if applicable 148 | """ 149 | 150 | model_name: str 151 | log_dir: str 152 | server_status: ModelStatus 153 | job_state: Union[str, ModelStatus] 154 | raw_output: str = field(repr=False) 155 | base_url: Optional[str] = None 156 | pending_reason: Optional[str] = None 157 | failed_reason: Optional[str] = None 158 | 159 | 160 | @dataclass 161 | class MetricsResponse: 162 | """Response from retrieving model metrics. 163 | 164 | Parameters 165 | ---------- 166 | model_name : str 167 | Name of the model 168 | metrics : Union[dict[str, float], str] 169 | Either a dictionary of metrics or an error message 170 | timestamp : float 171 | Unix timestamp of when metrics were collected 172 | """ 173 | 174 | model_name: str 175 | metrics: Union[dict[str, float], str] 176 | timestamp: float 177 | 178 | 179 | @dataclass 180 | class LaunchOptions: 181 | """Options for launching a model. 182 | 183 | Parameters 184 | ---------- 185 | model_family : str, optional 186 | Family/architecture of the model 187 | model_variant : str, optional 188 | Specific variant/version of the model 189 | partition : str, optional 190 | SLURM partition to use 191 | resource_type : str, optional 192 | Type of resource to request for the job 193 | num_nodes : int, optional 194 | Number of nodes to allocate 195 | gpus_per_node : int, optional 196 | Number of GPUs per node 197 | cpus_per_task : int, optional 198 | Number of CPUs per task 199 | mem_per_node : str, optional 200 | Memory per node 201 | account : str, optional 202 | Account name for job scheduling 203 | work_dir : str, optional 204 | Set working directory for the batch job 205 | qos : str, optional 206 | Quality of Service level 207 | time : str, optional 208 | Time limit for the job 209 | exclude : str, optional 210 | Exclude certain nodes from the resources granted to the job 211 | node_list : str, optional 212 | Request a specific list of nodes for deployment 213 | bind : str, optional 214 | Additional binds for the container as a comma separated list of bind paths 215 | vocab_size : int, optional 216 | Size of model vocabulary 217 | data_type : str, optional 218 | Data type for model weights 219 | venv : str, optional 220 | Virtual environment to use 221 | log_dir : str, optional 222 | Directory for logs 223 | model_weights_parent_dir : str, optional 224 | Parent directory containing model weights 225 | vllm_args : str, optional 226 | Additional arguments for vLLM 227 | env : str, optional 228 | Environment variables to be set 229 | config : str, optional 230 | Path to custom model config yaml 231 | """ 232 | 233 | model_family: Optional[str] = None 234 | model_variant: Optional[str] = None 235 | partition: Optional[str] = None 236 | resource_type: Optional[str] = None 237 | num_nodes: Optional[int] = None 238 | gpus_per_node: Optional[int] = None 239 | cpus_per_task: Optional[int] = None 240 | mem_per_node: Optional[str] = None 241 | account: Optional[str] = None 242 | work_dir: Optional[str] = None 243 | qos: Optional[str] = None 244 | exclude: Optional[str] = None 245 | nodelist: Optional[str] = None 246 | bind: Optional[str] = None 247 | time: Optional[str] = None 248 | vocab_size: Optional[int] = None 249 | data_type: Optional[str] = None 250 | venv: Optional[str] = None 251 | log_dir: Optional[str] = None 252 | model_weights_parent_dir: Optional[str] = None 253 | vllm_args: Optional[str] = None 254 | env: Optional[str] = None 255 | config: Optional[str] = None 256 | 257 | 258 | @dataclass 259 | class ModelInfo: 260 | """Information about an available model. 261 | 262 | Parameters 263 | ---------- 264 | name : str 265 | Name of the model 266 | family : str 267 | Family/architecture of the model 268 | variant : str, optional 269 | Specific variant/version of the model 270 | model_type : ModelType 271 | Type of the model 272 | config : dict[str, Any] 273 | Additional configuration parameters 274 | """ 275 | 276 | name: str 277 | family: str 278 | variant: Optional[str] 279 | model_type: ModelType 280 | config: dict[str, Any] 281 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Vector Inference: Easy inference on Slurm clusters 2 | 3 | ---------------------------------------------------- 4 | 5 | [![PyPI](https://img.shields.io/pypi/v/vec-inf)](https://pypi.org/project/vec-inf) 6 | [![downloads](https://img.shields.io/pypi/dm/vec-inf)](https://pypistats.org/packages/vec-inf) 7 | [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml) 8 | [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml) 9 | [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/main/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main) 10 | [![vLLM](https://img.shields.io/badge/vLLM-0.11.0-blue)](https://docs.vllm.ai/en/v0.11.0/) 11 | ![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference) 12 | 13 | This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **This package runs natively on the Vector Institute cluster environments**. To adapt to other environments, follow the instructions in [Installation](#installation). 14 | 15 | **NOTE**: Supported models on Killarney are tracked [here](./MODEL_TRACKING.md) 16 | 17 | ## Installation 18 | If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package: 19 | 20 | ```bash 21 | pip install vec-inf 22 | ``` 23 | Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.11.0`. 24 | 25 | If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it: 26 | * Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](vec_inf/config/), then install from source by running `pip install .`. 27 | * The package would try to look for cached configuration files in your environment before using the default configuration. The default cached configuration directory path points to `/model-weights/vec-inf-shared`, you would need to create an `environment.yaml` and a `models.yaml` following the format of these files in [`vec_inf/config`](vec_inf/config/). 28 | * The package would also look for an enviroment variable `VEC_INF_CONFIG_DIR`. You can put your `environment.yaml` and `models.yaml` in a directory of your choice and set the enviroment variable `VEC_INF_CONFIG_DIR` to point to that location. 29 | 30 | ## Usage 31 | 32 | Vector Inference provides 2 user interfaces, a CLI and an API 33 | 34 | ### CLI 35 | 36 | The `launch` command allows users to deploy a model as a slurm job. If the job successfully launches, a URL endpoint is exposed for the user to send requests for inference. 37 | 38 | We will use the Llama 3.1 model as example, to launch an OpenAI compatible inference server for Meta-Llama-3.1-8B-Instruct, run: 39 | 40 | ```bash 41 | vec-inf launch Meta-Llama-3.1-8B-Instruct 42 | ``` 43 | You should see an output like the following: 44 | 45 | launch_image 46 | 47 | **NOTE**: You can set the required fields in the environment configuration (`environment.yaml`), it's a mapping between required arguments and their corresponding environment variables. On the Vector **Killarney** Cluster environment, the required fields are: 48 | * `--account`, `-A`: The Slurm account, this argument can be set to default by setting environment variable `VEC_INF_ACCOUNT`. 49 | * `--work-dir`, `-D`: A working directory other than your home directory, this argument can be set to default by seeting environment variable `VEC_INF_WORK_DIR`. 50 | 51 | Models that are already supported by `vec-inf` would be launched using the cached configuration (set in [slurm_vars.py](vec_inf/client/slurm_vars.py)) or [default configuration](vec_inf/config/models.yaml). You can override these values by providing additional parameters. Use `vec-inf launch --help` to see the full list of parameters that can be overriden. You can also launch your own custom model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html). For detailed instructions on how to customize your model launch, check out the [`launch` command section in User Guide](https://vectorinstitute.github.io/vector-inference/latest/user_guide/#launch-command) 52 | 53 | #### Other commands 54 | 55 | * `batch-launch`: Launch multiple model inference servers at once, currently ONLY single node models supported, 56 | * `status`: Check the status of all `vec-inf` jobs, or a specific job by providing its job ID. 57 | * `metrics`: Streams performance metrics to the console. 58 | * `shutdown`: Shutdown a model by providing its Slurm job ID. 59 | * `list`: List all available model names, or view the default/cached configuration of a specific model. 60 | * `cleanup`: Remove old log directories, use `--help` to see the supported filters. Use `--dry-run` to preview what would be deleted. 61 | 62 | For more details on the usage of these commands, refer to the [User Guide](https://vectorinstitute.github.io/vector-inference/user_guide/) 63 | 64 | ### API 65 | 66 | Example: 67 | 68 | ```python 69 | >>> from vec_inf.api import VecInfClient 70 | >>> client = VecInfClient() 71 | >>> # Assume VEC_INF_ACCOUNT and VEC_INF_WORK_DIR is set 72 | >>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct") 73 | >>> job_id = response.slurm_job_id 74 | >>> status = client.get_status(job_id) 75 | >>> if status.status == ModelStatus.READY: 76 | ... print(f"Model is ready at {status.base_url}") 77 | >>> # Alternatively, use wait_until_ready which will either return a StatusResponse or throw a ServerError 78 | >>> try: 79 | >>> status = wait_until_ready(job_id) 80 | >>> except ServerError as e: 81 | >>> print(f"Model launch failed: {e}") 82 | >>> client.shutdown_model(job_id) 83 | ``` 84 | 85 | For details on the usage of the API, refer to the [API Reference](https://vectorinstitute.github.io/vector-inference/api/) 86 | 87 | ## Check Job Configuration 88 | 89 | With every model launch, a Slurm script will be generated dynamically based on the job and model configuration. Once the Slurm job is queued, the generated Slurm script will be moved to the log directory for reproducibility, located at `$log_dir/$model_family/$model_name.$slurm_job_id/$model_name.$slurm_job_id.slurm`. In the same directory you can also find a JSON file with the same name that captures the launch configuration, and will have an entry of server URL once the server is ready. 90 | 91 | ## Send inference requests 92 | 93 | Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/chat_completions.py`, and you should expect to see an output like the following: 94 | 95 | ```json 96 | { 97 | "id":"chatcmpl-387c2579231948ffaf66cdda5439d3dc", 98 | "choices": [ 99 | { 100 | "finish_reason":"stop", 101 | "index":0, 102 | "logprobs":null, 103 | "message": { 104 | "content":"Arrr, I be Captain Chatbeard, the scurviest chatbot on the seven seas! Ye be wantin' to know me identity, eh? Well, matey, I be a swashbucklin' AI, here to provide ye with answers and swappin' tales, savvy?", 105 | "role":"assistant", 106 | "function_call":null, 107 | "tool_calls":[], 108 | "reasoning_content":null 109 | }, 110 | "stop_reason":null 111 | } 112 | ], 113 | "created":1742496683, 114 | "model":"Meta-Llama-3.1-8B-Instruct", 115 | "object":"chat.completion", 116 | "system_fingerprint":null, 117 | "usage": { 118 | "completion_tokens":66, 119 | "prompt_tokens":32, 120 | "total_tokens":98, 121 | "prompt_tokens_details":null 122 | }, 123 | "prompt_logprobs":null 124 | } 125 | 126 | ``` 127 | **NOTE**: Certain models don't adhere to OpenAI's chat template, e.g. Mistral family. For these models, you can either change your prompt to follow the model's default chat template or provide your own chat template via `--chat-template: TEMPLATE_PATH`. 128 | 129 | ## SSH tunnel from your local device 130 | If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following: 131 | ```bash 132 | ssh -L 8081:10.1.1.29:8081 username@v.vectorinstitute.ai -N 133 | ``` 134 | The example provided above is for the Vector Killarney cluster, change the variables accordingly for your environment. The IP address for the compute nodes on Killarney follow `10.1.1.XX` pattern, where `XX` is the GPU number (`kn029` -> `29` in this example). 135 | 136 | ## Reference 137 | If you found Vector Inference useful in your research or applications, please cite using the following BibTeX template: 138 | ``` 139 | @software{vector_inference, 140 | title = {Vector Inference: Efficient LLM inference on Slurm clusters using vLLM}, 141 | author = {Wang, Marshall}, 142 | organization = {Vector Institute}, 143 | year = {}, 144 | version = {}, 145 | url = {https://github.com/VectorInstitute/vector-inference} 146 | } 147 | ``` 148 | -------------------------------------------------------------------------------- /MODEL_TRACKING.md: -------------------------------------------------------------------------------- 1 | # Model Weights Tracking 2 | 3 | This document tracks all model weights available in the `/model-weights` directory on Killarney cluster and indicates which ones have existing configurations in the cached model config (`/model-weights/vec-inf-shared/models.yaml`). By default, `vec-inf` would use the cached model config. To request new model weights to be downloaded or model configuration to be added, please open an issue for "Model request". 4 | 5 | **NOTE**: The [`models.yaml`](./vec_inf/config/models.yaml) file in the package is not always up to date with the latest cached model config on Killarney cluster, new model config would be added to the cached model config. `models.yaml` would be updated to reflect the cached model config when a new version of the package is released. 6 | 7 | ## Legend 8 | - ✅ **Configured**: Model has a complete configuration in `models.yaml` 9 | - ❌ **Not Configured**: Model exists in `/model-weights` but lacks configuration 10 | 11 | --- 12 | 13 | ## Text Generation Models (LLM) 14 | 15 | ### Cohere for AI: Command R 16 | | Model | Configuration | 17 | |:------|:-------------| 18 | | `c4ai-command-r-plus-08-2024` | ✅ | 19 | | `c4ai-command-r-08-2024` | ✅ | 20 | 21 | ### Code Llama 22 | | Model | Configuration | 23 | |:------|:-------------| 24 | | `CodeLlama-7b-hf` | ✅ | 25 | | `CodeLlama-7b-Instruct-hf` | ✅ | 26 | | `CodeLlama-13b-hf` | ✅ | 27 | | `CodeLlama-13b-Instruct-hf` | ✅ | 28 | | `CodeLlama-34b-hf` | ✅ | 29 | | `CodeLlama-34b-Instruct-hf` | ✅ | 30 | | `CodeLlama-70b-hf` | ✅ | 31 | | `CodeLlama-70b-Instruct-hf` | ✅ | 32 | | `CodeLlama-7b-Python-hf` | ❌ | 33 | | `CodeLlama-13b-Python-hf` | ❌ | 34 | | `CodeLlama-70b-Python-hf` | ❌ | 35 | 36 | ### Google: Gemma 37 | | Model | Configuration | 38 | |:------|:-------------| 39 | | `gemma-2b` | ❌ | 40 | | `gemma-2b-it` | ❌ | 41 | | `gemma-7b` | ❌ | 42 | | `gemma-7b-it` | ❌ | 43 | | `gemma-2-2b-it` | ✅ | 44 | | `gemma-2-9b` | ✅ | 45 | | `gemma-2-9b-it` | ✅ | 46 | | `gemma-2-27b` | ✅ | 47 | | `gemma-2-27b-it` | ✅ | 48 | | `gemma-3-1b-it` | ❌ | 49 | | `gemma-3-4b-it` | ❌ | 50 | | `gemma-3-12b-it` | ❌ | 51 | | `gemma-3-27b-it` | ❌ | 52 | 53 | ### Meta: Llama 2 54 | | Model | Configuration | 55 | |:------|:-------------| 56 | | `Llama-2-7b-hf` | ✅ | 57 | | `Llama-2-7b-chat-hf` | ✅ | 58 | | `Llama-2-13b-hf` | ✅ | 59 | | `Llama-2-13b-chat-hf` | ✅ | 60 | | `Llama-2-70b-hf` | ✅ | 61 | | `Llama-2-70b-chat-hf` | ✅ | 62 | 63 | ### Meta: Llama 3 64 | | Model | Configuration | 65 | |:------|:-------------| 66 | | `Meta-Llama-3-8B` | ✅ | 67 | | `Meta-Llama-3-8B-Instruct` | ✅ | 68 | | `Meta-Llama-3-70B` | ✅ | 69 | | `Meta-Llama-3-70B-Instruct` | ✅ | 70 | 71 | ### Meta: Llama 3.1 72 | | Model | Configuration | 73 | |:------|:-------------| 74 | | `Meta-Llama-3.1-8B` | ✅ | 75 | | `Meta-Llama-3.1-8B-Instruct` | ✅ | 76 | | `Meta-Llama-3.1-70B` | ✅ | 77 | | `Meta-Llama-3.1-70B-Instruct` | ✅ | 78 | | `Meta-Llama-3.1-405B-Instruct` | ✅ | 79 | 80 | ### Meta: Llama 3.2 81 | | Model | Configuration | 82 | |:------|:-------------| 83 | | `Llama-3.2-1B` | ✅ | 84 | | `Llama-3.2-1B-Instruct` | ✅ | 85 | | `Llama-3.2-3B` | ✅ | 86 | | `Llama-3.2-3B-Instruct` | ✅ | 87 | 88 | ### Meta: Llama 3.3 89 | | Model | Configuration | 90 | |:------|:-------------| 91 | | `Llama-3.3-70B-Instruct` | ✅ | 92 | 93 | ### Meta: Llama 4 94 | | Model | Configuration | 95 | |:------|:-------------| 96 | | `Llama-4-Scout-17B-16E-Instruct` | ❌ | 97 | 98 | ### Mistral AI: Mistral 99 | | Model | Configuration | 100 | |:------|:-------------| 101 | | `Mistral-7B-v0.3` | ✅ | 102 | | `Mistral-7B-Instruct-v0.1` | ✅ | 103 | | `Mistral-7B-Instruct-v0.2` | ✅ | 104 | | `Mistral-7B-Instruct-v0.3` | ✅ | 105 | | `Mistral-Large-Instruct-2407` | ✅ | 106 | | `Mistral-Large-Instruct-2411` | ✅ | 107 | 108 | ### Mistral AI: Mixtral 109 | | Model | Configuration | 110 | |:------|:-------------| 111 | | `Mixtral-8x7B-Instruct-v0.1` | ✅ | 112 | | `Mixtral-8x22B-v0.1` | ✅ | 113 | | `Mixtral-8x22B-Instruct-v0.1` | ✅ | 114 | 115 | ### Microsoft: Phi 116 | | Model | Configuration | 117 | |:------|:-------------| 118 | | `Phi-3-medium-128k-instruct` | ✅ | 119 | | `phi-4` | ❌ | 120 | 121 | ### Nvidia: Llama-3.1-Nemotron 122 | | Model | Configuration | 123 | |:------|:-------------| 124 | | `Llama-3.1-Nemotron-70B-Instruct-HF` | ✅ | 125 | 126 | ### Qwen: Qwen2.5 127 | | Model | Configuration | 128 | |:------|:-------------| 129 | | `Qwen2.5-0.5B-Instruct` | ✅ | 130 | | `Qwen2.5-1.5B-Instruct` | ✅ | 131 | | `Qwen2.5-3B-Instruct` | ✅ | 132 | | `Qwen2.5-7B-Instruct` | ✅ | 133 | | `Qwen2.5-14B-Instruct` | ✅ | 134 | | `Qwen2.5-32B-Instruct` | ✅ | 135 | | `Qwen2.5-72B-Instruct` | ✅ | 136 | 137 | ### Qwen: Qwen2.5-Math 138 | | Model | Configuration | 139 | |:------|:-------------| 140 | | `Qwen2.5-Math-1.5B-Instruct` | ✅ | 141 | | `Qwen2.5-Math-7B-Instruct` | ✅ | 142 | | `Qwen2.5-Math-72B-Instruct` | ✅ | 143 | 144 | ### Qwen: Qwen2.5-Coder 145 | | Model | Configuration | 146 | |:------|:-------------| 147 | | `Qwen2.5-Coder-7B-Instruct` | ✅ | 148 | 149 | ### Qwen: QwQ 150 | | Model | Configuration | 151 | |:------|:-------------| 152 | | `QwQ-32B` | ✅ | 153 | 154 | ### Qwen: Qwen2 155 | | Model | Configuration | 156 | |:------|:-------------| 157 | | `Qwen2-1.5B-Instruct` | ❌ | 158 | | `Qwen2-7B-Instruct` | ❌ | 159 | | `Qwen2-Math-1.5B-Instruct` | ❌ | 160 | | `Qwen2-Math-7B-Instruct` | ❌ | 161 | | `Qwen2-Math-72B` | ❌ | 162 | | `Qwen2-Math-72B-Instruct` | ❌ | 163 | | `Qwen2-VL-7B-Instruct` | ❌ | 164 | 165 | ### Qwen: Qwen3 166 | | Model | Configuration | 167 | |:------|:-------------| 168 | | `Qwen3-14B` | ✅ | 169 | | `Qwen3-8B` | ✅ | 170 | | `Qwen3-32B` | ✅ | 171 | | `Qwen3-235B-A22B` | ❌ | 172 | | `Qwen3-Embedding-8B` | ❌ | 173 | 174 | ### DeepSeek: DeepSeek-R1 175 | | Model | Configuration | 176 | |:------|:-------------| 177 | | `DeepSeek-R1-Distill-Llama-8B` | ✅ | 178 | | `DeepSeek-R1-Distill-Llama-70B` | ✅ | 179 | | `DeepSeek-R1-Distill-Qwen-1.5B` | ✅ | 180 | | `DeepSeek-R1-Distill-Qwen-7B` | ✅ | 181 | | `DeepSeek-R1-Distill-Qwen-14B` | ✅ | 182 | | `DeepSeek-R1-Distill-Qwen-32B` | ✅ | 183 | 184 | ### DeepSeek: Other Models 185 | | Model | Configuration | 186 | |:------|:-------------| 187 | | `DeepSeek-Coder-V2-Lite-Instruct` | ❌ | 188 | | `deepseek-math-7b-instruct` | ❌ | 189 | 190 | ### OpenAI: GPT-OSS 191 | | Model | Configuration | 192 | |:------|:-------------| 193 | | `gpt-oss-120b` | ✅ | 194 | 195 | ### Other LLM Models 196 | | Model | Configuration | 197 | |:------|:-------------| 198 | | `AI21-Jamba-1.5-Mini` | ❌ | 199 | | `aya-expanse-32b` | ✅ (as Aya-Expanse-32B) | 200 | | `gpt2-large` | ❌ | 201 | | `gpt2-xl` | ❌ | 202 | | `gpt-oss-120b` | ❌ | 203 | | `instructblip-vicuna-7b` | ❌ | 204 | | `internlm2-math-plus-7b` | ❌ | 205 | | `Janus-Pro-7B` | ❌ | 206 | | `Kimi-K2-Instruct` | ❌ | 207 | | `Ministral-8B-Instruct-2410` | ❌ | 208 | | `Molmo-7B-D-0924` | ✅ | 209 | | `OLMo-1B-hf` | ❌ | 210 | | `OLMo-7B-hf` | ❌ | 211 | | `OLMo-7B-SFT` | ❌ | 212 | | `pythia` | ❌ | 213 | | `Qwen1.5-72B-Chat` | ❌ | 214 | | `ReasonFlux-PRM-7B` | ❌ | 215 | | `t5-large-lm-adapt` | ❌ | 216 | | `t5-xl-lm-adapt` | ❌ | 217 | | `mt5-xl-lm-adapt` | ❌ | 218 | 219 | --- 220 | 221 | ## Vision Language Models (VLM) 222 | 223 | ### LLaVa 224 | | Model | Configuration | 225 | |:------|:-------------| 226 | | `llava-1.5-7b-hf` | ✅ | 227 | | `llava-1.5-13b-hf` | ✅ | 228 | | `llava-v1.6-mistral-7b-hf` | ✅ | 229 | | `llava-v1.6-34b-hf` | ✅ | 230 | | `llava-med-v1.5-mistral-7b` | ❌ | 231 | 232 | ### Microsoft: Phi 3 Vision 233 | | Model | Configuration | 234 | |:------|:-------------| 235 | | `Phi-3-vision-128k-instruct` | ✅ | 236 | | `Phi-3.5-vision-instruct` | ✅ | 237 | 238 | ### Meta: Llama 3.2 Vision 239 | | Model | Configuration | 240 | |:------|:-------------| 241 | | `Llama-3.2-11B-Vision` | ✅ | 242 | | `Llama-3.2-11B-Vision-Instruct` | ✅ | 243 | | `Llama-3.2-90B-Vision` | ✅ | 244 | | `Llama-3.2-90B-Vision-Instruct` | ✅ | 245 | 246 | ### Mistral: Pixtral 247 | | Model | Configuration | 248 | |:------|:-------------| 249 | | `Pixtral-12B-2409` | ✅ | 250 | 251 | ### OpenGVLab: InternVL2.5 252 | | Model | Configuration | 253 | |:------|:-------------| 254 | | `InternVL2_5-8B` | ✅ | 255 | | `InternVL2_5-26B` | ✅ | 256 | | `InternVL2_5-38B` | ✅ | 257 | 258 | ### THUDM: GLM-4 259 | | Model | Configuration | 260 | |:------|:-------------| 261 | | `glm-4v-9b` | ✅ | 262 | 263 | ### DeepSeek: DeepSeek-VL2 264 | | Model | Configuration | 265 | |:------|:-------------| 266 | | `deepseek-vl2` | ✅ | 267 | | `deepseek-vl2-small` | ✅ | 268 | 269 | ### Other VLM Models 270 | | Model | Configuration | 271 | |:------|:-------------| 272 | | `MiniCPM-Llama3-V-2_5` | ❌ | 273 | 274 | --- 275 | 276 | ## Text Embedding Models 277 | 278 | ### Liang Wang: e5 279 | | Model | Configuration | 280 | |:------|:-------------| 281 | | `e5-mistral-7b-instruct` | ✅ | 282 | 283 | ### BAAI: bge 284 | | Model | Configuration | 285 | |:------|:-------------| 286 | | `bge-base-en-v1.5` | ✅ | 287 | | `bge-m3` | ❌ | 288 | | `bge-multilingual-gemma2` | ❌ | 289 | 290 | ### Sentence Transformers: MiniLM 291 | | Model | Configuration | 292 | |:------|:-------------| 293 | | `all-MiniLM-L6-v2` | ✅ | 294 | 295 | ### Other Embedding Models 296 | | Model | Configuration | 297 | |:------|:-------------| 298 | | `data2vec` | ❌ | 299 | | `gte-modernbert-base` | ❌ | 300 | | `gte-Qwen2-7B-instruct` | ❌ | 301 | | `m2-bert-80M-32k-retrieval` | ❌ | 302 | | `m2-bert-80M-8k-retrieval` | ❌ | 303 | 304 | --- 305 | 306 | ## Reward Modeling Models 307 | 308 | ### Qwen: Qwen2.5-Math 309 | | Model | Configuration | 310 | |:------|:-------------| 311 | | `Qwen2.5-Math-RM-72B` | ✅ | 312 | | `Qwen2.5-Math-PRM-7B` | ✅ | 313 | 314 | --- 315 | 316 | ## Multimodal Models 317 | 318 | ### CLIP 319 | | Model | Configuration | 320 | |:------|:-------------| 321 | | `clip-vit-base-patch16` | ❌ | 322 | | `clip-vit-large-patch14-336` | ❌ | 323 | 324 | ### Stable Diffusion 325 | | Model | Configuration | 326 | |:------|:-------------| 327 | | `sd-v1-4-full-ema` | ❌ | 328 | | `stable-diffusion-v1-4` | ❌ | 329 | 330 | --- 331 | -------------------------------------------------------------------------------- /vec_inf/client/_slurm_templates.py: -------------------------------------------------------------------------------- 1 | """SLURM script templates for Vector Inference. 2 | 3 | This module contains the SLURM script templates for Vector Inference, including 4 | single-node, multi-node, and batch mode templates. 5 | """ 6 | 7 | from typing import TypedDict 8 | 9 | from vec_inf.client._slurm_vars import ( 10 | CONTAINER_LOAD_CMD, 11 | CONTAINER_MODULE_NAME, 12 | IMAGE_PATH, 13 | ) 14 | 15 | 16 | CONTAINER_MODULE_NAME_UPPER = CONTAINER_MODULE_NAME.upper() 17 | 18 | 19 | class ShebangConfig(TypedDict): 20 | """TypedDict for SLURM script shebang configuration. 21 | 22 | Parameters 23 | ---------- 24 | base : str 25 | Base shebang line for all SLURM scripts 26 | multinode : list[str] 27 | Additional SLURM directives for multi-node configurations 28 | """ 29 | 30 | base: str 31 | multinode: list[str] 32 | 33 | 34 | class ServerSetupConfig(TypedDict): 35 | """TypedDict for server setup configuration. 36 | 37 | Parameters 38 | ---------- 39 | single_node : list[str] 40 | Setup commands for single-node deployments 41 | multinode : list[str] 42 | Setup commands for multi-node deployments, including Ray initialization 43 | """ 44 | 45 | single_node: list[str] 46 | multinode: list[str] 47 | 48 | 49 | class SlurmScriptTemplate(TypedDict): 50 | """TypedDict for complete SLURM script template configuration. 51 | 52 | Parameters 53 | ---------- 54 | shebang : ShebangConfig 55 | Shebang and SLURM directive configuration 56 | container_setup : list[str] 57 | Commands for container setup 58 | imports : str 59 | Import statements and source commands 60 | bind_path : str 61 | Bind path environment variable for the container 62 | container_command : str 63 | Template for container execution command 64 | activate_venv : str 65 | Template for virtual environment activation 66 | server_setup : ServerSetupConfig 67 | Server initialization commands for different deployment modes 68 | find_vllm_port : list[str] 69 | Commands to find available ports for vLLM server 70 | write_to_json : list[str] 71 | Commands to write server configuration to JSON 72 | launch_cmd : list[str] 73 | vLLM server launch commands 74 | """ 75 | 76 | shebang: ShebangConfig 77 | container_setup: list[str] 78 | imports: str 79 | bind_path: str 80 | container_command: str 81 | activate_venv: str 82 | server_setup: ServerSetupConfig 83 | find_vllm_port: list[str] 84 | write_to_json: list[str] 85 | launch_cmd: list[str] 86 | 87 | 88 | SLURM_SCRIPT_TEMPLATE: SlurmScriptTemplate = { 89 | "shebang": { 90 | "base": "#!/bin/bash", 91 | "multinode": [ 92 | "#SBATCH --exclusive", 93 | "#SBATCH --tasks-per-node=1", 94 | ], 95 | }, 96 | "container_setup": [ 97 | CONTAINER_LOAD_CMD, 98 | f"{CONTAINER_MODULE_NAME} exec {IMAGE_PATH} ray stop", 99 | ], 100 | "imports": "source {src_dir}/find_port.sh", 101 | "bind_path": f"export {CONTAINER_MODULE_NAME.upper()}_BINDPATH=${CONTAINER_MODULE_NAME.upper()}_BINDPATH,/dev,/tmp,{{model_weights_path}}{{additional_binds}}", 102 | "container_command": f"{CONTAINER_MODULE_NAME} exec --nv {{env_str}} --containall {IMAGE_PATH} \\", 103 | "activate_venv": "source {venv}/bin/activate", 104 | "server_setup": { 105 | "single_node": [ 106 | "\n# Find available port", 107 | "head_node_ip=${SLURMD_NODENAME}", 108 | ], 109 | "multinode": [ 110 | "\n# Get list of nodes", 111 | 'nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")', 112 | "nodes_array=($nodes)", 113 | "head_node=${{nodes_array[0]}}", 114 | 'head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)', 115 | "\n# Check for RDMA devices and set environment variable accordingly", 116 | "if ! command -v ibv_devices >/dev/null 2>&1; then", 117 | ' echo "ibv_devices not found; forcing TCP. (No RDMA userland on host?)"', 118 | " export NCCL_IB_DISABLE=1", 119 | ' export NCCL_ENV_ARG="--env NCCL_IB_DISABLE=1"', 120 | "else", 121 | " # Pick GID index based on link layer (IB vs RoCE)", 122 | ' if ibv_devinfo 2>/dev/null | grep -q "link_layer:.*Ethernet"; then', 123 | " # RoCEv2 typically needs a nonzero GID index; 3 is common, try 2 if your fabric uses it", 124 | " export NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-3}}", 125 | ' export NCCL_ENV_ARG="--env NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-3}}"', 126 | " else", 127 | " # Native InfiniBand => GID 0", 128 | " export NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-0}}", 129 | ' export NCCL_ENV_ARG="--env NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-0}}"', 130 | " fi", 131 | "fi", 132 | "\n# Start Ray head node", 133 | "head_node_port=$(find_available_port $head_node_ip 8080 65535)", 134 | "ray_head=$head_node_ip:$head_node_port", 135 | 'echo "Ray Head IP: $ray_head"', 136 | 'echo "Starting HEAD at $head_node"', 137 | 'srun --nodes=1 --ntasks=1 -w "$head_node" \\', 138 | " CONTAINER_PLACEHOLDER", 139 | ' ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \\', 140 | ' --num-cpus "$SLURM_CPUS_PER_TASK" --num-gpus {gpus_per_node} --block &', 141 | "sleep 10", 142 | "\n# Start Ray worker nodes", 143 | "worker_num=$((SLURM_JOB_NUM_NODES - 1))", 144 | "for ((i = 1; i <= worker_num; i++)); do", 145 | " node_i=${{nodes_array[$i]}}", 146 | ' echo "Starting WORKER $i at $node_i"', 147 | ' srun --nodes=1 --ntasks=1 -w "$node_i" \\', 148 | " CONTAINER_PLACEHOLDER", 149 | ' ray start --address "$ray_head" \\', 150 | ' --num-cpus "$SLURM_CPUS_PER_TASK" --num-gpus {gpus_per_node} --block &', 151 | " sleep 5", 152 | "done", 153 | ], 154 | }, 155 | "find_vllm_port": [ 156 | "\nvllm_port_number=$(find_available_port $head_node_ip 8080 65535)", 157 | 'server_address="http://${head_node_ip}:${vllm_port_number}/v1"', 158 | ], 159 | "write_to_json": [ 160 | '\njson_path="{log_dir}/{model_name}.$SLURM_JOB_ID/{model_name}.$SLURM_JOB_ID.json"', 161 | 'jq --arg server_addr "$server_address" \\', 162 | " '. + {{\"server_address\": $server_addr}}' \\", 163 | ' "$json_path" > temp.json \\', 164 | ' && mv temp.json "$json_path"', 165 | ], 166 | "launch_cmd": [ 167 | "vllm serve {model_weights_path} \\", 168 | " --served-model-name {model_name} \\", 169 | ' --host "0.0.0.0" \\', 170 | " --port $vllm_port_number \\", 171 | ], 172 | } 173 | 174 | 175 | class BatchSlurmScriptTemplate(TypedDict): 176 | """TypedDict for batch SLURM script template configuration. 177 | 178 | Parameters 179 | ---------- 180 | shebang : str 181 | Shebang line for the script 182 | hetjob : str 183 | SLURM directive for hetjob 184 | permission_update : str 185 | Command to update permissions of the script 186 | launch_model_scripts : list[str] 187 | Commands to launch the vLLM server 188 | """ 189 | 190 | shebang: str 191 | hetjob: str 192 | permission_update: str 193 | launch_model_scripts: list[str] 194 | 195 | 196 | BATCH_SLURM_SCRIPT_TEMPLATE: BatchSlurmScriptTemplate = { 197 | "shebang": "#!/bin/bash", 198 | "hetjob": "#SBATCH hetjob\n", 199 | "permission_update": "chmod +x {script_name}", 200 | "launch_model_scripts": [ 201 | "\nsrun --het-group={het_group_id} \\", 202 | " --output={out_file} \\", 203 | " --error={err_file} \\", 204 | " {script_name} &\n", 205 | ], 206 | } 207 | 208 | 209 | class BatchModelLaunchScriptTemplate(TypedDict): 210 | """TypedDict for batch model launch script template configuration. 211 | 212 | Parameters 213 | ---------- 214 | shebang : str 215 | Shebang line for the script 216 | container_setup : list[str] 217 | Commands for container setup 218 | bind_path : str 219 | Bind path environment variable for the container 220 | server_address_setup : list[str] 221 | Commands to setup the server address 222 | launch_cmd : list[str] 223 | Commands to launch the vLLM server 224 | container_command : str 225 | Commands to setup the container command 226 | """ 227 | 228 | shebang: str 229 | container_setup: str 230 | bind_path: str 231 | server_address_setup: list[str] 232 | write_to_json: list[str] 233 | launch_cmd: list[str] 234 | container_command: str 235 | 236 | 237 | BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE: BatchModelLaunchScriptTemplate = { 238 | "shebang": "#!/bin/bash\n", 239 | "container_setup": f"{CONTAINER_LOAD_CMD}\n", 240 | "bind_path": f"export {CONTAINER_MODULE_NAME.upper()}_BINDPATH=${CONTAINER_MODULE_NAME.upper()}_BINDPATH,/dev,/tmp,{{model_weights_path}}{{additional_binds}}", 241 | "server_address_setup": [ 242 | "source {src_dir}/find_port.sh", 243 | "head_node_ip=${{SLURMD_NODENAME}}", 244 | "vllm_port_number=$(find_available_port $head_node_ip 8080 65535)", 245 | 'server_address="http://${{head_node_ip}}:${{vllm_port_number}}/v1"\n', 246 | "echo $server_address\n", 247 | ], 248 | "write_to_json": [ 249 | "het_job_id=$(($SLURM_JOB_ID+{het_group_id}))", 250 | 'json_path="{log_dir}/{slurm_job_name}.$het_job_id/{model_name}.$het_job_id.json"', 251 | 'jq --arg server_addr "$server_address" \\', 252 | " '. + {{\"server_address\": $server_addr}}' \\", 253 | ' "$json_path" > temp_{model_name}.json \\', 254 | ' && mv temp_{model_name}.json "$json_path"\n', 255 | ], 256 | "container_command": f"{CONTAINER_MODULE_NAME} exec --nv --containall {IMAGE_PATH} \\", 257 | "launch_cmd": [ 258 | "vllm serve {model_weights_path} \\", 259 | " --served-model-name {model_name} \\", 260 | ' --host "0.0.0.0" \\', 261 | " --port $vllm_port_number \\", 262 | ], 263 | } 264 | -------------------------------------------------------------------------------- /docs/assets/vector-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 15 | 17 | 26 | 31 | 34 | 38 | 42 | 46 | 50 | 54 | 58 | 62 | 66 | 70 | 74 | 78 | 82 | 86 | 90 | 94 | 98 | 102 | 106 | 110 | 114 | 118 | 122 | 126 | 130 | 134 | 138 | 142 | 146 | 150 | 154 | 158 | 162 | 166 | 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /tests/vec_inf/client/test_utils.py: -------------------------------------------------------------------------------- 1 | """Tests for the utility functions in the vec-inf client.""" 2 | 3 | import os 4 | from unittest.mock import MagicMock, patch 5 | 6 | import pytest 7 | import requests 8 | 9 | from vec_inf.client._utils import ( 10 | MODEL_READY_SIGNATURE, 11 | find_matching_dirs, 12 | get_base_url, 13 | is_server_running, 14 | load_config, 15 | model_health_check, 16 | read_slurm_log, 17 | run_bash_command, 18 | ) 19 | 20 | 21 | @pytest.fixture 22 | def mock_log_dir(tmp_path): 23 | """Create a temporary directory for log files.""" 24 | log_dir = tmp_path / "logs" 25 | log_dir.mkdir() 26 | return log_dir 27 | 28 | 29 | def test_run_bash_command_success(): 30 | """Test that run_bash_command returns the output of the command.""" 31 | with patch("subprocess.Popen") as mock_popen: 32 | mock_process = MagicMock() 33 | mock_process.communicate.return_value = ("test output", "") 34 | mock_popen.return_value = mock_process 35 | result, stderr = run_bash_command("echo test") 36 | assert result == "test output" 37 | assert stderr == "" 38 | 39 | 40 | def test_run_bash_command_error(): 41 | """Test run_bash_command with error output.""" 42 | with patch("subprocess.Popen") as mock_popen: 43 | mock_process = MagicMock() 44 | mock_process.communicate.return_value = ("", "error output") 45 | mock_popen.return_value = mock_process 46 | result, stderr = run_bash_command("invalid_command") 47 | assert result == "" 48 | assert stderr == "error output" 49 | 50 | 51 | def test_read_slurm_log_found(mock_log_dir): 52 | """Test that read_slurm_log reads the content of a log file.""" 53 | test_content = ["line1\n", "line2\n"] 54 | log_file = mock_log_dir / "test_job.123.err" 55 | log_file.parent.mkdir(parents=True, exist_ok=True) 56 | log_file.write_text("".join(test_content)) 57 | result = read_slurm_log("test_job", "123", "err", mock_log_dir) 58 | assert result == test_content 59 | 60 | 61 | def test_read_slurm_log_not_found(): 62 | """Test read_slurm_log, return an error message if the log file is not found.""" 63 | result = read_slurm_log("missing_job", "456", "err", "/nonexistent") 64 | assert result == "LOG FILE NOT FOUND: /nonexistent/missing_job.456.err" 65 | 66 | 67 | @pytest.mark.parametrize( 68 | "log_content,expected", 69 | [ 70 | ([MODEL_READY_SIGNATURE], "RUNNING"), 71 | (["ERROR: something wrong"], ("FAILED", "ERROR: something wrong")), 72 | ([], "LAUNCHING"), 73 | (["some other content"], "LAUNCHING"), 74 | ], 75 | ) 76 | def test_is_server_running_statuses(log_content, expected): 77 | """Test that is_server_running returns the correct status.""" 78 | with patch("vec_inf.client._utils.read_slurm_log") as mock_read: 79 | mock_read.return_value = log_content 80 | result = is_server_running("test_job", "123", None) 81 | assert result == expected 82 | 83 | 84 | def test_get_base_url_found(): 85 | """Test that get_base_url returns the correct base URL.""" 86 | test_dict = {"server_address": "http://localhost:8000"} 87 | with patch("vec_inf.client._utils.read_slurm_log") as mock_read: 88 | mock_read.return_value = test_dict 89 | result = get_base_url("test_job", "123", None) 90 | assert result == "http://localhost:8000" 91 | 92 | 93 | def test_get_base_url_not_found(): 94 | """Test get_base_url when URL is not found in logs.""" 95 | with patch("vec_inf.client._utils.read_slurm_log") as mock_read: 96 | mock_read.return_value = {"random_key": "123"} 97 | result = get_base_url("test_job", "123", None) 98 | assert result == "URL NOT FOUND" 99 | 100 | 101 | @pytest.mark.parametrize( 102 | "url,status_code,expected", 103 | [ 104 | ("http://localhost:8000", 200, ("READY", 200)), 105 | ("http://localhost:8000", 500, ("FAILED", 500)), 106 | ("not_a_url", None, ("FAILED", "not_a_url")), 107 | ], 108 | ) 109 | def test_model_health_check(url, status_code, expected): 110 | """Test model_health_check with various scenarios.""" 111 | with patch("vec_inf.client._utils.get_base_url") as mock_url: 112 | mock_url.return_value = url 113 | if url.startswith("http"): 114 | with patch("requests.get") as mock_get: 115 | mock_get.return_value.status_code = status_code 116 | result = model_health_check("test_job", "123", None) 117 | assert result == expected 118 | else: 119 | result = model_health_check("test_job", "123", None) 120 | assert result == expected 121 | 122 | 123 | def test_model_health_check_request_exception(): 124 | """Test model_health_check when request raises an exception.""" 125 | with ( 126 | patch("vec_inf.client._utils.get_base_url") as mock_url, 127 | patch("requests.get") as mock_get, 128 | ): 129 | mock_url.return_value = "http://localhost:8000" 130 | mock_get.side_effect = requests.exceptions.RequestException("Connection error") 131 | result = model_health_check("test_job", "123", None) 132 | assert result == ("FAILED", "Connection error") 133 | 134 | 135 | def test_load_config_default_only(): 136 | """Test loading the actual default configuration file from the filesystem.""" 137 | configs = load_config() 138 | 139 | # Verify at least one known model exists 140 | model_names = {m.model_name for m in configs} 141 | assert "c4ai-command-r-plus-08-2024" in model_names 142 | 143 | # Verify full configuration of a sample model 144 | model = next(m for m in configs if m.model_name == "c4ai-command-r-plus-08-2024") 145 | assert model.model_family == "c4ai-command-r" 146 | assert model.model_type == "LLM" 147 | assert model.gpus_per_node == 4 148 | assert model.num_nodes == 2 149 | assert model.vllm_args["--max-model-len"] == 65536 150 | 151 | 152 | def test_load_config_with_user_override(tmp_path, monkeypatch): 153 | """Test user config overriding default values.""" 154 | # Create user config directory and file 155 | user_config_dir = tmp_path / "user_config_dir" 156 | user_config_dir.mkdir() 157 | user_config_file = user_config_dir / "models.yaml" 158 | user_config_file.write_text("""\ 159 | models: 160 | c4ai-command-r-plus-08-2024: 161 | gpus_per_node: 8 162 | new-model: 163 | model_family: new-family 164 | model_type: VLM 165 | gpus_per_node: 4 166 | num_nodes: 1 167 | vocab_size: 256000 168 | vllm_args: 169 | --max-model-len: 4096 170 | """) 171 | 172 | with monkeypatch.context() as m: 173 | m.setenv("VEC_INF_CONFIG_DIR", str(user_config_dir)) 174 | configs = load_config() 175 | config_map = {m.model_name: m for m in configs} 176 | 177 | # Verify override (merged with defaults) 178 | assert config_map["c4ai-command-r-plus-08-2024"].gpus_per_node == 8 179 | assert config_map["c4ai-command-r-plus-08-2024"].num_nodes == 2 180 | assert config_map["c4ai-command-r-plus-08-2024"].vocab_size == 256000 181 | 182 | # Verify new model 183 | new_model = config_map["new-model"] 184 | assert new_model.model_family == "new-family" 185 | assert new_model.model_type == "VLM" 186 | assert new_model.gpus_per_node == 4 187 | assert new_model.vocab_size == 256000 188 | assert new_model.vllm_args["--max-model-len"] == 4096 189 | 190 | 191 | def test_load_config_invalid_user_model(tmp_path): 192 | """Test validation of user-provided model configurations.""" 193 | # Create user config directory and file 194 | invalid_config_dir = tmp_path / "bad_config_dir" 195 | invalid_config_dir.mkdir() 196 | invalid_config_file = invalid_config_dir / "models.yaml" 197 | invalid_config_file.write_text("""\ 198 | models: 199 | invalid-model: 200 | model_family: "" 201 | model_type: INVALID_TYPE 202 | num_gpus: 0 203 | num_nodes: -1 204 | """) 205 | 206 | with ( 207 | pytest.raises(ValueError) as excinfo, 208 | patch.dict(os.environ, {"VEC_INF_CONFIG_DIR": str(invalid_config_dir)}), 209 | ): 210 | load_config() 211 | 212 | assert "validation error" in str(excinfo.value).lower() 213 | assert "model_type" in str(excinfo.value) 214 | assert "num_gpus" in str(excinfo.value) 215 | 216 | 217 | def test_find_matching_dirs_only_model_family(tmp_path): 218 | """Return model_family directory when only model_family is provided.""" 219 | fam_dir = tmp_path / "fam_a" 220 | fam_dir.mkdir() 221 | (fam_dir / "model_a.1").mkdir() 222 | (fam_dir / "model_b.2").mkdir() 223 | 224 | other_dir = tmp_path / "fam_b" 225 | other_dir.mkdir() 226 | (other_dir / "model_c.3").mkdir() 227 | 228 | matches = find_matching_dirs(log_dir=tmp_path, model_family="fam_a") 229 | assert len(matches) == 1 230 | assert matches[0].name == "fam_a" 231 | 232 | 233 | def test_find_matching_dirs_only_model_name(tmp_path): 234 | """Return directories matching when only model_name is provided.""" 235 | fam_a = tmp_path / "fam_a" 236 | fam_a.mkdir() 237 | (fam_a / "target.1").mkdir() 238 | (fam_a / "other.2").mkdir() 239 | 240 | fam_b = tmp_path / "fam_b" 241 | fam_b.mkdir() 242 | (fam_b / "different.3").mkdir() 243 | 244 | matches = find_matching_dirs(log_dir=tmp_path, model_name="target") 245 | result_names = [p.name for p in matches] 246 | 247 | assert "target.1" in result_names 248 | assert "other.2" not in result_names 249 | assert "different.3" not in result_names 250 | 251 | 252 | def test_find_matching_dirs_only_job_id(tmp_path): 253 | """Return directories matching exact job_id.""" 254 | fam_dir = tmp_path / "fam" 255 | fam_dir.mkdir() 256 | (fam_dir / "model_a.10").mkdir() 257 | (fam_dir / "model_b.20").mkdir() 258 | (fam_dir / "model_c.30").mkdir() 259 | 260 | matches = find_matching_dirs(log_dir=tmp_path, job_id=10) 261 | result_names = [p.name for p in matches] 262 | 263 | assert "model_a.10" in result_names 264 | assert "model_b.20" not in result_names 265 | assert "model_c.30" not in result_names 266 | 267 | 268 | def test_find_matching_dirs_only_before_job_id(tmp_path): 269 | """Return directories with job_id < before_job_id.""" 270 | fam_dir = tmp_path / "fam_a" 271 | fam_dir.mkdir() 272 | (fam_dir / "model_a.1").mkdir() 273 | (fam_dir / "model_a.5").mkdir() 274 | (fam_dir / "model_a.100").mkdir() 275 | 276 | fam_dir = tmp_path / "fam_b" 277 | fam_dir.mkdir() 278 | (fam_dir / "model_b.30").mkdir() 279 | 280 | matches = find_matching_dirs(log_dir=tmp_path, before_job_id=50) 281 | result_names = [p.name for p in matches] 282 | 283 | assert "model_a.1" in result_names 284 | assert "model_a.5" in result_names 285 | assert "model_a.100" not in result_names 286 | assert "model_b.30" in result_names 287 | 288 | 289 | def test_find_matching_dirs_family_and_before_job_id(tmp_path): 290 | """Return directories under a given family with job IDs less than before_job_id.""" 291 | fam_dir = tmp_path / "targetfam" 292 | fam_dir.mkdir() 293 | (fam_dir / "model_a.10").mkdir() 294 | (fam_dir / "model_a.20").mkdir() 295 | (fam_dir / "model_a.99").mkdir() 296 | (fam_dir / "model_a.150").mkdir() 297 | 298 | other_fam = tmp_path / "otherfam" 299 | other_fam.mkdir() 300 | (other_fam / "model_b.5").mkdir() 301 | (other_fam / "model_b.10").mkdir() 302 | (other_fam / "model_b.100").mkdir() 303 | 304 | matches = find_matching_dirs( 305 | log_dir=tmp_path, 306 | model_family="targetfam", 307 | before_job_id=100, 308 | ) 309 | 310 | result_names = [p.name for p in matches] 311 | 312 | assert "model_a.10" in result_names 313 | assert "model_a.20" in result_names 314 | assert "model_a.99" in result_names 315 | assert "model_a.150" not in result_names 316 | assert all("otherfam" not in str(p) for p in matches) 317 | 318 | 319 | def test_find_matching_dirs_with_family_model_name_and_before_job_id(tmp_path): 320 | """Return matching dirs with model_family, model_name, and before_job_id filters.""" 321 | fam_dir = tmp_path / "targetfam" 322 | fam_dir.mkdir() 323 | (fam_dir / "model_a.1").mkdir() 324 | (fam_dir / "model_a.50").mkdir() 325 | (fam_dir / "model_a.150").mkdir() 326 | (fam_dir / "model_b.40").mkdir() 327 | 328 | other_fam = tmp_path / "otherfam" 329 | other_fam.mkdir() 330 | (other_fam / "model_c.20").mkdir() 331 | 332 | matches = find_matching_dirs( 333 | log_dir=tmp_path, 334 | model_family="targetfam", 335 | model_name="model_a", 336 | before_job_id=100, 337 | ) 338 | 339 | result_names = [p.name for p in matches] 340 | 341 | assert "model_a.1" in result_names 342 | assert "model_a.50" in result_names 343 | assert "model_a.150" not in result_names 344 | assert "model_b.40" not in result_names 345 | assert all("model_b" not in p for p in result_names) 346 | assert all("otherfam" not in str(p) for p in matches) 347 | -------------------------------------------------------------------------------- /vec_inf/client/api.py: -------------------------------------------------------------------------------- 1 | """Vector Inference client for programmatic access. 2 | 3 | This module provides the main client class for interacting with Vector Inference 4 | services programmatically. It includes functionality for launching models, monitoring 5 | their status, collecting metrics, and managing their lifecycle. 6 | 7 | See Also 8 | -------- 9 | vec_inf.client._helper : Helper classes for model inference server management 10 | vec_inf.client.models : Data models for API responses 11 | """ 12 | 13 | import re 14 | import shutil 15 | import subprocess 16 | import time 17 | import warnings 18 | from pathlib import Path 19 | from typing import Any, Optional, Union 20 | 21 | from vec_inf.client._exceptions import ( 22 | ServerError, 23 | SlurmJobError, 24 | ) 25 | from vec_inf.client._helper import ( 26 | BatchModelLauncher, 27 | ModelLauncher, 28 | ModelRegistry, 29 | ModelStatusMonitor, 30 | PerformanceMetricsCollector, 31 | ) 32 | from vec_inf.client._utils import find_matching_dirs, run_bash_command 33 | from vec_inf.client.config import ModelConfig 34 | from vec_inf.client.models import ( 35 | BatchLaunchResponse, 36 | LaunchOptions, 37 | LaunchResponse, 38 | MetricsResponse, 39 | ModelInfo, 40 | ModelStatus, 41 | StatusResponse, 42 | ) 43 | 44 | 45 | class VecInfClient: 46 | """Client for interacting with Vector Inference programmatically. 47 | 48 | This class provides methods for launching models, checking their status, 49 | retrieving metrics, and shutting down models using the Vector Inference 50 | infrastructure. 51 | 52 | Methods 53 | ------- 54 | list_models() 55 | List all available models 56 | get_model_config(model_name) 57 | Get configuration for a specific model 58 | launch_model(model_name, options) 59 | Launch a model on the cluster 60 | get_status(slurm_job_id, log_dir) 61 | Get status of a running model 62 | get_metrics(slurm_job_id, log_dir) 63 | Get performance metrics of a running model 64 | shutdown_model(slurm_job_id) 65 | Shutdown a running model 66 | wait_until_ready(slurm_job_id, timeout_seconds, poll_interval_seconds, log_dir) 67 | Wait for a model to become ready 68 | 69 | cleanup_logs(log_dir, model_name, model_family, job_id, dry_run) 70 | Remove logs from the log directory. 71 | 72 | Examples 73 | -------- 74 | >>> from vec_inf.api import VecInfClient 75 | >>> client = VecInfClient() 76 | >>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct") 77 | >>> job_id = response.slurm_job_id 78 | >>> status = client.get_status(job_id) 79 | >>> if status.status == ModelStatus.READY: 80 | ... print(f"Model is ready at {status.base_url}") 81 | >>> client.shutdown_model(job_id) 82 | """ 83 | 84 | def __init__(self) -> None: 85 | """Initialize the Vector Inference client.""" 86 | self._metrics_collectors: dict[str, PerformanceMetricsCollector] = {} 87 | 88 | def list_models(self) -> list[ModelInfo]: 89 | """List all available models. 90 | 91 | Returns 92 | ------- 93 | list[ModelInfo] 94 | List of ModelInfo objects containing information about available models, 95 | including their configurations and specifications. 96 | """ 97 | model_registry = ModelRegistry() 98 | return model_registry.get_all_models() 99 | 100 | def get_model_config(self, model_name: str) -> ModelConfig: 101 | """Get the configuration for a specific model. 102 | 103 | Parameters 104 | ---------- 105 | model_name : str 106 | Name of the model to get configuration for 107 | 108 | Returns 109 | ------- 110 | ModelConfig 111 | Complete configuration for the specified model 112 | 113 | Raises 114 | ------ 115 | ModelNotFoundError 116 | If the specified model is not found in the configuration 117 | """ 118 | model_registry = ModelRegistry() 119 | return model_registry.get_single_model_config(model_name) 120 | 121 | def launch_model( 122 | self, model_name: str, options: Optional[LaunchOptions] = None 123 | ) -> LaunchResponse: 124 | """Launch a model on the cluster. 125 | 126 | Parameters 127 | ---------- 128 | model_name : str 129 | Name of the model to launch 130 | options : LaunchOptions, optional 131 | Launch options to override default configuration 132 | 133 | Returns 134 | ------- 135 | LaunchResponse 136 | Response containing launch details including: 137 | - SLURM job ID 138 | - Model configuration 139 | - Launch status 140 | 141 | Raises 142 | ------ 143 | ModelConfigurationError 144 | If the model configuration is invalid 145 | SlurmJobError 146 | If there's an error launching the SLURM job 147 | """ 148 | # Convert LaunchOptions to dictionary if provided 149 | options_dict: dict[str, Any] = {} 150 | if options: 151 | options_dict = {k: v for k, v in vars(options).items() if v is not None} 152 | 153 | # Create and use the API Launch Helper 154 | model_launcher = ModelLauncher(model_name, options_dict) 155 | return model_launcher.launch() 156 | 157 | def batch_launch_models( 158 | self, 159 | model_names: list[str], 160 | batch_config: Optional[str] = None, 161 | account: Optional[str] = None, 162 | work_dir: Optional[str] = None, 163 | ) -> BatchLaunchResponse: 164 | """Launch multiple models on the cluster. 165 | 166 | Parameters 167 | ---------- 168 | model_names : list[str] 169 | List of model names to launch 170 | 171 | Returns 172 | ------- 173 | BatchLaunchResponse 174 | Response containing launch details for each model 175 | 176 | Raises 177 | ------ 178 | ModelConfigurationError 179 | If the model configuration is invalid 180 | """ 181 | model_launcher = BatchModelLauncher( 182 | model_names, batch_config, account, work_dir 183 | ) 184 | return model_launcher.launch() 185 | 186 | def fetch_running_jobs(self) -> list[str]: 187 | """ 188 | Fetch the list of running vec-inf job IDs for the current user. 189 | 190 | Returns 191 | ------- 192 | list[str] 193 | List of matching job names; empty list if squeue unavailable. 194 | """ 195 | try: 196 | res = subprocess.run( 197 | ["squeue", "--me", "--noheader"], 198 | capture_output=True, 199 | text=True, 200 | check=True, 201 | ) 202 | job_ids = [ 203 | ln.strip().split()[0] for ln in res.stdout.splitlines() if ln.strip() 204 | ] 205 | 206 | if not job_ids: 207 | return [] 208 | 209 | # For each job, fetch the full JobName and filter by suffix 210 | matching_ids = [] 211 | for jid in job_ids: 212 | try: 213 | sctl = subprocess.run( 214 | ["scontrol", "show", "job", "-o", jid], 215 | capture_output=True, 216 | text=True, 217 | check=True, 218 | ) 219 | m = re.search(r"\bJobName=([^\s]+)", sctl.stdout) 220 | if m and m.group(1).endswith("-vec-inf"): 221 | matching_ids.append(jid) 222 | except subprocess.CalledProcessError: 223 | # Job might have finished between squeue and scontrol; skip 224 | continue 225 | 226 | return matching_ids 227 | 228 | except subprocess.CalledProcessError as e: 229 | raise SlurmJobError(f"Error running slurm command: {e}") from e 230 | 231 | def get_status(self, slurm_job_id: str) -> StatusResponse: 232 | """Get the status of a running model. 233 | 234 | Parameters 235 | ---------- 236 | slurm_job_id : str 237 | The SLURM job ID to check 238 | 239 | Returns 240 | ------- 241 | StatusResponse 242 | Status information including: 243 | - Model name 244 | - Server status 245 | - Job state 246 | - Base URL (if ready) 247 | - Error information (if failed) 248 | """ 249 | model_status_monitor = ModelStatusMonitor(slurm_job_id) 250 | return model_status_monitor.process_model_status() 251 | 252 | def get_metrics(self, slurm_job_id: str) -> MetricsResponse: 253 | """Get the performance metrics of a running model. 254 | 255 | Parameters 256 | ---------- 257 | slurm_job_id : str 258 | The SLURM job ID to get metrics for 259 | 260 | Returns 261 | ------- 262 | MetricsResponse 263 | Response containing: 264 | - Model name 265 | - Performance metrics or error message 266 | - Timestamp of collection 267 | """ 268 | # Use cached collector to preserve state between calls to compute throughput 269 | if slurm_job_id not in self._metrics_collectors: 270 | self._metrics_collectors[slurm_job_id] = PerformanceMetricsCollector( 271 | slurm_job_id 272 | ) 273 | 274 | performance_metrics_collector = self._metrics_collectors[slurm_job_id] 275 | 276 | metrics: Union[dict[str, float], str] 277 | if not performance_metrics_collector.metrics_url.startswith("http"): 278 | metrics = performance_metrics_collector.metrics_url 279 | else: 280 | metrics = performance_metrics_collector.fetch_metrics() 281 | 282 | return MetricsResponse( 283 | model_name=performance_metrics_collector.status_info.model_name, 284 | metrics=metrics, 285 | timestamp=time.time(), 286 | ) 287 | 288 | def shutdown_model(self, slurm_job_id: str) -> bool: 289 | """Shutdown a running model. 290 | 291 | Parameters 292 | ---------- 293 | slurm_job_id : str 294 | The SLURM job ID to shut down 295 | 296 | Returns 297 | ------- 298 | bool 299 | True if the model was successfully shutdown 300 | 301 | Raises 302 | ------ 303 | SlurmJobError 304 | If there was an error shutting down the model 305 | """ 306 | shutdown_cmd = f"scancel {slurm_job_id}" 307 | _, stderr = run_bash_command(shutdown_cmd) 308 | if stderr: 309 | raise SlurmJobError(f"Failed to shutdown model: {stderr}") 310 | return True 311 | 312 | def wait_until_ready( 313 | self, 314 | slurm_job_id: str, 315 | timeout_seconds: int = 1800, 316 | poll_interval_seconds: int = 10, 317 | ) -> StatusResponse: 318 | """Wait until a model is ready or fails. 319 | 320 | Parameters 321 | ---------- 322 | slurm_job_id : str 323 | The SLURM job ID to wait for 324 | timeout_seconds : int, optional 325 | Maximum time to wait in seconds, by default 1800 (30 mins) 326 | poll_interval_seconds : int, optional 327 | How often to check status in seconds, by default 10 328 | 329 | Returns 330 | ------- 331 | StatusResponse 332 | Status information when the model becomes ready 333 | 334 | Raises 335 | ------ 336 | SlurmJobError 337 | If the specified job is not found or there's an error with the job 338 | ServerError 339 | If the server fails to start within the timeout period 340 | APIError 341 | If there was an error checking the status 342 | 343 | Notes 344 | ----- 345 | The timeout is reset if the model is still in PENDING state after the 346 | initial timeout period. This allows for longer queue times in the SLURM 347 | scheduler. 348 | """ 349 | start_time = time.time() 350 | 351 | while True: 352 | status_info = self.get_status(slurm_job_id) 353 | 354 | if status_info.server_status == ModelStatus.READY: 355 | return status_info 356 | 357 | if status_info.server_status == ModelStatus.FAILED: 358 | error_message = status_info.failed_reason or "Unknown error" 359 | raise ServerError(f"Model failed to start: {error_message}") 360 | 361 | if status_info.server_status == ModelStatus.SHUTDOWN: 362 | raise ServerError("Model was shutdown before it became ready") 363 | 364 | # Check timeout 365 | if time.time() - start_time > timeout_seconds: 366 | if status_info.server_status == ModelStatus.PENDING: 367 | warnings.warn( 368 | f"Model is still pending after {timeout_seconds} seconds, resetting timer...", 369 | UserWarning, 370 | stacklevel=2, 371 | ) 372 | start_time = time.time() 373 | raise ServerError( 374 | f"Timed out waiting for model to become ready after {timeout_seconds} seconds" 375 | ) 376 | 377 | # Wait before checking again 378 | time.sleep(poll_interval_seconds) 379 | 380 | def cleanup_logs( 381 | self, 382 | log_dir: Optional[Union[str, Path]] = None, 383 | model_family: Optional[str] = None, 384 | model_name: Optional[str] = None, 385 | job_id: Optional[int] = None, 386 | before_job_id: Optional[int] = None, 387 | dry_run: bool = False, 388 | ) -> list[Path]: 389 | """Remove logs from the log directory. 390 | 391 | Parameters 392 | ---------- 393 | log_dir : str or Path, optional 394 | Root directory containing log files. Defaults to ~/.vec-inf-logs. 395 | model_family : str, optional 396 | Only delete logs for this model family. 397 | model_name : str, optional 398 | Only delete logs for this model name. 399 | job_id : int, optional 400 | If provided, only match directories with this exact SLURM job ID. 401 | before_job_id : int, optional 402 | If provided, only delete logs with job ID less than this value. 403 | dry_run : bool 404 | If True, return matching files without deleting them. 405 | 406 | Returns 407 | ------- 408 | list[Path] 409 | List of deleted (or matched if dry_run) log file paths. 410 | """ 411 | log_root = Path(log_dir) if log_dir else Path.home() / ".vec-inf-logs" 412 | matched = find_matching_dirs( 413 | log_dir=log_root, 414 | model_family=model_family, 415 | model_name=model_name, 416 | job_id=job_id, 417 | before_job_id=before_job_id, 418 | ) 419 | 420 | if dry_run: 421 | return matched 422 | 423 | for path in matched: 424 | shutil.rmtree(path) 425 | 426 | return matched 427 | -------------------------------------------------------------------------------- /vec_inf/client/_slurm_script_generator.py: -------------------------------------------------------------------------------- 1 | """Class for generating Slurm scripts to run vLLM servers. 2 | 3 | This module provides functionality to generate Slurm scripts for running vLLM servers 4 | in both single-node and multi-node configurations. 5 | """ 6 | 7 | from datetime import datetime 8 | from pathlib import Path 9 | from typing import Any 10 | 11 | from vec_inf.client._client_vars import SLURM_JOB_CONFIG_ARGS 12 | from vec_inf.client._slurm_templates import ( 13 | BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE, 14 | BATCH_SLURM_SCRIPT_TEMPLATE, 15 | SLURM_SCRIPT_TEMPLATE, 16 | ) 17 | from vec_inf.client._slurm_vars import CONTAINER_MODULE_NAME 18 | 19 | 20 | class SlurmScriptGenerator: 21 | """A class to generate Slurm scripts for running vLLM servers. 22 | 23 | This class handles the generation of Slurm scripts for both single-node and 24 | multi-node configurations, supporting different virtualization environments 25 | (venv or singularity/apptainer). 26 | 27 | Parameters 28 | ---------- 29 | params : dict[str, Any] 30 | Configuration parameters for the Slurm script. 31 | """ 32 | 33 | def __init__(self, params: dict[str, Any]): 34 | self.params = params 35 | self.is_multinode = int(self.params["num_nodes"]) > 1 36 | self.use_container = self.params["venv"] == CONTAINER_MODULE_NAME 37 | self.additional_binds = ( 38 | f",{self.params['bind']}" if self.params.get("bind") else "" 39 | ) 40 | self.model_weights_path = str( 41 | Path(self.params["model_weights_parent_dir"], self.params["model_name"]) 42 | ) 43 | self.env_str = self._generate_env_str() 44 | 45 | def _generate_env_str(self) -> str: 46 | """Generate the environment variables string for the Slurm script. 47 | 48 | Returns 49 | ------- 50 | str 51 | Formatted env vars string for container or shell export commands. 52 | """ 53 | env_dict: dict[str, str] = self.params.get("env", {}) 54 | 55 | if not env_dict: 56 | return "" 57 | 58 | if self.use_container: 59 | # Format for container: --env KEY1=VAL1,KEY2=VAL2 60 | env_pairs = [f"{key}={val}" for key, val in env_dict.items()] 61 | return f"--env {','.join(env_pairs)}" 62 | # Format for shell: export KEY1=VAL1\nexport KEY2=VAL2 63 | export_lines = [f"export {key}={val}" for key, val in env_dict.items()] 64 | return "\n".join(export_lines) 65 | 66 | def _generate_script_content(self) -> str: 67 | """Generate the complete Slurm script content. 68 | 69 | Returns 70 | ------- 71 | str 72 | The complete Slurm script as a string. 73 | """ 74 | script_content = [] 75 | script_content.append(self._generate_shebang()) 76 | script_content.append(self._generate_server_setup()) 77 | script_content.append(self._generate_launch_cmd()) 78 | return "\n".join(script_content) 79 | 80 | def _generate_shebang(self) -> str: 81 | """Generate the Slurm script shebang with job specifications. 82 | 83 | Returns 84 | ------- 85 | str 86 | Slurm shebang containing job specifications. 87 | """ 88 | shebang = [SLURM_SCRIPT_TEMPLATE["shebang"]["base"]] 89 | for arg, value in SLURM_JOB_CONFIG_ARGS.items(): 90 | if self.params.get(value): 91 | shebang.append(f"#SBATCH --{arg}={self.params[value]}") 92 | if value == "model_name": 93 | shebang[-1] += "-vec-inf" 94 | if self.is_multinode: 95 | shebang += SLURM_SCRIPT_TEMPLATE["shebang"]["multinode"] 96 | return "\n".join(shebang) 97 | 98 | def _generate_server_setup(self) -> str: 99 | """Generate the server initialization script. 100 | 101 | Creates the script section that handles server setup, including Ray 102 | initialization for multi-node setups and port configuration. 103 | 104 | Returns 105 | ------- 106 | str 107 | Server initialization script content. 108 | """ 109 | server_script = ["\n"] 110 | if self.use_container: 111 | server_script.append("\n".join(SLURM_SCRIPT_TEMPLATE["container_setup"])) 112 | server_script.append( 113 | SLURM_SCRIPT_TEMPLATE["bind_path"].format( 114 | model_weights_path=self.model_weights_path, 115 | additional_binds=self.additional_binds, 116 | ) 117 | ) 118 | else: 119 | server_script.append( 120 | SLURM_SCRIPT_TEMPLATE["activate_venv"].format(venv=self.params["venv"]) 121 | ) 122 | server_script.append(self.env_str) 123 | server_script.append( 124 | SLURM_SCRIPT_TEMPLATE["imports"].format(src_dir=self.params["src_dir"]) 125 | ) 126 | if self.is_multinode: 127 | server_setup_str = "\n".join( 128 | SLURM_SCRIPT_TEMPLATE["server_setup"]["multinode"] 129 | ).format(gpus_per_node=self.params["gpus_per_node"]) 130 | if self.use_container: 131 | server_setup_str = server_setup_str.replace( 132 | "CONTAINER_PLACEHOLDER", 133 | SLURM_SCRIPT_TEMPLATE["container_command"].format( 134 | model_weights_path=self.model_weights_path, 135 | env_str=self.env_str, 136 | ), 137 | ) 138 | else: 139 | server_setup_str = server_setup_str.replace( 140 | "CONTAINER_PLACEHOLDER", 141 | "\\", 142 | ) 143 | else: 144 | server_setup_str = "\n".join( 145 | SLURM_SCRIPT_TEMPLATE["server_setup"]["single_node"] 146 | ) 147 | server_script.append(server_setup_str) 148 | server_script.append("\n".join(SLURM_SCRIPT_TEMPLATE["find_vllm_port"])) 149 | server_script.append( 150 | "\n".join(SLURM_SCRIPT_TEMPLATE["write_to_json"]).format( 151 | log_dir=self.params["log_dir"], model_name=self.params["model_name"] 152 | ) 153 | ) 154 | return "\n".join(server_script) 155 | 156 | def _generate_launch_cmd(self) -> str: 157 | """Generate the vLLM server launch command. 158 | 159 | Creates the command to launch the vLLM server, handling different virtualization 160 | environments (venv or singularity/apptainer). 161 | 162 | Returns 163 | ------- 164 | str 165 | Server launch command. 166 | """ 167 | launcher_script = ["\n"] 168 | if self.use_container: 169 | launcher_script.append( 170 | SLURM_SCRIPT_TEMPLATE["container_command"].format( 171 | model_weights_path=self.model_weights_path, 172 | env_str=self.env_str, 173 | ) 174 | ) 175 | 176 | launcher_script.append( 177 | "\n".join(SLURM_SCRIPT_TEMPLATE["launch_cmd"]).format( 178 | model_weights_path=self.model_weights_path, 179 | model_name=self.params["model_name"], 180 | ) 181 | ) 182 | 183 | for arg, value in self.params["vllm_args"].items(): 184 | if isinstance(value, bool): 185 | launcher_script.append(f" {arg} \\") 186 | else: 187 | launcher_script.append(f" {arg} {value} \\") 188 | return "\n".join(launcher_script) 189 | 190 | def write_to_log_dir(self) -> Path: 191 | """Write the generated Slurm script to the log directory. 192 | 193 | Creates a timestamped script file in the configured log directory. 194 | 195 | Returns 196 | ------- 197 | Path 198 | Path to the generated Slurm script file. 199 | """ 200 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 201 | script_path: Path = ( 202 | Path(self.params["log_dir"]) 203 | / f"launch_{self.params['model_name']}_{timestamp}.sbatch" 204 | ) 205 | 206 | content = self._generate_script_content() 207 | script_path.write_text(content) 208 | return script_path 209 | 210 | 211 | class BatchSlurmScriptGenerator: 212 | """A class to generate Slurm scripts for batch mode. 213 | 214 | This class handles the generation of Slurm scripts for batch mode, which 215 | launches multiple vLLM servers with different configurations in parallel. 216 | """ 217 | 218 | def __init__(self, params: dict[str, Any]): 219 | self.params = params 220 | self.script_paths: list[Path] = [] 221 | self.use_container = self.params["venv"] == CONTAINER_MODULE_NAME 222 | for model_name in self.params["models"]: 223 | self.params["models"][model_name]["additional_binds"] = ( 224 | f",{self.params['models'][model_name]['bind']}" 225 | if self.params["models"][model_name].get("bind") 226 | else "" 227 | ) 228 | self.params["models"][model_name]["model_weights_path"] = str( 229 | Path( 230 | self.params["models"][model_name]["model_weights_parent_dir"], 231 | model_name, 232 | ) 233 | ) 234 | 235 | def _write_to_log_dir(self, script_content: list[str], script_name: str) -> Path: 236 | """Write the generated Slurm script to the log directory. 237 | 238 | Returns 239 | ------- 240 | Path 241 | The Path object to the generated Slurm script file. 242 | """ 243 | script_path = Path(self.params["log_dir"]) / script_name 244 | script_path.touch(exist_ok=True) 245 | script_path.write_text("\n".join(script_content)) 246 | return script_path 247 | 248 | def _generate_model_launch_script(self, model_name: str) -> Path: 249 | """Generate the bash script for launching individual vLLM servers. 250 | 251 | Parameters 252 | ---------- 253 | model_name : str 254 | The name of the model to launch. 255 | 256 | Returns 257 | ------- 258 | Path 259 | The bash script path for launching the vLLM server. 260 | """ 261 | # Generate the bash script content 262 | script_content = [] 263 | model_params = self.params["models"][model_name] 264 | script_content.append(BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["shebang"]) 265 | if self.use_container: 266 | script_content.append(BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["container_setup"]) 267 | script_content.append( 268 | BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["bind_path"].format( 269 | model_weights_path=model_params["model_weights_path"], 270 | additional_binds=model_params["additional_binds"], 271 | ) 272 | ) 273 | script_content.append( 274 | "\n".join( 275 | BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["server_address_setup"] 276 | ).format(src_dir=self.params["src_dir"]) 277 | ) 278 | script_content.append( 279 | "\n".join(BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["write_to_json"]).format( 280 | het_group_id=model_params["het_group_id"], 281 | log_dir=self.params["log_dir"], 282 | slurm_job_name=self.params["slurm_job_name"], 283 | model_name=model_name, 284 | ) 285 | ) 286 | if self.use_container: 287 | script_content.append( 288 | BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["container_command"].format( 289 | model_weights_path=model_params["model_weights_path"], 290 | ) 291 | ) 292 | script_content.append( 293 | "\n".join(BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["launch_cmd"]).format( 294 | model_weights_path=model_params["model_weights_path"], 295 | model_name=model_name, 296 | ) 297 | ) 298 | for arg, value in model_params["vllm_args"].items(): 299 | if isinstance(value, bool): 300 | script_content.append(f" {arg} \\") 301 | else: 302 | script_content.append(f" {arg} {value} \\") 303 | script_content[-1] = script_content[-1].replace("\\", "") 304 | # Write the bash script to the log directory 305 | launch_script_path = self._write_to_log_dir( 306 | script_content, f"launch_{model_name}.sh" 307 | ) 308 | self.script_paths.append(launch_script_path) 309 | return launch_script_path 310 | 311 | def _generate_batch_slurm_script_shebang(self) -> str: 312 | """Generate the shebang for batch mode Slurm script. 313 | 314 | Returns 315 | ------- 316 | str 317 | The shebang for batch mode Slurm script. 318 | """ 319 | shebang = [BATCH_SLURM_SCRIPT_TEMPLATE["shebang"]] 320 | 321 | for arg, value in SLURM_JOB_CONFIG_ARGS.items(): 322 | if self.params.get(value): 323 | shebang.append(f"#SBATCH --{arg}={self.params[value]}") 324 | shebang.append("#SBATCH --ntasks=1") 325 | shebang.append("\n") 326 | 327 | for model_name in self.params["models"]: 328 | shebang.append(f"# ===== Resource group for {model_name} =====") 329 | for arg, value in SLURM_JOB_CONFIG_ARGS.items(): 330 | model_params = self.params["models"][model_name] 331 | if model_params.get(value) and value not in ["out_file", "err_file"]: 332 | shebang.append(f"#SBATCH --{arg}={model_params[value]}") 333 | if value == "model_name": 334 | shebang[-1] += "-vec-inf" 335 | shebang[-1] += "\n" 336 | shebang.append(BATCH_SLURM_SCRIPT_TEMPLATE["hetjob"]) 337 | # Remove the last hetjob line 338 | shebang.pop() 339 | return "\n".join(shebang) 340 | 341 | def generate_batch_slurm_script(self) -> Path: 342 | """Generate the Slurm script for launching multiple vLLM servers in batch mode. 343 | 344 | Returns 345 | ------- 346 | Path 347 | The Slurm script for launching multiple vLLM servers in batch mode. 348 | """ 349 | script_content = [] 350 | 351 | script_content.append(self._generate_batch_slurm_script_shebang()) 352 | 353 | for model_name in self.params["models"]: 354 | model_params = self.params["models"][model_name] 355 | script_content.append(f"# ===== Launching {model_name} =====") 356 | launch_script_path = str(self._generate_model_launch_script(model_name)) 357 | script_content.append( 358 | BATCH_SLURM_SCRIPT_TEMPLATE["permission_update"].format( 359 | script_name=launch_script_path 360 | ) 361 | ) 362 | script_content.append( 363 | "\n".join(BATCH_SLURM_SCRIPT_TEMPLATE["launch_model_scripts"]).format( 364 | het_group_id=model_params["het_group_id"], 365 | out_file=model_params["out_file"], 366 | err_file=model_params["err_file"], 367 | script_name=launch_script_path, 368 | ) 369 | ) 370 | script_content.append("wait") 371 | 372 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 373 | script_name = f"{self.params['slurm_job_name']}_{timestamp}.sbatch" 374 | return self._write_to_log_dir(script_content, script_name) 375 | -------------------------------------------------------------------------------- /vec_inf/client/_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions shared between CLI and API. 2 | 3 | This module provides utility functions for managing SLURM jobs, server status checks, 4 | and configuration handling for the vector inference package. 5 | """ 6 | 7 | import json 8 | import os 9 | import subprocess 10 | import warnings 11 | from pathlib import Path 12 | from typing import Any, Optional, Union, cast 13 | 14 | import requests 15 | import yaml 16 | 17 | from vec_inf.client._client_vars import MODEL_READY_SIGNATURE 18 | from vec_inf.client._exceptions import MissingRequiredFieldsError 19 | from vec_inf.client._slurm_vars import CACHED_CONFIG_DIR, REQUIRED_ARGS 20 | from vec_inf.client.config import ModelConfig 21 | from vec_inf.client.models import ModelStatus 22 | 23 | 24 | def run_bash_command(command: str) -> tuple[str, str]: 25 | """Run a bash command and return the output. 26 | 27 | Parameters 28 | ---------- 29 | command : str 30 | The bash command to execute 31 | 32 | Returns 33 | ------- 34 | tuple[str, str] 35 | A tuple containing (stdout, stderr) from the command execution 36 | """ 37 | process = subprocess.Popen( 38 | command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True 39 | ) 40 | return process.communicate() 41 | 42 | 43 | def read_slurm_log( 44 | slurm_job_name: str, 45 | slurm_job_id: str, 46 | slurm_log_type: str, 47 | log_dir: str, 48 | ) -> Union[list[str], str, dict[str, str]]: 49 | """Read the slurm log file. 50 | 51 | Parameters 52 | ---------- 53 | slurm_job_name : str 54 | Name of the SLURM job 55 | slurm_job_id : str 56 | ID of the SLURM job 57 | slurm_log_type : str 58 | Type of log file to read ('out', 'err', or 'json') 59 | log_dir : str 60 | Directory containing log files 61 | 62 | Returns 63 | ------- 64 | Union[list[str], str, dict[str, str]] 65 | Contents of the log file: 66 | - list[str] for 'out' and 'err' logs 67 | - dict[str, str] for 'json' logs 68 | - str for error messages if file not found 69 | """ 70 | try: 71 | if "+" in slurm_job_id: 72 | main_job_id, het_job_id = slurm_job_id.split("+") 73 | slurm_job_id = str(int(main_job_id) + int(het_job_id)) 74 | file_path = Path(log_dir, f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}") 75 | if slurm_log_type == "json": 76 | with file_path.open("r") as file: 77 | json_content: dict[str, str] = json.load(file) 78 | return json_content 79 | else: 80 | with file_path.open("r") as file: 81 | return file.readlines() 82 | except FileNotFoundError: 83 | return f"LOG FILE NOT FOUND: {file_path}" 84 | 85 | 86 | def is_server_running( 87 | slurm_job_name: str, slurm_job_id: str, log_dir: str 88 | ) -> Union[str, ModelStatus, tuple[ModelStatus, str]]: 89 | """Check if a model is ready to serve requests. 90 | 91 | Parameters 92 | ---------- 93 | slurm_job_name : str 94 | Name of the SLURM job 95 | slurm_job_id : str 96 | ID of the SLURM job 97 | log_dir : str 98 | Directory containing log files 99 | 100 | Returns 101 | ------- 102 | Union[str, ModelStatus, tuple[ModelStatus, str]] 103 | - str: Error message if logs cannot be read 104 | - ModelStatus: Current status of the server 105 | - tuple[ModelStatus, str]: Status and error message if server failed 106 | """ 107 | log_content = read_slurm_log(slurm_job_name, slurm_job_id, "err", log_dir) 108 | if isinstance(log_content, str): 109 | return log_content 110 | 111 | # Patterns that indicate fatal errors (not just warnings) 112 | fatal_error_patterns = [ 113 | "traceback", 114 | "exception", 115 | "fatal error", 116 | "critical error", 117 | "failed to", 118 | "could not", 119 | "unable to", 120 | "error:", 121 | ] 122 | 123 | # Patterns to ignore (non-fatal warnings/info messages) 124 | ignore_patterns = [ 125 | "deprecated", 126 | "futurewarning", 127 | "userwarning", 128 | "deprecationwarning", 129 | "slurmstepd: error:", # SLURM cancellation messages (often after server started) 130 | ] 131 | 132 | ready_signature_found = False 133 | fatal_error_line = None 134 | 135 | for line in log_content: 136 | line_lower = line.lower() 137 | 138 | # Check for ready signature first - if found, server is running 139 | if MODEL_READY_SIGNATURE in line: 140 | ready_signature_found = True 141 | # Continue checking to see if there are errors after startup 142 | 143 | # Check for fatal errors (only if we haven't seen ready signature yet) 144 | if not ready_signature_found: 145 | # Skip lines that match ignore patterns 146 | if any(ignore_pattern in line_lower for ignore_pattern in ignore_patterns): 147 | continue 148 | 149 | # Check for fatal error patterns 150 | for pattern in fatal_error_patterns: 151 | if pattern in line_lower: 152 | # Additional check: skip if it's part of a warning message 153 | # (warnings often contain "error:" but aren't fatal) 154 | if "warning" in line_lower and "error:" in line_lower: 155 | continue 156 | fatal_error_line = line.strip("\n") 157 | break 158 | 159 | # If we found a fatal error, mark as failed 160 | if fatal_error_line: 161 | return (ModelStatus.FAILED, fatal_error_line) 162 | 163 | # If ready signature was found and no fatal errors, server is running 164 | if ready_signature_found: 165 | return "RUNNING" 166 | 167 | # Otherwise, still launching 168 | return ModelStatus.LAUNCHING 169 | 170 | 171 | def get_base_url(slurm_job_name: str, slurm_job_id: str, log_dir: str) -> str: 172 | """Get the base URL of a model. 173 | 174 | Parameters 175 | ---------- 176 | slurm_job_name : str 177 | Name of the SLURM job 178 | slurm_job_id : str 179 | ID of the SLURM job 180 | log_dir : str 181 | Directory containing log files 182 | 183 | Returns 184 | ------- 185 | str 186 | Base URL of the model server or error message if not found 187 | """ 188 | log_content = read_slurm_log(slurm_job_name, slurm_job_id, "json", log_dir) 189 | if isinstance(log_content, str): 190 | return log_content 191 | 192 | server_addr = cast(dict[str, str], log_content).get("server_address") 193 | return server_addr if server_addr else "URL NOT FOUND" 194 | 195 | 196 | def model_health_check( 197 | slurm_job_name: str, slurm_job_id: str, log_dir: str 198 | ) -> tuple[ModelStatus, Union[str, int]]: 199 | """Check the health of a running model on the cluster. 200 | 201 | Parameters 202 | ---------- 203 | slurm_job_name : str 204 | Name of the SLURM job 205 | slurm_job_id : str 206 | ID of the SLURM job 207 | log_dir : str 208 | Directory containing log files 209 | 210 | Returns 211 | ------- 212 | tuple[ModelStatus, Union[str, int]] 213 | Tuple containing: 214 | - ModelStatus: Current status of the model 215 | - Union[str, int]: Either HTTP status code or error message 216 | """ 217 | base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir) 218 | if not base_url.startswith("http"): 219 | return (ModelStatus.FAILED, base_url) 220 | health_check_url = base_url.replace("v1", "health") 221 | 222 | try: 223 | response = requests.get(health_check_url) 224 | # Check if the request was successful 225 | if response.status_code == 200: 226 | return (ModelStatus.READY, response.status_code) 227 | return (ModelStatus.FAILED, response.status_code) 228 | except requests.exceptions.RequestException as e: 229 | return (ModelStatus.FAILED, str(e)) 230 | 231 | 232 | def load_config(config_path: Optional[str] = None) -> list[ModelConfig]: 233 | """Load the model configuration. 234 | 235 | Loads configuration from default and user-specified paths, merging them 236 | if both exist. User configuration takes precedence over default values. 237 | 238 | Parameters 239 | ---------- 240 | config_path : Optional[str] 241 | Path to the configuration file 242 | 243 | Returns 244 | ------- 245 | list[ModelConfig] 246 | List of validated model configurations 247 | 248 | Notes 249 | ----- 250 | Configuration is loaded from: 251 | 1. User path: specified by config_path 252 | 2. Default path: package's config/models.yaml or CACHED_CONFIG if it exists 253 | 3. Environment variable: specified by VEC_INF_CONFIG environment variable 254 | and merged with default config 255 | 256 | If user configuration exists, it will be merged with default configuration, 257 | with user values taking precedence for overlapping fields. 258 | """ 259 | 260 | def load_yaml_config(path: Path) -> dict[str, Any]: 261 | """Load YAML config with error handling.""" 262 | try: 263 | with path.open() as f: 264 | return yaml.safe_load(f) or {} 265 | except FileNotFoundError as err: 266 | raise FileNotFoundError(f"Could not find config: {path}") from err 267 | except yaml.YAMLError as err: 268 | raise ValueError(f"Error parsing YAML config at {path}: {err}") from err 269 | 270 | def process_config(config: dict[str, Any]) -> list[ModelConfig]: 271 | """Process the config based on the config type.""" 272 | return [ 273 | ModelConfig(model_name=name, **model_data) 274 | for name, model_data in config.get("models", {}).items() 275 | ] 276 | 277 | def resolve_config_path_from_env_var() -> Path | None: 278 | """Resolve the config path from the environment variable.""" 279 | config_dir = os.getenv("VEC_INF_CONFIG_DIR") 280 | config_path = os.getenv("VEC_INF_MODEL_CONFIG") 281 | if config_path: 282 | return Path(config_path) 283 | if config_dir: 284 | return Path(config_dir, "models.yaml") 285 | return None 286 | 287 | def update_config( 288 | config: dict[str, Any], user_config: dict[str, Any] 289 | ) -> dict[str, Any]: 290 | """Update the config with the user config.""" 291 | for name, data in user_config.get("models", {}).items(): 292 | if name in config.get("models", {}): 293 | config["models"][name].update(data) 294 | else: 295 | config.setdefault("models", {})[name] = data 296 | 297 | return config 298 | 299 | # 1. If config_path is given, use only that 300 | if config_path: 301 | config = load_yaml_config(Path(config_path)) 302 | return process_config(config) 303 | 304 | # 2. Otherwise, load default config 305 | default_path = ( 306 | CACHED_CONFIG_DIR / "models.yaml" 307 | if CACHED_CONFIG_DIR.exists() 308 | else Path(__file__).resolve().parent.parent / "config" / "models.yaml" 309 | ) 310 | config = load_yaml_config(default_path) 311 | 312 | # 3. If user config exists, merge it 313 | user_path = resolve_config_path_from_env_var() 314 | if user_path and user_path.exists(): 315 | user_config = load_yaml_config(user_path) 316 | config = update_config(config, user_config) 317 | elif user_path: 318 | warnings.warn( 319 | f"WARNING: Could not find user config: {str(user_path)}, revert to default config located at {default_path}", 320 | UserWarning, 321 | stacklevel=2, 322 | ) 323 | 324 | return process_config(config) 325 | 326 | 327 | def parse_launch_output(output: str) -> tuple[str, dict[str, str]]: 328 | """Parse output from model launch command. 329 | 330 | Parameters 331 | ---------- 332 | output : str 333 | Raw output from the launch command 334 | 335 | Returns 336 | ------- 337 | tuple[str, dict[str, str]] 338 | Tuple containing: 339 | - str: SLURM job ID 340 | - dict[str, str]: Dictionary of parsed configuration parameters 341 | 342 | Notes 343 | ----- 344 | Extracts the SLURM job ID and configuration parameters from the launch 345 | command output. Configuration parameters are parsed from key-value pairs 346 | in the output text. 347 | """ 348 | slurm_job_id = output.split(" ")[-1].strip().strip("\n") 349 | 350 | # Extract config parameters 351 | config_dict = {} 352 | output_lines = output.split("\n")[:-2] 353 | for line in output_lines: 354 | if ": " in line: 355 | key, value = line.split(": ", 1) 356 | config_dict[key.lower().replace(" ", "_")] = value 357 | 358 | return slurm_job_id, config_dict 359 | 360 | 361 | def is_power_of_two(n: int) -> bool: 362 | """Check if a number is a power of two. 363 | 364 | Parameters 365 | ---------- 366 | n : int 367 | The number to check 368 | """ 369 | return n > 0 and (n & (n - 1)) == 0 370 | 371 | 372 | def find_matching_dirs( 373 | log_dir: Path, 374 | model_family: Optional[str] = None, 375 | model_name: Optional[str] = None, 376 | job_id: Optional[int] = None, 377 | before_job_id: Optional[int] = None, 378 | ) -> list[Path]: 379 | """ 380 | Find log directories based on filtering criteria. 381 | 382 | Parameters 383 | ---------- 384 | log_dir : Path 385 | The base directory containing model family directories. 386 | model_family : str, optional 387 | Filter to only search inside this family. 388 | model_name : str, optional 389 | Filter to only match model names. 390 | job_id : int, optional 391 | Filter to only match this exact SLURM job ID. 392 | before_job_id : int, optional 393 | Filter to only include job IDs less than this value. 394 | 395 | Returns 396 | ------- 397 | list[Path] 398 | List of directories that match the criteria and can be deleted. 399 | """ 400 | matched = [] 401 | 402 | if not log_dir.exists() or not log_dir.is_dir(): 403 | raise FileNotFoundError(f"Log directory does not exist: {log_dir}") 404 | 405 | if not model_family and not model_name and not job_id and not before_job_id: 406 | return [log_dir] 407 | 408 | for family_dir in log_dir.iterdir(): 409 | if not family_dir.is_dir(): 410 | continue 411 | if model_family and family_dir.name != model_family: 412 | continue 413 | 414 | if model_family and not model_name and not job_id and not before_job_id: 415 | return [family_dir] 416 | 417 | for job_dir in family_dir.iterdir(): 418 | if not job_dir.is_dir(): 419 | continue 420 | 421 | try: 422 | name_part, id_part = job_dir.name.rsplit(".", 1) 423 | parsed_id = int(id_part) 424 | except ValueError: 425 | continue 426 | 427 | if model_name and name_part != model_name: 428 | continue 429 | if job_id is not None and parsed_id != job_id: 430 | continue 431 | if before_job_id is not None and parsed_id >= before_job_id: 432 | continue 433 | 434 | matched.append(job_dir) 435 | 436 | return matched 437 | 438 | 439 | def check_required_fields(params: dict[str, Any]) -> dict[str, Any]: 440 | """Check for required fields without default vals and their corresponding env vars. 441 | 442 | Parameters 443 | ---------- 444 | params : dict[str, Any] 445 | Dictionary of parameters to check. 446 | """ 447 | env_overrides = {} 448 | for arg in REQUIRED_ARGS: 449 | if not params.get(arg): 450 | default_value = os.getenv(REQUIRED_ARGS[arg]) 451 | if default_value: 452 | params[arg] = default_value 453 | env_overrides[arg] = default_value 454 | else: 455 | raise MissingRequiredFieldsError( 456 | f"{arg} is required, please set it in the command arguments or environment variables" 457 | ) 458 | return env_overrides 459 | -------------------------------------------------------------------------------- /vec_inf/cli/_cli.py: -------------------------------------------------------------------------------- 1 | """Command line interface for Vector Inference. 2 | 3 | This module provides the command-line interface for interacting with Vector 4 | Inference services, including model launching, status checking, metrics 5 | monitoring, and shutdown operations. 6 | 7 | Commands 8 | -------- 9 | launch 10 | Launch a model on the cluster 11 | status 12 | Check the status of a running model 13 | shutdown 14 | Stop a running model 15 | list 16 | List available models or get specific model configuration 17 | metrics 18 | Stream real-time performance metrics 19 | """ 20 | 21 | import json 22 | import time 23 | from typing import Optional, Union 24 | 25 | import click 26 | from rich.console import Console 27 | from rich.live import Live 28 | 29 | from vec_inf.cli._helper import ( 30 | BatchLaunchResponseFormatter, 31 | LaunchResponseFormatter, 32 | ListCmdDisplay, 33 | ListStatusDisplay, 34 | MetricsResponseFormatter, 35 | StatusResponseFormatter, 36 | ) 37 | from vec_inf.client import LaunchOptions, VecInfClient 38 | 39 | 40 | CONSOLE = Console() 41 | 42 | 43 | @click.group() 44 | def cli() -> None: 45 | """Vector Inference CLI.""" 46 | pass 47 | 48 | 49 | @cli.command("launch", help="Launch a model on the cluster.") 50 | @click.argument("model-name", type=str, nargs=1) 51 | @click.option("--model-family", type=str, help="The model family") 52 | @click.option("--model-variant", type=str, help="The model variant") 53 | @click.option( 54 | "--partition", 55 | type=str, 56 | help="Type of Slurm partition", 57 | ) 58 | @click.option( 59 | "--resource-type", 60 | type=str, 61 | help="Type of resource to request for the job", 62 | ) 63 | @click.option( 64 | "--num-nodes", 65 | type=int, 66 | help="Number of nodes to use, default to suggested resource allocation for model", 67 | ) 68 | @click.option( 69 | "--gpus-per-node", 70 | type=int, 71 | help="Number of GPUs/node to use, default to suggested resource allocation for model", 72 | ) 73 | @click.option( 74 | "--cpus-per-task", 75 | type=int, 76 | help="Number of CPU cores per task", 77 | ) 78 | @click.option( 79 | "--mem-per-node", 80 | type=str, 81 | help="Memory allocation per node in GB format (e.g., '32G')", 82 | ) 83 | @click.option( 84 | "--account", 85 | "-A", 86 | type=str, 87 | help="Charge resources used by this job to specified account.", 88 | ) 89 | @click.option( 90 | "--work-dir", 91 | "-D", 92 | type=str, 93 | help="Set working directory for the batch job", 94 | ) 95 | @click.option( 96 | "--qos", 97 | type=str, 98 | help="Quality of service", 99 | ) 100 | @click.option( 101 | "--exclude", 102 | type=str, 103 | help="Exclude certain nodes from the resources granted to the job", 104 | ) 105 | @click.option( 106 | "--nodelist", 107 | type=str, 108 | help="Request a specific list of nodes for deployment", 109 | ) 110 | @click.option( 111 | "--bind", 112 | type=str, 113 | help="Additional binds for the container as a comma separated list of bind paths", 114 | ) 115 | @click.option( 116 | "--time", 117 | type=str, 118 | help="Time limit for job, this should comply with QoS limits", 119 | ) 120 | @click.option( 121 | "--venv", 122 | type=str, 123 | help="Path to virtual environment", 124 | ) 125 | @click.option( 126 | "--log-dir", 127 | type=str, 128 | help="Path to slurm log directory", 129 | ) 130 | @click.option( 131 | "--model-weights-parent-dir", 132 | type=str, 133 | help="Path to parent directory containing model weights", 134 | ) 135 | @click.option( 136 | "--vllm-args", 137 | type=str, 138 | help="vLLM engine arguments to be set, use the format as specified in vLLM documentation and separate arguments with commas, e.g. --vllm-args '--max-model-len=8192,--max-num-seqs=256,--enable-prefix-caching'", 139 | ) 140 | @click.option( 141 | "--json-mode", 142 | is_flag=True, 143 | help="Output in JSON string", 144 | ) 145 | @click.option( 146 | "--env", 147 | type=str, 148 | help="Environment variables to be set. Seperate variables with commas. Can also include path to a file containing environment variables seperated by newlines. e.g. --env 'TRITON_CACHE_DIR=/scratch/.cache/triton,my_custom_vars_file.env'", 149 | ) 150 | @click.option( 151 | "--config", 152 | type=str, 153 | help="Path to a model config yaml file to use in place of the default", 154 | ) 155 | def launch( 156 | model_name: str, 157 | **cli_kwargs: Optional[Union[str, int, float, bool]], 158 | ) -> None: 159 | """Launch a model on the cluster. 160 | 161 | Parameters 162 | ---------- 163 | model_name : str 164 | Name of the model to launch 165 | **cli_kwargs : dict 166 | Additional launch options including: 167 | - model_family : str, optional 168 | Family/architecture of the model 169 | - model_variant : str, optional 170 | Specific variant of the model 171 | - partition : str, optional 172 | Type of Slurm partition 173 | - resource_type : str, optional 174 | Type of resource to request for the job 175 | - num_nodes : int, optional 176 | Number of nodes to use 177 | - gpus_per_node : int, optional 178 | Number of GPUs per node 179 | - cpus_per_task : int, optional 180 | Number of CPU cores per task 181 | - mem_per_node : str, optional 182 | Memory allocation per node in GB format (e.g., '32G') 183 | - account : str, optional 184 | Charge resources used by this job to specified account 185 | - work_dir : str, optional 186 | Set working directory for the batch job 187 | - qos : str, optional 188 | Quality of service tier 189 | - exclude : str, optional 190 | Exclude certain nodes from the resources granted to the job 191 | - nodelist : str, optional 192 | Request a specific list of nodes for deployment 193 | - bind : str, optional 194 | Additional binds for the container as a comma separated list of bind paths 195 | - time : str, optional 196 | Time limit for job 197 | - venv : str, optional 198 | Path to virtual environment 199 | - log_dir : str, optional 200 | Path to SLURM log directory 201 | - model_weights_parent_dir : str, optional 202 | Path to model weights directory 203 | - vllm_args : str, optional 204 | vLLM engine arguments 205 | - env : str, optional 206 | Environment variables 207 | - config : str, optional 208 | Path to custom model config yaml file 209 | - json_mode : bool, optional 210 | Output in JSON format 211 | 212 | Raises 213 | ------ 214 | click.ClickException 215 | If launch fails for any reason 216 | """ 217 | try: 218 | # Convert cli_kwargs to LaunchOptions 219 | json_mode = cli_kwargs["json_mode"] 220 | del cli_kwargs["json_mode"] 221 | 222 | launch_options = LaunchOptions(**cli_kwargs) # type: ignore 223 | 224 | # Start the client and launch model inference server 225 | client = VecInfClient() 226 | launch_response = client.launch_model(model_name, launch_options) 227 | 228 | # Display launch information 229 | if json_mode: 230 | click.echo(json.dumps(launch_response.config)) 231 | else: 232 | launch_formatter = LaunchResponseFormatter( 233 | model_name, launch_response.config 234 | ) 235 | launch_info_table = launch_formatter.format_table_output() 236 | CONSOLE.print(launch_info_table) 237 | 238 | except click.ClickException as e: 239 | raise e 240 | except Exception as e: 241 | raise click.ClickException(f"Launch failed: {str(e)}") from e 242 | 243 | 244 | @cli.command( 245 | "batch-launch", 246 | help="Launch multiple models in a batch, separate model names with spaces.", 247 | ) 248 | @click.argument("model-names", type=str, nargs=-1) 249 | @click.option( 250 | "--batch-config", 251 | type=str, 252 | help="Model configuration for batch launch", 253 | ) 254 | @click.option( 255 | "--account", 256 | "-A", 257 | type=str, 258 | help="Charge resources used by this job to specified account.", 259 | ) 260 | @click.option( 261 | "--work-dir", 262 | "-D", 263 | type=str, 264 | help="Set working directory for the batch job", 265 | ) 266 | @click.option( 267 | "--json-mode", 268 | is_flag=True, 269 | help="Output in JSON string", 270 | ) 271 | def batch_launch( 272 | model_names: tuple[str, ...], 273 | batch_config: Optional[str] = None, 274 | account: Optional[str] = None, 275 | work_dir: Optional[str] = None, 276 | json_mode: Optional[bool] = False, 277 | ) -> None: 278 | """Launch multiple models in a batch. 279 | 280 | Parameters 281 | ---------- 282 | model_names : tuple[str, ...] 283 | Names of the models to launch 284 | batch_config : str 285 | Model configuration for batch launch 286 | json_mode : bool, default=False 287 | Whether to output in JSON format 288 | 289 | Raises 290 | ------ 291 | click.ClickException 292 | If batch launch fails 293 | """ 294 | try: 295 | # Start the client and launch models in batch mode 296 | client = VecInfClient() 297 | batch_launch_response = client.batch_launch_models( 298 | list(model_names), batch_config, account, work_dir 299 | ) 300 | 301 | # Display batch launch information 302 | if json_mode: 303 | click.echo(json.dumps(batch_launch_response.config, indent=4)) 304 | else: 305 | batch_launch_formatter = BatchLaunchResponseFormatter( 306 | batch_launch_response.config 307 | ) 308 | batch_launch_info_table = batch_launch_formatter.format_table_output() 309 | CONSOLE.print(batch_launch_info_table) 310 | 311 | except click.ClickException as e: 312 | raise e 313 | except Exception as e: 314 | raise click.ClickException(f"Batch launch failed: {str(e)}") from e 315 | 316 | 317 | @cli.command("status", help="Check the status of running vec-inf jobs on the cluster.") 318 | @click.argument("slurm_job_id", required=False) 319 | @click.option( 320 | "--json-mode", 321 | is_flag=True, 322 | help="Output in JSON string", 323 | ) 324 | def status(slurm_job_id: Optional[str] = None, json_mode: bool = False) -> None: 325 | """Get the status of a running model on the cluster. 326 | 327 | Parameters 328 | ---------- 329 | slurm_job_id : str 330 | ID of the SLURM job to check 331 | json_mode : bool, default=False 332 | Whether to output in JSON format 333 | 334 | Raises 335 | ------ 336 | click.ClickException 337 | If status check fails 338 | """ 339 | try: 340 | # Start the client and get model inference server status 341 | client = VecInfClient() 342 | if not slurm_job_id: 343 | slurm_job_ids = client.fetch_running_jobs() 344 | if not slurm_job_ids: 345 | click.echo("No running jobs found.") 346 | return 347 | else: 348 | slurm_job_ids = [slurm_job_id] 349 | responses = [] 350 | for job_id in slurm_job_ids: 351 | responses.append(client.get_status(job_id)) 352 | 353 | # Display status information 354 | if slurm_job_id: 355 | status_formatter = StatusResponseFormatter(responses[0]) 356 | if json_mode: 357 | status_formatter.output_json() 358 | else: 359 | status_info_table = status_formatter.output_table() 360 | CONSOLE.print(status_info_table) 361 | else: 362 | list_status_display = ListStatusDisplay(slurm_job_ids, responses, json_mode) 363 | list_status_display.display_multiple_status_output(CONSOLE) 364 | 365 | except click.ClickException as e: 366 | raise e 367 | except Exception as e: 368 | raise click.ClickException(f"Status check failed: {str(e)}") from e 369 | 370 | 371 | @cli.command("shutdown", help="Shutdown a running model on the cluster.") 372 | @click.argument("slurm_job_id", type=str, nargs=1) 373 | def shutdown(slurm_job_id: str) -> None: 374 | """Shutdown a running model on the cluster. 375 | 376 | Parameters 377 | ---------- 378 | slurm_job_id : str 379 | ID of the SLURM job to shut down 380 | 381 | Raises 382 | ------ 383 | click.ClickException 384 | If shutdown operation fails 385 | """ 386 | try: 387 | client = VecInfClient() 388 | client.shutdown_model(slurm_job_id) 389 | click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}") 390 | except Exception as e: 391 | raise click.ClickException(f"Shutdown failed: {str(e)}") from e 392 | 393 | 394 | @cli.command("list", help="List available models or get specific model configuration.") 395 | @click.argument("model-name", required=False) 396 | @click.option( 397 | "--json-mode", 398 | is_flag=True, 399 | help="Output in JSON string", 400 | ) 401 | def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None: 402 | """List all available models, or get default setup of a specific model. 403 | 404 | Parameters 405 | ---------- 406 | model_name : str, optional 407 | Name of specific model to get information for 408 | json_mode : bool, default=False 409 | Whether to output in JSON format 410 | 411 | Raises 412 | ------ 413 | click.ClickException 414 | If list operation fails 415 | """ 416 | try: 417 | # Start the client 418 | client = VecInfClient() 419 | list_display = ListCmdDisplay(CONSOLE, json_mode) 420 | if model_name: 421 | model_config = client.get_model_config(model_name) 422 | list_display.display_single_model_output(model_config) 423 | else: 424 | model_infos = client.list_models() 425 | list_display.display_all_models_output(model_infos) 426 | except click.ClickException as e: 427 | raise e 428 | except Exception as e: 429 | raise click.ClickException(f"List models failed: {str(e)}") from e 430 | 431 | 432 | @cli.command( 433 | "metrics", help="Stream real-time performance metrics from the model endpoint." 434 | ) 435 | @click.argument("slurm_job_id", type=str, nargs=1) 436 | def metrics(slurm_job_id: str) -> None: 437 | """Stream real-time performance metrics from the model endpoint. 438 | 439 | Parameters 440 | ---------- 441 | slurm_job_id : str 442 | ID of the SLURM job to monitor 443 | 444 | Raises 445 | ------ 446 | click.ClickException 447 | If metrics collection fails 448 | 449 | Notes 450 | ----- 451 | This command continuously streams metrics with a 2-second refresh interval 452 | until interrupted. If metrics are not available, it will display status 453 | information instead. 454 | """ 455 | try: 456 | # Start the client and get inference server metrics 457 | client = VecInfClient() 458 | metrics_response = client.get_metrics(slurm_job_id) 459 | metrics_formatter = MetricsResponseFormatter(metrics_response.metrics) 460 | 461 | # Check if metrics response is ready 462 | if isinstance(metrics_response.metrics, str): 463 | metrics_formatter.format_failed_metrics(metrics_response.metrics) 464 | CONSOLE.print(metrics_formatter.table) 465 | return 466 | 467 | with Live(refresh_per_second=1, console=CONSOLE) as live: 468 | while True: 469 | metrics_response = client.get_metrics(slurm_job_id) 470 | metrics_formatter = MetricsResponseFormatter(metrics_response.metrics) 471 | 472 | if isinstance(metrics_response.metrics, str): 473 | # Show status information if metrics aren't available 474 | metrics_formatter.format_failed_metrics(metrics_response.metrics) 475 | else: 476 | metrics_formatter.format_metrics() 477 | 478 | live.update(metrics_formatter.table) 479 | time.sleep(1) 480 | except click.ClickException as e: 481 | raise e 482 | except Exception as e: 483 | raise click.ClickException(f"Metrics check failed: {str(e)}") from e 484 | 485 | 486 | @cli.command("cleanup", help="Clean up log files based on optional filters.") 487 | @click.option("--log-dir", type=str, help="Path to SLURM log directory") 488 | @click.option("--model-family", type=str, help="Filter by model family") 489 | @click.option("--model-name", type=str, help="Filter by model name") 490 | @click.option( 491 | "--job-id", type=int, help="Only remove logs with this exact SLURM job ID" 492 | ) 493 | @click.option( 494 | "--before-job-id", 495 | type=int, 496 | help="Remove logs with job ID less than this value", 497 | ) 498 | @click.option("--dry-run", is_flag=True, help="List matching logs without deleting") 499 | def cleanup_logs_cli( 500 | log_dir: Optional[str], 501 | model_family: Optional[str], 502 | model_name: Optional[str], 503 | job_id: Optional[int], 504 | before_job_id: Optional[int], 505 | dry_run: bool, 506 | ) -> None: 507 | """Clean up log files based on optional filters. 508 | 509 | Parameters 510 | ---------- 511 | log_dir : str or Path, optional 512 | Root directory containing log files. Defaults to ~/.vec-inf-logs. 513 | model_family : str, optional 514 | Only delete logs for this model family. 515 | model_name : str, optional 516 | Only delete logs for this model name. 517 | job_id : int, optional 518 | If provided, only match directories with this exact SLURM job ID. 519 | before_job_id : int, optional 520 | If provided, only delete logs with job ID less than this value. 521 | dry_run : bool 522 | If True, return matching files without deleting them. 523 | """ 524 | try: 525 | client = VecInfClient() 526 | matched = client.cleanup_logs( 527 | log_dir=log_dir, 528 | model_family=model_family, 529 | model_name=model_name, 530 | job_id=job_id, 531 | before_job_id=before_job_id, 532 | dry_run=dry_run, 533 | ) 534 | 535 | if not matched: 536 | if dry_run: 537 | click.echo("Dry run: no matching log directories found.") 538 | else: 539 | click.echo("No matching log directories were deleted.") 540 | elif dry_run: 541 | click.echo(f"Dry run: {len(matched)} directories would be deleted:") 542 | for f in matched: 543 | click.echo(f" - {f}") 544 | else: 545 | click.echo(f"Deleted {len(matched)} log directory(ies).") 546 | except Exception as e: 547 | raise click.ClickException(f"Cleanup failed: {str(e)}") from e 548 | 549 | 550 | if __name__ == "__main__": 551 | cli() 552 | --------------------------------------------------------------------------------