├── .python-version
├── tests
├── __init__.py
├── vec_inf
│ ├── __init__.py
│ ├── cli
│ │ ├── __init__.py
│ │ └── test_utils.py
│ └── client
│ │ ├── __init__.py
│ │ ├── test_vars.env
│ │ ├── test_models.py
│ │ ├── test_examples.py
│ │ └── test_utils.py
└── test_imports.py
├── vec_inf
├── __init__.py
├── cli
│ ├── __init__.py
│ ├── _vars.py
│ ├── _utils.py
│ └── _cli.py
├── config
│ ├── README.md
│ └── environment.yaml
├── client
│ ├── __init__.py
│ ├── _exceptions.py
│ ├── _client_vars.py
│ ├── _slurm_vars.py
│ ├── config.py
│ ├── models.py
│ ├── _slurm_templates.py
│ ├── api.py
│ ├── _slurm_script_generator.py
│ └── _utils.py
├── README.md
└── find_port.sh
├── .github
├── ISSUE_TEMPLATE
│ ├── config.yml
│ ├── model-request.md
│ ├── bug_report.md
│ └── feature_request.md
├── pull_request_template.md
├── dependabot.yml
└── workflows
│ ├── publish.yml
│ ├── code_checks.yml
│ ├── docker.yml
│ ├── unit_tests.yml
│ └── docs.yml
├── docs
├── assets
│ ├── favicon.ico
│ └── vector-logo.svg
├── overrides
│ └── partials
│ │ ├── logo.html
│ │ └── copyright.html
├── api.md
├── Makefile
├── make.bat
├── index.md
├── contributing.md
└── stylesheets
│ └── extra.css
├── examples
├── inference
│ ├── llm
│ │ ├── completions.sh
│ │ ├── completions.py
│ │ └── chat_completions.py
│ ├── text_embedding
│ │ └── embeddings.py
│ └── vlm
│ │ └── vision_completions.py
├── logits
│ └── logits.py
├── slurm_dependency
│ ├── run_workflow.sh
│ ├── downstream_job.sbatch
│ ├── run_downstream.py
│ └── README.md
├── README.md
└── api
│ └── basic_usage.py
├── codecov.yml
├── venv.sh
├── LICENSE
├── .pre-commit-config.yaml
├── profile
├── avg_throughput.py
└── gen.py
├── Dockerfile
├── .gitignore
├── mkdocs.yml
├── pyproject.toml
├── README.md
└── MODEL_TRACKING.md
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10
2 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit tests module."""
2 |
--------------------------------------------------------------------------------
/vec_inf/__init__.py:
--------------------------------------------------------------------------------
1 | """vec_inf package."""
2 |
--------------------------------------------------------------------------------
/vec_inf/cli/__init__.py:
--------------------------------------------------------------------------------
1 | """vec_inf cli package."""
2 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 |
--------------------------------------------------------------------------------
/tests/vec_inf/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit tests for vec_inf package."""
2 |
--------------------------------------------------------------------------------
/tests/vec_inf/cli/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit tests for vec_inf.cli subpackage."""
2 |
--------------------------------------------------------------------------------
/tests/vec_inf/client/__init__.py:
--------------------------------------------------------------------------------
1 | """Tests for the Vector Inference API."""
2 |
--------------------------------------------------------------------------------
/tests/vec_inf/client/test_vars.env:
--------------------------------------------------------------------------------
1 | MY_VAR=5
2 | VLLM_CACHE_ROOT=/cache/vllm
3 |
--------------------------------------------------------------------------------
/docs/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VectorInstitute/vector-inference/HEAD/docs/assets/favicon.ico
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | # PR Type
2 | [Feature | Fix | Documentation | Other() ]
3 |
4 | # Short Description
5 | ...
6 |
7 | # Tests Added
8 | ...
9 |
--------------------------------------------------------------------------------
/docs/overrides/partials/logo.html:
--------------------------------------------------------------------------------
1 | {% if config.theme.logo %}
2 |
3 | {% else %}
4 |
5 | {% endif %}
6 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/model-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Model request
3 | about: Request for new model weights or model config
4 | title: New model request for [MODEL_NAME]
5 | labels: new model
6 | assignees: XkunW
7 |
8 | ---
9 |
10 | ### Request Type
11 | Model weights | Model config | Both
12 |
13 | ### Model Name
14 | Name of the model requested
15 |
--------------------------------------------------------------------------------
/examples/inference/llm/completions.sh:
--------------------------------------------------------------------------------
1 | #!bin/bash
2 |
3 | # The url can be found with vec-inf status $JOB_ID
4 | export API_BASE_URL=http://gpuXXX:XXXX/v1
5 |
6 | # Update the model path accordingly
7 | curl ${API_BASE_URL}/completions \
8 | -H "Content-Type: application/json" \
9 | -d '{
10 | "model": "Meta-Llama-3.1-8B-Instruct",
11 | "prompt": "What is the capital of Canada?",
12 | "max_tokens": 20
13 | }'
14 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | codecov:
2 | branch: main
3 | require_ci_to_pass: true
4 | notify:
5 | after_n_builds: 2
6 | wait_for_ci: yes
7 | comment:
8 | behavior: default
9 | layout: reach,diff,flags,tree,reach
10 | show_carryforward_flags: false
11 | require_changes: true
12 | coverage:
13 | status:
14 | changes: true
15 | default_rules:
16 | flag_coverage_not_uploaded_behavior: include
17 | patch: true
18 | project: true
19 | github_checks:
20 | annotations: true
21 |
--------------------------------------------------------------------------------
/examples/inference/llm/completions.py:
--------------------------------------------------------------------------------
1 | """Example of how to use the OpenAI API to generate completions."""
2 |
3 | from openai import OpenAI
4 |
5 |
6 | # The url can be found with vec-inf status $JOB_ID
7 | client = OpenAI(base_url="http://gpuXXX:XXXX/v1", api_key="EMPTY")
8 |
9 | # Update the model path accordingly
10 | completion = client.completions.create(
11 | model="Meta-Llama-3.1-8B-Instruct",
12 | prompt="Where is the capital of Canada?",
13 | max_tokens=20,
14 | )
15 |
16 | print(completion)
17 |
--------------------------------------------------------------------------------
/examples/logits/logits.py:
--------------------------------------------------------------------------------
1 | """Example of how to get logits from the model."""
2 |
3 | from openai import OpenAI
4 |
5 |
6 | # The url can be found with vec-inf status $JOB_ID
7 | client = OpenAI(base_url="http://gpuXXX:XXXX/v1", api_key="EMPTY")
8 |
9 | completion = client.completions.create(
10 | model="Meta-Llama-3.1-8B-Instruct",
11 | prompt="Where is the capital of Canada?",
12 | max_tokens=1,
13 | logprobs=128256, # Set to model vocab size to get logits
14 | )
15 |
16 | print(completion.choices[0].logprobs)
17 |
--------------------------------------------------------------------------------
/vec_inf/config/README.md:
--------------------------------------------------------------------------------
1 | # Configs
2 |
3 | * [`environment.yaml`](environment.yaml): Configuration for the Slurm cluster environment, including image paths, resource availabilities, default value, and etc.
4 | * [`models.yaml`](models.yaml): Configuration for launching model inference servers, including Slurm parameters as well as `vllm serve` arguments.
5 |
6 | **NOTE**: These configs acts as last resort fallbacks in the `vec-inf` package, they will be updated to match the latest cached config on the Vector Killarney cluster with each new package version release.
7 |
--------------------------------------------------------------------------------
/examples/slurm_dependency/run_workflow.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # ---- Config ----
4 | MODEL_NAME="Meta-Llama-3.1-8B-Instruct"
5 | LAUNCH_ARGS="$MODEL_NAME"
6 |
7 | # ---- Step 1: Launch the server
8 | RAW_JSON=$(vec-inf launch $LAUNCH_ARGS --json-mode)
9 | SERVER_JOB_ID=$(echo "$RAW_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin)['slurm_job_id'])")
10 | echo "Launched server as job $SERVER_JOB_ID"
11 | echo "$RAW_JSON"
12 |
13 | # ---- Step 2: Submit downstream job
14 | sbatch --dependency=after:$SERVER_JOB_ID --export=SERVER_JOB_ID=$SERVER_JOB_ID downstream_job.sbatch
15 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "github-actions" # See documentation for possible values
9 | directory: "/" # Location of package manifests
10 | schedule:
11 | interval: "weekly"
12 |
--------------------------------------------------------------------------------
/docs/api.md:
--------------------------------------------------------------------------------
1 | # Python API Reference
2 |
3 | This section documents the Python API for vector-inference.
4 |
5 | ## Client Interface
6 |
7 | ::: vec_inf.client.api.VecInfClient
8 | options:
9 | show_root_heading: true
10 | show_root_full_path: true
11 | members: true
12 |
13 | ## Model Config
14 |
15 | ::: vec_inf.client.config.ModelConfig
16 | options:
17 | show_root_heading: true
18 | show_root_full_path: true
19 | members: true
20 |
21 |
22 | ## Data Models
23 |
24 | ::: vec_inf.client.models
25 | options:
26 | show_root_heading: true
27 | members: true
28 |
--------------------------------------------------------------------------------
/examples/inference/text_embedding/embeddings.py:
--------------------------------------------------------------------------------
1 | """Example of how to use the OpenAI API to generate embeddings."""
2 |
3 | from openai import OpenAI
4 |
5 |
6 | # The url can be found with vec-inf status $JOB_ID
7 | client = OpenAI(base_url="http://gpuXXX:XXXX/v1", api_key="EMPTY")
8 |
9 | model_name = "bge-base-en-v1.5"
10 |
11 | input_texts = [
12 | "The chef prepared a delicious meal.",
13 | ]
14 |
15 | # test single embedding
16 | embedding_response = client.embeddings.create(
17 | model=model_name,
18 | input=input_texts,
19 | encoding_format="float",
20 | )
21 |
22 | print(embedding_response)
23 |
--------------------------------------------------------------------------------
/tests/vec_inf/cli/test_utils.py:
--------------------------------------------------------------------------------
1 | """Tests for the utils functions in the vec-inf cli."""
2 |
3 | from vec_inf.cli._utils import create_table
4 |
5 |
6 | def test_create_table_with_header():
7 | """Test that create_table creates a table with the correct header."""
8 | table = create_table("Key", "Value")
9 | assert table.columns[0].header == "Key"
10 | assert table.columns[1].header == "Value"
11 | assert table.show_header is True
12 |
13 |
14 | def test_create_table_without_header():
15 | """Test create_table without header."""
16 | table = create_table(show_header=False)
17 | assert table.show_header is False
18 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: XkunW
7 |
8 | ---
9 |
10 | ### Describe the bug
11 | A clear and concise description of what the bug is.
12 |
13 | ### To Reproduce
14 | Code snippet or clear steps to reproduce behaviour.
15 |
16 | ### Expected behavior
17 | A clear and concise description of what you expected to happen.
18 |
19 | ### Screenshots
20 | If applicable, add screenshots to help explain your problem.
21 |
22 | ### Version
23 | - Version info such as v0.1.5
24 |
25 | ### Additional context
26 | Add any other context about the problem here.
27 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | ### Is your feature request related to a problem? Please describe.
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | ### Describe the solution you'd like
14 | A clear and concise description of what you want to happen.
15 |
16 | ### Describe alternatives you've considered
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | ### Additional context
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/examples/inference/llm/chat_completions.py:
--------------------------------------------------------------------------------
1 | """Example of how to use the OpenAI API to generate chat completions."""
2 |
3 | from openai import OpenAI
4 |
5 |
6 | # The url can be found with vec-inf status $JOB_ID
7 | client = OpenAI(base_url="http://gpuXXX:XXXX/v1", api_key="EMPTY")
8 |
9 | # Update the model path accordingly
10 | completion = client.chat.completions.create(
11 | model="Meta-Llama-3.1-8B-Instruct",
12 | messages=[
13 | {
14 | "role": "system",
15 | "content": "You are a pirate chatbot who always responds in pirate speak!",
16 | },
17 | {"role": "user", "content": "Who are you?"},
18 | ],
19 | )
20 |
21 | print(completion.model_dump_json())
22 |
--------------------------------------------------------------------------------
/docs/overrides/partials/copyright.html:
--------------------------------------------------------------------------------
1 |
2 |
11 | {% if config.copyright %}
12 |
{{ config.copyright }}
13 | {% endif %} {% if not config.extra.generator == false %} Made with
14 |
19 | Material for MkDocs
20 |
21 | {% endif %}
22 |
23 |
--------------------------------------------------------------------------------
/examples/slurm_dependency/downstream_job.sbatch:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=Meta-Llama-3.1-8B-Instruct-downstream
3 | #SBATCH --partition=a40
4 | #SBATCH --qos=m2
5 | #SBATCH --time=08:00:00
6 | #SBATCH --nodes=1
7 | #SBATCH --gpus-per-node=1
8 | #SBATCH --cpus-per-task=4
9 | #SBATCH --mem=8G
10 | #SBATCH --output=$HOME/.vec-inf-logs/Meta-Llama-3.1-8B-Instruct-downstream.%j.out
11 | #SBATCH --error=$HOME/.vec-inf-logs/Meta-Llama-3.1-8B-Instruct-downstream.%j.err
12 |
13 | # Activate your environment
14 | # TODO: update this path to match your venv location
15 | source $HOME/vector-inference/.venv/bin/activate
16 |
17 | # Wait for the server to be ready using the job ID passed as CLI arg
18 | python run_downstream.py "$SERVER_JOB_ID"
19 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: publish package
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | jobs:
8 | deploy:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - name: Install apt dependencies
12 | run: |
13 | sudo apt-get update
14 | sudo apt-get install libcurl4-openssl-dev libssl-dev
15 |
16 | - uses: actions/checkout@v5.0.0
17 |
18 | - name: Install uv
19 | uses: astral-sh/setup-uv@v7
20 | with:
21 | version: "0.6.6"
22 | enable-cache: true
23 |
24 | - uses: actions/setup-python@v6
25 | with:
26 | python-version: '3.10'
27 |
28 | - name: Build package
29 | run: uv build
30 |
31 | - name: Publish package
32 | run: uv publish --token ${{ secrets.PYPI_API_TOKEN }}
33 |
--------------------------------------------------------------------------------
/examples/slurm_dependency/run_downstream.py:
--------------------------------------------------------------------------------
1 | """Example script to query a launched model via the OpenAI-compatible API."""
2 |
3 | import sys
4 |
5 | from openai import OpenAI
6 |
7 | from vec_inf.client import VecInfClient
8 |
9 |
10 | if len(sys.argv) < 2:
11 | raise ValueError("Expected server job ID as the first argument.")
12 | job_id = sys.argv[1]
13 |
14 | vi_client = VecInfClient()
15 | print(f"Waiting for SLURM job {job_id} to be ready...")
16 | status = vi_client.wait_until_ready(slurm_job_id=job_id)
17 | print(f"Server is ready at {status.base_url}")
18 |
19 | api_client = OpenAI(base_url=status.base_url, api_key="EMPTY")
20 | resp = api_client.completions.create(
21 | model="Meta-Llama-3.1-8B-Instruct",
22 | prompt="Where is the capital of Canada?",
23 | max_tokens=20,
24 | )
25 |
26 | print(resp)
27 |
--------------------------------------------------------------------------------
/vec_inf/config/environment.yaml:
--------------------------------------------------------------------------------
1 | paths:
2 | image_path: "/model-weights/vec-inf-shared/vector-inference_latest.sif"
3 |
4 | containerization:
5 | module_load_cmd: "module load apptainer"
6 | module_name: "apptainer"
7 |
8 | limits:
9 | max_gpus_per_node: 8
10 | max_num_nodes: 178
11 | max_cpus_per_task: 64
12 |
13 | allowed_values:
14 | qos: []
15 | partition: []
16 | resource_type: ["l40s", "h100"]
17 |
18 | required_args:
19 | account: "VEC_INF_ACCOUNT"
20 | work_dir: "VEC_INF_WORK_DIR"
21 |
22 | default_args:
23 | cpus_per_task: "16"
24 | mem_per_node: "64G"
25 | time: "08:00:00"
26 | qos: ""
27 | partition: ""
28 | resource_type: ""
29 | exclude: ""
30 | nodelist: ""
31 | bind: ""
32 | venv: "apptainer"
33 | data_type: "auto"
34 | log_dir: "~/.vec-inf-logs"
35 | model_weights_parent_dir: "/model-weights"
36 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
22 | # NOTE: This is not generated from `sphinx-quickstart` and manually added
23 | serve:
24 | sphinx-autobuild $(SOURCEDIR) $(BUILDDIR)
25 |
--------------------------------------------------------------------------------
/vec_inf/client/__init__.py:
--------------------------------------------------------------------------------
1 | """Programmatic API for Vector Inference.
2 |
3 | This module provides a Python API for launching and managing inference servers
4 | using `vec_inf`. It is an alternative to the command-line interface, and allows
5 | users direct control over the lifecycle of inference servers via python scripts.
6 | """
7 |
8 | from vec_inf.client.api import VecInfClient
9 | from vec_inf.client.config import ModelConfig
10 | from vec_inf.client.models import (
11 | LaunchOptions,
12 | LaunchResponse,
13 | MetricsResponse,
14 | ModelInfo,
15 | ModelStatus,
16 | ModelType,
17 | StatusResponse,
18 | )
19 |
20 |
21 | __all__ = [
22 | "VecInfClient",
23 | "LaunchResponse",
24 | "StatusResponse",
25 | "ModelInfo",
26 | "MetricsResponse",
27 | "ModelStatus",
28 | "ModelType",
29 | "LaunchOptions",
30 | "ModelConfig",
31 | ]
32 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/vec_inf/cli/_vars.py:
--------------------------------------------------------------------------------
1 | """Constants for CLI rendering.
2 |
3 | This module defines constant mappings for model type priorities and colors
4 | used in the CLI display formatting.
5 |
6 | Constants
7 | ---------
8 | MODEL_TYPE_PRIORITY : dict
9 | Mapping of model types to their display priority (lower numbers shown first)
10 |
11 | MODEL_TYPE_COLORS : dict
12 | Mapping of model types to their display colors in Rich
13 |
14 | Notes
15 | -----
16 | These constants are used primarily by the ListCmdDisplay class to ensure
17 | consistent sorting and color coding of different model types in the CLI output.
18 | """
19 |
20 | MODEL_TYPE_PRIORITY = {
21 | "LLM": 0,
22 | "VLM": 1,
23 | "Text_Embedding": 2,
24 | "Reward_Modeling": 3,
25 | }
26 |
27 | MODEL_TYPE_COLORS = {
28 | "LLM": "cyan",
29 | "VLM": "bright_blue",
30 | "Text_Embedding": "purple",
31 | "Reward_Modeling": "bright_magenta",
32 | }
33 |
--------------------------------------------------------------------------------
/vec_inf/client/_exceptions.py:
--------------------------------------------------------------------------------
1 | """Exceptions for the vector inference package."""
2 |
3 |
4 | class ModelConfigurationError(Exception):
5 | """Raised when the model config or weights are missing or invalid."""
6 |
7 | pass
8 |
9 |
10 | class MissingRequiredFieldsError(ValueError):
11 | """Raised when required fields are missing from the provided parameters."""
12 |
13 | pass
14 |
15 |
16 | class ModelNotFoundError(KeyError):
17 | """Raised when the specified model name is not found in the configuration."""
18 |
19 | pass
20 |
21 |
22 | class SlurmJobError(RuntimeError):
23 | """Raised when there's an error with a Slurm job."""
24 |
25 | pass
26 |
27 |
28 | class APIError(Exception):
29 | """Base exception for API errors."""
30 |
31 | pass
32 |
33 |
34 | class ServerError(Exception):
35 | """Exception raised when there's an error with the inference server."""
36 |
37 | pass
38 |
--------------------------------------------------------------------------------
/examples/inference/vlm/vision_completions.py:
--------------------------------------------------------------------------------
1 | """Example of using the OpenAI API to generate completions for vision tasks."""
2 |
3 | from openai import OpenAI
4 |
5 |
6 | # The url can be found with vec-inf status $JOB_ID
7 | client = OpenAI(base_url="http://gpuXXX:XXXX/v1", api_key="EMPTY")
8 |
9 | # Update the model path accordingly
10 | completion = client.chat.completions.create(
11 | model="llava-1.5-13b-hf",
12 | messages=[
13 | {
14 | "role": "user",
15 | "content": [
16 | {"type": "text", "text": "What's in this image?"},
17 | {
18 | "type": "image_url",
19 | "image_url": {
20 | "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
21 | },
22 | },
23 | ],
24 | }
25 | ],
26 | max_tokens=50,
27 | )
28 |
29 | print(completion)
30 |
--------------------------------------------------------------------------------
/venv.sh:
--------------------------------------------------------------------------------
1 | #!bin/bash
2 |
3 | # Load python module if you are on Vector cluster and install uv
4 | module load python/3.10.13
5 | module load rust
6 | curl -LsSf https://astral.sh/uv/install.sh | sh
7 |
8 | # Optional: it's recommended to change the cache directory to somewhere in the scratch space to avoid
9 | # running out of space in your home directory, below is an example for the Vector cluster
10 | mkdir -p /scratch/$(whoami)/uv_cache
11 | export UV_CACHE_DIR=/scratch/$(whoami)/uv_cache
12 |
13 | # To see if the cache directory is set correctly, run the following command
14 | # uv config get cache-dir
15 | echo "Cache directory set to: $(uv config get cache-dir)"
16 |
17 | # Install dependencies via uv
18 | uv sync
19 |
20 | # Activate the virtual environment
21 | source .venv/bin/activate
22 |
23 | # Deactivate the virtual environment
24 | # deactivate
25 |
26 | # To check where your virtual environment is located, run the following command
27 | # uv venv --show-path
28 |
29 | # Alternatively, to activate your virtual environment without running uv shell, run the following command
30 | # source $(uv venv --show-path)/bin/activate
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Vector Institute
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/vec_inf/cli/_utils.py:
--------------------------------------------------------------------------------
1 | """Helper functions for the CLI.
2 |
3 | This module provides utility functions for creating consistent table displays
4 | in the command-line interface.
5 | """
6 |
7 | from rich.table import Table
8 |
9 |
10 | def create_table(
11 | key_title: str = "", value_title: str = "", show_header: bool = True
12 | ) -> Table:
13 | """Create a table for displaying model status.
14 |
15 | Creates a two-column Rich table with consistent styling for displaying
16 | key-value pairs in the CLI.
17 |
18 | Parameters
19 | ----------
20 | key_title : str, default=""
21 | Title for the key column
22 | value_title : str, default=""
23 | Title for the value column
24 | show_header : bool, default=True
25 | Whether to display column headers
26 |
27 | Returns
28 | -------
29 | Table
30 | Rich Table instance with configured styling:
31 | - Headers in bold magenta
32 | - Key column in dim style
33 | - Value column in default style
34 | """
35 | table = Table(show_header=show_header, header_style="bold magenta")
36 | table.add_column(key_title, style="dim")
37 | table.add_column(value_title)
38 | return table
39 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | - [`inference`](inference): Examples for sending inference requests
3 | - [`llm/chat_completions.py`](inference/llm/chat_completions.py): Python example of sending chat completion requests to OpenAI compatible server
4 | - [`llm/completions.py`](inference/llm/completions.py): Python example of sending completion requests to OpenAI compatible server
5 | - [`llm/completions.sh`](inference/llm/completions.sh): Bash example of sending completion requests to OpenAI compatible server, supports JSON mode
6 | - [`text_embedding/embeddings.py`](inference/text_embedding/embeddings.py): Python example of sending text embedding requests to OpenAI compatible server
7 | - [`vlm/vision_completions.py`](inference/vlm/vision_completions.py): Python example of sending chat completion requests with image attached to prompt to OpenAI compatible server for vision language models
8 | - [`logits`](logits): Example for logits generation
9 | - [`logits.py`](logits/logits.py): Python example of getting logits from hosted model.
10 | - [`api`](api): Examples for using the Python API
11 | - [`basic_usage.py`](api/basic_usage.py): Basic Python example demonstrating the Vector Inference API
12 | - [`slurm_dependency`](slurm_dependency): Example of launching a model with `vec-inf` and running a downstream SLURM job that waits for the server to be ready before sending a request.
13 |
--------------------------------------------------------------------------------
/tests/test_imports.py:
--------------------------------------------------------------------------------
1 | """Test the imports of the vec_inf package."""
2 |
3 | import unittest
4 |
5 | import pytest
6 |
7 |
8 | class TestVecInfImports(unittest.TestCase):
9 | """Test the imports of the vec_inf package."""
10 |
11 | def test_imports(self):
12 | """Test that all modules can be imported."""
13 | try:
14 | # CLI imports
15 | import vec_inf.cli # noqa: PLC0415
16 | import vec_inf.cli._cli # noqa: PLC0415
17 | import vec_inf.cli._helper # noqa: PLC0415
18 |
19 | # Client imports
20 | import vec_inf.client # noqa: PLC0415
21 | import vec_inf.client._client_vars # noqa: F401, PLC0415
22 | import vec_inf.client._exceptions # noqa: PLC0415
23 | import vec_inf.client._helper # noqa: PLC0415
24 | import vec_inf.client._slurm_script_generator # noqa: PLC0415
25 | import vec_inf.client._slurm_templates # noqa: PLC0415
26 | import vec_inf.client._slurm_vars # noqa: PLC0415
27 | import vec_inf.client._utils # noqa: PLC0415
28 | import vec_inf.client.api # noqa: PLC0415
29 | import vec_inf.client.config # noqa: PLC0415
30 | import vec_inf.client.models # noqa: F401, PLC0415
31 |
32 | except ImportError as e:
33 | pytest.fail(f"Import failed: {e}")
34 |
--------------------------------------------------------------------------------
/examples/slurm_dependency/README.md:
--------------------------------------------------------------------------------
1 | # SLURM Dependency Workflow Example
2 |
3 | This example demonstrates how to launch a model server using `vec-inf`, and run a downstream SLURM job that waits for the server to become ready before querying it.
4 |
5 | ## Files
6 |
7 | This directory contains the following:
8 |
9 | 1. [run_workflow.sh](run_workflow.sh)
10 | Launches the model server and submits the downstream job with a dependency, so it starts only after the server job begins running.
11 |
12 | 2. [downstream_job.sbatch](downstream_job.sbatch)
13 | A SLURM job script that runs the downstream logic (e.g., prompting the model).
14 |
15 | 3. [run_downstream.py](run_downstream.py)
16 | A Python script that waits until the inference server is ready, then sends a request using the OpenAI-compatible API.
17 |
18 | ## What to update
19 |
20 | Before running this example, update the following in [downstream_job.sbatch](downstream_job.sbatch):
21 |
22 | - `--job-name`, `--output`, and `--error` paths
23 | - Virtual environment path in the `source` line
24 | - SLURM resource configuration (e.g., partition, memory, GPU)
25 |
26 | Also update the model name in [run_downstream.py](run_downstream.py) to match what you're launching.
27 |
28 | ## Running the example
29 |
30 | First, activate a virtual environment where `vec-inf` is installed. Then, from this directory, run:
31 |
32 | ```bash
33 | bash run_workflow.sh
34 |
--------------------------------------------------------------------------------
/vec_inf/README.md:
--------------------------------------------------------------------------------
1 | ## `vec-inf` CLI Commands
2 |
3 | * `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server.
4 | * `batch-launch`: Specify a list of models to launch multiple OpenAI compatible inference servers at the same time.
5 | * `status`: Check the status of all `vec-inf` jobs, or a specific job by providing its job ID.
6 | * `metrics`: Streams performance metrics to the console.
7 | * `shutdown`: Shutdown a model by providing its Slurm job ID.
8 | * `list`: List all available model names, or view the default/cached configuration of a specific model.
9 | * `cleanup`: Remove old log directories. You can filter by `--model-family`, `--model-name`, `--job-id`, and/or `--before-job-id`. Use `--dry-run` to preview what would be deleted.
10 |
11 | Use `--help` to see all available options
12 |
13 | ## `VecInfClient` API
14 |
15 | * `launch_model`: Launch an OpenAI compatible inference server.
16 | * `batch_launch_models`: Launch multiple OpenAI compatible inference servers.
17 | * `fetch_running_jobs`: Get the running `vec-inf` job IDs.
18 | * `get_status`: Get the status of a running model.
19 | * `get_metrics`: Get the performance metrics of a running model.
20 | * `shutdown_model`: Shutdown a running model.
21 | * `list_models`" List all available models.
22 | * `get_model_config`: Get the configuration for a specific model.
23 | * `wait_until_ready`: Wait until a model is ready or fails.
24 | * `cleanup_logs`: Remove logs from the log directory.
25 |
--------------------------------------------------------------------------------
/examples/api/basic_usage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Basic example of Vector Inference API usage.
3 |
4 | This script demonstrates the core features of the Vector Inference API
5 | for launching and interacting with models.
6 | """
7 |
8 | from vec_inf.client import VecInfClient
9 |
10 |
11 | # Create the API client
12 | client = VecInfClient()
13 |
14 | # List available models
15 | print("Listing available models...")
16 | models = client.list_models()
17 | print(f"Found {len(models)} models")
18 | for model in models[:3]: # Show just the first few
19 | print(f"- {model.name} ({model.model_type})")
20 |
21 | # Launch a model (replace with an actual model name from your environment)
22 | model_name = "Meta-Llama-3.1-8B-Instruct" # Use an available model from your list
23 | print(f"\nLaunching {model_name}...")
24 | response = client.launch_model(model_name)
25 | job_id = response.slurm_job_id
26 | print(f"Launched with job ID: {job_id}")
27 |
28 | # Wait for the model to be ready
29 | print("Waiting for model to be ready...")
30 | status = client.wait_until_ready(job_id)
31 | print(f"Model is ready at: {status.base_url}")
32 |
33 | # Get metrics
34 | print("\nRetrieving metrics...")
35 | metrics = client.get_metrics(job_id)
36 | if isinstance(metrics.metrics, dict):
37 | for key, value in metrics.metrics.items():
38 | print(f"- {key}: {value}")
39 |
40 | # Shutdown when done
41 | print("\nShutting down model...")
42 | client.shutdown_model(job_id)
43 | print("Model shutdown complete")
44 |
--------------------------------------------------------------------------------
/vec_inf/find_port.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Function to check if a port is available on the specified IP
4 | is_port_available() {
5 | local ip=$1
6 | local port=$2
7 | # Attempt to listen on the specified port and IP. Use & to background the process.
8 | nc -l $ip $port &> /dev/null &
9 |
10 | # Capture the PID of the background process
11 | local pid=$!
12 | # Wait a short moment to ensure nc had time to bind to the port
13 | sleep 0.1
14 |
15 | # Check if nc is still running. If so, the port was available.
16 | if kill -0 $pid &> /dev/null; then
17 | # Kill the background nc process
18 | kill $pid &> /dev/null
19 | return 0 # True, port is available
20 | else
21 | return 1 # False, port is not available
22 | fi
23 | }
24 |
25 | # Function to find an available port on the specified IP
26 | find_available_port() {
27 | local ip=$1
28 | local base_port=$2
29 | local max_port=$3
30 |
31 | # Generate shuffled list of ports; fallback to sequential if shuf not present
32 | if command -v shuf >/dev/null 2>&1; then
33 | local port_list
34 | port_list=$(shuf -i "${base_port}-${max_port}")
35 | else
36 | local port_list
37 | port_list=$(seq $base_port $max_port)
38 | fi
39 |
40 | for port in $port_list; do
41 | if is_port_available $ip $port; then
42 | echo $port
43 | return
44 | fi
45 | done
46 | echo "No available port between $base_port and $max_port for $ip." >&2
47 | return 1
48 | }
49 |
--------------------------------------------------------------------------------
/.github/workflows/code_checks.yml:
--------------------------------------------------------------------------------
1 | name: code checks
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | - develop
8 | paths:
9 | - .pre-commit-config.yaml
10 | - .github/workflows/code_checks.yml
11 | - '**.py'
12 | - uv.lock
13 | - pyproject.toml
14 | - '**.ipynb'
15 | pull_request:
16 | branches:
17 | - main
18 | - develop
19 | paths:
20 | - .pre-commit-config.yaml
21 | - .github/workflows/code_checks.yml
22 | - '**.py'
23 | - uv.lock
24 | - pyproject.toml
25 | - '**.ipynb'
26 |
27 | jobs:
28 | run-code-check:
29 | runs-on: ubuntu-latest
30 | steps:
31 | - uses: actions/checkout@v5.0.0
32 | - name: Install uv
33 | uses: astral-sh/setup-uv@v7
34 | with:
35 | # Install a specific version of uv.
36 | version: "0.5.21"
37 | enable-cache: true
38 | - name: "Set up Python"
39 | uses: actions/setup-python@v6
40 | with:
41 | python-version-file: ".python-version"
42 | - name: Install the project
43 | run: uv sync --dev --prerelease=allow
44 | - name: Install dependencies and check code
45 | run: |
46 | source .venv/bin/activate
47 | pre-commit run --all-files
48 | - name: pip-audit (gh-action-pip-audit)
49 | uses: pypa/gh-action-pip-audit@v1.1.0
50 | with:
51 | virtual-environment: .venv/
52 | # Temporary: ignore pip advisory until fixed in pip>=25.3
53 | ignore-vulns: GHSA-4xh5-x5gv-qwph
54 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v6.0.0 # Use the ref you want to point at
4 | hooks:
5 | - id: trailing-whitespace
6 | - id: check-ast
7 | - id: check-builtin-literals
8 | - id: check-docstring-first
9 | - id: check-executables-have-shebangs
10 | - id: debug-statements
11 | - id: end-of-file-fixer
12 | - id: mixed-line-ending
13 | args: [--fix=lf]
14 | - id: requirements-txt-fixer
15 | - id: check-yaml
16 | args: [--unsafe]
17 | - id: check-toml
18 |
19 | - repo: https://github.com/astral-sh/ruff-pre-commit
20 | rev: 'v0.14.5'
21 | hooks:
22 | - id: ruff
23 | args: [--fix, --exit-non-zero-on-fix]
24 | types_or: [python, jupyter]
25 | - id: ruff-format
26 | types_or: [python, jupyter]
27 |
28 | - repo: https://github.com/pre-commit/mirrors-mypy
29 | rev: v1.18.2
30 | hooks:
31 | - id: mypy
32 | entry: python3 -m mypy --config-file pyproject.toml
33 | language: system
34 | types: [python]
35 | exclude: "tests"
36 |
37 | - repo: https://github.com/nbQA-dev/nbQA
38 | rev: 1.9.1
39 | hooks:
40 | - id: nbqa-ruff
41 | args: [--fix, --exit-non-zero-on-fix]
42 |
43 | - repo: local
44 | hooks:
45 | - id: pytest
46 | name: pytest
47 | entry: python3 -m pytest -m "not integration_test"
48 | language: system
49 | pass_filenames: false
50 | always_run: true
51 |
52 | ci:
53 | autofix_commit_msg: |
54 | [pre-commit.ci] Add auto fixes from pre-commit.com hooks
55 |
56 | for more information, see https://pre-commit.ci
57 | autofix_prs: true
58 | autoupdate_branch: ''
59 | autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
60 | autoupdate_schedule: weekly
61 | skip: [pytest,mypy]
62 | submodules: false
63 |
--------------------------------------------------------------------------------
/.github/workflows/docker.yml:
--------------------------------------------------------------------------------
1 | name: docker
2 |
3 | on:
4 | release:
5 | types: [published]
6 | push:
7 | branches:
8 | - main
9 | paths:
10 | - Dockerfile
11 | - .github/workflows/docker.yml
12 | - uv.lock
13 | pull_request:
14 | branches:
15 | - main
16 | paths:
17 | - Dockerfile
18 | - .github/workflows/docker.yml
19 | - uv.lock
20 |
21 | jobs:
22 | push_to_registry:
23 | name: Push Docker image to Docker Hub
24 | runs-on:
25 | - self-hosted
26 | - docker
27 | steps:
28 | - name: Checkout repository
29 | uses: actions/checkout@v5.0.0
30 |
31 | - name: Extract vLLM version
32 | id: vllm-version
33 | run: |
34 | VERSION=$(grep -A 1 'name = "vllm"' uv.lock | grep version | cut -d '"' -f 2)
35 | echo "version=$VERSION" >> $GITHUB_OUTPUT
36 |
37 | - name: Set up Docker Buildx
38 | uses: docker/setup-buildx-action@v3
39 |
40 | - name: Log in to Docker Hub
41 | uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef
42 | with:
43 | username: ${{ secrets.DOCKER_USERNAME }}
44 | password: ${{ secrets.DOCKER_PASSWORD }}
45 |
46 | - name: Extract metadata (tags, labels) for Docker
47 | id: meta
48 | uses: docker/metadata-action@318604b99e75e41977312d83839a89be02ca4893
49 | with:
50 | images: vectorinstitute/vector-inference
51 |
52 | - name: Build and push Docker image
53 | uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83
54 | with:
55 | context: .
56 | file: ./Dockerfile
57 | push: true
58 | tags: |
59 | ${{ steps.meta.outputs.tags }}
60 | vectorinstitute/vector-inference:${{ steps.vllm-version.outputs.version }}
61 | labels: ${{ steps.meta.outputs.labels }}
62 |
--------------------------------------------------------------------------------
/tests/vec_inf/client/test_models.py:
--------------------------------------------------------------------------------
1 | """Tests for the Vector Inference API data models."""
2 |
3 | from vec_inf.client import LaunchOptions, ModelInfo, ModelStatus, ModelType
4 |
5 |
6 | def test_model_info_creation():
7 | """Test creating a ModelInfo instance."""
8 | model = ModelInfo(
9 | name="test-model",
10 | family="test-family",
11 | variant="test-variant",
12 | model_type=ModelType.LLM,
13 | config={"gpus_per_node": 1},
14 | )
15 |
16 | assert model.name == "test-model"
17 | assert model.family == "test-family"
18 | assert model.variant == "test-variant"
19 | assert model.model_type == ModelType.LLM
20 | assert model.config["gpus_per_node"] == 1
21 |
22 |
23 | def test_model_info_optional_fields():
24 | """Test ModelInfo with optional fields omitted."""
25 | model = ModelInfo(
26 | name="test-model",
27 | family="test-family",
28 | variant=None,
29 | model_type=ModelType.LLM,
30 | config={},
31 | )
32 |
33 | assert model.name == "test-model"
34 | assert model.family == "test-family"
35 | assert model.variant is None
36 | assert model.model_type == ModelType.LLM
37 |
38 |
39 | def test_launch_options_default_values():
40 | """Test LaunchOptions with default values."""
41 | options = LaunchOptions()
42 |
43 | assert options.gpus_per_node is None
44 | assert options.partition is None
45 | assert options.data_type is None
46 | assert options.num_nodes is None
47 | assert options.model_family is None
48 |
49 |
50 | def test_model_status_enum():
51 | """Test ModelStatus enum values."""
52 | assert ModelStatus.PENDING.value == "PENDING"
53 | assert ModelStatus.LAUNCHING.value == "LAUNCHING"
54 | assert ModelStatus.READY.value == "READY"
55 | assert ModelStatus.FAILED.value == "FAILED"
56 | assert ModelStatus.SHUTDOWN.value == "SHUTDOWN"
57 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Vector Inference: Easy inference on Slurm clusters
2 |
3 | This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/stable/). **This package runs natively on the Vector Institute cluster environment**. To adapt to other environments, follow the instructions in [Installation](#installation).
4 |
5 | **NOTE**: Supported models on Killarney are tracked [here](https://github.com/VectorInstitute/vector-inference/blob/main/MODEL_TRACKING.md)
6 |
7 | ## Installation
8 |
9 | If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
10 |
11 | ```bash
12 | pip install vec-inf
13 | ```
14 |
15 | Otherwise, we recommend using the provided [`Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.11.0`.
16 |
17 | If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:
18 | * Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/config), then install from source by running `pip install .`.
19 | * The package would try to look for cached configuration files in your environment before using the default configuration. The default cached configuration directory path points to `/model-weights/vec-inf-shared`, you would need to create an `environment.yaml` and a `models.yaml` following the format of these files in [`vec_inf/config`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/config).
20 | * The package would also look for an enviroment variable `VEC_INF_CONFIG_DIR`. You can put your `environment.yaml` and `models.yaml` in a directory of your choice and set the enviroment variable `VEC_INF_CONFIG_DIR` to point to that location.
21 |
--------------------------------------------------------------------------------
/profile/avg_throughput.py:
--------------------------------------------------------------------------------
1 | """Calculate the average prompt and generation throughput from a log file."""
2 |
3 | import argparse
4 | import re
5 |
6 |
7 | def filter_throughput(log_file_path: str) -> None:
8 | """Filter log file for non-zero entries and calculate the avg throughput."""
9 | avg_prompt_throughput = []
10 | avg_generation_throughput = []
11 | # Define a regular expression pattern to extract throughput values
12 | pattern = r"Avg prompt throughput: ([^,]+) tokens/s, Avg generation throughput: ([^,]+) tokens/s"
13 |
14 | # Open the log file
15 | with open(log_file_path, "r") as file:
16 | # Iterate over each line in the file
17 | for line in file:
18 | # Use regex to find matches
19 | match = re.search(pattern, line)
20 | if match:
21 | # Extract prompt and generation throughput values
22 | prompt_throughput = match.group(1).strip()
23 | generation_throughput = match.group(2).strip()
24 |
25 | # Check if both throughput values are not zero
26 | if prompt_throughput != "0.0":
27 | avg_prompt_throughput.append(float(prompt_throughput))
28 | if generation_throughput != "0.0":
29 | avg_generation_throughput.append(float(generation_throughput))
30 |
31 | print(
32 | f"Average prompt throughput: {sum(avg_prompt_throughput) / len(avg_prompt_throughput)} tokens/s"
33 | )
34 | print(
35 | f"Average generation throughput: {sum(avg_generation_throughput) / len(avg_generation_throughput)} tokens/s"
36 | )
37 |
38 |
39 | def main() -> None:
40 | """Run the main function."""
41 | # Create the parser
42 | parser = argparse.ArgumentParser(
43 | description="Filter log file for non-zero throughput entries."
44 | )
45 |
46 | # Add the arguments
47 | parser.add_argument("--path", type=str, help="The path to the log file")
48 |
49 | # Execute the parse_args() method
50 | args = parser.parse_args()
51 |
52 | # Use the provided arguments
53 | filter_throughput(args.path)
54 |
55 |
56 | if __name__ == "__main__":
57 | main()
58 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04
2 |
3 | # Non-interactive apt-get commands
4 | ARG DEBIAN_FRONTEND=noninteractive
5 |
6 | # No GPUs visible during build
7 | ARG CUDA_VISIBLE_DEVICES=none
8 |
9 | # Specify CUDA architectures -> 7.5: Quadro RTX 6000 & T4, 8.0: A100, 8.6: A40, 8.9: L40S, 9.0: H100
10 | ARG TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0+PTX"
11 |
12 | # Set the Python version
13 | ARG PYTHON_VERSION=3.10.12
14 |
15 | # Install system dependencies
16 | RUN apt-get update && apt-get install -y \
17 | wget build-essential libssl-dev zlib1g-dev libbz2-dev \
18 | libreadline-dev libsqlite3-dev libffi-dev libncursesw5-dev \
19 | xz-utils tk-dev libxml2-dev libxmlsec1-dev liblzma-dev git vim \
20 | && rm -rf /var/lib/apt/lists/*
21 |
22 | # Install Python
23 | RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \
24 | tar -xzf Python-$PYTHON_VERSION.tgz && \
25 | cd Python-$PYTHON_VERSION && \
26 | ./configure --enable-optimizations && \
27 | make -j$(nproc) && \
28 | make altinstall && \
29 | cd .. && \
30 | rm -rf Python-$PYTHON_VERSION.tgz Python-$PYTHON_VERSION
31 |
32 | # Install pip and core Python tools
33 | RUN wget https://bootstrap.pypa.io/get-pip.py && \
34 | python3.10 get-pip.py && \
35 | rm get-pip.py && \
36 | python3.10 -m pip install --upgrade pip setuptools wheel uv
37 |
38 | # Install RDMA support
39 | RUN apt-get update && apt-get install -y \
40 | libibverbs1 libibverbs-dev ibverbs-utils \
41 | librdmacm1 librdmacm-dev rdmacm-utils \
42 | rdma-core ibverbs-providers infiniband-diags perftest \
43 | && rm -rf /var/lib/apt/lists/*
44 |
45 | # Set up RDMA environment (these will persist in the final container)
46 | ENV LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH"
47 | ENV NCCL_IB_DISABLE=0
48 | ENV NCCL_SOCKET_IFNAME="^lo,docker0"
49 | ENV NCCL_NET_GDR_LEVEL=PHB
50 | ENV NCCL_IB_TIMEOUT=22
51 | ENV NCCL_IB_RETRY_CNT=7
52 | ENV NCCL_DEBUG=INFO
53 |
54 | # Set up project
55 | WORKDIR /vec-inf
56 | COPY . /vec-inf
57 |
58 | # Install project dependencies with build requirements
59 | RUN uv pip install --system -e .[dev] --prerelease=allow
60 |
61 | # Install a single, system NCCL (from NVIDIA CUDA repo in base image)
62 | RUN apt-get update && apt-get install -y --allow-change-held-packages\
63 | libnccl2 libnccl-dev \
64 | && rm -rf /var/lib/apt/lists/*
65 |
66 | # Set the default command to start an interactive shell
67 | CMD ["bash"]
68 |
--------------------------------------------------------------------------------
/.github/workflows/unit_tests.yml:
--------------------------------------------------------------------------------
1 | name: unit tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | - develop
8 | paths:
9 | - .pre-commit-config.yaml
10 | - .github/workflows/code_checks.yml
11 | - .github/workflows/docs_build.yml
12 | - .github/workflows/docs_deploy.yml
13 | - .github/workflows/unit_tests.yml
14 | - .github/workflows/integration_tests.yml
15 | - '**.py'
16 | - '**.ipynb'
17 | - uv.lock
18 | - pyproject.toml
19 | - '**.rst'
20 | - '**.md'
21 | pull_request:
22 | branches:
23 | - main
24 | - develop
25 | paths:
26 | - .pre-commit-config.yaml
27 | - .github/workflows/code_checks.yml
28 | - .github/workflows/docs_build.yml
29 | - .github/workflows/docs_deploy.yml
30 | - .github/workflows/unit_tests.yml
31 | - .github/workflows/integration_tests.yml
32 | - '**.py'
33 | - '**.ipynb'
34 | - uv.lock
35 | - pyproject.toml
36 | - '**.rst'
37 | - '**.md'
38 |
39 | jobs:
40 | unit-tests:
41 | runs-on: ubuntu-latest
42 | strategy:
43 | matrix:
44 | python-version: ["3.10", "3.11", "3.12"]
45 | steps:
46 | - uses: actions/checkout@v5.0.0
47 |
48 | - name: Install uv
49 | uses: astral-sh/setup-uv@v7
50 | with:
51 | # Install a specific version of uv.
52 | version: "0.5.21"
53 | enable-cache: true
54 |
55 | - name: "Set up Python ${{ matrix.python-version }}"
56 | uses: actions/setup-python@v6
57 | with:
58 | python-version: ${{ matrix.python-version }}
59 |
60 | - name: Install the project
61 | run: uv sync --dev --prerelease=allow
62 |
63 | - name: Install dependencies and check code
64 | run: |
65 | uv run --frozen pytest -m "not integration_test" --cov vec_inf --cov-report=xml tests
66 |
67 | - name: Install the core package only
68 | run: uv sync --no-dev
69 |
70 | - name: Run package import tests
71 | run: |
72 | uv run --frozen pytest tests/test_imports.py
73 |
74 | - name: Import Codecov GPG public key
75 | run: |
76 | gpg --keyserver keyserver.ubuntu.com --recv-keys 806BB28AED779869
77 |
78 | - name: Upload coverage to Codecov
79 | uses: codecov/codecov-action@v5.5.1
80 | with:
81 | token: ${{ secrets.CODECOV_TOKEN }}
82 | files: ./coverage.xml
83 | name: codecov-umbrella
84 | fail_ci_if_error: true
85 | verbose: true
86 |
--------------------------------------------------------------------------------
/vec_inf/client/_client_vars.py:
--------------------------------------------------------------------------------
1 | """Global variables for Vector Inference.
2 |
3 | This module contains configuration constants and templates used throughout the
4 | Vector Inference package, including model configurations, and metric definitions.
5 |
6 | Constants
7 | ---------
8 | MODEL_READY_SIGNATURE : str
9 | Signature string indicating successful model server startup
10 | SRC_DIR : str
11 | Absolute path to the package source directory
12 | KEY_METRICS : dict
13 | Mapping of vLLM metrics to their human-readable names
14 | SLURM_JOB_CONFIG_ARGS : dict
15 | Mapping of SLURM configuration arguments to their parameter names
16 | VLLM_SHORT_TO_LONG_MAP : dict
17 | Mapping of vLLM short arguments to their long names
18 | """
19 |
20 | from pathlib import Path
21 |
22 |
23 | MODEL_READY_SIGNATURE = "INFO: Application startup complete."
24 | SRC_DIR = str(Path(__file__).parent.parent)
25 |
26 |
27 | # Key production metrics for inference servers
28 | KEY_METRICS = {
29 | "vllm:prompt_tokens_total": "total_prompt_tokens",
30 | "vllm:generation_tokens_total": "total_generation_tokens",
31 | "vllm:e2e_request_latency_seconds_sum": "request_latency_sum",
32 | "vllm:e2e_request_latency_seconds_count": "request_latency_count",
33 | "vllm:request_queue_time_seconds_sum": "queue_time_sum",
34 | "vllm:request_success_total": "successful_requests_total",
35 | "vllm:num_requests_running": "requests_running",
36 | "vllm:num_requests_waiting": "requests_waiting",
37 | "vllm:num_requests_swapped": "requests_swapped",
38 | "vllm:gpu_cache_usage_perc": "gpu_cache_usage",
39 | "vllm:cpu_cache_usage_perc": "cpu_cache_usage",
40 | }
41 |
42 | # Slurm job configuration arguments
43 | SLURM_JOB_CONFIG_ARGS = {
44 | "job-name": "model_name",
45 | "partition": "partition",
46 | "account": "account",
47 | "chdir": "work_dir",
48 | "qos": "qos",
49 | "time": "time",
50 | "nodes": "num_nodes",
51 | "exclude": "exclude",
52 | "nodelist": "node_list",
53 | "gres": "gres",
54 | "cpus-per-task": "cpus_per_task",
55 | "mem": "mem_per_node",
56 | "output": "out_file",
57 | "error": "err_file",
58 | }
59 |
60 | # vLLM engine args mapping between short and long names
61 | VLLM_SHORT_TO_LONG_MAP = {
62 | "-tp": "--tensor-parallel-size",
63 | "-pp": "--pipeline-parallel-size",
64 | "-dp": "--data-parallel-size",
65 | "-dpl": "--data-parallel-size-local",
66 | "-dpa": "--data-parallel-address",
67 | "-dpp": "--data-parallel-rpc-port",
68 | "-O": "--compilation-config",
69 | "-q": "--quantization",
70 | }
71 |
72 | # Required matching arguments for batch mode
73 | BATCH_MODE_REQUIRED_MATCHING_ARGS = ["venv", "log_dir"]
74 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pipenv
85 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
86 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
87 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
88 | # install all needed dependencies.
89 | #Pipfile.lock
90 |
91 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
92 | __pypackages__/
93 |
94 | # Celery stuff
95 | celerybeat-schedule
96 | celerybeat.pid
97 |
98 | # SageMath parsed files
99 | *.sage.py
100 |
101 | # Environments
102 | .env
103 | .venv
104 | env/
105 | venv/
106 | ENV/
107 | env.bak/
108 | venv.bak/
109 |
110 | # Spyder project settings
111 | .spyderproject
112 | .spyproject
113 |
114 | # Rope project settings
115 | .ropeproject
116 |
117 | # mkdocs documentation
118 | /site
119 |
120 | # mypy
121 | .mypy_cache/
122 | .dmypy.json
123 | dmypy.json
124 |
125 | # Pyre type checker
126 | .pyre/
127 |
128 | # pycharm
129 | .idea/
130 |
131 | # VS Code
132 | .vscode/
133 |
134 | # MacOS
135 | .DS_Store
136 |
137 | # Slurm logs
138 | *.out
139 | *.err
140 |
141 | # Server url files
142 | *_url
143 |
144 | logs/
145 |
146 | local/
147 | slurm/
148 | scripts/
149 |
150 | # vLLM bug reporting files
151 | collect_env.py
152 |
153 | # build files
154 | dist/
155 |
156 | # type stubs
157 | stubs/
158 | mypy.ini
159 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | extra_css:
2 | - stylesheets/extra.css
3 | extra:
4 | generator: false
5 | social:
6 | - icon: fontawesome/brands/discord
7 | link: 404.html
8 | - icon: fontawesome/brands/github
9 | link: https://github.com/VectorInstitute/vector-inference
10 | version:
11 | provider: mike
12 | default: latest
13 | markdown_extensions:
14 | - attr_list
15 | - admonition
16 | - md_in_html
17 | - pymdownx.highlight:
18 | anchor_linenums: true
19 | line_spans: __span
20 | pygments_lang_class: true
21 | - pymdownx.inlinehilite
22 | - pymdownx.details
23 | - pymdownx.snippets
24 | - pymdownx.superfences
25 | - pymdownx.emoji:
26 | emoji_index: !!python/name:material.extensions.emoji.twemoji
27 | emoji_generator: !!python/name:material.extensions.emoji.to_svg
28 | - toc:
29 | permalink: true
30 | - meta
31 | - footnotes
32 | nav:
33 | - Home: index.md
34 | - User Guide: user_guide.md
35 | - API Reference: api.md
36 | - Contributing: contributing.md
37 | plugins:
38 | - search
39 | - mike:
40 | version_selector: true
41 | css_dir: stylesheets
42 | canonical_version: latest
43 | alias_type: symlink
44 | deploy_prefix: ''
45 | - mkdocstrings:
46 | default_handler: python
47 | handlers:
48 | python:
49 | paths: [../vec_inf]
50 | options:
51 | docstring_style: numpy
52 | members_order: source
53 | separate_signature: true
54 | show_overloads: true
55 | show_submodules: true
56 | show_root_heading: false
57 | show_root_full_path: true
58 | show_root_toc_entry: false
59 | show_symbol_type_heading: true
60 | show_symbol_type_toc: true
61 | repo_url: https://github.com/VectorInstitute/vector-inference
62 | repo_name: VectorInstitute/vector-inference
63 | site_name: Vector Inference
64 | site_url: https://vectorinstitute.github.io/vector-inference/
65 | theme:
66 | name: material
67 | custom_dir: docs/overrides
68 | favicon: assets/favicon-48x48.svg
69 | features:
70 | - content.code.annotate
71 | - content.code.copy
72 | - navigation.footer
73 | - navigation.indexes
74 | - navigation.instant
75 | - navigation.tabs
76 | - navigation.tabs.sticky
77 | - navigation.top
78 | - search.suggest
79 | - search.highlight
80 | - toc.follow
81 | icon:
82 | repo: fontawesome/brands/github
83 | logo: assets/vector-logo.svg
84 | logo_footer: assets/vector-logo.svg
85 | palette:
86 | - media: "(prefers-color-scheme: light)"
87 | scheme: default
88 | primary: vector
89 | accent: vector-teal
90 | toggle:
91 | icon: material/brightness-7
92 | name: Switch to dark mode
93 | - media: "(prefers-color-scheme: dark)"
94 | scheme: slate
95 | primary: black
96 | accent: vector-teal
97 | toggle:
98 | icon: material/brightness-4
99 | name: Switch to light mode
100 |
--------------------------------------------------------------------------------
/vec_inf/client/_slurm_vars.py:
--------------------------------------------------------------------------------
1 | """Slurm cluster configuration variables."""
2 |
3 | import os
4 | import warnings
5 | from pathlib import Path
6 | from typing import Any, TypeAlias
7 |
8 | import yaml
9 | from typing_extensions import Literal
10 |
11 |
12 | CACHED_CONFIG_DIR = Path("/model-weights/vec-inf-shared")
13 |
14 |
15 | def load_env_config() -> dict[str, Any]:
16 | """Load the environment configuration."""
17 |
18 | def load_yaml_config(path: Path) -> dict[str, Any]:
19 | """Load YAML config with error handling."""
20 | try:
21 | with path.open() as f:
22 | return yaml.safe_load(f) or {}
23 | except FileNotFoundError as err:
24 | raise FileNotFoundError(f"Could not find config: {path}") from err
25 | except yaml.YAMLError as err:
26 | raise ValueError(f"Error parsing YAML config at {path}: {err}") from err
27 |
28 | cached_config_path = CACHED_CONFIG_DIR / "environment.yaml"
29 | default_path = (
30 | cached_config_path
31 | if cached_config_path.exists()
32 | else Path(__file__).resolve().parent.parent / "config" / "environment.yaml"
33 | )
34 | config = load_yaml_config(default_path)
35 |
36 | user_path = os.getenv("VEC_INF_CONFIG_DIR")
37 | if user_path:
38 | user_path_obj = Path(user_path, "environment.yaml")
39 | if user_path_obj.exists():
40 | user_config = load_yaml_config(user_path_obj)
41 | config.update(user_config)
42 | else:
43 | warnings.warn(
44 | f"WARNING: Could not find user config directory: {user_path}, revert to default config located at {default_path}",
45 | UserWarning,
46 | stacklevel=2,
47 | )
48 |
49 | return config
50 |
51 |
52 | _config = load_env_config()
53 |
54 | # Extract path values
55 | IMAGE_PATH = _config["paths"]["image_path"]
56 |
57 | # Extract containerization info
58 | CONTAINER_LOAD_CMD = _config["containerization"]["module_load_cmd"]
59 | CONTAINER_MODULE_NAME = _config["containerization"]["module_name"]
60 |
61 | # Extract limits
62 | MAX_GPUS_PER_NODE = _config["limits"]["max_gpus_per_node"]
63 | MAX_NUM_NODES = _config["limits"]["max_num_nodes"]
64 | MAX_CPUS_PER_TASK = _config["limits"]["max_cpus_per_task"]
65 |
66 |
67 | # Create dynamic Literal types
68 | def create_literal_type(values: list[str], fallback: str = "") -> Any:
69 | """Create a Literal type from a list, with configurable fallback."""
70 | if not values:
71 | return Literal[fallback]
72 | return Literal[tuple(values)]
73 |
74 |
75 | QOS: TypeAlias = create_literal_type(_config["allowed_values"]["qos"]) # type: ignore[valid-type]
76 | PARTITION: TypeAlias = create_literal_type(_config["allowed_values"]["partition"]) # type: ignore[valid-type]
77 | RESOURCE_TYPE: TypeAlias = create_literal_type( # type: ignore[valid-type]
78 | _config["allowed_values"]["resource_type"]
79 | )
80 |
81 | # Extract required arguments, for launching jobs that don't have a default value and
82 | # their corresponding environment variables
83 | REQUIRED_ARGS: dict[str, str] = _config["required_args"]
84 |
85 | # Extract default arguments
86 | DEFAULT_ARGS: dict[str, str] = _config["default_args"]
87 |
--------------------------------------------------------------------------------
/tests/vec_inf/client/test_examples.py:
--------------------------------------------------------------------------------
1 | """Tests to verify the API examples function properly."""
2 |
3 | from pathlib import Path
4 | from unittest.mock import MagicMock, patch
5 |
6 | import pytest
7 |
8 | from vec_inf.client import ModelStatus, ModelType, VecInfClient
9 |
10 |
11 | @pytest.fixture
12 | def mock_client():
13 | """Create a mocked VecInfClient."""
14 | client = MagicMock(spec=VecInfClient)
15 |
16 | # Set up mock responses
17 | mock_model1 = MagicMock()
18 | mock_model1.name = "test-model"
19 | mock_model1.family = "test-family"
20 | mock_model1.type = ModelType.LLM
21 |
22 | mock_model2 = MagicMock()
23 | mock_model2.name = "test-model-2"
24 | mock_model2.family = "test-family-2"
25 | mock_model2.type = ModelType.VLM
26 |
27 | client.list_models.return_value = [mock_model1, mock_model2]
28 |
29 | launch_response = MagicMock()
30 | launch_response.slurm_job_id = "123456"
31 | launch_response.model_name = "Meta-Llama-3.1-8B-Instruct"
32 | client.launch_model.return_value = launch_response
33 |
34 | status_response = MagicMock()
35 | status_response.status = ModelStatus.READY
36 | status_response.base_url = "http://gpu123:8080/v1"
37 | client.wait_until_ready.return_value = status_response
38 |
39 | metrics_response = MagicMock()
40 | metrics_response.metrics = {"throughput": "10.5"}
41 | client.get_metrics.return_value = metrics_response
42 |
43 | return client
44 |
45 |
46 | @pytest.mark.skipif(
47 | not (
48 | Path(__file__).parent.parent.parent.parent
49 | / "examples"
50 | / "api"
51 | / "basic_usage.py"
52 | ).exists(),
53 | reason="Example file not found",
54 | )
55 | def test_api_usage_example():
56 | """Test the basic API usage example."""
57 | example_path = (
58 | Path(__file__).parent.parent.parent.parent
59 | / "examples"
60 | / "api"
61 | / "basic_usage.py"
62 | )
63 |
64 | # Create a mock client
65 | mock_client = MagicMock(spec=VecInfClient)
66 |
67 | # Set up mock responses
68 | mock_model = MagicMock()
69 | mock_model.name = "Meta-Llama-3.1-8B-Instruct"
70 | mock_model.type = ModelType.LLM
71 | mock_client.list_models.return_value = [mock_model]
72 |
73 | launch_response = MagicMock()
74 | launch_response.slurm_job_id = "123456"
75 | mock_client.launch_model.return_value = launch_response
76 |
77 | status_response = MagicMock()
78 | status_response.status = ModelStatus.READY
79 | status_response.base_url = "http://gpu123:8080/v1"
80 | mock_client.wait_until_ready.return_value = status_response
81 |
82 | metrics_response = MagicMock()
83 | metrics_response.metrics = {"throughput": "10.5"}
84 | mock_client.get_metrics.return_value = metrics_response
85 |
86 | # Mock the VecInfClient class
87 | with (
88 | patch("vec_inf.client.VecInfClient", return_value=mock_client),
89 | patch("builtins.print"),
90 | example_path.open() as f,
91 | ):
92 | exec(f.read())
93 |
94 | # Verify the client methods were called
95 | mock_client.list_models.assert_called_once()
96 | mock_client.launch_model.assert_called_once()
97 | mock_client.wait_until_ready.assert_called_once()
98 | mock_client.get_metrics.assert_called_once()
99 | mock_client.shutdown_model.assert_called_once()
100 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "vec-inf"
3 | version = "0.7.3"
4 | description = "Efficient LLM inference on Slurm clusters using vLLM."
5 | readme = "README.md"
6 | authors = [{name = "Marshall Wang", email = "marshall.wang@vectorinstitute.ai"}]
7 | license = "MIT"
8 | requires-python = ">=3.10"
9 | dependencies = [
10 | "requests>=2.31.0",
11 | "click>=8.1.0",
12 | "rich>=13.7.0",
13 | "pydantic>=2.10.6",
14 | "pyyaml>=6.0.2",
15 | ]
16 |
17 | [dependency-groups]
18 | dev = [
19 | "codecov>=2.1.13",
20 | "mypy>=1.15.0",
21 | "nbqa>=1.9.1",
22 | "openai>=1.65.1",
23 | "pip-audit>=2.8.0",
24 | "pre-commit>=4.1.0",
25 | "pytest>=8.3.4",
26 | "pytest-asyncio>=0.25.3",
27 | "pytest-cov>=6.0.0",
28 | "pytest-mock>=3.14.0",
29 | "ruff>=0.9.6",
30 | ]
31 | docs = [
32 | "mkdocs>=1.5.3",
33 | "mkdocs-material>=9.5.12",
34 | "mkdocstrings>=0.24.1",
35 | "mkdocstrings-python>=1.8.0",
36 | "pymdown-extensions>=10.7.1",
37 | "mike>=2.0.0",
38 | ]
39 |
40 | [project.optional-dependencies]
41 | dev = [
42 | "xgrammar>=0.1.11",
43 | "torch>=2.7.0",
44 | "vllm>=0.10.0",
45 | "ray[default]>=2.50.0",
46 | "cupy-cuda12x==12.1.0",
47 | "flashinfer-python>=0.4.0",
48 | "sglang>=0.5.0",
49 | ]
50 |
51 | [project.scripts]
52 | vec-inf = "vec_inf.cli._cli:cli"
53 |
54 | [build-system]
55 | requires = ["hatchling"]
56 | build-backend = "hatchling.build"
57 |
58 | [tool.hatch.build.targets.wheel]
59 | packages = ["vec_inf"]
60 |
61 | [tool.mypy]
62 | ignore_missing_imports = true
63 | install_types = true
64 | pretty = true
65 | namespace_packages = true
66 | explicit_package_bases = true
67 | non_interactive = true
68 | warn_unused_configs = true
69 | allow_any_generics = false
70 | allow_subclassing_any = false
71 | allow_untyped_calls = false
72 | allow_untyped_defs = false
73 | allow_incomplete_defs = false
74 | check_untyped_defs = true
75 | allow_untyped_decorators = false
76 | warn_redundant_casts = true
77 | warn_unused_ignores = true
78 | warn_return_any = true
79 | implicit_reexport = false
80 | strict_equality = true
81 | extra_checks = true
82 |
83 | [tool.ruff]
84 | include = ["*.py", "pyproject.toml", "*.ipynb"]
85 | line-length = 88
86 |
87 | [tool.ruff.format]
88 | quote-style = "double"
89 | indent-style = "space"
90 | docstring-code-format = true
91 |
92 | [tool.ruff.lint]
93 | select = [
94 | "A", # flake8-builtins
95 | "B", # flake8-bugbear
96 | "COM", # flake8-commas
97 | "C4", # flake8-comprehensions
98 | "RET", # flake8-return
99 | "SIM", # flake8-simplify
100 | "ICN", # flake8-import-conventions
101 | "Q", # flake8-quotes
102 | "RSE", # flake8-raise
103 | "D", # pydocstyle
104 | "E", # pycodestyle
105 | "F", # pyflakes
106 | "I", # isort
107 | "W", # pycodestyle
108 | "N", # pep8-naming
109 | "ERA", # eradicate
110 | "PL", # pylint
111 | ]
112 | fixable = ["A", "B", "COM", "C4", "RET", "SIM", "ICN", "Q", "RSE", "D", "E", "F", "I", "W", "N", "ERA", "PL"]
113 | ignore = [
114 | "B905", # `zip()` without an explicit `strict=` parameter
115 | "E501", # line too long
116 | "D203", # 1 blank line required before class docstring
117 | "D213", # Multi-line docstring summary should start at the second line
118 | "PLR2004", # Replace magic number with named constant
119 | "PLR0913", # Too many arguments
120 | "COM812", # Missing trailing comma
121 | ]
122 |
123 | # Ignore import violations in all `__init__.py` files.
124 | [tool.ruff.lint.per-file-ignores]
125 | "__init__.py" = ["E402", "F401", "F403", "F811"]
126 |
127 | [tool.ruff.lint.pep8-naming]
128 | ignore-names = ["X*", "setUp"]
129 |
130 | [tool.ruff.lint.isort]
131 | lines-after-imports = 2
132 |
133 | [tool.ruff.lint.pydocstyle]
134 | convention = "numpy"
135 |
136 | [tool.ruff.lint.pycodestyle]
137 | max-doc-length = 88
138 |
139 | [tool.pytest.ini_options]
140 | markers = [
141 | "integration_test: marks tests as integration tests",
142 | ]
143 |
144 | [tool.coverage]
145 | [tool.coverage.run]
146 | source=["vec_inf"]
147 | omit=["tests/*", "*__init__.py"]
148 |
--------------------------------------------------------------------------------
/profile/gen.py:
--------------------------------------------------------------------------------
1 | """Testing script."""
2 |
3 | import time
4 | from typing import List, Union
5 |
6 | import requests
7 |
8 |
9 | # Change the ENDPOINT and MODEL_PATH to match your setup
10 | ENDPOINT = "http://gpuXXX:XXXX/v1"
11 | MODEL_PATH = "Meta-Llama-3-70B"
12 |
13 | # Configuration
14 | API_KEY = "EMPTY"
15 | HEADERS = {
16 | "Authorization": f"Bearer {API_KEY}",
17 | "Content-Type": "application/json",
18 | }
19 |
20 | # Sample prompts for testing
21 | PROMPTS = [
22 | "Translate the following English text to French: 'Hello, how are you?'",
23 | "What is the square root of 144?",
24 | "Summarize the following paragraph: 'Artificial intelligence refers to the simulation of human intelligence in machines...'",
25 | "Explain the process of photosynthesis in plants.",
26 | "What are the main differences between classical and quantum physics?",
27 | "Summarize the plot of 'To Kill a Mockingbird' by Harper Lee.",
28 | "Describe the economic impacts of climate change on agriculture.",
29 | "Translate the following sentence into Spanish: 'Where is the closest grocery store?'",
30 | "How does a lithium-ion battery work?",
31 | "Provide a brief biography of Marie Curie.",
32 | "What are the key factors that led to the end of the Cold War?",
33 | "Write a poem about the sunset over the ocean.",
34 | "Explain the rules of chess.",
35 | "What is blockchain technology and how does it work?",
36 | "Give a step-by-step guide on how to bake chocolate chip cookies.",
37 | "Describe the human digestive system.",
38 | "What is the theory of relativity?",
39 | "How to perform a basic oil change on a car.",
40 | "What are the symptoms and treatments for type 2 diabetes?",
41 | "Summarize the last episode of 'Game of Thrones'.",
42 | "Explain the role of the United Nations in world peace.",
43 | "Describe the culture and traditions of Japan.",
44 | "Provide a detailed explanation of the stock market.",
45 | "How do solar panels generate electricity?",
46 | "What is machine learning and how is it applied in daily life?",
47 | "Discuss the impact of the internet on modern education.",
48 | "Write a short story about a lost dog finding its way home.",
49 | "What are the benefits of meditation?",
50 | "Explain the process of recycling plastic.",
51 | "What is the significance of the Magna Carta?",
52 | "How does the human immune system fight viruses?",
53 | "Describe the stages of a frog's life cycle.",
54 | "Explain Newton's three laws of motion.",
55 | "What are the best practices for sustainable farming?",
56 | "Give a history of the Olympic Games.",
57 | "What are the causes and effects of global warming?",
58 | "Write an essay on the importance of voting.",
59 | "How is artificial intelligence used in healthcare?",
60 | "What is the function of the Federal Reserve?",
61 | "Describe the geography of South America.",
62 | "Explain how to set up a freshwater aquarium.",
63 | "What are the major works of William Shakespeare?",
64 | "How do antibiotics work against bacterial infections?",
65 | "Discuss the role of art in society.",
66 | "What are the main sources of renewable energy?",
67 | "How to prepare for a job interview.",
68 | "Describe the life cycle of a butterfly.",
69 | "What are the main components of a computer?",
70 | "Write a review of the latest Marvel movie.",
71 | "What are the ethical implications of cloning?",
72 | "Explain the significance of the Pyramids of Giza.",
73 | "Describe the process of making wine.",
74 | "How does the GPS system work?",
75 | ]
76 |
77 |
78 | def send_request(prompt: List[str]) -> Union[float, None]:
79 | """Send a request to the API."""
80 | data = {"model": f"{MODEL_PATH}", "prompt": prompt, "max_tokens": 100}
81 | start_time = time.time()
82 | response = requests.post(f"{ENDPOINT}/completions", headers=HEADERS, json=data)
83 | duration = time.time() - start_time
84 | if response.status_code == 200:
85 | return duration
86 | return None
87 |
88 |
89 | def main() -> None:
90 | """Run main function."""
91 | for _ in range(10):
92 | print("Sending 20x requests 0-52...")
93 | send_request(PROMPTS * 20)
94 | print("Done!")
95 |
96 |
97 | if __name__ == "__main__":
98 | main()
99 |
--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | # Contributing to Vector Inference
2 |
3 | Thank you for your interest in contributing to Vector Inference! This guide will help you get started with development, testing, and documentation contributions.
4 |
5 | ## Development Setup
6 |
7 | ### Prerequisites
8 |
9 | - Python 3.10 or newer
10 | - [uv](https://github.com/astral-sh/uv) for dependency management
11 |
12 | ### Setting Up Development Environment
13 |
14 | 1. Clone the repository:
15 | ```bash
16 | git clone https://github.com/VectorInstitute/vector-inference.git
17 | cd vector-inference
18 | ```
19 |
20 | 2. Install development dependencies:
21 | ```bash
22 | uv sync --all-extras --group dev
23 | ```
24 |
25 | 3. Install pre-commit hooks:
26 | ```bash
27 | pre-commit install
28 | ```
29 |
30 | !!! tip "Using Virtual Environments"
31 | If you prefer using virtual environments, you can use `uv venv` to create one:
32 | ```bash
33 | uv venv
34 | source .venv/bin/activate
35 | ```
36 |
37 | ## Development Workflow
38 |
39 | ### Code Style and Linting
40 |
41 | We use several tools to ensure code quality:
42 |
43 | - **ruff** for linting and formatting
44 | - **mypy** for type checking
45 |
46 | You can run these tools with:
47 |
48 | ```bash
49 | # Linting
50 | uv run ruff check .
51 |
52 | # Type checking
53 | uv run mypy
54 |
55 | # Format code
56 | uv run ruff format .
57 | ```
58 |
59 | !!! note "Pre-commit Hooks"
60 | The pre-commit hooks will automatically run these checks before each commit.
61 | If the hooks fail, you will need to fix the issues before you can commit.
62 |
63 | ### Testing
64 |
65 | All new features and bug fixes should include tests. We use pytest for testing:
66 |
67 | ```bash
68 | # Run all tests
69 | uv run pytest
70 |
71 | # Run tests with coverage
72 | uv run pytest --cov=vec_inf
73 | ```
74 |
75 | ## Documentation
76 |
77 | ### Documentation Setup
78 |
79 | Install the documentation dependencies:
80 |
81 | ```bash
82 | uv sync --group docs
83 | ```
84 |
85 | ### Building Documentation
86 |
87 | Build and serve the documentation locally:
88 |
89 | ```bash
90 | # Standard build
91 | mkdocs build
92 |
93 | # Serve locally with hot-reload
94 | mkdocs serve
95 | ```
96 |
97 | ### Versioned Documentation
98 |
99 | Vector Inference uses [mike](https://github.com/jimporter/mike) to manage versioned documentation. This allows users to access documentation for specific versions of the library.
100 |
101 | #### Available Versions
102 |
103 | The documentation is available in multiple versions:
104 |
105 | - `latest` - Always points to the most recent stable release
106 | - Version-specific documentation (e.g., `0.5.0`, `0.4.0`)
107 |
108 | #### Versioning Strategy
109 |
110 | Our versioning strategy follows these rules:
111 |
112 | 1. Each release gets its own version number matching the package version (e.g., `0.5.0`)
113 | 2. The `latest` alias always points to the most recent stable release
114 | 3. Documentation is automatically deployed when changes are pushed to the main branch
115 |
116 | #### Working with Mike Locally
117 |
118 | To preview or work with versioned documentation:
119 |
120 | ```bash
121 | # Build and deploy a specific version to your local gh-pages branch
122 | mike deploy 0.5.0
123 |
124 | # Add an alias for the latest version
125 | mike deploy 0.5.0 latest
126 |
127 | # Set the default version to redirect to
128 | mike set-default latest
129 |
130 | # View the deployed versions
131 | mike list
132 |
133 | # Serve the versioned documentation locally
134 | mike serve
135 | ```
136 |
137 | #### Automatic Documentation Deployment
138 |
139 | Documentation is automatically deployed through GitHub Actions:
140 |
141 | - On pushes to `main`, documentation is deployed with the version from `pyproject.toml` and the `latest` alias
142 | - Through manual trigger in the GitHub Actions workflow, where you can specify the version to deploy
143 |
144 | !!! info "When to Update Documentation"
145 | - When adding new features
146 | - When changing existing APIs
147 | - When fixing bugs that affect user experience
148 | - When improving explanations or examples
149 |
150 | ## Pull Request Process
151 |
152 | 1. **Fork the repository** and create your branch from `main`
153 | 2. **Make your changes** and add appropriate tests
154 | 3. **Ensure tests pass** and code meets style guidelines
155 | 4. **Write clear documentation** for your changes
156 | 5. **Submit a pull request** with a clear description of the changes
157 |
158 | !!! important "Checklist Before Submitting PR"
159 | - [ ] All tests pass
160 | - [ ] Code is formatted with ruff
161 | - [ ] Type annotations are correct
162 | - [ ] Documentation is updated
163 | - [ ] Commit messages are clear and descriptive
164 |
165 | ## Release Process
166 |
167 | 1. Update version in `pyproject.toml`
168 | 2. Update changelogs and documentation as needed
169 | 3. Create a new tag and release on GitHub
170 | 4. Documentation for the new version will be automatically deployed
171 |
172 | ## License
173 |
174 | By contributing to Vector Inference, you agree that your contributions will be licensed under the project's [MIT License](https://github.com/VectorInstitute/vector-inference/blob/main/LICENSE).
175 |
--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
1 | name: docs
2 | permissions:
3 | contents: write
4 | pull-requests: write
5 |
6 | on:
7 | push:
8 | branches:
9 | - main
10 | paths:
11 | - .pre-commit-config.yaml
12 | - .github/workflows/docs.yml
13 | - '**.py'
14 | - '**.ipynb'
15 | - '**.html'
16 | - '**.js'
17 | - '**.md'
18 | - uv.lock
19 | - pyproject.toml
20 | - mkdocs.yml
21 | - '**.png'
22 | - '**.svg'
23 | pull_request:
24 | branches:
25 | - main
26 | paths:
27 | - .pre-commit-config.yaml
28 | - .github/workflows/docs.yml
29 | - '**.py'
30 | - '**.ipynb'
31 | - '**.js'
32 | - '**.html'
33 | - uv.lock
34 | - pyproject.toml
35 | - '**.md'
36 | - mkdocs.yml
37 | - '**.png'
38 | - '**.svg'
39 | release:
40 | types: [published]
41 | # Allow manual trigger
42 | workflow_dispatch:
43 | inputs:
44 | version:
45 | description: 'Version to deploy (e.g., 0.5.0, latest)'
46 | required: true
47 | default: 'latest'
48 |
49 | jobs:
50 | build:
51 | runs-on: ubuntu-latest
52 | steps:
53 | - name: Checkout code
54 | uses: actions/checkout@v5.0.0
55 | with:
56 | fetch-depth: 0 # Fetch all history for proper versioning
57 |
58 | - name: Install uv
59 | uses: astral-sh/setup-uv@v7
60 | with:
61 | version: "0.5.21"
62 | enable-cache: true
63 |
64 | - name: Set up Python
65 | uses: actions/setup-python@v6
66 | with:
67 | python-version-file: ".python-version"
68 |
69 | - name: Install the project
70 | run: uv sync --all-extras --group docs --prerelease=allow
71 |
72 | - name: Build docs
73 | run: uv run --frozen mkdocs build
74 |
75 | - name: Create .nojekyll file
76 | run: touch site/.nojekyll
77 |
78 | - name: Upload artifact
79 | uses: actions/upload-artifact@v5
80 | with:
81 | name: docs-site
82 | path: site/
83 | retention-days: 1
84 |
85 | deploy:
86 | needs: build
87 | if: (github.event_name == 'push' && github.ref == 'refs/heads/main') || github.event_name == 'workflow_dispatch' || github.event_name == 'release'
88 | runs-on: ubuntu-latest
89 | steps:
90 | - name: Checkout code
91 | uses: actions/checkout@v5.0.0
92 | with:
93 | fetch-depth: 0 # Fetch all history for proper versioning
94 |
95 | - name: Install uv
96 | uses: astral-sh/setup-uv@v7
97 | with:
98 | version: "0.5.21"
99 | enable-cache: true
100 |
101 | - name: Set up Python
102 | uses: actions/setup-python@v6
103 | with:
104 | python-version-file: ".python-version"
105 |
106 | - name: Install the project
107 | run: uv sync --all-extras --group docs --frozen
108 |
109 | - name: Configure Git Credentials
110 | run: |
111 | git config user.name github-actions[bot]
112 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com
113 |
114 | - name: Download artifact
115 | uses: actions/download-artifact@v6
116 | with:
117 | name: docs-site
118 | path: site
119 |
120 | - name: Ensure .nojekyll exists
121 | run: touch site/.nojekyll
122 |
123 | - name: Determine version
124 | id: version
125 | run: |
126 | if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
127 | # Use the version provided in the workflow dispatch
128 | echo "VERSION=${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT
129 | echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT
130 | elif [[ "${{ github.event_name }}" == "release" ]]; then
131 | # Use the tag from the release
132 | VERSION="${{ github.ref_name }}"
133 | # Remove 'v' prefix if present
134 | VERSION="${VERSION#v}"
135 | echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
136 | echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT
137 | elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then
138 | # For pushes to main, tag as "main"
139 | echo "VERSION=main" >> $GITHUB_OUTPUT
140 | # No alias for main
141 | echo "VERSION_ALIAS=" >> $GITHUB_OUTPUT
142 | else
143 | # Get version from pyproject.toml as fallback
144 | VERSION=$(grep -m 1 '^version = ' pyproject.toml | sed 's/^version = "\(.*\)"$/\1/')
145 | echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
146 | echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT
147 | fi
148 |
149 | - name: Deploy docs with mike
150 | run: |
151 | VERSION=${{ steps.version.outputs.VERSION }}
152 | ALIAS=${{ steps.version.outputs.VERSION_ALIAS }}
153 |
154 | # Add a temporary remote to fetch gh-pages if it exists
155 | git remote add temp https://github.com/${{ github.repository }}.git || true
156 | git fetch temp gh-pages || true
157 |
158 | DEPLOY_ARGS="--push --update-aliases $VERSION"
159 |
160 | if [[ ! -z "$ALIAS" ]]; then
161 | DEPLOY_ARGS="$DEPLOY_ARGS $ALIAS"
162 | fi
163 |
164 | # Activate the virtual environment
165 | source .venv/bin/activate
166 |
167 | echo "Running: mike deploy $DEPLOY_ARGS"
168 | mike deploy $DEPLOY_ARGS
169 |
170 | # Set default version to latest only if we're deploying a version with the latest alias
171 | if [[ ! -z "$ALIAS" && "$ALIAS" == "latest" ]]; then
172 | mike set-default --push latest
173 | fi
174 |
175 | # Remove the temporary remote
176 | git remote remove temp || true
177 |
--------------------------------------------------------------------------------
/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
1 | [data-md-color-primary="vector"] {
2 | --md-primary-fg-color: #eb088a;
3 | --md-primary-fg-color--light: #f252a5;
4 | --md-primary-fg-color--dark: #b00068;
5 | --md-primary-bg-color: hsla(0, 0%, 100%, 1);
6 | --md-primary-bg-color--light: hsla(0, 0%, 100%, 0.7);
7 | }
8 |
9 | [data-md-color-primary="black"] {
10 | --md-primary-fg-color: #181818;
11 | --md-primary-fg-color--light: #f252a5;
12 | --md-primary-fg-color--dark: #b00068;
13 | --md-primary-bg-color: #eb088a;
14 | }
15 |
16 | [data-md-color-accent="vector-teal"] {
17 | --md-accent-fg-color: #48c0d9;
18 | --md-accent-fg-color--transparent: #526cfe1a;
19 | --md-accent-bg-color: #fff;
20 | --md-accent-bg-color--light: #ffffffb3;
21 | }
22 |
23 | [data-md-color-scheme="slate"][data-md-color-primary="black"] {
24 | --md-typeset-a-color: #eb088a;
25 | }
26 |
27 | [data-md-color-scheme="default"] {
28 | /* Default light mode styling */
29 | }
30 |
31 | [data-md-color-scheme="slate"] {
32 | --md-typeset-a-color: #eb088a;
33 | /* Dark mode styling */
34 | }
35 |
36 | /* Vector logo css styling to match overrides/partial/copyright.html */
37 | .md-footer-vector {
38 | display: flex;
39 | align-items: center;
40 | padding: 0 0.6rem;
41 | }
42 |
43 | .md-footer-vector img {
44 | height: 24px; /* Reduce height to a fixed value */
45 | width: auto; /* Maintain aspect ratio */
46 | transition: opacity 0.25s;
47 | opacity: 0.7;
48 | }
49 |
50 | .md-footer-vector img:hover {
51 | opacity: 1;
52 | }
53 |
54 | /* Make the inner footer grid elements distribute evenly */
55 | .md-footer-meta__inner {
56 | display: flex;
57 | justify-content: space-between;
58 | align-items: center;
59 | }
60 |
61 | /* To make socials and Vector logo not stack when viewing on mobile */
62 | @media screen and (max-width: 76.234375em) {
63 | .md-footer-meta__inner.md-grid {
64 | flex-direction: row;
65 | justify-content: space-between;
66 | align-items: center;
67 | }
68 |
69 | .md-copyright,
70 | .md-social {
71 | width: auto;
72 | max-width: 49%;
73 | }
74 |
75 | /* Prevent margin that causes stacking */
76 | .md-social {
77 | margin: 0;
78 | }
79 | }
80 |
81 | /* Reduce margins for h2 when using grid cards */
82 | .grid.cards h2 {
83 | margin-top: 0; /* Remove top margin completely in cards */
84 | margin-bottom: 0.5rem; /* Smaller bottom margin in cards */
85 | }
86 |
87 | .vector-icon {
88 | color: #eb088a;
89 | opacity: 0.7;
90 | margin-right: 0.2em;
91 | }
92 |
93 | /* Version selector styling - Material theme */
94 |
95 | /* Version selector container */
96 | .md-version {
97 | position: relative;
98 | display: inline-block;
99 | margin-left: 0.25rem;
100 | }
101 |
102 | /* Current version button styling */
103 | .md-version__current {
104 | display: inline-flex;
105 | align-items: center;
106 | font-size: 0.7rem;
107 | font-weight: 600;
108 | color: var(--md-primary-bg-color);
109 | padding: 0.4rem 0.8rem;
110 | margin: 0.4rem 0;
111 | background-color: rgba(255, 255, 255, 0.1);
112 | border-radius: 4px;
113 | border: 1px solid rgba(255, 255, 255, 0.2);
114 | cursor: pointer;
115 | transition: all 0.15s ease-in-out;
116 | }
117 |
118 | /* Hover effect for current version button */
119 | .md-version__current:hover {
120 | background-color: rgba(255, 255, 255, 0.2);
121 | box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
122 | }
123 |
124 | /* Down arrow for version dropdown */
125 | .md-version__current:after {
126 | display: inline-block;
127 | margin-left: 0.5rem;
128 | content: "";
129 | vertical-align: middle;
130 | border-top: 0.3em solid;
131 | border-right: 0.3em solid transparent;
132 | border-bottom: 0;
133 | border-left: 0.3em solid transparent;
134 | }
135 |
136 | /* Dropdown menu */
137 | .md-version__list {
138 | position: absolute;
139 | top: 100%;
140 | left: 0;
141 | z-index: 10;
142 | min-width: 125%;
143 | margin: 0.1rem 0 0;
144 | padding: 0;
145 | background-color: var(--md-primary-fg-color);
146 | border-radius: 4px;
147 | box-shadow: 0 4px 16px rgba(0, 0, 0, 0.2);
148 | opacity: 0;
149 | visibility: hidden;
150 | transform: translateY(-8px);
151 | transition: all 0.2s ease;
152 | }
153 |
154 | /* Show dropdown when parent is hovered */
155 | .md-version:hover .md-version__list {
156 | opacity: 1;
157 | visibility: visible;
158 | transform: translateY(0);
159 | }
160 |
161 | /* Version list items */
162 | .md-version__item {
163 | list-style: none;
164 | padding: 0;
165 | }
166 |
167 | /* Version links */
168 | .md-version__link {
169 | display: block;
170 | padding: 0.5rem 1rem;
171 | font-size: 0.75rem;
172 | color: var(--md-primary-bg-color);
173 | transition: background-color 0.15s;
174 | text-decoration: none;
175 | }
176 |
177 | /* Version link hover */
178 | .md-version__link:hover {
179 | background-color: var(--md-primary-fg-color--dark);
180 | text-decoration: none;
181 | }
182 |
183 | /* Active version in dropdown */
184 | .md-version__link--active {
185 | background-color: var(--md-accent-fg-color);
186 | color: var(--md-accent-bg-color);
187 | font-weight: 700;
188 | }
189 |
190 | /* For the Material selector */
191 | .md-header__option {
192 | display: flex;
193 | align-items: center;
194 | }
195 |
196 | /* Version selector in Material 9.x */
197 | .md-select {
198 | position: relative;
199 | margin-left: 0.5rem;
200 | }
201 |
202 | .md-select__label {
203 | font-size: 0.7rem;
204 | font-weight: 600;
205 | color: var(--md-primary-bg-color);
206 | cursor: pointer;
207 | padding: 0.4rem 0.8rem;
208 | background-color: rgba(255, 255, 255, 0.1);
209 | border-radius: 4px;
210 | border: 1px solid rgba(255, 255, 255, 0.2);
211 | transition: all 0.15s ease-in-out;
212 | }
213 |
214 | .md-select__label:hover {
215 | background-color: rgba(255, 255, 255, 0.2);
216 | box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
217 | }
218 |
219 | /* Version selector in Material 9.2+ */
220 | .md-header__button.md-select {
221 | display: inline-flex;
222 | align-items: center;
223 | margin: 0 0.8rem;
224 | }
225 |
226 | /* For Material 9.x+ with specific version selector */
227 | .md-typeset .md-version-warn {
228 | padding: 0.6rem 1rem;
229 | margin: 1.5rem 0;
230 | background-color: rgba(235, 8, 138, 0.1);
231 | border-left: 4px solid #eb088a;
232 | border-radius: 0.2rem;
233 | color: var(--md-default-fg-color);
234 | font-size: 0.8rem;
235 | }
236 |
--------------------------------------------------------------------------------
/vec_inf/client/config.py:
--------------------------------------------------------------------------------
1 | """Model configuration.
2 |
3 | This module provides a Pydantic model for validating and managing model deployment
4 | configurations, including hardware requirements and model specifications.
5 | """
6 |
7 | from pathlib import Path
8 | from typing import Any, Optional, Union
9 |
10 | from pydantic import BaseModel, ConfigDict, Field
11 | from typing_extensions import Literal
12 |
13 | from vec_inf.client._slurm_vars import (
14 | DEFAULT_ARGS,
15 | MAX_CPUS_PER_TASK,
16 | MAX_GPUS_PER_NODE,
17 | MAX_NUM_NODES,
18 | PARTITION,
19 | QOS,
20 | RESOURCE_TYPE,
21 | )
22 |
23 |
24 | class ModelConfig(BaseModel):
25 | """Pydantic model for validating and managing model deployment configurations.
26 |
27 | A configuration class that handles validation and management of model deployment
28 | settings, including model specifications, hardware requirements, and runtime
29 | parameters.
30 |
31 | Parameters
32 | ----------
33 | model_name : str
34 | Name of the model, must be alphanumeric with allowed characters: '-', '_', '.'
35 | model_family : str
36 | Family/architecture of the model
37 | model_variant : str, optional
38 | Specific variant or version of the model family
39 | model_type : {'LLM', 'VLM', 'Text_Embedding', 'Reward_Modeling'}
40 | Type of model architecture
41 | gpus_per_node : int
42 | Number of GPUs to use per node (1-MAX_GPUS_PER_NODE)
43 | num_nodes : int
44 | Number of nodes to use for deployment (1-MAX_NUM_NODES)
45 | cpus_per_task : int, optional
46 | Number of CPU cores per task (1-MAX_CPUS_PER_TASK)
47 | mem_per_node : str, optional
48 | Memory allocation per node in GB format (e.g., '32G')
49 | vocab_size : int
50 | Size of the model's vocabulary (1-1,000,000)
51 | account : str, optional
52 | Charge resources used by this job to specified account.
53 | work_dir : str, optional
54 | Set working directory for the batch job
55 | qos : Union[QOS, str], optional
56 | Quality of Service tier for job scheduling
57 | time : str, optional
58 | Time limit for the job in HH:MM:SS format
59 | partition : Union[PARTITION, str], optional
60 | Slurm partition for job scheduling
61 | resource_type : Union[RESOURCE_TYPE, str], optional
62 | Type of resource to request for the job
63 | venv : str, optional
64 | Virtual environment or container system to use
65 | log_dir : Path, optional
66 | Directory path for storing logs
67 | model_weights_parent_dir : Path, optional
68 | Base directory containing model weights
69 | vllm_args : dict[str, Any], optional
70 | Additional arguments for vLLM engine configuration
71 |
72 | Notes
73 | -----
74 | All fields are validated using Pydantic's validation system. The model is
75 | configured to be immutable (frozen) and forbids extra fields.
76 | """
77 |
78 | model_name: str = Field(..., min_length=3, pattern=r"^[a-zA-Z0-9\-_\.]+$")
79 | model_family: str = Field(..., min_length=2)
80 | model_variant: Optional[str] = Field(
81 | default=None, description="Specific variant/version of the model family"
82 | )
83 | model_type: Literal["LLM", "VLM", "Text_Embedding", "Reward_Modeling"] = Field(
84 | ..., description="Type of model architecture"
85 | )
86 | gpus_per_node: int = Field(
87 | ..., gt=0, le=MAX_GPUS_PER_NODE, description="GPUs per node"
88 | )
89 | num_nodes: int = Field(..., gt=0, le=MAX_NUM_NODES, description="Number of nodes")
90 | cpus_per_task: int = Field(
91 | default=int(DEFAULT_ARGS["cpus_per_task"]),
92 | gt=0,
93 | le=MAX_CPUS_PER_TASK,
94 | description="CPUs per task",
95 | )
96 | mem_per_node: str = Field(
97 | default=DEFAULT_ARGS["mem_per_node"],
98 | pattern=r"^\d{1,4}G$",
99 | description="Memory per node",
100 | )
101 | vocab_size: int = Field(..., gt=0, le=1_000_000)
102 | account: Optional[str] = Field(
103 | default=None, description="Account name for job scheduling"
104 | )
105 | work_dir: Optional[str] = Field(
106 | default=None, description="Working directory for the job"
107 | )
108 | qos: Optional[Union[QOS, str]] = Field(
109 | default=DEFAULT_ARGS["qos"] if DEFAULT_ARGS["qos"] != "" else None,
110 | description="Quality of Service tier",
111 | )
112 | time: str = Field(
113 | default=DEFAULT_ARGS["time"],
114 | pattern=r"^\d{2}:\d{2}:\d{2}$",
115 | description="HH:MM:SS time limit",
116 | )
117 | partition: Optional[Union[PARTITION, str]] = Field(
118 | default=DEFAULT_ARGS["partition"] if DEFAULT_ARGS["partition"] != "" else None,
119 | description="GPU partition type",
120 | )
121 | resource_type: Optional[Union[RESOURCE_TYPE, str]] = Field(
122 | default=DEFAULT_ARGS["resource_type"]
123 | if DEFAULT_ARGS["resource_type"] != ""
124 | else None,
125 | description="Resource type",
126 | )
127 | exclude: Optional[str] = Field(
128 | default=DEFAULT_ARGS["exclude"],
129 | description="Exclude certain nodes from the resources granted to the job",
130 | )
131 | nodelist: Optional[str] = Field(
132 | default=DEFAULT_ARGS["nodelist"],
133 | description="Request a specific list of nodes for deployment",
134 | )
135 | bind: Optional[str] = Field(
136 | default=DEFAULT_ARGS["bind"],
137 | description="Additional binds for the container",
138 | )
139 | venv: str = Field(
140 | default=DEFAULT_ARGS["venv"],
141 | description="Virtual environment/container system",
142 | )
143 | log_dir: Path = Field(
144 | default=Path(DEFAULT_ARGS["log_dir"]),
145 | description="Log directory path",
146 | )
147 | model_weights_parent_dir: Path = Field(
148 | default=Path(DEFAULT_ARGS["model_weights_parent_dir"]),
149 | description="Base directory for model weights",
150 | )
151 | vllm_args: Optional[dict[str, Any]] = Field(
152 | default={}, description="vLLM engine arguments"
153 | )
154 | env: Optional[dict[str, Any]] = Field(
155 | default={}, description="Environment variables to be set"
156 | )
157 | model_config = ConfigDict(
158 | extra="forbid", str_strip_whitespace=True, validate_default=True, frozen=True
159 | )
160 |
--------------------------------------------------------------------------------
/vec_inf/client/models.py:
--------------------------------------------------------------------------------
1 | """Data models for Vector Inference API.
2 |
3 | This module contains the data model classes used by the Vector Inference API
4 | for both request parameters and response objects.
5 |
6 | Classes
7 | -------
8 | ModelStatus : Enum
9 | Status states of a model
10 | ModelType : Enum
11 | Types of supported models
12 | LaunchResponse : dataclass
13 | Response from model launch operation
14 | StatusResponse : dataclass
15 | Response from model status check
16 | MetricsResponse : dataclass
17 | Response from metrics collection
18 | LaunchOptions : dataclass
19 | Options for model launch
20 | LaunchOptionsDict : TypedDict
21 | Dictionary representation of launch options
22 | ModelInfo : datacitten
23 | Information about available models
24 | """
25 |
26 | from dataclasses import dataclass, field
27 | from enum import Enum
28 | from typing import Any, Optional, Union
29 |
30 |
31 | class ModelStatus(str, Enum):
32 | """Enum representing the possible status states of a model.
33 |
34 | Attributes
35 | ----------
36 | PENDING : str
37 | Model is waiting for Slurm to allocate resources
38 | LAUNCHING : str
39 | Model is in the process of starting
40 | READY : str
41 | Model is running and ready to serve requests
42 | FAILED : str
43 | Model failed to start or encountered an error
44 | SHUTDOWN : str
45 | Model was intentionally stopped
46 | UNAVAILABLE : str
47 | Model status cannot be determined
48 | """
49 |
50 | PENDING = "PENDING"
51 | LAUNCHING = "LAUNCHING"
52 | READY = "READY"
53 | FAILED = "FAILED"
54 | SHUTDOWN = "SHUTDOWN"
55 | UNAVAILABLE = "UNAVAILABLE"
56 |
57 |
58 | class ModelType(str, Enum):
59 | """Enum representing the possible model types.
60 |
61 | Attributes
62 | ----------
63 | LLM : str
64 | Large Language Model
65 | VLM : str
66 | Vision Language Model
67 | TEXT_EMBEDDING : str
68 | Text Embedding Model
69 | REWARD_MODELING : str
70 | Reward Modeling Model
71 | """
72 |
73 | LLM = "LLM"
74 | VLM = "VLM"
75 | TEXT_EMBEDDING = "Text_Embedding"
76 | REWARD_MODELING = "Reward_Modeling"
77 |
78 |
79 | @dataclass
80 | class LaunchResponse:
81 | """Response from launching a model.
82 |
83 | Parameters
84 | ----------
85 | slurm_job_id : str
86 | ID of the launched SLURM job
87 | model_name : str
88 | Name of the launched model
89 | config : dict[str, Any]
90 | Configuration used for the launch
91 | raw_output : str
92 | Raw output from the launch command (hidden from repr)
93 | """
94 |
95 | slurm_job_id: str
96 | model_name: str
97 | config: dict[str, Any]
98 | raw_output: str = field(repr=False)
99 |
100 |
101 | @dataclass
102 | class BatchLaunchResponse:
103 | """Response from launching multiple models in batch mode.
104 |
105 | Parameters
106 | ----------
107 | slurm_job_id : str
108 | ID of the launched SLURM job
109 | slurm_job_name : str
110 | Name of the launched SLURM job
111 | model_names : list[str]
112 | Names of the launched models
113 | config : dict[str, Any]
114 | Configuration used for the launch
115 | raw_output : str
116 | Raw output from the launch command (hidden from repr)
117 | """
118 |
119 | slurm_job_id: str
120 | slurm_job_name: str
121 | model_names: list[str]
122 | config: dict[str, Any]
123 | raw_output: str = field(repr=False)
124 |
125 |
126 | @dataclass
127 | class StatusResponse:
128 | """Response from checking a model's status.
129 |
130 | Parameters
131 | ----------
132 | model_name : str
133 | Name of the model
134 | log_dir : str
135 | Path to the SLURM log directory
136 | server_status : ModelStatus
137 | Current status of the server
138 | job_state : Union[str, ModelStatus]
139 | Current state of the SLURM job
140 | raw_output : str
141 | Raw output from status check (hidden from repr)
142 | base_url : str, optional
143 | Base URL of the model server if ready
144 | pending_reason : str, optional
145 | Reason for pending state if applicable
146 | failed_reason : str, optional
147 | Reason for failure if applicable
148 | """
149 |
150 | model_name: str
151 | log_dir: str
152 | server_status: ModelStatus
153 | job_state: Union[str, ModelStatus]
154 | raw_output: str = field(repr=False)
155 | base_url: Optional[str] = None
156 | pending_reason: Optional[str] = None
157 | failed_reason: Optional[str] = None
158 |
159 |
160 | @dataclass
161 | class MetricsResponse:
162 | """Response from retrieving model metrics.
163 |
164 | Parameters
165 | ----------
166 | model_name : str
167 | Name of the model
168 | metrics : Union[dict[str, float], str]
169 | Either a dictionary of metrics or an error message
170 | timestamp : float
171 | Unix timestamp of when metrics were collected
172 | """
173 |
174 | model_name: str
175 | metrics: Union[dict[str, float], str]
176 | timestamp: float
177 |
178 |
179 | @dataclass
180 | class LaunchOptions:
181 | """Options for launching a model.
182 |
183 | Parameters
184 | ----------
185 | model_family : str, optional
186 | Family/architecture of the model
187 | model_variant : str, optional
188 | Specific variant/version of the model
189 | partition : str, optional
190 | SLURM partition to use
191 | resource_type : str, optional
192 | Type of resource to request for the job
193 | num_nodes : int, optional
194 | Number of nodes to allocate
195 | gpus_per_node : int, optional
196 | Number of GPUs per node
197 | cpus_per_task : int, optional
198 | Number of CPUs per task
199 | mem_per_node : str, optional
200 | Memory per node
201 | account : str, optional
202 | Account name for job scheduling
203 | work_dir : str, optional
204 | Set working directory for the batch job
205 | qos : str, optional
206 | Quality of Service level
207 | time : str, optional
208 | Time limit for the job
209 | exclude : str, optional
210 | Exclude certain nodes from the resources granted to the job
211 | node_list : str, optional
212 | Request a specific list of nodes for deployment
213 | bind : str, optional
214 | Additional binds for the container as a comma separated list of bind paths
215 | vocab_size : int, optional
216 | Size of model vocabulary
217 | data_type : str, optional
218 | Data type for model weights
219 | venv : str, optional
220 | Virtual environment to use
221 | log_dir : str, optional
222 | Directory for logs
223 | model_weights_parent_dir : str, optional
224 | Parent directory containing model weights
225 | vllm_args : str, optional
226 | Additional arguments for vLLM
227 | env : str, optional
228 | Environment variables to be set
229 | config : str, optional
230 | Path to custom model config yaml
231 | """
232 |
233 | model_family: Optional[str] = None
234 | model_variant: Optional[str] = None
235 | partition: Optional[str] = None
236 | resource_type: Optional[str] = None
237 | num_nodes: Optional[int] = None
238 | gpus_per_node: Optional[int] = None
239 | cpus_per_task: Optional[int] = None
240 | mem_per_node: Optional[str] = None
241 | account: Optional[str] = None
242 | work_dir: Optional[str] = None
243 | qos: Optional[str] = None
244 | exclude: Optional[str] = None
245 | nodelist: Optional[str] = None
246 | bind: Optional[str] = None
247 | time: Optional[str] = None
248 | vocab_size: Optional[int] = None
249 | data_type: Optional[str] = None
250 | venv: Optional[str] = None
251 | log_dir: Optional[str] = None
252 | model_weights_parent_dir: Optional[str] = None
253 | vllm_args: Optional[str] = None
254 | env: Optional[str] = None
255 | config: Optional[str] = None
256 |
257 |
258 | @dataclass
259 | class ModelInfo:
260 | """Information about an available model.
261 |
262 | Parameters
263 | ----------
264 | name : str
265 | Name of the model
266 | family : str
267 | Family/architecture of the model
268 | variant : str, optional
269 | Specific variant/version of the model
270 | model_type : ModelType
271 | Type of the model
272 | config : dict[str, Any]
273 | Additional configuration parameters
274 | """
275 |
276 | name: str
277 | family: str
278 | variant: Optional[str]
279 | model_type: ModelType
280 | config: dict[str, Any]
281 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Vector Inference: Easy inference on Slurm clusters
2 |
3 | ----------------------------------------------------
4 |
5 | [](https://pypi.org/project/vec-inf)
6 | [](https://pypistats.org/packages/vec-inf)
7 | [](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
8 | [](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
9 | [](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
10 | [](https://docs.vllm.ai/en/v0.11.0/)
11 | 
12 |
13 | This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **This package runs natively on the Vector Institute cluster environments**. To adapt to other environments, follow the instructions in [Installation](#installation).
14 |
15 | **NOTE**: Supported models on Killarney are tracked [here](./MODEL_TRACKING.md)
16 |
17 | ## Installation
18 | If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
19 |
20 | ```bash
21 | pip install vec-inf
22 | ```
23 | Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.11.0`.
24 |
25 | If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:
26 | * Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](vec_inf/config/), then install from source by running `pip install .`.
27 | * The package would try to look for cached configuration files in your environment before using the default configuration. The default cached configuration directory path points to `/model-weights/vec-inf-shared`, you would need to create an `environment.yaml` and a `models.yaml` following the format of these files in [`vec_inf/config`](vec_inf/config/).
28 | * The package would also look for an enviroment variable `VEC_INF_CONFIG_DIR`. You can put your `environment.yaml` and `models.yaml` in a directory of your choice and set the enviroment variable `VEC_INF_CONFIG_DIR` to point to that location.
29 |
30 | ## Usage
31 |
32 | Vector Inference provides 2 user interfaces, a CLI and an API
33 |
34 | ### CLI
35 |
36 | The `launch` command allows users to deploy a model as a slurm job. If the job successfully launches, a URL endpoint is exposed for the user to send requests for inference.
37 |
38 | We will use the Llama 3.1 model as example, to launch an OpenAI compatible inference server for Meta-Llama-3.1-8B-Instruct, run:
39 |
40 | ```bash
41 | vec-inf launch Meta-Llama-3.1-8B-Instruct
42 | ```
43 | You should see an output like the following:
44 |
45 |
46 |
47 | **NOTE**: You can set the required fields in the environment configuration (`environment.yaml`), it's a mapping between required arguments and their corresponding environment variables. On the Vector **Killarney** Cluster environment, the required fields are:
48 | * `--account`, `-A`: The Slurm account, this argument can be set to default by setting environment variable `VEC_INF_ACCOUNT`.
49 | * `--work-dir`, `-D`: A working directory other than your home directory, this argument can be set to default by seeting environment variable `VEC_INF_WORK_DIR`.
50 |
51 | Models that are already supported by `vec-inf` would be launched using the cached configuration (set in [slurm_vars.py](vec_inf/client/slurm_vars.py)) or [default configuration](vec_inf/config/models.yaml). You can override these values by providing additional parameters. Use `vec-inf launch --help` to see the full list of parameters that can be overriden. You can also launch your own custom model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html). For detailed instructions on how to customize your model launch, check out the [`launch` command section in User Guide](https://vectorinstitute.github.io/vector-inference/latest/user_guide/#launch-command)
52 |
53 | #### Other commands
54 |
55 | * `batch-launch`: Launch multiple model inference servers at once, currently ONLY single node models supported,
56 | * `status`: Check the status of all `vec-inf` jobs, or a specific job by providing its job ID.
57 | * `metrics`: Streams performance metrics to the console.
58 | * `shutdown`: Shutdown a model by providing its Slurm job ID.
59 | * `list`: List all available model names, or view the default/cached configuration of a specific model.
60 | * `cleanup`: Remove old log directories, use `--help` to see the supported filters. Use `--dry-run` to preview what would be deleted.
61 |
62 | For more details on the usage of these commands, refer to the [User Guide](https://vectorinstitute.github.io/vector-inference/user_guide/)
63 |
64 | ### API
65 |
66 | Example:
67 |
68 | ```python
69 | >>> from vec_inf.api import VecInfClient
70 | >>> client = VecInfClient()
71 | >>> # Assume VEC_INF_ACCOUNT and VEC_INF_WORK_DIR is set
72 | >>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
73 | >>> job_id = response.slurm_job_id
74 | >>> status = client.get_status(job_id)
75 | >>> if status.status == ModelStatus.READY:
76 | ... print(f"Model is ready at {status.base_url}")
77 | >>> # Alternatively, use wait_until_ready which will either return a StatusResponse or throw a ServerError
78 | >>> try:
79 | >>> status = wait_until_ready(job_id)
80 | >>> except ServerError as e:
81 | >>> print(f"Model launch failed: {e}")
82 | >>> client.shutdown_model(job_id)
83 | ```
84 |
85 | For details on the usage of the API, refer to the [API Reference](https://vectorinstitute.github.io/vector-inference/api/)
86 |
87 | ## Check Job Configuration
88 |
89 | With every model launch, a Slurm script will be generated dynamically based on the job and model configuration. Once the Slurm job is queued, the generated Slurm script will be moved to the log directory for reproducibility, located at `$log_dir/$model_family/$model_name.$slurm_job_id/$model_name.$slurm_job_id.slurm`. In the same directory you can also find a JSON file with the same name that captures the launch configuration, and will have an entry of server URL once the server is ready.
90 |
91 | ## Send inference requests
92 |
93 | Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/chat_completions.py`, and you should expect to see an output like the following:
94 |
95 | ```json
96 | {
97 | "id":"chatcmpl-387c2579231948ffaf66cdda5439d3dc",
98 | "choices": [
99 | {
100 | "finish_reason":"stop",
101 | "index":0,
102 | "logprobs":null,
103 | "message": {
104 | "content":"Arrr, I be Captain Chatbeard, the scurviest chatbot on the seven seas! Ye be wantin' to know me identity, eh? Well, matey, I be a swashbucklin' AI, here to provide ye with answers and swappin' tales, savvy?",
105 | "role":"assistant",
106 | "function_call":null,
107 | "tool_calls":[],
108 | "reasoning_content":null
109 | },
110 | "stop_reason":null
111 | }
112 | ],
113 | "created":1742496683,
114 | "model":"Meta-Llama-3.1-8B-Instruct",
115 | "object":"chat.completion",
116 | "system_fingerprint":null,
117 | "usage": {
118 | "completion_tokens":66,
119 | "prompt_tokens":32,
120 | "total_tokens":98,
121 | "prompt_tokens_details":null
122 | },
123 | "prompt_logprobs":null
124 | }
125 |
126 | ```
127 | **NOTE**: Certain models don't adhere to OpenAI's chat template, e.g. Mistral family. For these models, you can either change your prompt to follow the model's default chat template or provide your own chat template via `--chat-template: TEMPLATE_PATH`.
128 |
129 | ## SSH tunnel from your local device
130 | If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
131 | ```bash
132 | ssh -L 8081:10.1.1.29:8081 username@v.vectorinstitute.ai -N
133 | ```
134 | The example provided above is for the Vector Killarney cluster, change the variables accordingly for your environment. The IP address for the compute nodes on Killarney follow `10.1.1.XX` pattern, where `XX` is the GPU number (`kn029` -> `29` in this example).
135 |
136 | ## Reference
137 | If you found Vector Inference useful in your research or applications, please cite using the following BibTeX template:
138 | ```
139 | @software{vector_inference,
140 | title = {Vector Inference: Efficient LLM inference on Slurm clusters using vLLM},
141 | author = {Wang, Marshall},
142 | organization = {Vector Institute},
143 | year = {},
144 | version = {},
145 | url = {https://github.com/VectorInstitute/vector-inference}
146 | }
147 | ```
148 |
--------------------------------------------------------------------------------
/MODEL_TRACKING.md:
--------------------------------------------------------------------------------
1 | # Model Weights Tracking
2 |
3 | This document tracks all model weights available in the `/model-weights` directory on Killarney cluster and indicates which ones have existing configurations in the cached model config (`/model-weights/vec-inf-shared/models.yaml`). By default, `vec-inf` would use the cached model config. To request new model weights to be downloaded or model configuration to be added, please open an issue for "Model request".
4 |
5 | **NOTE**: The [`models.yaml`](./vec_inf/config/models.yaml) file in the package is not always up to date with the latest cached model config on Killarney cluster, new model config would be added to the cached model config. `models.yaml` would be updated to reflect the cached model config when a new version of the package is released.
6 |
7 | ## Legend
8 | - ✅ **Configured**: Model has a complete configuration in `models.yaml`
9 | - ❌ **Not Configured**: Model exists in `/model-weights` but lacks configuration
10 |
11 | ---
12 |
13 | ## Text Generation Models (LLM)
14 |
15 | ### Cohere for AI: Command R
16 | | Model | Configuration |
17 | |:------|:-------------|
18 | | `c4ai-command-r-plus-08-2024` | ✅ |
19 | | `c4ai-command-r-08-2024` | ✅ |
20 |
21 | ### Code Llama
22 | | Model | Configuration |
23 | |:------|:-------------|
24 | | `CodeLlama-7b-hf` | ✅ |
25 | | `CodeLlama-7b-Instruct-hf` | ✅ |
26 | | `CodeLlama-13b-hf` | ✅ |
27 | | `CodeLlama-13b-Instruct-hf` | ✅ |
28 | | `CodeLlama-34b-hf` | ✅ |
29 | | `CodeLlama-34b-Instruct-hf` | ✅ |
30 | | `CodeLlama-70b-hf` | ✅ |
31 | | `CodeLlama-70b-Instruct-hf` | ✅ |
32 | | `CodeLlama-7b-Python-hf` | ❌ |
33 | | `CodeLlama-13b-Python-hf` | ❌ |
34 | | `CodeLlama-70b-Python-hf` | ❌ |
35 |
36 | ### Google: Gemma
37 | | Model | Configuration |
38 | |:------|:-------------|
39 | | `gemma-2b` | ❌ |
40 | | `gemma-2b-it` | ❌ |
41 | | `gemma-7b` | ❌ |
42 | | `gemma-7b-it` | ❌ |
43 | | `gemma-2-2b-it` | ✅ |
44 | | `gemma-2-9b` | ✅ |
45 | | `gemma-2-9b-it` | ✅ |
46 | | `gemma-2-27b` | ✅ |
47 | | `gemma-2-27b-it` | ✅ |
48 | | `gemma-3-1b-it` | ❌ |
49 | | `gemma-3-4b-it` | ❌ |
50 | | `gemma-3-12b-it` | ❌ |
51 | | `gemma-3-27b-it` | ❌ |
52 |
53 | ### Meta: Llama 2
54 | | Model | Configuration |
55 | |:------|:-------------|
56 | | `Llama-2-7b-hf` | ✅ |
57 | | `Llama-2-7b-chat-hf` | ✅ |
58 | | `Llama-2-13b-hf` | ✅ |
59 | | `Llama-2-13b-chat-hf` | ✅ |
60 | | `Llama-2-70b-hf` | ✅ |
61 | | `Llama-2-70b-chat-hf` | ✅ |
62 |
63 | ### Meta: Llama 3
64 | | Model | Configuration |
65 | |:------|:-------------|
66 | | `Meta-Llama-3-8B` | ✅ |
67 | | `Meta-Llama-3-8B-Instruct` | ✅ |
68 | | `Meta-Llama-3-70B` | ✅ |
69 | | `Meta-Llama-3-70B-Instruct` | ✅ |
70 |
71 | ### Meta: Llama 3.1
72 | | Model | Configuration |
73 | |:------|:-------------|
74 | | `Meta-Llama-3.1-8B` | ✅ |
75 | | `Meta-Llama-3.1-8B-Instruct` | ✅ |
76 | | `Meta-Llama-3.1-70B` | ✅ |
77 | | `Meta-Llama-3.1-70B-Instruct` | ✅ |
78 | | `Meta-Llama-3.1-405B-Instruct` | ✅ |
79 |
80 | ### Meta: Llama 3.2
81 | | Model | Configuration |
82 | |:------|:-------------|
83 | | `Llama-3.2-1B` | ✅ |
84 | | `Llama-3.2-1B-Instruct` | ✅ |
85 | | `Llama-3.2-3B` | ✅ |
86 | | `Llama-3.2-3B-Instruct` | ✅ |
87 |
88 | ### Meta: Llama 3.3
89 | | Model | Configuration |
90 | |:------|:-------------|
91 | | `Llama-3.3-70B-Instruct` | ✅ |
92 |
93 | ### Meta: Llama 4
94 | | Model | Configuration |
95 | |:------|:-------------|
96 | | `Llama-4-Scout-17B-16E-Instruct` | ❌ |
97 |
98 | ### Mistral AI: Mistral
99 | | Model | Configuration |
100 | |:------|:-------------|
101 | | `Mistral-7B-v0.3` | ✅ |
102 | | `Mistral-7B-Instruct-v0.1` | ✅ |
103 | | `Mistral-7B-Instruct-v0.2` | ✅ |
104 | | `Mistral-7B-Instruct-v0.3` | ✅ |
105 | | `Mistral-Large-Instruct-2407` | ✅ |
106 | | `Mistral-Large-Instruct-2411` | ✅ |
107 |
108 | ### Mistral AI: Mixtral
109 | | Model | Configuration |
110 | |:------|:-------------|
111 | | `Mixtral-8x7B-Instruct-v0.1` | ✅ |
112 | | `Mixtral-8x22B-v0.1` | ✅ |
113 | | `Mixtral-8x22B-Instruct-v0.1` | ✅ |
114 |
115 | ### Microsoft: Phi
116 | | Model | Configuration |
117 | |:------|:-------------|
118 | | `Phi-3-medium-128k-instruct` | ✅ |
119 | | `phi-4` | ❌ |
120 |
121 | ### Nvidia: Llama-3.1-Nemotron
122 | | Model | Configuration |
123 | |:------|:-------------|
124 | | `Llama-3.1-Nemotron-70B-Instruct-HF` | ✅ |
125 |
126 | ### Qwen: Qwen2.5
127 | | Model | Configuration |
128 | |:------|:-------------|
129 | | `Qwen2.5-0.5B-Instruct` | ✅ |
130 | | `Qwen2.5-1.5B-Instruct` | ✅ |
131 | | `Qwen2.5-3B-Instruct` | ✅ |
132 | | `Qwen2.5-7B-Instruct` | ✅ |
133 | | `Qwen2.5-14B-Instruct` | ✅ |
134 | | `Qwen2.5-32B-Instruct` | ✅ |
135 | | `Qwen2.5-72B-Instruct` | ✅ |
136 |
137 | ### Qwen: Qwen2.5-Math
138 | | Model | Configuration |
139 | |:------|:-------------|
140 | | `Qwen2.5-Math-1.5B-Instruct` | ✅ |
141 | | `Qwen2.5-Math-7B-Instruct` | ✅ |
142 | | `Qwen2.5-Math-72B-Instruct` | ✅ |
143 |
144 | ### Qwen: Qwen2.5-Coder
145 | | Model | Configuration |
146 | |:------|:-------------|
147 | | `Qwen2.5-Coder-7B-Instruct` | ✅ |
148 |
149 | ### Qwen: QwQ
150 | | Model | Configuration |
151 | |:------|:-------------|
152 | | `QwQ-32B` | ✅ |
153 |
154 | ### Qwen: Qwen2
155 | | Model | Configuration |
156 | |:------|:-------------|
157 | | `Qwen2-1.5B-Instruct` | ❌ |
158 | | `Qwen2-7B-Instruct` | ❌ |
159 | | `Qwen2-Math-1.5B-Instruct` | ❌ |
160 | | `Qwen2-Math-7B-Instruct` | ❌ |
161 | | `Qwen2-Math-72B` | ❌ |
162 | | `Qwen2-Math-72B-Instruct` | ❌ |
163 | | `Qwen2-VL-7B-Instruct` | ❌ |
164 |
165 | ### Qwen: Qwen3
166 | | Model | Configuration |
167 | |:------|:-------------|
168 | | `Qwen3-14B` | ✅ |
169 | | `Qwen3-8B` | ✅ |
170 | | `Qwen3-32B` | ✅ |
171 | | `Qwen3-235B-A22B` | ❌ |
172 | | `Qwen3-Embedding-8B` | ❌ |
173 |
174 | ### DeepSeek: DeepSeek-R1
175 | | Model | Configuration |
176 | |:------|:-------------|
177 | | `DeepSeek-R1-Distill-Llama-8B` | ✅ |
178 | | `DeepSeek-R1-Distill-Llama-70B` | ✅ |
179 | | `DeepSeek-R1-Distill-Qwen-1.5B` | ✅ |
180 | | `DeepSeek-R1-Distill-Qwen-7B` | ✅ |
181 | | `DeepSeek-R1-Distill-Qwen-14B` | ✅ |
182 | | `DeepSeek-R1-Distill-Qwen-32B` | ✅ |
183 |
184 | ### DeepSeek: Other Models
185 | | Model | Configuration |
186 | |:------|:-------------|
187 | | `DeepSeek-Coder-V2-Lite-Instruct` | ❌ |
188 | | `deepseek-math-7b-instruct` | ❌ |
189 |
190 | ### OpenAI: GPT-OSS
191 | | Model | Configuration |
192 | |:------|:-------------|
193 | | `gpt-oss-120b` | ✅ |
194 |
195 | ### Other LLM Models
196 | | Model | Configuration |
197 | |:------|:-------------|
198 | | `AI21-Jamba-1.5-Mini` | ❌ |
199 | | `aya-expanse-32b` | ✅ (as Aya-Expanse-32B) |
200 | | `gpt2-large` | ❌ |
201 | | `gpt2-xl` | ❌ |
202 | | `gpt-oss-120b` | ❌ |
203 | | `instructblip-vicuna-7b` | ❌ |
204 | | `internlm2-math-plus-7b` | ❌ |
205 | | `Janus-Pro-7B` | ❌ |
206 | | `Kimi-K2-Instruct` | ❌ |
207 | | `Ministral-8B-Instruct-2410` | ❌ |
208 | | `Molmo-7B-D-0924` | ✅ |
209 | | `OLMo-1B-hf` | ❌ |
210 | | `OLMo-7B-hf` | ❌ |
211 | | `OLMo-7B-SFT` | ❌ |
212 | | `pythia` | ❌ |
213 | | `Qwen1.5-72B-Chat` | ❌ |
214 | | `ReasonFlux-PRM-7B` | ❌ |
215 | | `t5-large-lm-adapt` | ❌ |
216 | | `t5-xl-lm-adapt` | ❌ |
217 | | `mt5-xl-lm-adapt` | ❌ |
218 |
219 | ---
220 |
221 | ## Vision Language Models (VLM)
222 |
223 | ### LLaVa
224 | | Model | Configuration |
225 | |:------|:-------------|
226 | | `llava-1.5-7b-hf` | ✅ |
227 | | `llava-1.5-13b-hf` | ✅ |
228 | | `llava-v1.6-mistral-7b-hf` | ✅ |
229 | | `llava-v1.6-34b-hf` | ✅ |
230 | | `llava-med-v1.5-mistral-7b` | ❌ |
231 |
232 | ### Microsoft: Phi 3 Vision
233 | | Model | Configuration |
234 | |:------|:-------------|
235 | | `Phi-3-vision-128k-instruct` | ✅ |
236 | | `Phi-3.5-vision-instruct` | ✅ |
237 |
238 | ### Meta: Llama 3.2 Vision
239 | | Model | Configuration |
240 | |:------|:-------------|
241 | | `Llama-3.2-11B-Vision` | ✅ |
242 | | `Llama-3.2-11B-Vision-Instruct` | ✅ |
243 | | `Llama-3.2-90B-Vision` | ✅ |
244 | | `Llama-3.2-90B-Vision-Instruct` | ✅ |
245 |
246 | ### Mistral: Pixtral
247 | | Model | Configuration |
248 | |:------|:-------------|
249 | | `Pixtral-12B-2409` | ✅ |
250 |
251 | ### OpenGVLab: InternVL2.5
252 | | Model | Configuration |
253 | |:------|:-------------|
254 | | `InternVL2_5-8B` | ✅ |
255 | | `InternVL2_5-26B` | ✅ |
256 | | `InternVL2_5-38B` | ✅ |
257 |
258 | ### THUDM: GLM-4
259 | | Model | Configuration |
260 | |:------|:-------------|
261 | | `glm-4v-9b` | ✅ |
262 |
263 | ### DeepSeek: DeepSeek-VL2
264 | | Model | Configuration |
265 | |:------|:-------------|
266 | | `deepseek-vl2` | ✅ |
267 | | `deepseek-vl2-small` | ✅ |
268 |
269 | ### Other VLM Models
270 | | Model | Configuration |
271 | |:------|:-------------|
272 | | `MiniCPM-Llama3-V-2_5` | ❌ |
273 |
274 | ---
275 |
276 | ## Text Embedding Models
277 |
278 | ### Liang Wang: e5
279 | | Model | Configuration |
280 | |:------|:-------------|
281 | | `e5-mistral-7b-instruct` | ✅ |
282 |
283 | ### BAAI: bge
284 | | Model | Configuration |
285 | |:------|:-------------|
286 | | `bge-base-en-v1.5` | ✅ |
287 | | `bge-m3` | ❌ |
288 | | `bge-multilingual-gemma2` | ❌ |
289 |
290 | ### Sentence Transformers: MiniLM
291 | | Model | Configuration |
292 | |:------|:-------------|
293 | | `all-MiniLM-L6-v2` | ✅ |
294 |
295 | ### Other Embedding Models
296 | | Model | Configuration |
297 | |:------|:-------------|
298 | | `data2vec` | ❌ |
299 | | `gte-modernbert-base` | ❌ |
300 | | `gte-Qwen2-7B-instruct` | ❌ |
301 | | `m2-bert-80M-32k-retrieval` | ❌ |
302 | | `m2-bert-80M-8k-retrieval` | ❌ |
303 |
304 | ---
305 |
306 | ## Reward Modeling Models
307 |
308 | ### Qwen: Qwen2.5-Math
309 | | Model | Configuration |
310 | |:------|:-------------|
311 | | `Qwen2.5-Math-RM-72B` | ✅ |
312 | | `Qwen2.5-Math-PRM-7B` | ✅ |
313 |
314 | ---
315 |
316 | ## Multimodal Models
317 |
318 | ### CLIP
319 | | Model | Configuration |
320 | |:------|:-------------|
321 | | `clip-vit-base-patch16` | ❌ |
322 | | `clip-vit-large-patch14-336` | ❌ |
323 |
324 | ### Stable Diffusion
325 | | Model | Configuration |
326 | |:------|:-------------|
327 | | `sd-v1-4-full-ema` | ❌ |
328 | | `stable-diffusion-v1-4` | ❌ |
329 |
330 | ---
331 |
--------------------------------------------------------------------------------
/vec_inf/client/_slurm_templates.py:
--------------------------------------------------------------------------------
1 | """SLURM script templates for Vector Inference.
2 |
3 | This module contains the SLURM script templates for Vector Inference, including
4 | single-node, multi-node, and batch mode templates.
5 | """
6 |
7 | from typing import TypedDict
8 |
9 | from vec_inf.client._slurm_vars import (
10 | CONTAINER_LOAD_CMD,
11 | CONTAINER_MODULE_NAME,
12 | IMAGE_PATH,
13 | )
14 |
15 |
16 | CONTAINER_MODULE_NAME_UPPER = CONTAINER_MODULE_NAME.upper()
17 |
18 |
19 | class ShebangConfig(TypedDict):
20 | """TypedDict for SLURM script shebang configuration.
21 |
22 | Parameters
23 | ----------
24 | base : str
25 | Base shebang line for all SLURM scripts
26 | multinode : list[str]
27 | Additional SLURM directives for multi-node configurations
28 | """
29 |
30 | base: str
31 | multinode: list[str]
32 |
33 |
34 | class ServerSetupConfig(TypedDict):
35 | """TypedDict for server setup configuration.
36 |
37 | Parameters
38 | ----------
39 | single_node : list[str]
40 | Setup commands for single-node deployments
41 | multinode : list[str]
42 | Setup commands for multi-node deployments, including Ray initialization
43 | """
44 |
45 | single_node: list[str]
46 | multinode: list[str]
47 |
48 |
49 | class SlurmScriptTemplate(TypedDict):
50 | """TypedDict for complete SLURM script template configuration.
51 |
52 | Parameters
53 | ----------
54 | shebang : ShebangConfig
55 | Shebang and SLURM directive configuration
56 | container_setup : list[str]
57 | Commands for container setup
58 | imports : str
59 | Import statements and source commands
60 | bind_path : str
61 | Bind path environment variable for the container
62 | container_command : str
63 | Template for container execution command
64 | activate_venv : str
65 | Template for virtual environment activation
66 | server_setup : ServerSetupConfig
67 | Server initialization commands for different deployment modes
68 | find_vllm_port : list[str]
69 | Commands to find available ports for vLLM server
70 | write_to_json : list[str]
71 | Commands to write server configuration to JSON
72 | launch_cmd : list[str]
73 | vLLM server launch commands
74 | """
75 |
76 | shebang: ShebangConfig
77 | container_setup: list[str]
78 | imports: str
79 | bind_path: str
80 | container_command: str
81 | activate_venv: str
82 | server_setup: ServerSetupConfig
83 | find_vllm_port: list[str]
84 | write_to_json: list[str]
85 | launch_cmd: list[str]
86 |
87 |
88 | SLURM_SCRIPT_TEMPLATE: SlurmScriptTemplate = {
89 | "shebang": {
90 | "base": "#!/bin/bash",
91 | "multinode": [
92 | "#SBATCH --exclusive",
93 | "#SBATCH --tasks-per-node=1",
94 | ],
95 | },
96 | "container_setup": [
97 | CONTAINER_LOAD_CMD,
98 | f"{CONTAINER_MODULE_NAME} exec {IMAGE_PATH} ray stop",
99 | ],
100 | "imports": "source {src_dir}/find_port.sh",
101 | "bind_path": f"export {CONTAINER_MODULE_NAME.upper()}_BINDPATH=${CONTAINER_MODULE_NAME.upper()}_BINDPATH,/dev,/tmp,{{model_weights_path}}{{additional_binds}}",
102 | "container_command": f"{CONTAINER_MODULE_NAME} exec --nv {{env_str}} --containall {IMAGE_PATH} \\",
103 | "activate_venv": "source {venv}/bin/activate",
104 | "server_setup": {
105 | "single_node": [
106 | "\n# Find available port",
107 | "head_node_ip=${SLURMD_NODENAME}",
108 | ],
109 | "multinode": [
110 | "\n# Get list of nodes",
111 | 'nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")',
112 | "nodes_array=($nodes)",
113 | "head_node=${{nodes_array[0]}}",
114 | 'head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)',
115 | "\n# Check for RDMA devices and set environment variable accordingly",
116 | "if ! command -v ibv_devices >/dev/null 2>&1; then",
117 | ' echo "ibv_devices not found; forcing TCP. (No RDMA userland on host?)"',
118 | " export NCCL_IB_DISABLE=1",
119 | ' export NCCL_ENV_ARG="--env NCCL_IB_DISABLE=1"',
120 | "else",
121 | " # Pick GID index based on link layer (IB vs RoCE)",
122 | ' if ibv_devinfo 2>/dev/null | grep -q "link_layer:.*Ethernet"; then',
123 | " # RoCEv2 typically needs a nonzero GID index; 3 is common, try 2 if your fabric uses it",
124 | " export NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-3}}",
125 | ' export NCCL_ENV_ARG="--env NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-3}}"',
126 | " else",
127 | " # Native InfiniBand => GID 0",
128 | " export NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-0}}",
129 | ' export NCCL_ENV_ARG="--env NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-0}}"',
130 | " fi",
131 | "fi",
132 | "\n# Start Ray head node",
133 | "head_node_port=$(find_available_port $head_node_ip 8080 65535)",
134 | "ray_head=$head_node_ip:$head_node_port",
135 | 'echo "Ray Head IP: $ray_head"',
136 | 'echo "Starting HEAD at $head_node"',
137 | 'srun --nodes=1 --ntasks=1 -w "$head_node" \\',
138 | " CONTAINER_PLACEHOLDER",
139 | ' ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \\',
140 | ' --num-cpus "$SLURM_CPUS_PER_TASK" --num-gpus {gpus_per_node} --block &',
141 | "sleep 10",
142 | "\n# Start Ray worker nodes",
143 | "worker_num=$((SLURM_JOB_NUM_NODES - 1))",
144 | "for ((i = 1; i <= worker_num; i++)); do",
145 | " node_i=${{nodes_array[$i]}}",
146 | ' echo "Starting WORKER $i at $node_i"',
147 | ' srun --nodes=1 --ntasks=1 -w "$node_i" \\',
148 | " CONTAINER_PLACEHOLDER",
149 | ' ray start --address "$ray_head" \\',
150 | ' --num-cpus "$SLURM_CPUS_PER_TASK" --num-gpus {gpus_per_node} --block &',
151 | " sleep 5",
152 | "done",
153 | ],
154 | },
155 | "find_vllm_port": [
156 | "\nvllm_port_number=$(find_available_port $head_node_ip 8080 65535)",
157 | 'server_address="http://${head_node_ip}:${vllm_port_number}/v1"',
158 | ],
159 | "write_to_json": [
160 | '\njson_path="{log_dir}/{model_name}.$SLURM_JOB_ID/{model_name}.$SLURM_JOB_ID.json"',
161 | 'jq --arg server_addr "$server_address" \\',
162 | " '. + {{\"server_address\": $server_addr}}' \\",
163 | ' "$json_path" > temp.json \\',
164 | ' && mv temp.json "$json_path"',
165 | ],
166 | "launch_cmd": [
167 | "vllm serve {model_weights_path} \\",
168 | " --served-model-name {model_name} \\",
169 | ' --host "0.0.0.0" \\',
170 | " --port $vllm_port_number \\",
171 | ],
172 | }
173 |
174 |
175 | class BatchSlurmScriptTemplate(TypedDict):
176 | """TypedDict for batch SLURM script template configuration.
177 |
178 | Parameters
179 | ----------
180 | shebang : str
181 | Shebang line for the script
182 | hetjob : str
183 | SLURM directive for hetjob
184 | permission_update : str
185 | Command to update permissions of the script
186 | launch_model_scripts : list[str]
187 | Commands to launch the vLLM server
188 | """
189 |
190 | shebang: str
191 | hetjob: str
192 | permission_update: str
193 | launch_model_scripts: list[str]
194 |
195 |
196 | BATCH_SLURM_SCRIPT_TEMPLATE: BatchSlurmScriptTemplate = {
197 | "shebang": "#!/bin/bash",
198 | "hetjob": "#SBATCH hetjob\n",
199 | "permission_update": "chmod +x {script_name}",
200 | "launch_model_scripts": [
201 | "\nsrun --het-group={het_group_id} \\",
202 | " --output={out_file} \\",
203 | " --error={err_file} \\",
204 | " {script_name} &\n",
205 | ],
206 | }
207 |
208 |
209 | class BatchModelLaunchScriptTemplate(TypedDict):
210 | """TypedDict for batch model launch script template configuration.
211 |
212 | Parameters
213 | ----------
214 | shebang : str
215 | Shebang line for the script
216 | container_setup : list[str]
217 | Commands for container setup
218 | bind_path : str
219 | Bind path environment variable for the container
220 | server_address_setup : list[str]
221 | Commands to setup the server address
222 | launch_cmd : list[str]
223 | Commands to launch the vLLM server
224 | container_command : str
225 | Commands to setup the container command
226 | """
227 |
228 | shebang: str
229 | container_setup: str
230 | bind_path: str
231 | server_address_setup: list[str]
232 | write_to_json: list[str]
233 | launch_cmd: list[str]
234 | container_command: str
235 |
236 |
237 | BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE: BatchModelLaunchScriptTemplate = {
238 | "shebang": "#!/bin/bash\n",
239 | "container_setup": f"{CONTAINER_LOAD_CMD}\n",
240 | "bind_path": f"export {CONTAINER_MODULE_NAME.upper()}_BINDPATH=${CONTAINER_MODULE_NAME.upper()}_BINDPATH,/dev,/tmp,{{model_weights_path}}{{additional_binds}}",
241 | "server_address_setup": [
242 | "source {src_dir}/find_port.sh",
243 | "head_node_ip=${{SLURMD_NODENAME}}",
244 | "vllm_port_number=$(find_available_port $head_node_ip 8080 65535)",
245 | 'server_address="http://${{head_node_ip}}:${{vllm_port_number}}/v1"\n',
246 | "echo $server_address\n",
247 | ],
248 | "write_to_json": [
249 | "het_job_id=$(($SLURM_JOB_ID+{het_group_id}))",
250 | 'json_path="{log_dir}/{slurm_job_name}.$het_job_id/{model_name}.$het_job_id.json"',
251 | 'jq --arg server_addr "$server_address" \\',
252 | " '. + {{\"server_address\": $server_addr}}' \\",
253 | ' "$json_path" > temp_{model_name}.json \\',
254 | ' && mv temp_{model_name}.json "$json_path"\n',
255 | ],
256 | "container_command": f"{CONTAINER_MODULE_NAME} exec --nv --containall {IMAGE_PATH} \\",
257 | "launch_cmd": [
258 | "vllm serve {model_weights_path} \\",
259 | " --served-model-name {model_name} \\",
260 | ' --host "0.0.0.0" \\',
261 | " --port $vllm_port_number \\",
262 | ],
263 | }
264 |
--------------------------------------------------------------------------------
/docs/assets/vector-logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
173 |
--------------------------------------------------------------------------------
/tests/vec_inf/client/test_utils.py:
--------------------------------------------------------------------------------
1 | """Tests for the utility functions in the vec-inf client."""
2 |
3 | import os
4 | from unittest.mock import MagicMock, patch
5 |
6 | import pytest
7 | import requests
8 |
9 | from vec_inf.client._utils import (
10 | MODEL_READY_SIGNATURE,
11 | find_matching_dirs,
12 | get_base_url,
13 | is_server_running,
14 | load_config,
15 | model_health_check,
16 | read_slurm_log,
17 | run_bash_command,
18 | )
19 |
20 |
21 | @pytest.fixture
22 | def mock_log_dir(tmp_path):
23 | """Create a temporary directory for log files."""
24 | log_dir = tmp_path / "logs"
25 | log_dir.mkdir()
26 | return log_dir
27 |
28 |
29 | def test_run_bash_command_success():
30 | """Test that run_bash_command returns the output of the command."""
31 | with patch("subprocess.Popen") as mock_popen:
32 | mock_process = MagicMock()
33 | mock_process.communicate.return_value = ("test output", "")
34 | mock_popen.return_value = mock_process
35 | result, stderr = run_bash_command("echo test")
36 | assert result == "test output"
37 | assert stderr == ""
38 |
39 |
40 | def test_run_bash_command_error():
41 | """Test run_bash_command with error output."""
42 | with patch("subprocess.Popen") as mock_popen:
43 | mock_process = MagicMock()
44 | mock_process.communicate.return_value = ("", "error output")
45 | mock_popen.return_value = mock_process
46 | result, stderr = run_bash_command("invalid_command")
47 | assert result == ""
48 | assert stderr == "error output"
49 |
50 |
51 | def test_read_slurm_log_found(mock_log_dir):
52 | """Test that read_slurm_log reads the content of a log file."""
53 | test_content = ["line1\n", "line2\n"]
54 | log_file = mock_log_dir / "test_job.123.err"
55 | log_file.parent.mkdir(parents=True, exist_ok=True)
56 | log_file.write_text("".join(test_content))
57 | result = read_slurm_log("test_job", "123", "err", mock_log_dir)
58 | assert result == test_content
59 |
60 |
61 | def test_read_slurm_log_not_found():
62 | """Test read_slurm_log, return an error message if the log file is not found."""
63 | result = read_slurm_log("missing_job", "456", "err", "/nonexistent")
64 | assert result == "LOG FILE NOT FOUND: /nonexistent/missing_job.456.err"
65 |
66 |
67 | @pytest.mark.parametrize(
68 | "log_content,expected",
69 | [
70 | ([MODEL_READY_SIGNATURE], "RUNNING"),
71 | (["ERROR: something wrong"], ("FAILED", "ERROR: something wrong")),
72 | ([], "LAUNCHING"),
73 | (["some other content"], "LAUNCHING"),
74 | ],
75 | )
76 | def test_is_server_running_statuses(log_content, expected):
77 | """Test that is_server_running returns the correct status."""
78 | with patch("vec_inf.client._utils.read_slurm_log") as mock_read:
79 | mock_read.return_value = log_content
80 | result = is_server_running("test_job", "123", None)
81 | assert result == expected
82 |
83 |
84 | def test_get_base_url_found():
85 | """Test that get_base_url returns the correct base URL."""
86 | test_dict = {"server_address": "http://localhost:8000"}
87 | with patch("vec_inf.client._utils.read_slurm_log") as mock_read:
88 | mock_read.return_value = test_dict
89 | result = get_base_url("test_job", "123", None)
90 | assert result == "http://localhost:8000"
91 |
92 |
93 | def test_get_base_url_not_found():
94 | """Test get_base_url when URL is not found in logs."""
95 | with patch("vec_inf.client._utils.read_slurm_log") as mock_read:
96 | mock_read.return_value = {"random_key": "123"}
97 | result = get_base_url("test_job", "123", None)
98 | assert result == "URL NOT FOUND"
99 |
100 |
101 | @pytest.mark.parametrize(
102 | "url,status_code,expected",
103 | [
104 | ("http://localhost:8000", 200, ("READY", 200)),
105 | ("http://localhost:8000", 500, ("FAILED", 500)),
106 | ("not_a_url", None, ("FAILED", "not_a_url")),
107 | ],
108 | )
109 | def test_model_health_check(url, status_code, expected):
110 | """Test model_health_check with various scenarios."""
111 | with patch("vec_inf.client._utils.get_base_url") as mock_url:
112 | mock_url.return_value = url
113 | if url.startswith("http"):
114 | with patch("requests.get") as mock_get:
115 | mock_get.return_value.status_code = status_code
116 | result = model_health_check("test_job", "123", None)
117 | assert result == expected
118 | else:
119 | result = model_health_check("test_job", "123", None)
120 | assert result == expected
121 |
122 |
123 | def test_model_health_check_request_exception():
124 | """Test model_health_check when request raises an exception."""
125 | with (
126 | patch("vec_inf.client._utils.get_base_url") as mock_url,
127 | patch("requests.get") as mock_get,
128 | ):
129 | mock_url.return_value = "http://localhost:8000"
130 | mock_get.side_effect = requests.exceptions.RequestException("Connection error")
131 | result = model_health_check("test_job", "123", None)
132 | assert result == ("FAILED", "Connection error")
133 |
134 |
135 | def test_load_config_default_only():
136 | """Test loading the actual default configuration file from the filesystem."""
137 | configs = load_config()
138 |
139 | # Verify at least one known model exists
140 | model_names = {m.model_name for m in configs}
141 | assert "c4ai-command-r-plus-08-2024" in model_names
142 |
143 | # Verify full configuration of a sample model
144 | model = next(m for m in configs if m.model_name == "c4ai-command-r-plus-08-2024")
145 | assert model.model_family == "c4ai-command-r"
146 | assert model.model_type == "LLM"
147 | assert model.gpus_per_node == 4
148 | assert model.num_nodes == 2
149 | assert model.vllm_args["--max-model-len"] == 65536
150 |
151 |
152 | def test_load_config_with_user_override(tmp_path, monkeypatch):
153 | """Test user config overriding default values."""
154 | # Create user config directory and file
155 | user_config_dir = tmp_path / "user_config_dir"
156 | user_config_dir.mkdir()
157 | user_config_file = user_config_dir / "models.yaml"
158 | user_config_file.write_text("""\
159 | models:
160 | c4ai-command-r-plus-08-2024:
161 | gpus_per_node: 8
162 | new-model:
163 | model_family: new-family
164 | model_type: VLM
165 | gpus_per_node: 4
166 | num_nodes: 1
167 | vocab_size: 256000
168 | vllm_args:
169 | --max-model-len: 4096
170 | """)
171 |
172 | with monkeypatch.context() as m:
173 | m.setenv("VEC_INF_CONFIG_DIR", str(user_config_dir))
174 | configs = load_config()
175 | config_map = {m.model_name: m for m in configs}
176 |
177 | # Verify override (merged with defaults)
178 | assert config_map["c4ai-command-r-plus-08-2024"].gpus_per_node == 8
179 | assert config_map["c4ai-command-r-plus-08-2024"].num_nodes == 2
180 | assert config_map["c4ai-command-r-plus-08-2024"].vocab_size == 256000
181 |
182 | # Verify new model
183 | new_model = config_map["new-model"]
184 | assert new_model.model_family == "new-family"
185 | assert new_model.model_type == "VLM"
186 | assert new_model.gpus_per_node == 4
187 | assert new_model.vocab_size == 256000
188 | assert new_model.vllm_args["--max-model-len"] == 4096
189 |
190 |
191 | def test_load_config_invalid_user_model(tmp_path):
192 | """Test validation of user-provided model configurations."""
193 | # Create user config directory and file
194 | invalid_config_dir = tmp_path / "bad_config_dir"
195 | invalid_config_dir.mkdir()
196 | invalid_config_file = invalid_config_dir / "models.yaml"
197 | invalid_config_file.write_text("""\
198 | models:
199 | invalid-model:
200 | model_family: ""
201 | model_type: INVALID_TYPE
202 | num_gpus: 0
203 | num_nodes: -1
204 | """)
205 |
206 | with (
207 | pytest.raises(ValueError) as excinfo,
208 | patch.dict(os.environ, {"VEC_INF_CONFIG_DIR": str(invalid_config_dir)}),
209 | ):
210 | load_config()
211 |
212 | assert "validation error" in str(excinfo.value).lower()
213 | assert "model_type" in str(excinfo.value)
214 | assert "num_gpus" in str(excinfo.value)
215 |
216 |
217 | def test_find_matching_dirs_only_model_family(tmp_path):
218 | """Return model_family directory when only model_family is provided."""
219 | fam_dir = tmp_path / "fam_a"
220 | fam_dir.mkdir()
221 | (fam_dir / "model_a.1").mkdir()
222 | (fam_dir / "model_b.2").mkdir()
223 |
224 | other_dir = tmp_path / "fam_b"
225 | other_dir.mkdir()
226 | (other_dir / "model_c.3").mkdir()
227 |
228 | matches = find_matching_dirs(log_dir=tmp_path, model_family="fam_a")
229 | assert len(matches) == 1
230 | assert matches[0].name == "fam_a"
231 |
232 |
233 | def test_find_matching_dirs_only_model_name(tmp_path):
234 | """Return directories matching when only model_name is provided."""
235 | fam_a = tmp_path / "fam_a"
236 | fam_a.mkdir()
237 | (fam_a / "target.1").mkdir()
238 | (fam_a / "other.2").mkdir()
239 |
240 | fam_b = tmp_path / "fam_b"
241 | fam_b.mkdir()
242 | (fam_b / "different.3").mkdir()
243 |
244 | matches = find_matching_dirs(log_dir=tmp_path, model_name="target")
245 | result_names = [p.name for p in matches]
246 |
247 | assert "target.1" in result_names
248 | assert "other.2" not in result_names
249 | assert "different.3" not in result_names
250 |
251 |
252 | def test_find_matching_dirs_only_job_id(tmp_path):
253 | """Return directories matching exact job_id."""
254 | fam_dir = tmp_path / "fam"
255 | fam_dir.mkdir()
256 | (fam_dir / "model_a.10").mkdir()
257 | (fam_dir / "model_b.20").mkdir()
258 | (fam_dir / "model_c.30").mkdir()
259 |
260 | matches = find_matching_dirs(log_dir=tmp_path, job_id=10)
261 | result_names = [p.name for p in matches]
262 |
263 | assert "model_a.10" in result_names
264 | assert "model_b.20" not in result_names
265 | assert "model_c.30" not in result_names
266 |
267 |
268 | def test_find_matching_dirs_only_before_job_id(tmp_path):
269 | """Return directories with job_id < before_job_id."""
270 | fam_dir = tmp_path / "fam_a"
271 | fam_dir.mkdir()
272 | (fam_dir / "model_a.1").mkdir()
273 | (fam_dir / "model_a.5").mkdir()
274 | (fam_dir / "model_a.100").mkdir()
275 |
276 | fam_dir = tmp_path / "fam_b"
277 | fam_dir.mkdir()
278 | (fam_dir / "model_b.30").mkdir()
279 |
280 | matches = find_matching_dirs(log_dir=tmp_path, before_job_id=50)
281 | result_names = [p.name for p in matches]
282 |
283 | assert "model_a.1" in result_names
284 | assert "model_a.5" in result_names
285 | assert "model_a.100" not in result_names
286 | assert "model_b.30" in result_names
287 |
288 |
289 | def test_find_matching_dirs_family_and_before_job_id(tmp_path):
290 | """Return directories under a given family with job IDs less than before_job_id."""
291 | fam_dir = tmp_path / "targetfam"
292 | fam_dir.mkdir()
293 | (fam_dir / "model_a.10").mkdir()
294 | (fam_dir / "model_a.20").mkdir()
295 | (fam_dir / "model_a.99").mkdir()
296 | (fam_dir / "model_a.150").mkdir()
297 |
298 | other_fam = tmp_path / "otherfam"
299 | other_fam.mkdir()
300 | (other_fam / "model_b.5").mkdir()
301 | (other_fam / "model_b.10").mkdir()
302 | (other_fam / "model_b.100").mkdir()
303 |
304 | matches = find_matching_dirs(
305 | log_dir=tmp_path,
306 | model_family="targetfam",
307 | before_job_id=100,
308 | )
309 |
310 | result_names = [p.name for p in matches]
311 |
312 | assert "model_a.10" in result_names
313 | assert "model_a.20" in result_names
314 | assert "model_a.99" in result_names
315 | assert "model_a.150" not in result_names
316 | assert all("otherfam" not in str(p) for p in matches)
317 |
318 |
319 | def test_find_matching_dirs_with_family_model_name_and_before_job_id(tmp_path):
320 | """Return matching dirs with model_family, model_name, and before_job_id filters."""
321 | fam_dir = tmp_path / "targetfam"
322 | fam_dir.mkdir()
323 | (fam_dir / "model_a.1").mkdir()
324 | (fam_dir / "model_a.50").mkdir()
325 | (fam_dir / "model_a.150").mkdir()
326 | (fam_dir / "model_b.40").mkdir()
327 |
328 | other_fam = tmp_path / "otherfam"
329 | other_fam.mkdir()
330 | (other_fam / "model_c.20").mkdir()
331 |
332 | matches = find_matching_dirs(
333 | log_dir=tmp_path,
334 | model_family="targetfam",
335 | model_name="model_a",
336 | before_job_id=100,
337 | )
338 |
339 | result_names = [p.name for p in matches]
340 |
341 | assert "model_a.1" in result_names
342 | assert "model_a.50" in result_names
343 | assert "model_a.150" not in result_names
344 | assert "model_b.40" not in result_names
345 | assert all("model_b" not in p for p in result_names)
346 | assert all("otherfam" not in str(p) for p in matches)
347 |
--------------------------------------------------------------------------------
/vec_inf/client/api.py:
--------------------------------------------------------------------------------
1 | """Vector Inference client for programmatic access.
2 |
3 | This module provides the main client class for interacting with Vector Inference
4 | services programmatically. It includes functionality for launching models, monitoring
5 | their status, collecting metrics, and managing their lifecycle.
6 |
7 | See Also
8 | --------
9 | vec_inf.client._helper : Helper classes for model inference server management
10 | vec_inf.client.models : Data models for API responses
11 | """
12 |
13 | import re
14 | import shutil
15 | import subprocess
16 | import time
17 | import warnings
18 | from pathlib import Path
19 | from typing import Any, Optional, Union
20 |
21 | from vec_inf.client._exceptions import (
22 | ServerError,
23 | SlurmJobError,
24 | )
25 | from vec_inf.client._helper import (
26 | BatchModelLauncher,
27 | ModelLauncher,
28 | ModelRegistry,
29 | ModelStatusMonitor,
30 | PerformanceMetricsCollector,
31 | )
32 | from vec_inf.client._utils import find_matching_dirs, run_bash_command
33 | from vec_inf.client.config import ModelConfig
34 | from vec_inf.client.models import (
35 | BatchLaunchResponse,
36 | LaunchOptions,
37 | LaunchResponse,
38 | MetricsResponse,
39 | ModelInfo,
40 | ModelStatus,
41 | StatusResponse,
42 | )
43 |
44 |
45 | class VecInfClient:
46 | """Client for interacting with Vector Inference programmatically.
47 |
48 | This class provides methods for launching models, checking their status,
49 | retrieving metrics, and shutting down models using the Vector Inference
50 | infrastructure.
51 |
52 | Methods
53 | -------
54 | list_models()
55 | List all available models
56 | get_model_config(model_name)
57 | Get configuration for a specific model
58 | launch_model(model_name, options)
59 | Launch a model on the cluster
60 | get_status(slurm_job_id, log_dir)
61 | Get status of a running model
62 | get_metrics(slurm_job_id, log_dir)
63 | Get performance metrics of a running model
64 | shutdown_model(slurm_job_id)
65 | Shutdown a running model
66 | wait_until_ready(slurm_job_id, timeout_seconds, poll_interval_seconds, log_dir)
67 | Wait for a model to become ready
68 |
69 | cleanup_logs(log_dir, model_name, model_family, job_id, dry_run)
70 | Remove logs from the log directory.
71 |
72 | Examples
73 | --------
74 | >>> from vec_inf.api import VecInfClient
75 | >>> client = VecInfClient()
76 | >>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
77 | >>> job_id = response.slurm_job_id
78 | >>> status = client.get_status(job_id)
79 | >>> if status.status == ModelStatus.READY:
80 | ... print(f"Model is ready at {status.base_url}")
81 | >>> client.shutdown_model(job_id)
82 | """
83 |
84 | def __init__(self) -> None:
85 | """Initialize the Vector Inference client."""
86 | self._metrics_collectors: dict[str, PerformanceMetricsCollector] = {}
87 |
88 | def list_models(self) -> list[ModelInfo]:
89 | """List all available models.
90 |
91 | Returns
92 | -------
93 | list[ModelInfo]
94 | List of ModelInfo objects containing information about available models,
95 | including their configurations and specifications.
96 | """
97 | model_registry = ModelRegistry()
98 | return model_registry.get_all_models()
99 |
100 | def get_model_config(self, model_name: str) -> ModelConfig:
101 | """Get the configuration for a specific model.
102 |
103 | Parameters
104 | ----------
105 | model_name : str
106 | Name of the model to get configuration for
107 |
108 | Returns
109 | -------
110 | ModelConfig
111 | Complete configuration for the specified model
112 |
113 | Raises
114 | ------
115 | ModelNotFoundError
116 | If the specified model is not found in the configuration
117 | """
118 | model_registry = ModelRegistry()
119 | return model_registry.get_single_model_config(model_name)
120 |
121 | def launch_model(
122 | self, model_name: str, options: Optional[LaunchOptions] = None
123 | ) -> LaunchResponse:
124 | """Launch a model on the cluster.
125 |
126 | Parameters
127 | ----------
128 | model_name : str
129 | Name of the model to launch
130 | options : LaunchOptions, optional
131 | Launch options to override default configuration
132 |
133 | Returns
134 | -------
135 | LaunchResponse
136 | Response containing launch details including:
137 | - SLURM job ID
138 | - Model configuration
139 | - Launch status
140 |
141 | Raises
142 | ------
143 | ModelConfigurationError
144 | If the model configuration is invalid
145 | SlurmJobError
146 | If there's an error launching the SLURM job
147 | """
148 | # Convert LaunchOptions to dictionary if provided
149 | options_dict: dict[str, Any] = {}
150 | if options:
151 | options_dict = {k: v for k, v in vars(options).items() if v is not None}
152 |
153 | # Create and use the API Launch Helper
154 | model_launcher = ModelLauncher(model_name, options_dict)
155 | return model_launcher.launch()
156 |
157 | def batch_launch_models(
158 | self,
159 | model_names: list[str],
160 | batch_config: Optional[str] = None,
161 | account: Optional[str] = None,
162 | work_dir: Optional[str] = None,
163 | ) -> BatchLaunchResponse:
164 | """Launch multiple models on the cluster.
165 |
166 | Parameters
167 | ----------
168 | model_names : list[str]
169 | List of model names to launch
170 |
171 | Returns
172 | -------
173 | BatchLaunchResponse
174 | Response containing launch details for each model
175 |
176 | Raises
177 | ------
178 | ModelConfigurationError
179 | If the model configuration is invalid
180 | """
181 | model_launcher = BatchModelLauncher(
182 | model_names, batch_config, account, work_dir
183 | )
184 | return model_launcher.launch()
185 |
186 | def fetch_running_jobs(self) -> list[str]:
187 | """
188 | Fetch the list of running vec-inf job IDs for the current user.
189 |
190 | Returns
191 | -------
192 | list[str]
193 | List of matching job names; empty list if squeue unavailable.
194 | """
195 | try:
196 | res = subprocess.run(
197 | ["squeue", "--me", "--noheader"],
198 | capture_output=True,
199 | text=True,
200 | check=True,
201 | )
202 | job_ids = [
203 | ln.strip().split()[0] for ln in res.stdout.splitlines() if ln.strip()
204 | ]
205 |
206 | if not job_ids:
207 | return []
208 |
209 | # For each job, fetch the full JobName and filter by suffix
210 | matching_ids = []
211 | for jid in job_ids:
212 | try:
213 | sctl = subprocess.run(
214 | ["scontrol", "show", "job", "-o", jid],
215 | capture_output=True,
216 | text=True,
217 | check=True,
218 | )
219 | m = re.search(r"\bJobName=([^\s]+)", sctl.stdout)
220 | if m and m.group(1).endswith("-vec-inf"):
221 | matching_ids.append(jid)
222 | except subprocess.CalledProcessError:
223 | # Job might have finished between squeue and scontrol; skip
224 | continue
225 |
226 | return matching_ids
227 |
228 | except subprocess.CalledProcessError as e:
229 | raise SlurmJobError(f"Error running slurm command: {e}") from e
230 |
231 | def get_status(self, slurm_job_id: str) -> StatusResponse:
232 | """Get the status of a running model.
233 |
234 | Parameters
235 | ----------
236 | slurm_job_id : str
237 | The SLURM job ID to check
238 |
239 | Returns
240 | -------
241 | StatusResponse
242 | Status information including:
243 | - Model name
244 | - Server status
245 | - Job state
246 | - Base URL (if ready)
247 | - Error information (if failed)
248 | """
249 | model_status_monitor = ModelStatusMonitor(slurm_job_id)
250 | return model_status_monitor.process_model_status()
251 |
252 | def get_metrics(self, slurm_job_id: str) -> MetricsResponse:
253 | """Get the performance metrics of a running model.
254 |
255 | Parameters
256 | ----------
257 | slurm_job_id : str
258 | The SLURM job ID to get metrics for
259 |
260 | Returns
261 | -------
262 | MetricsResponse
263 | Response containing:
264 | - Model name
265 | - Performance metrics or error message
266 | - Timestamp of collection
267 | """
268 | # Use cached collector to preserve state between calls to compute throughput
269 | if slurm_job_id not in self._metrics_collectors:
270 | self._metrics_collectors[slurm_job_id] = PerformanceMetricsCollector(
271 | slurm_job_id
272 | )
273 |
274 | performance_metrics_collector = self._metrics_collectors[slurm_job_id]
275 |
276 | metrics: Union[dict[str, float], str]
277 | if not performance_metrics_collector.metrics_url.startswith("http"):
278 | metrics = performance_metrics_collector.metrics_url
279 | else:
280 | metrics = performance_metrics_collector.fetch_metrics()
281 |
282 | return MetricsResponse(
283 | model_name=performance_metrics_collector.status_info.model_name,
284 | metrics=metrics,
285 | timestamp=time.time(),
286 | )
287 |
288 | def shutdown_model(self, slurm_job_id: str) -> bool:
289 | """Shutdown a running model.
290 |
291 | Parameters
292 | ----------
293 | slurm_job_id : str
294 | The SLURM job ID to shut down
295 |
296 | Returns
297 | -------
298 | bool
299 | True if the model was successfully shutdown
300 |
301 | Raises
302 | ------
303 | SlurmJobError
304 | If there was an error shutting down the model
305 | """
306 | shutdown_cmd = f"scancel {slurm_job_id}"
307 | _, stderr = run_bash_command(shutdown_cmd)
308 | if stderr:
309 | raise SlurmJobError(f"Failed to shutdown model: {stderr}")
310 | return True
311 |
312 | def wait_until_ready(
313 | self,
314 | slurm_job_id: str,
315 | timeout_seconds: int = 1800,
316 | poll_interval_seconds: int = 10,
317 | ) -> StatusResponse:
318 | """Wait until a model is ready or fails.
319 |
320 | Parameters
321 | ----------
322 | slurm_job_id : str
323 | The SLURM job ID to wait for
324 | timeout_seconds : int, optional
325 | Maximum time to wait in seconds, by default 1800 (30 mins)
326 | poll_interval_seconds : int, optional
327 | How often to check status in seconds, by default 10
328 |
329 | Returns
330 | -------
331 | StatusResponse
332 | Status information when the model becomes ready
333 |
334 | Raises
335 | ------
336 | SlurmJobError
337 | If the specified job is not found or there's an error with the job
338 | ServerError
339 | If the server fails to start within the timeout period
340 | APIError
341 | If there was an error checking the status
342 |
343 | Notes
344 | -----
345 | The timeout is reset if the model is still in PENDING state after the
346 | initial timeout period. This allows for longer queue times in the SLURM
347 | scheduler.
348 | """
349 | start_time = time.time()
350 |
351 | while True:
352 | status_info = self.get_status(slurm_job_id)
353 |
354 | if status_info.server_status == ModelStatus.READY:
355 | return status_info
356 |
357 | if status_info.server_status == ModelStatus.FAILED:
358 | error_message = status_info.failed_reason or "Unknown error"
359 | raise ServerError(f"Model failed to start: {error_message}")
360 |
361 | if status_info.server_status == ModelStatus.SHUTDOWN:
362 | raise ServerError("Model was shutdown before it became ready")
363 |
364 | # Check timeout
365 | if time.time() - start_time > timeout_seconds:
366 | if status_info.server_status == ModelStatus.PENDING:
367 | warnings.warn(
368 | f"Model is still pending after {timeout_seconds} seconds, resetting timer...",
369 | UserWarning,
370 | stacklevel=2,
371 | )
372 | start_time = time.time()
373 | raise ServerError(
374 | f"Timed out waiting for model to become ready after {timeout_seconds} seconds"
375 | )
376 |
377 | # Wait before checking again
378 | time.sleep(poll_interval_seconds)
379 |
380 | def cleanup_logs(
381 | self,
382 | log_dir: Optional[Union[str, Path]] = None,
383 | model_family: Optional[str] = None,
384 | model_name: Optional[str] = None,
385 | job_id: Optional[int] = None,
386 | before_job_id: Optional[int] = None,
387 | dry_run: bool = False,
388 | ) -> list[Path]:
389 | """Remove logs from the log directory.
390 |
391 | Parameters
392 | ----------
393 | log_dir : str or Path, optional
394 | Root directory containing log files. Defaults to ~/.vec-inf-logs.
395 | model_family : str, optional
396 | Only delete logs for this model family.
397 | model_name : str, optional
398 | Only delete logs for this model name.
399 | job_id : int, optional
400 | If provided, only match directories with this exact SLURM job ID.
401 | before_job_id : int, optional
402 | If provided, only delete logs with job ID less than this value.
403 | dry_run : bool
404 | If True, return matching files without deleting them.
405 |
406 | Returns
407 | -------
408 | list[Path]
409 | List of deleted (or matched if dry_run) log file paths.
410 | """
411 | log_root = Path(log_dir) if log_dir else Path.home() / ".vec-inf-logs"
412 | matched = find_matching_dirs(
413 | log_dir=log_root,
414 | model_family=model_family,
415 | model_name=model_name,
416 | job_id=job_id,
417 | before_job_id=before_job_id,
418 | )
419 |
420 | if dry_run:
421 | return matched
422 |
423 | for path in matched:
424 | shutil.rmtree(path)
425 |
426 | return matched
427 |
--------------------------------------------------------------------------------
/vec_inf/client/_slurm_script_generator.py:
--------------------------------------------------------------------------------
1 | """Class for generating Slurm scripts to run vLLM servers.
2 |
3 | This module provides functionality to generate Slurm scripts for running vLLM servers
4 | in both single-node and multi-node configurations.
5 | """
6 |
7 | from datetime import datetime
8 | from pathlib import Path
9 | from typing import Any
10 |
11 | from vec_inf.client._client_vars import SLURM_JOB_CONFIG_ARGS
12 | from vec_inf.client._slurm_templates import (
13 | BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE,
14 | BATCH_SLURM_SCRIPT_TEMPLATE,
15 | SLURM_SCRIPT_TEMPLATE,
16 | )
17 | from vec_inf.client._slurm_vars import CONTAINER_MODULE_NAME
18 |
19 |
20 | class SlurmScriptGenerator:
21 | """A class to generate Slurm scripts for running vLLM servers.
22 |
23 | This class handles the generation of Slurm scripts for both single-node and
24 | multi-node configurations, supporting different virtualization environments
25 | (venv or singularity/apptainer).
26 |
27 | Parameters
28 | ----------
29 | params : dict[str, Any]
30 | Configuration parameters for the Slurm script.
31 | """
32 |
33 | def __init__(self, params: dict[str, Any]):
34 | self.params = params
35 | self.is_multinode = int(self.params["num_nodes"]) > 1
36 | self.use_container = self.params["venv"] == CONTAINER_MODULE_NAME
37 | self.additional_binds = (
38 | f",{self.params['bind']}" if self.params.get("bind") else ""
39 | )
40 | self.model_weights_path = str(
41 | Path(self.params["model_weights_parent_dir"], self.params["model_name"])
42 | )
43 | self.env_str = self._generate_env_str()
44 |
45 | def _generate_env_str(self) -> str:
46 | """Generate the environment variables string for the Slurm script.
47 |
48 | Returns
49 | -------
50 | str
51 | Formatted env vars string for container or shell export commands.
52 | """
53 | env_dict: dict[str, str] = self.params.get("env", {})
54 |
55 | if not env_dict:
56 | return ""
57 |
58 | if self.use_container:
59 | # Format for container: --env KEY1=VAL1,KEY2=VAL2
60 | env_pairs = [f"{key}={val}" for key, val in env_dict.items()]
61 | return f"--env {','.join(env_pairs)}"
62 | # Format for shell: export KEY1=VAL1\nexport KEY2=VAL2
63 | export_lines = [f"export {key}={val}" for key, val in env_dict.items()]
64 | return "\n".join(export_lines)
65 |
66 | def _generate_script_content(self) -> str:
67 | """Generate the complete Slurm script content.
68 |
69 | Returns
70 | -------
71 | str
72 | The complete Slurm script as a string.
73 | """
74 | script_content = []
75 | script_content.append(self._generate_shebang())
76 | script_content.append(self._generate_server_setup())
77 | script_content.append(self._generate_launch_cmd())
78 | return "\n".join(script_content)
79 |
80 | def _generate_shebang(self) -> str:
81 | """Generate the Slurm script shebang with job specifications.
82 |
83 | Returns
84 | -------
85 | str
86 | Slurm shebang containing job specifications.
87 | """
88 | shebang = [SLURM_SCRIPT_TEMPLATE["shebang"]["base"]]
89 | for arg, value in SLURM_JOB_CONFIG_ARGS.items():
90 | if self.params.get(value):
91 | shebang.append(f"#SBATCH --{arg}={self.params[value]}")
92 | if value == "model_name":
93 | shebang[-1] += "-vec-inf"
94 | if self.is_multinode:
95 | shebang += SLURM_SCRIPT_TEMPLATE["shebang"]["multinode"]
96 | return "\n".join(shebang)
97 |
98 | def _generate_server_setup(self) -> str:
99 | """Generate the server initialization script.
100 |
101 | Creates the script section that handles server setup, including Ray
102 | initialization for multi-node setups and port configuration.
103 |
104 | Returns
105 | -------
106 | str
107 | Server initialization script content.
108 | """
109 | server_script = ["\n"]
110 | if self.use_container:
111 | server_script.append("\n".join(SLURM_SCRIPT_TEMPLATE["container_setup"]))
112 | server_script.append(
113 | SLURM_SCRIPT_TEMPLATE["bind_path"].format(
114 | model_weights_path=self.model_weights_path,
115 | additional_binds=self.additional_binds,
116 | )
117 | )
118 | else:
119 | server_script.append(
120 | SLURM_SCRIPT_TEMPLATE["activate_venv"].format(venv=self.params["venv"])
121 | )
122 | server_script.append(self.env_str)
123 | server_script.append(
124 | SLURM_SCRIPT_TEMPLATE["imports"].format(src_dir=self.params["src_dir"])
125 | )
126 | if self.is_multinode:
127 | server_setup_str = "\n".join(
128 | SLURM_SCRIPT_TEMPLATE["server_setup"]["multinode"]
129 | ).format(gpus_per_node=self.params["gpus_per_node"])
130 | if self.use_container:
131 | server_setup_str = server_setup_str.replace(
132 | "CONTAINER_PLACEHOLDER",
133 | SLURM_SCRIPT_TEMPLATE["container_command"].format(
134 | model_weights_path=self.model_weights_path,
135 | env_str=self.env_str,
136 | ),
137 | )
138 | else:
139 | server_setup_str = server_setup_str.replace(
140 | "CONTAINER_PLACEHOLDER",
141 | "\\",
142 | )
143 | else:
144 | server_setup_str = "\n".join(
145 | SLURM_SCRIPT_TEMPLATE["server_setup"]["single_node"]
146 | )
147 | server_script.append(server_setup_str)
148 | server_script.append("\n".join(SLURM_SCRIPT_TEMPLATE["find_vllm_port"]))
149 | server_script.append(
150 | "\n".join(SLURM_SCRIPT_TEMPLATE["write_to_json"]).format(
151 | log_dir=self.params["log_dir"], model_name=self.params["model_name"]
152 | )
153 | )
154 | return "\n".join(server_script)
155 |
156 | def _generate_launch_cmd(self) -> str:
157 | """Generate the vLLM server launch command.
158 |
159 | Creates the command to launch the vLLM server, handling different virtualization
160 | environments (venv or singularity/apptainer).
161 |
162 | Returns
163 | -------
164 | str
165 | Server launch command.
166 | """
167 | launcher_script = ["\n"]
168 | if self.use_container:
169 | launcher_script.append(
170 | SLURM_SCRIPT_TEMPLATE["container_command"].format(
171 | model_weights_path=self.model_weights_path,
172 | env_str=self.env_str,
173 | )
174 | )
175 |
176 | launcher_script.append(
177 | "\n".join(SLURM_SCRIPT_TEMPLATE["launch_cmd"]).format(
178 | model_weights_path=self.model_weights_path,
179 | model_name=self.params["model_name"],
180 | )
181 | )
182 |
183 | for arg, value in self.params["vllm_args"].items():
184 | if isinstance(value, bool):
185 | launcher_script.append(f" {arg} \\")
186 | else:
187 | launcher_script.append(f" {arg} {value} \\")
188 | return "\n".join(launcher_script)
189 |
190 | def write_to_log_dir(self) -> Path:
191 | """Write the generated Slurm script to the log directory.
192 |
193 | Creates a timestamped script file in the configured log directory.
194 |
195 | Returns
196 | -------
197 | Path
198 | Path to the generated Slurm script file.
199 | """
200 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
201 | script_path: Path = (
202 | Path(self.params["log_dir"])
203 | / f"launch_{self.params['model_name']}_{timestamp}.sbatch"
204 | )
205 |
206 | content = self._generate_script_content()
207 | script_path.write_text(content)
208 | return script_path
209 |
210 |
211 | class BatchSlurmScriptGenerator:
212 | """A class to generate Slurm scripts for batch mode.
213 |
214 | This class handles the generation of Slurm scripts for batch mode, which
215 | launches multiple vLLM servers with different configurations in parallel.
216 | """
217 |
218 | def __init__(self, params: dict[str, Any]):
219 | self.params = params
220 | self.script_paths: list[Path] = []
221 | self.use_container = self.params["venv"] == CONTAINER_MODULE_NAME
222 | for model_name in self.params["models"]:
223 | self.params["models"][model_name]["additional_binds"] = (
224 | f",{self.params['models'][model_name]['bind']}"
225 | if self.params["models"][model_name].get("bind")
226 | else ""
227 | )
228 | self.params["models"][model_name]["model_weights_path"] = str(
229 | Path(
230 | self.params["models"][model_name]["model_weights_parent_dir"],
231 | model_name,
232 | )
233 | )
234 |
235 | def _write_to_log_dir(self, script_content: list[str], script_name: str) -> Path:
236 | """Write the generated Slurm script to the log directory.
237 |
238 | Returns
239 | -------
240 | Path
241 | The Path object to the generated Slurm script file.
242 | """
243 | script_path = Path(self.params["log_dir"]) / script_name
244 | script_path.touch(exist_ok=True)
245 | script_path.write_text("\n".join(script_content))
246 | return script_path
247 |
248 | def _generate_model_launch_script(self, model_name: str) -> Path:
249 | """Generate the bash script for launching individual vLLM servers.
250 |
251 | Parameters
252 | ----------
253 | model_name : str
254 | The name of the model to launch.
255 |
256 | Returns
257 | -------
258 | Path
259 | The bash script path for launching the vLLM server.
260 | """
261 | # Generate the bash script content
262 | script_content = []
263 | model_params = self.params["models"][model_name]
264 | script_content.append(BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["shebang"])
265 | if self.use_container:
266 | script_content.append(BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["container_setup"])
267 | script_content.append(
268 | BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["bind_path"].format(
269 | model_weights_path=model_params["model_weights_path"],
270 | additional_binds=model_params["additional_binds"],
271 | )
272 | )
273 | script_content.append(
274 | "\n".join(
275 | BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["server_address_setup"]
276 | ).format(src_dir=self.params["src_dir"])
277 | )
278 | script_content.append(
279 | "\n".join(BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["write_to_json"]).format(
280 | het_group_id=model_params["het_group_id"],
281 | log_dir=self.params["log_dir"],
282 | slurm_job_name=self.params["slurm_job_name"],
283 | model_name=model_name,
284 | )
285 | )
286 | if self.use_container:
287 | script_content.append(
288 | BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["container_command"].format(
289 | model_weights_path=model_params["model_weights_path"],
290 | )
291 | )
292 | script_content.append(
293 | "\n".join(BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE["launch_cmd"]).format(
294 | model_weights_path=model_params["model_weights_path"],
295 | model_name=model_name,
296 | )
297 | )
298 | for arg, value in model_params["vllm_args"].items():
299 | if isinstance(value, bool):
300 | script_content.append(f" {arg} \\")
301 | else:
302 | script_content.append(f" {arg} {value} \\")
303 | script_content[-1] = script_content[-1].replace("\\", "")
304 | # Write the bash script to the log directory
305 | launch_script_path = self._write_to_log_dir(
306 | script_content, f"launch_{model_name}.sh"
307 | )
308 | self.script_paths.append(launch_script_path)
309 | return launch_script_path
310 |
311 | def _generate_batch_slurm_script_shebang(self) -> str:
312 | """Generate the shebang for batch mode Slurm script.
313 |
314 | Returns
315 | -------
316 | str
317 | The shebang for batch mode Slurm script.
318 | """
319 | shebang = [BATCH_SLURM_SCRIPT_TEMPLATE["shebang"]]
320 |
321 | for arg, value in SLURM_JOB_CONFIG_ARGS.items():
322 | if self.params.get(value):
323 | shebang.append(f"#SBATCH --{arg}={self.params[value]}")
324 | shebang.append("#SBATCH --ntasks=1")
325 | shebang.append("\n")
326 |
327 | for model_name in self.params["models"]:
328 | shebang.append(f"# ===== Resource group for {model_name} =====")
329 | for arg, value in SLURM_JOB_CONFIG_ARGS.items():
330 | model_params = self.params["models"][model_name]
331 | if model_params.get(value) and value not in ["out_file", "err_file"]:
332 | shebang.append(f"#SBATCH --{arg}={model_params[value]}")
333 | if value == "model_name":
334 | shebang[-1] += "-vec-inf"
335 | shebang[-1] += "\n"
336 | shebang.append(BATCH_SLURM_SCRIPT_TEMPLATE["hetjob"])
337 | # Remove the last hetjob line
338 | shebang.pop()
339 | return "\n".join(shebang)
340 |
341 | def generate_batch_slurm_script(self) -> Path:
342 | """Generate the Slurm script for launching multiple vLLM servers in batch mode.
343 |
344 | Returns
345 | -------
346 | Path
347 | The Slurm script for launching multiple vLLM servers in batch mode.
348 | """
349 | script_content = []
350 |
351 | script_content.append(self._generate_batch_slurm_script_shebang())
352 |
353 | for model_name in self.params["models"]:
354 | model_params = self.params["models"][model_name]
355 | script_content.append(f"# ===== Launching {model_name} =====")
356 | launch_script_path = str(self._generate_model_launch_script(model_name))
357 | script_content.append(
358 | BATCH_SLURM_SCRIPT_TEMPLATE["permission_update"].format(
359 | script_name=launch_script_path
360 | )
361 | )
362 | script_content.append(
363 | "\n".join(BATCH_SLURM_SCRIPT_TEMPLATE["launch_model_scripts"]).format(
364 | het_group_id=model_params["het_group_id"],
365 | out_file=model_params["out_file"],
366 | err_file=model_params["err_file"],
367 | script_name=launch_script_path,
368 | )
369 | )
370 | script_content.append("wait")
371 |
372 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
373 | script_name = f"{self.params['slurm_job_name']}_{timestamp}.sbatch"
374 | return self._write_to_log_dir(script_content, script_name)
375 |
--------------------------------------------------------------------------------
/vec_inf/client/_utils.py:
--------------------------------------------------------------------------------
1 | """Utility functions shared between CLI and API.
2 |
3 | This module provides utility functions for managing SLURM jobs, server status checks,
4 | and configuration handling for the vector inference package.
5 | """
6 |
7 | import json
8 | import os
9 | import subprocess
10 | import warnings
11 | from pathlib import Path
12 | from typing import Any, Optional, Union, cast
13 |
14 | import requests
15 | import yaml
16 |
17 | from vec_inf.client._client_vars import MODEL_READY_SIGNATURE
18 | from vec_inf.client._exceptions import MissingRequiredFieldsError
19 | from vec_inf.client._slurm_vars import CACHED_CONFIG_DIR, REQUIRED_ARGS
20 | from vec_inf.client.config import ModelConfig
21 | from vec_inf.client.models import ModelStatus
22 |
23 |
24 | def run_bash_command(command: str) -> tuple[str, str]:
25 | """Run a bash command and return the output.
26 |
27 | Parameters
28 | ----------
29 | command : str
30 | The bash command to execute
31 |
32 | Returns
33 | -------
34 | tuple[str, str]
35 | A tuple containing (stdout, stderr) from the command execution
36 | """
37 | process = subprocess.Popen(
38 | command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
39 | )
40 | return process.communicate()
41 |
42 |
43 | def read_slurm_log(
44 | slurm_job_name: str,
45 | slurm_job_id: str,
46 | slurm_log_type: str,
47 | log_dir: str,
48 | ) -> Union[list[str], str, dict[str, str]]:
49 | """Read the slurm log file.
50 |
51 | Parameters
52 | ----------
53 | slurm_job_name : str
54 | Name of the SLURM job
55 | slurm_job_id : str
56 | ID of the SLURM job
57 | slurm_log_type : str
58 | Type of log file to read ('out', 'err', or 'json')
59 | log_dir : str
60 | Directory containing log files
61 |
62 | Returns
63 | -------
64 | Union[list[str], str, dict[str, str]]
65 | Contents of the log file:
66 | - list[str] for 'out' and 'err' logs
67 | - dict[str, str] for 'json' logs
68 | - str for error messages if file not found
69 | """
70 | try:
71 | if "+" in slurm_job_id:
72 | main_job_id, het_job_id = slurm_job_id.split("+")
73 | slurm_job_id = str(int(main_job_id) + int(het_job_id))
74 | file_path = Path(log_dir, f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}")
75 | if slurm_log_type == "json":
76 | with file_path.open("r") as file:
77 | json_content: dict[str, str] = json.load(file)
78 | return json_content
79 | else:
80 | with file_path.open("r") as file:
81 | return file.readlines()
82 | except FileNotFoundError:
83 | return f"LOG FILE NOT FOUND: {file_path}"
84 |
85 |
86 | def is_server_running(
87 | slurm_job_name: str, slurm_job_id: str, log_dir: str
88 | ) -> Union[str, ModelStatus, tuple[ModelStatus, str]]:
89 | """Check if a model is ready to serve requests.
90 |
91 | Parameters
92 | ----------
93 | slurm_job_name : str
94 | Name of the SLURM job
95 | slurm_job_id : str
96 | ID of the SLURM job
97 | log_dir : str
98 | Directory containing log files
99 |
100 | Returns
101 | -------
102 | Union[str, ModelStatus, tuple[ModelStatus, str]]
103 | - str: Error message if logs cannot be read
104 | - ModelStatus: Current status of the server
105 | - tuple[ModelStatus, str]: Status and error message if server failed
106 | """
107 | log_content = read_slurm_log(slurm_job_name, slurm_job_id, "err", log_dir)
108 | if isinstance(log_content, str):
109 | return log_content
110 |
111 | # Patterns that indicate fatal errors (not just warnings)
112 | fatal_error_patterns = [
113 | "traceback",
114 | "exception",
115 | "fatal error",
116 | "critical error",
117 | "failed to",
118 | "could not",
119 | "unable to",
120 | "error:",
121 | ]
122 |
123 | # Patterns to ignore (non-fatal warnings/info messages)
124 | ignore_patterns = [
125 | "deprecated",
126 | "futurewarning",
127 | "userwarning",
128 | "deprecationwarning",
129 | "slurmstepd: error:", # SLURM cancellation messages (often after server started)
130 | ]
131 |
132 | ready_signature_found = False
133 | fatal_error_line = None
134 |
135 | for line in log_content:
136 | line_lower = line.lower()
137 |
138 | # Check for ready signature first - if found, server is running
139 | if MODEL_READY_SIGNATURE in line:
140 | ready_signature_found = True
141 | # Continue checking to see if there are errors after startup
142 |
143 | # Check for fatal errors (only if we haven't seen ready signature yet)
144 | if not ready_signature_found:
145 | # Skip lines that match ignore patterns
146 | if any(ignore_pattern in line_lower for ignore_pattern in ignore_patterns):
147 | continue
148 |
149 | # Check for fatal error patterns
150 | for pattern in fatal_error_patterns:
151 | if pattern in line_lower:
152 | # Additional check: skip if it's part of a warning message
153 | # (warnings often contain "error:" but aren't fatal)
154 | if "warning" in line_lower and "error:" in line_lower:
155 | continue
156 | fatal_error_line = line.strip("\n")
157 | break
158 |
159 | # If we found a fatal error, mark as failed
160 | if fatal_error_line:
161 | return (ModelStatus.FAILED, fatal_error_line)
162 |
163 | # If ready signature was found and no fatal errors, server is running
164 | if ready_signature_found:
165 | return "RUNNING"
166 |
167 | # Otherwise, still launching
168 | return ModelStatus.LAUNCHING
169 |
170 |
171 | def get_base_url(slurm_job_name: str, slurm_job_id: str, log_dir: str) -> str:
172 | """Get the base URL of a model.
173 |
174 | Parameters
175 | ----------
176 | slurm_job_name : str
177 | Name of the SLURM job
178 | slurm_job_id : str
179 | ID of the SLURM job
180 | log_dir : str
181 | Directory containing log files
182 |
183 | Returns
184 | -------
185 | str
186 | Base URL of the model server or error message if not found
187 | """
188 | log_content = read_slurm_log(slurm_job_name, slurm_job_id, "json", log_dir)
189 | if isinstance(log_content, str):
190 | return log_content
191 |
192 | server_addr = cast(dict[str, str], log_content).get("server_address")
193 | return server_addr if server_addr else "URL NOT FOUND"
194 |
195 |
196 | def model_health_check(
197 | slurm_job_name: str, slurm_job_id: str, log_dir: str
198 | ) -> tuple[ModelStatus, Union[str, int]]:
199 | """Check the health of a running model on the cluster.
200 |
201 | Parameters
202 | ----------
203 | slurm_job_name : str
204 | Name of the SLURM job
205 | slurm_job_id : str
206 | ID of the SLURM job
207 | log_dir : str
208 | Directory containing log files
209 |
210 | Returns
211 | -------
212 | tuple[ModelStatus, Union[str, int]]
213 | Tuple containing:
214 | - ModelStatus: Current status of the model
215 | - Union[str, int]: Either HTTP status code or error message
216 | """
217 | base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
218 | if not base_url.startswith("http"):
219 | return (ModelStatus.FAILED, base_url)
220 | health_check_url = base_url.replace("v1", "health")
221 |
222 | try:
223 | response = requests.get(health_check_url)
224 | # Check if the request was successful
225 | if response.status_code == 200:
226 | return (ModelStatus.READY, response.status_code)
227 | return (ModelStatus.FAILED, response.status_code)
228 | except requests.exceptions.RequestException as e:
229 | return (ModelStatus.FAILED, str(e))
230 |
231 |
232 | def load_config(config_path: Optional[str] = None) -> list[ModelConfig]:
233 | """Load the model configuration.
234 |
235 | Loads configuration from default and user-specified paths, merging them
236 | if both exist. User configuration takes precedence over default values.
237 |
238 | Parameters
239 | ----------
240 | config_path : Optional[str]
241 | Path to the configuration file
242 |
243 | Returns
244 | -------
245 | list[ModelConfig]
246 | List of validated model configurations
247 |
248 | Notes
249 | -----
250 | Configuration is loaded from:
251 | 1. User path: specified by config_path
252 | 2. Default path: package's config/models.yaml or CACHED_CONFIG if it exists
253 | 3. Environment variable: specified by VEC_INF_CONFIG environment variable
254 | and merged with default config
255 |
256 | If user configuration exists, it will be merged with default configuration,
257 | with user values taking precedence for overlapping fields.
258 | """
259 |
260 | def load_yaml_config(path: Path) -> dict[str, Any]:
261 | """Load YAML config with error handling."""
262 | try:
263 | with path.open() as f:
264 | return yaml.safe_load(f) or {}
265 | except FileNotFoundError as err:
266 | raise FileNotFoundError(f"Could not find config: {path}") from err
267 | except yaml.YAMLError as err:
268 | raise ValueError(f"Error parsing YAML config at {path}: {err}") from err
269 |
270 | def process_config(config: dict[str, Any]) -> list[ModelConfig]:
271 | """Process the config based on the config type."""
272 | return [
273 | ModelConfig(model_name=name, **model_data)
274 | for name, model_data in config.get("models", {}).items()
275 | ]
276 |
277 | def resolve_config_path_from_env_var() -> Path | None:
278 | """Resolve the config path from the environment variable."""
279 | config_dir = os.getenv("VEC_INF_CONFIG_DIR")
280 | config_path = os.getenv("VEC_INF_MODEL_CONFIG")
281 | if config_path:
282 | return Path(config_path)
283 | if config_dir:
284 | return Path(config_dir, "models.yaml")
285 | return None
286 |
287 | def update_config(
288 | config: dict[str, Any], user_config: dict[str, Any]
289 | ) -> dict[str, Any]:
290 | """Update the config with the user config."""
291 | for name, data in user_config.get("models", {}).items():
292 | if name in config.get("models", {}):
293 | config["models"][name].update(data)
294 | else:
295 | config.setdefault("models", {})[name] = data
296 |
297 | return config
298 |
299 | # 1. If config_path is given, use only that
300 | if config_path:
301 | config = load_yaml_config(Path(config_path))
302 | return process_config(config)
303 |
304 | # 2. Otherwise, load default config
305 | default_path = (
306 | CACHED_CONFIG_DIR / "models.yaml"
307 | if CACHED_CONFIG_DIR.exists()
308 | else Path(__file__).resolve().parent.parent / "config" / "models.yaml"
309 | )
310 | config = load_yaml_config(default_path)
311 |
312 | # 3. If user config exists, merge it
313 | user_path = resolve_config_path_from_env_var()
314 | if user_path and user_path.exists():
315 | user_config = load_yaml_config(user_path)
316 | config = update_config(config, user_config)
317 | elif user_path:
318 | warnings.warn(
319 | f"WARNING: Could not find user config: {str(user_path)}, revert to default config located at {default_path}",
320 | UserWarning,
321 | stacklevel=2,
322 | )
323 |
324 | return process_config(config)
325 |
326 |
327 | def parse_launch_output(output: str) -> tuple[str, dict[str, str]]:
328 | """Parse output from model launch command.
329 |
330 | Parameters
331 | ----------
332 | output : str
333 | Raw output from the launch command
334 |
335 | Returns
336 | -------
337 | tuple[str, dict[str, str]]
338 | Tuple containing:
339 | - str: SLURM job ID
340 | - dict[str, str]: Dictionary of parsed configuration parameters
341 |
342 | Notes
343 | -----
344 | Extracts the SLURM job ID and configuration parameters from the launch
345 | command output. Configuration parameters are parsed from key-value pairs
346 | in the output text.
347 | """
348 | slurm_job_id = output.split(" ")[-1].strip().strip("\n")
349 |
350 | # Extract config parameters
351 | config_dict = {}
352 | output_lines = output.split("\n")[:-2]
353 | for line in output_lines:
354 | if ": " in line:
355 | key, value = line.split(": ", 1)
356 | config_dict[key.lower().replace(" ", "_")] = value
357 |
358 | return slurm_job_id, config_dict
359 |
360 |
361 | def is_power_of_two(n: int) -> bool:
362 | """Check if a number is a power of two.
363 |
364 | Parameters
365 | ----------
366 | n : int
367 | The number to check
368 | """
369 | return n > 0 and (n & (n - 1)) == 0
370 |
371 |
372 | def find_matching_dirs(
373 | log_dir: Path,
374 | model_family: Optional[str] = None,
375 | model_name: Optional[str] = None,
376 | job_id: Optional[int] = None,
377 | before_job_id: Optional[int] = None,
378 | ) -> list[Path]:
379 | """
380 | Find log directories based on filtering criteria.
381 |
382 | Parameters
383 | ----------
384 | log_dir : Path
385 | The base directory containing model family directories.
386 | model_family : str, optional
387 | Filter to only search inside this family.
388 | model_name : str, optional
389 | Filter to only match model names.
390 | job_id : int, optional
391 | Filter to only match this exact SLURM job ID.
392 | before_job_id : int, optional
393 | Filter to only include job IDs less than this value.
394 |
395 | Returns
396 | -------
397 | list[Path]
398 | List of directories that match the criteria and can be deleted.
399 | """
400 | matched = []
401 |
402 | if not log_dir.exists() or not log_dir.is_dir():
403 | raise FileNotFoundError(f"Log directory does not exist: {log_dir}")
404 |
405 | if not model_family and not model_name and not job_id and not before_job_id:
406 | return [log_dir]
407 |
408 | for family_dir in log_dir.iterdir():
409 | if not family_dir.is_dir():
410 | continue
411 | if model_family and family_dir.name != model_family:
412 | continue
413 |
414 | if model_family and not model_name and not job_id and not before_job_id:
415 | return [family_dir]
416 |
417 | for job_dir in family_dir.iterdir():
418 | if not job_dir.is_dir():
419 | continue
420 |
421 | try:
422 | name_part, id_part = job_dir.name.rsplit(".", 1)
423 | parsed_id = int(id_part)
424 | except ValueError:
425 | continue
426 |
427 | if model_name and name_part != model_name:
428 | continue
429 | if job_id is not None and parsed_id != job_id:
430 | continue
431 | if before_job_id is not None and parsed_id >= before_job_id:
432 | continue
433 |
434 | matched.append(job_dir)
435 |
436 | return matched
437 |
438 |
439 | def check_required_fields(params: dict[str, Any]) -> dict[str, Any]:
440 | """Check for required fields without default vals and their corresponding env vars.
441 |
442 | Parameters
443 | ----------
444 | params : dict[str, Any]
445 | Dictionary of parameters to check.
446 | """
447 | env_overrides = {}
448 | for arg in REQUIRED_ARGS:
449 | if not params.get(arg):
450 | default_value = os.getenv(REQUIRED_ARGS[arg])
451 | if default_value:
452 | params[arg] = default_value
453 | env_overrides[arg] = default_value
454 | else:
455 | raise MissingRequiredFieldsError(
456 | f"{arg} is required, please set it in the command arguments or environment variables"
457 | )
458 | return env_overrides
459 |
--------------------------------------------------------------------------------
/vec_inf/cli/_cli.py:
--------------------------------------------------------------------------------
1 | """Command line interface for Vector Inference.
2 |
3 | This module provides the command-line interface for interacting with Vector
4 | Inference services, including model launching, status checking, metrics
5 | monitoring, and shutdown operations.
6 |
7 | Commands
8 | --------
9 | launch
10 | Launch a model on the cluster
11 | status
12 | Check the status of a running model
13 | shutdown
14 | Stop a running model
15 | list
16 | List available models or get specific model configuration
17 | metrics
18 | Stream real-time performance metrics
19 | """
20 |
21 | import json
22 | import time
23 | from typing import Optional, Union
24 |
25 | import click
26 | from rich.console import Console
27 | from rich.live import Live
28 |
29 | from vec_inf.cli._helper import (
30 | BatchLaunchResponseFormatter,
31 | LaunchResponseFormatter,
32 | ListCmdDisplay,
33 | ListStatusDisplay,
34 | MetricsResponseFormatter,
35 | StatusResponseFormatter,
36 | )
37 | from vec_inf.client import LaunchOptions, VecInfClient
38 |
39 |
40 | CONSOLE = Console()
41 |
42 |
43 | @click.group()
44 | def cli() -> None:
45 | """Vector Inference CLI."""
46 | pass
47 |
48 |
49 | @cli.command("launch", help="Launch a model on the cluster.")
50 | @click.argument("model-name", type=str, nargs=1)
51 | @click.option("--model-family", type=str, help="The model family")
52 | @click.option("--model-variant", type=str, help="The model variant")
53 | @click.option(
54 | "--partition",
55 | type=str,
56 | help="Type of Slurm partition",
57 | )
58 | @click.option(
59 | "--resource-type",
60 | type=str,
61 | help="Type of resource to request for the job",
62 | )
63 | @click.option(
64 | "--num-nodes",
65 | type=int,
66 | help="Number of nodes to use, default to suggested resource allocation for model",
67 | )
68 | @click.option(
69 | "--gpus-per-node",
70 | type=int,
71 | help="Number of GPUs/node to use, default to suggested resource allocation for model",
72 | )
73 | @click.option(
74 | "--cpus-per-task",
75 | type=int,
76 | help="Number of CPU cores per task",
77 | )
78 | @click.option(
79 | "--mem-per-node",
80 | type=str,
81 | help="Memory allocation per node in GB format (e.g., '32G')",
82 | )
83 | @click.option(
84 | "--account",
85 | "-A",
86 | type=str,
87 | help="Charge resources used by this job to specified account.",
88 | )
89 | @click.option(
90 | "--work-dir",
91 | "-D",
92 | type=str,
93 | help="Set working directory for the batch job",
94 | )
95 | @click.option(
96 | "--qos",
97 | type=str,
98 | help="Quality of service",
99 | )
100 | @click.option(
101 | "--exclude",
102 | type=str,
103 | help="Exclude certain nodes from the resources granted to the job",
104 | )
105 | @click.option(
106 | "--nodelist",
107 | type=str,
108 | help="Request a specific list of nodes for deployment",
109 | )
110 | @click.option(
111 | "--bind",
112 | type=str,
113 | help="Additional binds for the container as a comma separated list of bind paths",
114 | )
115 | @click.option(
116 | "--time",
117 | type=str,
118 | help="Time limit for job, this should comply with QoS limits",
119 | )
120 | @click.option(
121 | "--venv",
122 | type=str,
123 | help="Path to virtual environment",
124 | )
125 | @click.option(
126 | "--log-dir",
127 | type=str,
128 | help="Path to slurm log directory",
129 | )
130 | @click.option(
131 | "--model-weights-parent-dir",
132 | type=str,
133 | help="Path to parent directory containing model weights",
134 | )
135 | @click.option(
136 | "--vllm-args",
137 | type=str,
138 | help="vLLM engine arguments to be set, use the format as specified in vLLM documentation and separate arguments with commas, e.g. --vllm-args '--max-model-len=8192,--max-num-seqs=256,--enable-prefix-caching'",
139 | )
140 | @click.option(
141 | "--json-mode",
142 | is_flag=True,
143 | help="Output in JSON string",
144 | )
145 | @click.option(
146 | "--env",
147 | type=str,
148 | help="Environment variables to be set. Seperate variables with commas. Can also include path to a file containing environment variables seperated by newlines. e.g. --env 'TRITON_CACHE_DIR=/scratch/.cache/triton,my_custom_vars_file.env'",
149 | )
150 | @click.option(
151 | "--config",
152 | type=str,
153 | help="Path to a model config yaml file to use in place of the default",
154 | )
155 | def launch(
156 | model_name: str,
157 | **cli_kwargs: Optional[Union[str, int, float, bool]],
158 | ) -> None:
159 | """Launch a model on the cluster.
160 |
161 | Parameters
162 | ----------
163 | model_name : str
164 | Name of the model to launch
165 | **cli_kwargs : dict
166 | Additional launch options including:
167 | - model_family : str, optional
168 | Family/architecture of the model
169 | - model_variant : str, optional
170 | Specific variant of the model
171 | - partition : str, optional
172 | Type of Slurm partition
173 | - resource_type : str, optional
174 | Type of resource to request for the job
175 | - num_nodes : int, optional
176 | Number of nodes to use
177 | - gpus_per_node : int, optional
178 | Number of GPUs per node
179 | - cpus_per_task : int, optional
180 | Number of CPU cores per task
181 | - mem_per_node : str, optional
182 | Memory allocation per node in GB format (e.g., '32G')
183 | - account : str, optional
184 | Charge resources used by this job to specified account
185 | - work_dir : str, optional
186 | Set working directory for the batch job
187 | - qos : str, optional
188 | Quality of service tier
189 | - exclude : str, optional
190 | Exclude certain nodes from the resources granted to the job
191 | - nodelist : str, optional
192 | Request a specific list of nodes for deployment
193 | - bind : str, optional
194 | Additional binds for the container as a comma separated list of bind paths
195 | - time : str, optional
196 | Time limit for job
197 | - venv : str, optional
198 | Path to virtual environment
199 | - log_dir : str, optional
200 | Path to SLURM log directory
201 | - model_weights_parent_dir : str, optional
202 | Path to model weights directory
203 | - vllm_args : str, optional
204 | vLLM engine arguments
205 | - env : str, optional
206 | Environment variables
207 | - config : str, optional
208 | Path to custom model config yaml file
209 | - json_mode : bool, optional
210 | Output in JSON format
211 |
212 | Raises
213 | ------
214 | click.ClickException
215 | If launch fails for any reason
216 | """
217 | try:
218 | # Convert cli_kwargs to LaunchOptions
219 | json_mode = cli_kwargs["json_mode"]
220 | del cli_kwargs["json_mode"]
221 |
222 | launch_options = LaunchOptions(**cli_kwargs) # type: ignore
223 |
224 | # Start the client and launch model inference server
225 | client = VecInfClient()
226 | launch_response = client.launch_model(model_name, launch_options)
227 |
228 | # Display launch information
229 | if json_mode:
230 | click.echo(json.dumps(launch_response.config))
231 | else:
232 | launch_formatter = LaunchResponseFormatter(
233 | model_name, launch_response.config
234 | )
235 | launch_info_table = launch_formatter.format_table_output()
236 | CONSOLE.print(launch_info_table)
237 |
238 | except click.ClickException as e:
239 | raise e
240 | except Exception as e:
241 | raise click.ClickException(f"Launch failed: {str(e)}") from e
242 |
243 |
244 | @cli.command(
245 | "batch-launch",
246 | help="Launch multiple models in a batch, separate model names with spaces.",
247 | )
248 | @click.argument("model-names", type=str, nargs=-1)
249 | @click.option(
250 | "--batch-config",
251 | type=str,
252 | help="Model configuration for batch launch",
253 | )
254 | @click.option(
255 | "--account",
256 | "-A",
257 | type=str,
258 | help="Charge resources used by this job to specified account.",
259 | )
260 | @click.option(
261 | "--work-dir",
262 | "-D",
263 | type=str,
264 | help="Set working directory for the batch job",
265 | )
266 | @click.option(
267 | "--json-mode",
268 | is_flag=True,
269 | help="Output in JSON string",
270 | )
271 | def batch_launch(
272 | model_names: tuple[str, ...],
273 | batch_config: Optional[str] = None,
274 | account: Optional[str] = None,
275 | work_dir: Optional[str] = None,
276 | json_mode: Optional[bool] = False,
277 | ) -> None:
278 | """Launch multiple models in a batch.
279 |
280 | Parameters
281 | ----------
282 | model_names : tuple[str, ...]
283 | Names of the models to launch
284 | batch_config : str
285 | Model configuration for batch launch
286 | json_mode : bool, default=False
287 | Whether to output in JSON format
288 |
289 | Raises
290 | ------
291 | click.ClickException
292 | If batch launch fails
293 | """
294 | try:
295 | # Start the client and launch models in batch mode
296 | client = VecInfClient()
297 | batch_launch_response = client.batch_launch_models(
298 | list(model_names), batch_config, account, work_dir
299 | )
300 |
301 | # Display batch launch information
302 | if json_mode:
303 | click.echo(json.dumps(batch_launch_response.config, indent=4))
304 | else:
305 | batch_launch_formatter = BatchLaunchResponseFormatter(
306 | batch_launch_response.config
307 | )
308 | batch_launch_info_table = batch_launch_formatter.format_table_output()
309 | CONSOLE.print(batch_launch_info_table)
310 |
311 | except click.ClickException as e:
312 | raise e
313 | except Exception as e:
314 | raise click.ClickException(f"Batch launch failed: {str(e)}") from e
315 |
316 |
317 | @cli.command("status", help="Check the status of running vec-inf jobs on the cluster.")
318 | @click.argument("slurm_job_id", required=False)
319 | @click.option(
320 | "--json-mode",
321 | is_flag=True,
322 | help="Output in JSON string",
323 | )
324 | def status(slurm_job_id: Optional[str] = None, json_mode: bool = False) -> None:
325 | """Get the status of a running model on the cluster.
326 |
327 | Parameters
328 | ----------
329 | slurm_job_id : str
330 | ID of the SLURM job to check
331 | json_mode : bool, default=False
332 | Whether to output in JSON format
333 |
334 | Raises
335 | ------
336 | click.ClickException
337 | If status check fails
338 | """
339 | try:
340 | # Start the client and get model inference server status
341 | client = VecInfClient()
342 | if not slurm_job_id:
343 | slurm_job_ids = client.fetch_running_jobs()
344 | if not slurm_job_ids:
345 | click.echo("No running jobs found.")
346 | return
347 | else:
348 | slurm_job_ids = [slurm_job_id]
349 | responses = []
350 | for job_id in slurm_job_ids:
351 | responses.append(client.get_status(job_id))
352 |
353 | # Display status information
354 | if slurm_job_id:
355 | status_formatter = StatusResponseFormatter(responses[0])
356 | if json_mode:
357 | status_formatter.output_json()
358 | else:
359 | status_info_table = status_formatter.output_table()
360 | CONSOLE.print(status_info_table)
361 | else:
362 | list_status_display = ListStatusDisplay(slurm_job_ids, responses, json_mode)
363 | list_status_display.display_multiple_status_output(CONSOLE)
364 |
365 | except click.ClickException as e:
366 | raise e
367 | except Exception as e:
368 | raise click.ClickException(f"Status check failed: {str(e)}") from e
369 |
370 |
371 | @cli.command("shutdown", help="Shutdown a running model on the cluster.")
372 | @click.argument("slurm_job_id", type=str, nargs=1)
373 | def shutdown(slurm_job_id: str) -> None:
374 | """Shutdown a running model on the cluster.
375 |
376 | Parameters
377 | ----------
378 | slurm_job_id : str
379 | ID of the SLURM job to shut down
380 |
381 | Raises
382 | ------
383 | click.ClickException
384 | If shutdown operation fails
385 | """
386 | try:
387 | client = VecInfClient()
388 | client.shutdown_model(slurm_job_id)
389 | click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
390 | except Exception as e:
391 | raise click.ClickException(f"Shutdown failed: {str(e)}") from e
392 |
393 |
394 | @cli.command("list", help="List available models or get specific model configuration.")
395 | @click.argument("model-name", required=False)
396 | @click.option(
397 | "--json-mode",
398 | is_flag=True,
399 | help="Output in JSON string",
400 | )
401 | def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
402 | """List all available models, or get default setup of a specific model.
403 |
404 | Parameters
405 | ----------
406 | model_name : str, optional
407 | Name of specific model to get information for
408 | json_mode : bool, default=False
409 | Whether to output in JSON format
410 |
411 | Raises
412 | ------
413 | click.ClickException
414 | If list operation fails
415 | """
416 | try:
417 | # Start the client
418 | client = VecInfClient()
419 | list_display = ListCmdDisplay(CONSOLE, json_mode)
420 | if model_name:
421 | model_config = client.get_model_config(model_name)
422 | list_display.display_single_model_output(model_config)
423 | else:
424 | model_infos = client.list_models()
425 | list_display.display_all_models_output(model_infos)
426 | except click.ClickException as e:
427 | raise e
428 | except Exception as e:
429 | raise click.ClickException(f"List models failed: {str(e)}") from e
430 |
431 |
432 | @cli.command(
433 | "metrics", help="Stream real-time performance metrics from the model endpoint."
434 | )
435 | @click.argument("slurm_job_id", type=str, nargs=1)
436 | def metrics(slurm_job_id: str) -> None:
437 | """Stream real-time performance metrics from the model endpoint.
438 |
439 | Parameters
440 | ----------
441 | slurm_job_id : str
442 | ID of the SLURM job to monitor
443 |
444 | Raises
445 | ------
446 | click.ClickException
447 | If metrics collection fails
448 |
449 | Notes
450 | -----
451 | This command continuously streams metrics with a 2-second refresh interval
452 | until interrupted. If metrics are not available, it will display status
453 | information instead.
454 | """
455 | try:
456 | # Start the client and get inference server metrics
457 | client = VecInfClient()
458 | metrics_response = client.get_metrics(slurm_job_id)
459 | metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
460 |
461 | # Check if metrics response is ready
462 | if isinstance(metrics_response.metrics, str):
463 | metrics_formatter.format_failed_metrics(metrics_response.metrics)
464 | CONSOLE.print(metrics_formatter.table)
465 | return
466 |
467 | with Live(refresh_per_second=1, console=CONSOLE) as live:
468 | while True:
469 | metrics_response = client.get_metrics(slurm_job_id)
470 | metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
471 |
472 | if isinstance(metrics_response.metrics, str):
473 | # Show status information if metrics aren't available
474 | metrics_formatter.format_failed_metrics(metrics_response.metrics)
475 | else:
476 | metrics_formatter.format_metrics()
477 |
478 | live.update(metrics_formatter.table)
479 | time.sleep(1)
480 | except click.ClickException as e:
481 | raise e
482 | except Exception as e:
483 | raise click.ClickException(f"Metrics check failed: {str(e)}") from e
484 |
485 |
486 | @cli.command("cleanup", help="Clean up log files based on optional filters.")
487 | @click.option("--log-dir", type=str, help="Path to SLURM log directory")
488 | @click.option("--model-family", type=str, help="Filter by model family")
489 | @click.option("--model-name", type=str, help="Filter by model name")
490 | @click.option(
491 | "--job-id", type=int, help="Only remove logs with this exact SLURM job ID"
492 | )
493 | @click.option(
494 | "--before-job-id",
495 | type=int,
496 | help="Remove logs with job ID less than this value",
497 | )
498 | @click.option("--dry-run", is_flag=True, help="List matching logs without deleting")
499 | def cleanup_logs_cli(
500 | log_dir: Optional[str],
501 | model_family: Optional[str],
502 | model_name: Optional[str],
503 | job_id: Optional[int],
504 | before_job_id: Optional[int],
505 | dry_run: bool,
506 | ) -> None:
507 | """Clean up log files based on optional filters.
508 |
509 | Parameters
510 | ----------
511 | log_dir : str or Path, optional
512 | Root directory containing log files. Defaults to ~/.vec-inf-logs.
513 | model_family : str, optional
514 | Only delete logs for this model family.
515 | model_name : str, optional
516 | Only delete logs for this model name.
517 | job_id : int, optional
518 | If provided, only match directories with this exact SLURM job ID.
519 | before_job_id : int, optional
520 | If provided, only delete logs with job ID less than this value.
521 | dry_run : bool
522 | If True, return matching files without deleting them.
523 | """
524 | try:
525 | client = VecInfClient()
526 | matched = client.cleanup_logs(
527 | log_dir=log_dir,
528 | model_family=model_family,
529 | model_name=model_name,
530 | job_id=job_id,
531 | before_job_id=before_job_id,
532 | dry_run=dry_run,
533 | )
534 |
535 | if not matched:
536 | if dry_run:
537 | click.echo("Dry run: no matching log directories found.")
538 | else:
539 | click.echo("No matching log directories were deleted.")
540 | elif dry_run:
541 | click.echo(f"Dry run: {len(matched)} directories would be deleted:")
542 | for f in matched:
543 | click.echo(f" - {f}")
544 | else:
545 | click.echo(f"Deleted {len(matched)} log directory(ies).")
546 | except Exception as e:
547 | raise click.ClickException(f"Cleanup failed: {str(e)}") from e
548 |
549 |
550 | if __name__ == "__main__":
551 | cli()
552 |
--------------------------------------------------------------------------------