├── e2e ├── __init__.py ├── pytest_kind │ ├── __init__.py │ └── plugin.py ├── conftest.py ├── test_cluster.yaml ├── test_pack_install.py ├── test_pulumi_install.py └── test_kubectl_install.py ├── tests ├── __init__.py ├── cli │ ├── __init__.py │ └── test_function.py ├── k8s │ ├── __init__.py │ ├── function │ │ └── __init__.py │ ├── model_group │ │ ├── __init__.py │ │ └── runtime │ │ │ ├── __init__.py │ │ │ ├── test_vllm.py │ │ │ └── test_llama_cpp.py │ └── test_utils.py ├── model │ ├── __init__.py │ ├── test_progress_bar.py │ ├── test_settings.py │ ├── test_base_model.py │ ├── test_http_model.py │ ├── test_hf_model.py │ └── test_store.py ├── config │ ├── __init__.py │ └── snapshots │ │ └── test_config │ │ └── test_aws_yaml │ │ └── aws_yaml.txt ├── container │ ├── __init__.py │ └── test_ecr.py ├── policy_packs │ ├── __init__.py │ └── aws │ │ ├── __init__.py │ │ ├── PulumiPolicy.yaml │ │ ├── __main__.py │ │ ├── test_cluster.yaml │ │ ├── container_registry.py │ │ ├── eks.py │ │ └── object_store.py ├── test_examples.py └── test_utils.py ├── paka ├── cli │ ├── __init__.py │ ├── kubeconfig.py │ ├── __main__.py │ ├── build.py │ ├── model_group.py │ ├── run.py │ └── cluster.py ├── model │ ├── __init__.py │ ├── manifest.py │ ├── http_model.py │ ├── settings.py │ ├── hf_model.py │ ├── progress_bar.py │ └── base_model.py ├── cluster │ ├── __init__.py │ ├── aws │ │ ├── __init__.py │ │ ├── container_registry.py │ │ ├── object_store.py │ │ ├── cloudwatch.py │ │ ├── elb.py │ │ ├── ebs_csi_driver.py │ │ ├── utils.py │ │ ├── cluster_autoscaler.py │ │ └── service_account.py │ ├── manager │ │ ├── __init__.py │ │ ├── aws.py │ │ └── base.py │ ├── utils.py │ ├── namespace.py │ ├── zipkin.py │ ├── keda.py │ ├── redis.py │ ├── kubectl.py │ ├── nvidia_device_plugin.py │ ├── pulumi.py │ ├── qdrant.py │ ├── context.py │ ├── prometheus.py │ └── fluentbit.py ├── k8s │ ├── job │ │ ├── __init__.py │ │ └── autoscaler.py │ ├── function │ │ └── __init__.py │ └── model_group │ │ ├── __init__.py │ │ ├── runtime │ │ ├── __init__.py │ │ ├── vllm.py │ │ └── llama_cpp.py │ │ ├── manifest.py │ │ └── ingress.py ├── __init__.py ├── constants.py ├── logger.py └── container │ ├── pack.py │ └── ecr.py ├── examples ├── website_rag │ ├── __init__.py │ ├── runtime.txt │ ├── .gitignore │ ├── .cnignore │ ├── Procfile │ ├── constants.py │ ├── embeddings.py │ ├── requirements.txt │ ├── cluster.yaml │ ├── crawler.py │ ├── serve.py │ ├── ingest.py │ └── README.md ├── invoice_extraction │ ├── __init__.py │ ├── runtime.txt │ ├── .gitignore │ ├── .cnignore │ ├── Procfile │ ├── invoices │ │ ├── invoice-2024-01-01.pdf │ │ ├── invoice-2024-01-31.pdf │ │ ├── invoice-2024-02-29.pdf │ │ └── invoice-2024-03-31.pdf │ ├── cluster_cpu.yaml │ ├── requirements.txt │ ├── cluster.yaml │ ├── output_parser.py │ ├── README.md │ └── serve.py └── templates │ ├── Llama2_7B_Chat_AWQ.yaml │ ├── Llama2_7B_Chat_GPTQ.yaml │ ├── Llama3_70B_Instruct_GPTQ.yaml │ ├── Mistral_7B_Instruct_GPTQ.yaml │ ├── Llama3_70B_Instruct.yaml │ ├── Llama3_8B_Instruct.yaml │ ├── Phi3_Mini_4K_Instruct.yaml │ └── Mistral_7B_Instruct.yaml ├── docs ├── img │ ├── architecture.png │ └── tokens_per_sec.png ├── faq.md └── quick_start.md ├── Makefile ├── .github └── workflows │ ├── publish_release.yml │ └── pull-request-tests.yml ├── .pre-commit-config.yaml ├── LICENSE ├── pyproject.toml └── .gitignore /e2e/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paka/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paka/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/k8s/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /e2e/pytest_kind/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paka/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paka/cluster/aws/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paka/k8s/job/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/container/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/website_rag/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paka/cluster/manager/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paka/k8s/function/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paka/k8s/model_group/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/k8s/function/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/k8s/model_group/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/policy_packs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/policy_packs/aws/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/invoice_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paka/k8s/model_group/runtime/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/k8s/model_group/runtime/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/website_rag/runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.11.* 2 | -------------------------------------------------------------------------------- /examples/invoice_extraction/runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.11.* 2 | -------------------------------------------------------------------------------- /examples/website_rag/.gitignore: -------------------------------------------------------------------------------- 1 | .mypy_cache 2 | venv/ 3 | -------------------------------------------------------------------------------- /tests/policy_packs/aws/PulumiPolicy.yaml: -------------------------------------------------------------------------------- 1 | runtime: python 2 | -------------------------------------------------------------------------------- /examples/invoice_extraction/.gitignore: -------------------------------------------------------------------------------- 1 | .mypy_cache 2 | venv/ 3 | -------------------------------------------------------------------------------- /e2e/conftest.py: -------------------------------------------------------------------------------- 1 | from .pytest_kind.plugin import * # noqa: F401, F403 2 | -------------------------------------------------------------------------------- /examples/website_rag/.cnignore: -------------------------------------------------------------------------------- 1 | venv 2 | __pycache__ 3 | .mypy_cache 4 | -------------------------------------------------------------------------------- /examples/invoice_extraction/.cnignore: -------------------------------------------------------------------------------- 1 | venv 2 | __pycache__ 3 | .mypy_cache 4 | -------------------------------------------------------------------------------- /examples/invoice_extraction/Procfile: -------------------------------------------------------------------------------- 1 | web: python serve.py 2 | serve: python serve.py 3 | -------------------------------------------------------------------------------- /docs/img/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jjleng/paka/HEAD/docs/img/architecture.png -------------------------------------------------------------------------------- /docs/img/tokens_per_sec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jjleng/paka/HEAD/docs/img/tokens_per_sec.png -------------------------------------------------------------------------------- /examples/invoice_extraction/invoices/invoice-2024-01-01.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jjleng/paka/HEAD/examples/invoice_extraction/invoices/invoice-2024-01-01.pdf -------------------------------------------------------------------------------- /examples/invoice_extraction/invoices/invoice-2024-01-31.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jjleng/paka/HEAD/examples/invoice_extraction/invoices/invoice-2024-01-31.pdf -------------------------------------------------------------------------------- /examples/invoice_extraction/invoices/invoice-2024-02-29.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jjleng/paka/HEAD/examples/invoice_extraction/invoices/invoice-2024-02-29.pdf -------------------------------------------------------------------------------- /examples/invoice_extraction/invoices/invoice-2024-03-31.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jjleng/paka/HEAD/examples/invoice_extraction/invoices/invoice-2024-03-31.pdf -------------------------------------------------------------------------------- /examples/website_rag/Procfile: -------------------------------------------------------------------------------- 1 | ingest: python ingest.py https://python.langchain.com/docs/get_started/introduction 2 | web: python serve.py 3 | serve: python serve.py 4 | -------------------------------------------------------------------------------- /examples/website_rag/constants.py: -------------------------------------------------------------------------------- 1 | QDRANT_URL = "http://qdrant.qdrant.svc.cluster.local:6333" 2 | LLM_URL = "http://llama2-7b-chat" 3 | EMBEDDING_URL = "http://gte-base" 4 | -------------------------------------------------------------------------------- /paka/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import PackageNotFoundError, version 2 | 3 | try: 4 | __version__ = version(__name__) 5 | except PackageNotFoundError: 6 | __version__ = "" 7 | -------------------------------------------------------------------------------- /paka/k8s/model_group/manifest.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class Manifest(BaseModel): 5 | name: str 6 | url: str 7 | type: str 8 | file: str 9 | sha256: str 10 | -------------------------------------------------------------------------------- /paka/cluster/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from paka.cluster.context import Context 4 | from paka.model.store import ModelStore, S3ModelStore 5 | 6 | 7 | def get_model_store(ctx: Context, *args: Any, **kwargs: Any) -> ModelStore: 8 | assert ctx.provider == "aws" 9 | 10 | return S3ModelStore(ctx.bucket, *args, **kwargs) 11 | -------------------------------------------------------------------------------- /tests/policy_packs/aws/__main__.py: -------------------------------------------------------------------------------- 1 | from pulumi_policy import EnforcementLevel, PolicyPack 2 | 3 | from tests.policy_packs.aws.container_registry import ecr_policies 4 | from tests.policy_packs.aws.eks import model_group_taints 5 | from tests.policy_packs.aws.object_store import s3_policies 6 | 7 | PolicyPack( 8 | name="aws", 9 | enforcement_level=EnforcementLevel.MANDATORY, 10 | policies=s3_policies + ecr_policies + [model_group_taints], 11 | ) 12 | -------------------------------------------------------------------------------- /tests/container/test_ecr.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | from moto import mock_aws 4 | 5 | from paka.container.ecr import authenticate_docker_to_ecr 6 | 7 | 8 | @mock_aws 9 | def test_authenticate_docker_to_ecr() -> None: 10 | with patch("subprocess.Popen") as mock_popen: 11 | mock_result = MagicMock() 12 | mock_result.communicate.return_value = (b"", b"") 13 | mock_result.returncode = 0 14 | mock_popen.return_value = mock_result 15 | 16 | authenticate_docker_to_ecr("us-west-2") 17 | -------------------------------------------------------------------------------- /paka/constants.py: -------------------------------------------------------------------------------- 1 | # The name of the project 2 | PROJECT_NAME = "paka" 3 | 4 | # The service account that has access to all resources 5 | ACCESS_ALL_SA = "access-all-sa" 6 | 7 | # The environment variable for the directory where the paka data is saved 8 | HOME_ENV_VAR = "PAKA_HOME" 9 | 10 | # The environment variable for the buildpack builder 11 | BP_BUILDER_ENV_VAR = "BP_BUILDER" 12 | 13 | # The path where the model files are mounted in the container 14 | MODEL_MOUNT_PATH = "/data" 15 | 16 | # Pulumi stack name 17 | PULUMI_STACK_NAME = "default" 18 | -------------------------------------------------------------------------------- /tests/policy_packs/aws/test_cluster.yaml: -------------------------------------------------------------------------------- 1 | aws: 2 | cluster: 3 | name: test-cluster 4 | region: us-west-2 5 | nodeType: t2.micro 6 | minNodes: 2 7 | maxNodes: 2 8 | modelGroups: 9 | - nodeType: c7a.xlarge 10 | minInstances: 1 11 | maxInstances: 1 12 | name: llama2-7b 13 | runtime: 14 | image: ghcr.io/ggerganov/llama.cpp:server 15 | model: 16 | hfRepoId: TheBloke/Llama-2-7B-GGUF 17 | files: ["*.Q4_0.gguf"] 18 | vectorStore: 19 | nodeType: t2.small 20 | replicas: 2 21 | -------------------------------------------------------------------------------- /e2e/test_cluster.yaml: -------------------------------------------------------------------------------- 1 | # kind-config.yaml 2 | kind: Cluster 3 | apiVersion: kind.x-k8s.io/v1alpha4 4 | nodes: 5 | - role: control-plane 6 | - role: worker 7 | - role: worker 8 | kubeadmConfigPatches: 9 | - | 10 | kind: JoinConfiguration 11 | nodeRegistration: 12 | taints: 13 | - key: "app" 14 | value: "model-group" 15 | effect: "NoSchedule" 16 | - key: "model" 17 | value: "gte-base" 18 | effect: "NoSchedule" 19 | kubeletExtraArgs: 20 | node-labels: "app=model-group,model=gte-base" 21 | -------------------------------------------------------------------------------- /tests/config/snapshots/test_config/test_aws_yaml/aws_yaml.txt: -------------------------------------------------------------------------------- 1 | version: '1.0' 2 | aws: 3 | cluster: 4 | name: test-cluster 5 | region: us-east-1 6 | namespace: default 7 | nodeType: t2.micro 8 | minNodes: 2 9 | maxNodes: 2 10 | logRetentionDays: 14 11 | modelGroups: 12 | - minInstances: 1 13 | maxInstances: 2 14 | nodeType: t2.micro 15 | diskSize: 20 16 | name: test-model-group 17 | runtime: 18 | image: test-image 19 | resourceRequest: 20 | cpu: 500m 21 | memory: 2Gi 22 | isPublic: false 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: install test lint setup policy-pack type-check check-all e2e 2 | 3 | install: 4 | poetry install 5 | 6 | test: 7 | poetry run pytest tests 8 | 9 | e2e: 10 | poetry run pytest --cluster-config e2e/test_cluster.yaml e2e 11 | 12 | type-check: 13 | poetry run mypy paka tests 14 | 15 | policy-pack: 16 | poetry run python -m paka.cli cluster preview -f $(shell pwd)/tests/policy_packs/aws/test_cluster.yaml --policy-pack $(shell pwd)/tests/policy_packs/aws 17 | 18 | setup: 19 | poetry run pre-commit install 20 | 21 | lint: setup 22 | poetry run pre-commit run --all-files --show-diff-on-failure 23 | 24 | check-all: lint type-check test e2e 25 | -------------------------------------------------------------------------------- /paka/cluster/aws/container_registry.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | 3 | from paka.cluster.context import Context 4 | from paka.utils import call_once 5 | 6 | 7 | @call_once 8 | def create_container_registry(ctx: Context) -> None: 9 | """ 10 | Create a container registry in AWS ECR for storing Docker images. 11 | 12 | Returns: 13 | None 14 | """ 15 | repository = aws.ecr.Repository( 16 | ctx.cluster_name, 17 | force_delete=True, 18 | image_tag_mutability="MUTABLE", 19 | ) 20 | 21 | # Save the repository URL to the cluster data file 22 | repository.repository_url.apply(lambda url: ctx.set_registry(url)) 23 | -------------------------------------------------------------------------------- /paka/cluster/aws/object_store.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | 3 | from paka.cluster.context import Context 4 | from paka.utils import call_once 5 | 6 | 7 | @call_once 8 | def create_object_store(ctx: Context) -> None: 9 | """ 10 | Creates an object store in AWS S3 based on the provided configuration. 11 | 12 | Returns: 13 | None 14 | """ 15 | # `bucket` is the name of the bucket. It will avoid pulumi appending a random string to the name 16 | # `force_destroy`` is needed to delete the bucket when it's not empty 17 | bucket = aws.s3.Bucket(ctx.cluster_name, force_destroy=True) 18 | bucket.id.apply(lambda id: ctx.set_bucket(id)) 19 | -------------------------------------------------------------------------------- /.github/workflows/publish_release.yml: -------------------------------------------------------------------------------- 1 | name: Release to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | jobs: 9 | build-and-publish: 10 | runs-on: ubuntu-latest 11 | environment: 12 | name: pypi 13 | url: https://pypi.org/p/paka 14 | permissions: 15 | id-token: write 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Set up Python 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: '3.11' 22 | - name: Install Poetry 23 | run: pip install poetry 24 | - name: Build package 25 | run: poetry build 26 | - name: Publish to PyPI 27 | uses: pypa/gh-action-pypi-publish@release/v1 28 | -------------------------------------------------------------------------------- /.github/workflows/pull-request-tests.yml: -------------------------------------------------------------------------------- 1 | name: 'Run tests on pr' 2 | 3 | run-name: ${{ github.actor }} has create a pull request 💻 4 | 5 | on: 6 | push: 7 | branches: ['main'] 8 | pull_request: 9 | branches: ['main'] 10 | 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | build: 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - uses: actions/checkout@v3 20 | 21 | - name: Set up Python 3.10 22 | uses: actions/setup-python@v3 23 | with: 24 | python-version: '3.10' 25 | 26 | - name: Install poetry 27 | run: pip3 install poetry 28 | 29 | - name: Install dependencies 30 | run: poetry install 31 | 32 | - name: Test with pytest 33 | run: make check-all 34 | -------------------------------------------------------------------------------- /tests/test_examples.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from paka.config import parse_yaml 6 | 7 | examples_path = Path(__file__).parent.parent / "examples" 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "cluster_config", 12 | [ 13 | examples_path / "website_rag" / "cluster.yaml", 14 | examples_path / "invoice_extraction" / "cluster.yaml", 15 | examples_path / "invoice_extraction" / "cluster_cpu.yaml", 16 | ], 17 | ) 18 | def test_example_configs(cluster_config: Path) -> None: 19 | cluster_config = Path(cluster_config).expanduser().absolute() 20 | 21 | if not cluster_config.exists(): 22 | raise FileNotFoundError(f"The cluster config file does not exist") 23 | 24 | with open(cluster_config, "r") as file: 25 | parse_yaml(file.read()) 26 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: 'v4.5.0' 4 | hooks: 5 | - id: check-merge-conflict 6 | - id: check-toml 7 | - id: check-yaml 8 | - id: detect-private-key 9 | - id: end-of-file-fixer 10 | - id: mixed-line-ending 11 | - id: trailing-whitespace 12 | - repo: https://github.com/pre-commit/mirrors-mypy 13 | rev: 'v1.7.1' 14 | hooks: 15 | - id: mypy 16 | additional_dependencies: 17 | - types-requests 18 | - types-tabulate 19 | - pydantic 20 | - "pydantic[mypy]" 21 | - repo: https://github.com/pre-commit/mirrors-isort 22 | rev: 'v5.10.1' 23 | hooks: 24 | - id: isort 25 | - repo: https://github.com/psf/black 26 | rev: '24.1a1' 27 | hooks: 28 | - id: black 29 | -------------------------------------------------------------------------------- /examples/invoice_extraction/cluster_cpu.yaml: -------------------------------------------------------------------------------- 1 | version: "1.2" 2 | aws: 3 | cluster: 4 | name: invoice-extraction 5 | region: us-west-2 6 | namespace: default 7 | nodeType: t3a.medium 8 | minNodes: 2 9 | maxNodes: 4 10 | prometheus: 11 | enabled: false 12 | tracing: 13 | enabled: false 14 | mixedModelGroups: 15 | - nodeType: c7i.large 16 | baseInstances: 0 17 | maxOnDemandInstances: 1 18 | spot: 19 | minInstances: 1 20 | maxInstances: 3 21 | name: llama2-7b-chat 22 | runtime: 23 | image: ghcr.io/ggerganov/llama.cpp:server 24 | model: 25 | hfRepoId: TheBloke/Llama-2-7B-Chat-GGUF 26 | files: ["*.Q4_0.gguf"] # Use the q4 quantization 27 | autoScaleTriggers: 28 | - type: cpu 29 | metadata: 30 | type: Utilization 31 | value: "50" 32 | -------------------------------------------------------------------------------- /examples/website_rag/embeddings.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import requests 4 | from constants import EMBEDDING_URL 5 | from langchain_core.embeddings import Embeddings 6 | from langchain_core.pydantic_v1 import BaseModel 7 | 8 | MAX_ATTEMPTS = 10000 9 | 10 | 11 | class LlamaEmbeddings(BaseModel, Embeddings): 12 | def embed_documents(self, texts: List[str]) -> List[List[float]]: 13 | url = f"{EMBEDDING_URL}/v1/embeddings" 14 | headers = {"Content-Type": "application/json", "accept": "application/json"} 15 | data = { 16 | "input": texts, 17 | } 18 | 19 | response = requests.post(url, headers=headers, json=data, verify=False) 20 | 21 | return [data["embedding"] for data in response.json()["data"]] 22 | 23 | def embed_query(self, text: str) -> List[float]: 24 | return self.embed_documents([text])[0] 25 | -------------------------------------------------------------------------------- /paka/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | # Create a logger 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | # "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 8 | def setup_logger(verbose: bool = False, format: str = "%(message)s") -> None: 9 | # Set the logging level based on the verbose flag 10 | logger.setLevel(logging.DEBUG if verbose else logging.INFO) 11 | 12 | for handler in logger.handlers: 13 | logger.removeHandler(handler) 14 | 15 | # Create a console handler 16 | ch = logging.StreamHandler() 17 | ch.setLevel(logging.DEBUG if verbose else logging.INFO) 18 | 19 | # Create a formatter 20 | formatter = logging.Formatter(format) 21 | 22 | # Add the formatter to the console handler 23 | ch.setFormatter(formatter) 24 | 25 | # Add the console handler to the logger 26 | logger.addHandler(ch) 27 | 28 | 29 | setup_logger() 30 | -------------------------------------------------------------------------------- /paka/cluster/manager/aws.py: -------------------------------------------------------------------------------- 1 | from paka.cluster.aws.container_registry import create_container_registry 2 | from paka.cluster.aws.eks import create_k8s_cluster 3 | from paka.cluster.aws.object_store import create_object_store 4 | from paka.cluster.manager.base import ClusterManager 5 | from paka.config import Config 6 | 7 | 8 | class AWSClusterManager(ClusterManager): 9 | """ 10 | AWS-specific implementation of the ClusterManager abstract base class. 11 | 12 | The AWSClusterManager class is responsible for managing a cluster of AWS resources. 13 | It provides methods for creating and managing AWS-specific resources such as EKS clusters, 14 | node groups, and service accounts. It also handles AWS-specific configuration and setup tasks. 15 | """ 16 | 17 | def __init__(self, config: Config) -> None: 18 | super().__init__(config) 19 | 20 | def provision_k8s(self) -> None: 21 | create_object_store(self.ctx) 22 | create_container_registry(self.ctx) 23 | create_k8s_cluster(self.ctx) 24 | -------------------------------------------------------------------------------- /examples/invoice_extraction/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.9.3 2 | aiosignal==1.3.1 3 | annotated-types==0.6.0 4 | anyio==4.3.0 5 | attrs==23.2.0 6 | certifi==2024.2.2 7 | charset-normalizer==3.3.2 8 | click==8.1.7 9 | dataclasses-json==0.6.4 10 | fastapi==0.110.1 11 | frozenlist==1.4.1 12 | h11==0.14.0 13 | idna==3.6 14 | install==1.3.5 15 | jsonpatch==1.33 16 | jsonpointer==2.4 17 | langchain==0.1.14 18 | langchain-community==0.0.31 19 | langchain-core==0.1.40 20 | langchain-text-splitters==0.0.1 21 | langsmith==0.1.40 22 | marshmallow==3.21.1 23 | multidict==6.0.5 24 | mypy-extensions==1.0.0 25 | numpy==1.26.4 26 | orjson==3.10.0 27 | packaging==23.2 28 | pydantic==2.6.4 29 | pydantic_core==2.16.3 30 | pypdf==4.1.0 31 | python-multipart==0.0.9 32 | PyYAML==6.0.1 33 | requests==2.31.0 34 | six==1.16.0 35 | sniffio==1.3.1 36 | SQLAlchemy==2.0.29 37 | starlette==0.37.2 38 | sse-starlette==1.8.2 39 | sseclient-py==1.8.0 40 | tenacity==8.2.3 41 | typing-inspect==0.9.0 42 | typing_extensions==4.10.0 43 | urllib3==2.2.1 44 | uvicorn==0.29.0 45 | yarl==1.9.4 46 | -------------------------------------------------------------------------------- /paka/cluster/namespace.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pulumi 4 | import pulumi_kubernetes as k8s 5 | from kubernetes import client, config 6 | 7 | from paka.cluster.context import Context 8 | 9 | 10 | def create_namespace(ctx: Context, kubeconfig_json: str) -> None: 11 | # Pulumi does not support creating the default namespace again, so we need to handle it separately 12 | if ctx.namespace != "default": 13 | k8s.core.v1.Namespace( 14 | "app-ns", 15 | metadata={ 16 | "name": ctx.namespace, 17 | "labels": {"istio-injection": "enabled"}, 18 | }, 19 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider), 20 | ) 21 | else: 22 | config.load_kube_config_from_dict(json.loads(kubeconfig_json)) 23 | # We are dealing with the default namespace 24 | api_instance = client.CoreV1Api() 25 | 26 | body = {"metadata": {"labels": {"istio-injection": "enabled"}}} 27 | 28 | api_instance.patch_namespace("default", body) 29 | -------------------------------------------------------------------------------- /paka/cli/kubeconfig.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import os 5 | from typing import Optional 6 | 7 | import typer 8 | 9 | from paka.cli.utils import ensure_cluster_name 10 | from paka.k8s.utils import update_kubeconfig 11 | from paka.logger import logger 12 | from paka.utils import read_pulumi_stack 13 | 14 | kube_app = typer.Typer() 15 | 16 | 17 | @kube_app.command() 18 | def update( 19 | cluster_name: Optional[str] = typer.Option( 20 | os.getenv("PAKA_CURRENT_CLUSTER"), 21 | "--cluster", 22 | "-c", 23 | help="The name of the cluster.", 24 | ) 25 | ) -> None: 26 | """ 27 | Updates the default kubeconfig file (~/.kube/config) to include the connection 28 | details of the specified cluster. 29 | """ 30 | logger.info("Updating kubeconfig...") 31 | cluster_name = ensure_cluster_name(cluster_name) 32 | kubeconfig = read_pulumi_stack(cluster_name, "kubeconfig") 33 | 34 | update_kubeconfig(json.loads(kubeconfig)) 35 | logger.info("Successfully updated kubeconfig.") 36 | -------------------------------------------------------------------------------- /paka/cluster/zipkin.py: -------------------------------------------------------------------------------- 1 | import pulumi 2 | from pulumi_kubernetes.helm.v3 import Chart, ChartOpts, FetchOpts 3 | 4 | from paka.cluster.context import Context 5 | from paka.utils import call_once 6 | 7 | 8 | @call_once 9 | def create_zipkin(ctx: Context) -> None: 10 | """ 11 | Installs zipkin with a helm chart. 12 | """ 13 | 14 | config = ctx.cloud_config 15 | 16 | if not config.tracing or not config.tracing.enabled: 17 | return 18 | 19 | autoscaling = ( 20 | {"autoscaling": {"enabled": True}} if config.tracing.autoScalingEnabled else {} 21 | ) 22 | 23 | Chart( 24 | "zipkin", 25 | ChartOpts( 26 | chart="zipkin", 27 | version="0.1.2", 28 | namespace="istio-system", 29 | fetch_opts=FetchOpts(repo="https://zipkin.io/zipkin-helm"), 30 | values={ 31 | **autoscaling, 32 | **(config.tracing.zipkinHelmSettings or {}), 33 | }, 34 | ), 35 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider), 36 | ) 37 | -------------------------------------------------------------------------------- /tests/model/test_progress_bar.py: -------------------------------------------------------------------------------- 1 | from paka.model.progress_bar import NullProgressBar, ProgressBar 2 | 3 | 4 | def test_progress_bar() -> None: 5 | pb = ProgressBar("Testing") 6 | fake_pb = NullProgressBar("Testing") 7 | 8 | pb.create_progress_bar(100) 9 | fake_pb.create_progress_bar(100) 10 | assert pb.progress_bar is not None 11 | assert pb.progress_bar.total == 100 12 | assert pb.progress_bar.desc == "Testing" 13 | 14 | pb.advance_progress_bar("task1", 10) 15 | fake_pb.advance_progress_bar("task1", 10) 16 | assert pb.progress_bar.n == 10 17 | 18 | pb.set_postfix_str("test postfix") 19 | fake_pb.set_postfix_str("test postfix") 20 | assert pb.progress_bar.postfix == "test postfix" 21 | 22 | pb.update_progress_bar("task2", 20) 23 | fake_pb.update_progress_bar("task2", 20) 24 | assert pb.progress_bar.total == 120 25 | 26 | pb.clear_counter() 27 | fake_pb.clear_counter() 28 | assert pb.counter == {} 29 | 30 | pb.close_progress_bar() 31 | fake_pb.close_progress_bar() 32 | assert pb.progress_bar is None 33 | -------------------------------------------------------------------------------- /tests/model/test_settings.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from paka.model.settings import ModelSettings 4 | 5 | 6 | def test_validate_quantization() -> None: 7 | settings = ModelSettings( 8 | inference_devices=["cpu"], quantization="GPTQ", runtime="llama.cpp" 9 | ) 10 | assert settings.quantization == "GPTQ" 11 | 12 | with pytest.raises(ValueError): 13 | ModelSettings( 14 | inference_devices=["cpu"], 15 | quantization="invalid_quantization", 16 | runtime="llama.cpp", 17 | ) 18 | 19 | 20 | def test_validate_prompt_template_name() -> None: 21 | settings = ModelSettings( 22 | inference_devices=["cpu"], 23 | quantization="GPTQ", 24 | runtime="llama.cpp", 25 | prompt_template_name="chatml", 26 | ) 27 | assert settings.prompt_template_name == "chatml" 28 | 29 | with pytest.raises(ValueError): 30 | ModelSettings( 31 | inference_devices=["cpu"], 32 | quantization="GPTQ", 33 | runtime="llama.cpp", 34 | prompt_template_name="invalid_template", 35 | ) 36 | -------------------------------------------------------------------------------- /tests/model/test_base_model.py: -------------------------------------------------------------------------------- 1 | import io 2 | from unittest.mock import MagicMock, patch 3 | 4 | from paka.model.base_model import BaseMLModel 5 | 6 | 7 | class ConcreteMLModel(BaseMLModel): 8 | def save(self) -> None: 9 | pass 10 | 11 | 12 | def test_base_ml_model() -> None: 13 | 14 | progress_bar_mock = MagicMock() 15 | model_store_mock = MagicMock(progress_bar=progress_bar_mock) 16 | model = ConcreteMLModel( 17 | name="TestModel", 18 | model_store=model_store_mock, 19 | quantization="GPTQ", 20 | prompt_template_name=None, 21 | prompt_template_str=None, 22 | ) 23 | 24 | model.save_manifest_yml() 25 | model_store_mock.save.assert_called_once() 26 | 27 | stream = io.BytesIO(b"Test data") 28 | 29 | model.save_single_stream("test.txt", stream, 9, "test_sha256") 30 | model_store_mock.save_stream.assert_called_with( 31 | "test.txt", stream, 9, "test_sha256" 32 | ) 33 | assert ("test.txt", "test_sha256") in model.completed_files 34 | 35 | model.finish() 36 | progress_bar_mock.close_progress_bar.assert_called_once() 37 | -------------------------------------------------------------------------------- /e2e/test_pack_install.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from pathlib import Path 4 | from unittest.mock import patch 5 | 6 | import pytest 7 | 8 | from paka.constants import HOME_ENV_VAR 9 | from paka.container.pack import ensure_pack 10 | 11 | 12 | def test_installation_on_windows() -> None: 13 | with patch( 14 | "platform.system", return_value="windows" 15 | ), tempfile.TemporaryDirectory() as temp_dir: 16 | os.environ[HOME_ENV_VAR] = temp_dir 17 | 18 | pack = Path(ensure_pack()) 19 | 20 | assert pack.exists() 21 | assert str(pack).endswith(".exe") 22 | 23 | 24 | @pytest.mark.parametrize( 25 | "system, arch", 26 | [("darwin", "amd64"), ("darwin", "arm64"), ("linux", "amd64"), ("linux", "arm64")], 27 | ) 28 | def test_installation_on_other_platforms(system: str, arch: str) -> None: 29 | with patch("platform.system", return_value=system), patch( 30 | "platform.machine", return_value=arch 31 | ), tempfile.TemporaryDirectory() as temp_dir: 32 | os.environ[HOME_ENV_VAR] = temp_dir 33 | 34 | pack = Path(ensure_pack()) 35 | 36 | assert pack.exists() 37 | -------------------------------------------------------------------------------- /paka/cluster/aws/cloudwatch.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | 3 | from paka.cluster.context import Context 4 | from paka.cluster.fluentbit import create_fluentbit 5 | from paka.constants import PROJECT_NAME 6 | 7 | LOG_GROUP = f"EKSContainerLogs/{PROJECT_NAME}" 8 | 9 | 10 | def enable_cloudwatch(ctx: Context) -> None: 11 | aws.cloudwatch.LogGroup( 12 | "log-group", 13 | name=LOG_GROUP, 14 | retention_in_days=ctx.cloud_config.cluster.logRetentionDays, 15 | ) 16 | 17 | # Fluent Bit configuration for forwarding logs to CloudWatch 18 | fluent_bit_config = f""" 19 | [SERVICE] 20 | Parsers_File /fluent-bit/etc/parsers.conf 21 | 22 | [INPUT] 23 | Name tail 24 | Path /var/log/containers/*.log 25 | Parser docker 26 | Tag kube.* 27 | Refresh_Interval 5 28 | 29 | [OUTPUT] 30 | Name cloudwatch_logs 31 | Match kube.* 32 | log_group_name {LOG_GROUP} 33 | log_stream_prefix eks/ 34 | region {ctx.cloud_config.cluster.region} 35 | """ 36 | create_fluentbit(ctx, fluent_bit_config) 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023-2024 Jijun Leng 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /tests/cli/test_function.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import typer 3 | 4 | from paka.cli.function import process_traffic_splits, validate_traffic_split 5 | 6 | 7 | def test_validate_traffic_split() -> None: 8 | # Test valid input 9 | assert validate_traffic_split("rev1=20") == ("rev1", 20) 10 | 11 | # Test missing '=' 12 | with pytest.raises(ValueError): 13 | validate_traffic_split("rev120") 14 | 15 | # Test non-numeric percentage 16 | with pytest.raises(ValueError): 17 | validate_traffic_split("rev1=twenty") 18 | 19 | # Test percentage out of range 20 | with pytest.raises(ValueError): 21 | validate_traffic_split("rev1=101") 22 | 23 | 24 | def test_process_traffic_splits() -> None: 25 | # Test valid input 26 | splits, total = process_traffic_splits(["rev1=20", "rev2=30"]) 27 | assert splits == [("rev1", 20), ("rev2", 30)] 28 | assert total == 50 29 | 30 | # Test duplicate revisions 31 | with pytest.raises(typer.Exit): 32 | process_traffic_splits(["rev1=20", "rev1=30"]) 33 | 34 | # Test invalid split 35 | with pytest.raises(ValueError): 36 | process_traffic_splits(["rev1=20", "rev2=thirty"]) 37 | -------------------------------------------------------------------------------- /examples/invoice_extraction/cluster.yaml: -------------------------------------------------------------------------------- 1 | version: "1.2" 2 | aws: 3 | cluster: 4 | name: invoice-extraction 5 | region: us-west-2 6 | namespace: default 7 | nodeType: t3a.medium 8 | minNodes: 2 9 | maxNodes: 4 10 | prometheus: 11 | enabled: true 12 | tracing: 13 | enabled: false 14 | mixedModelGroups: 15 | - name: llama2-7b-chat 16 | nodeType: g4dn.xlarge 17 | gpu: 18 | enabled: true # This model group runs on GPU-enabled instances 19 | baseInstances: 0 20 | maxOnDemandInstances: 1 21 | spot: 22 | minInstances: 1 23 | maxInstances: 2 24 | runtime: 25 | image: vllm/vllm-openai:v0.4.2 26 | model: 27 | hfRepoId: TheBloke/Llama-2-7B-Chat-GPTQ 28 | autoScaleTriggers: 29 | - type: prometheus 30 | metadata: 31 | serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint 32 | metricName: latency_p95 33 | threshold: '20000' # Set to 20s, tune as needed 34 | query: | # Trigger scaling if p95 latency exceeds 20s 35 | histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama2-7b-chat.default.svc.cluster.local"}[5m])) by (le)) 36 | -------------------------------------------------------------------------------- /examples/website_rag/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.9.3 2 | aiosignal==1.3.1 3 | annotated-types==0.6.0 4 | anyio==4.2.0 5 | attrs==23.2.0 6 | beautifulsoup4==4.12.3 7 | certifi==2024.2.2 8 | charset-normalizer==3.3.2 9 | click==8.1.7 10 | dataclasses-json==0.6.4 11 | fancycompleter==0.9.1 12 | fastapi==0.109.2 13 | frozenlist==1.4.1 14 | grpcio==1.60.1 15 | grpcio-tools==1.60.1 16 | h11==0.14.0 17 | h2==4.1.0 18 | hpack==4.0.0 19 | httpcore==1.0.2 20 | httpx==0.26.0 21 | hyperframe==6.0.1 22 | idna==3.6 23 | jsonpatch==1.33 24 | jsonpointer==2.4 25 | langchain==0.1.4 26 | langchain-community==0.0.16 27 | langchain-core==0.1.18 28 | langdetect==1.0.9 29 | langserve==0.0.41 30 | langsmith==0.0.86 31 | marshmallow==3.20.2 32 | multidict==6.0.5 33 | mypy-extensions==1.0.0 34 | numpy==1.26.4 35 | orjson==3.9.13 36 | packaging==23.2 37 | portalocker==2.8.2 38 | protobuf==4.25.2 39 | pydantic==2.6.1 40 | pydantic_core==2.16.2 41 | Pygments==2.17.2 42 | pyrepl==0.9.0 43 | python-dotenv==1.0.1 44 | PyYAML==6.0.1 45 | qdrant-client==1.7.2 46 | requests==2.31.0 47 | six==1.16.0 48 | sniffio==1.3.0 49 | soupsieve==2.5 50 | SQLAlchemy==2.0.25 51 | sse-starlette==1.8.2 52 | sseclient-py==1.8.0 53 | starlette==0.36.3 54 | tenacity==8.2.3 55 | typing-inspect==0.9.0 56 | typing_extensions==4.9.0 57 | urllib3==2.2.0 58 | uvicorn==0.27.0.post1 59 | wmctrl==0.5 60 | yarl==1.9.4 61 | -------------------------------------------------------------------------------- /paka/cluster/keda.py: -------------------------------------------------------------------------------- 1 | import pulumi 2 | import pulumi_kubernetes as k8s 3 | from pulumi_kubernetes.helm.v3 import Chart, ChartOpts, FetchOpts 4 | 5 | from paka.cluster.context import Context 6 | from paka.cluster.prometheus import create_prometheus 7 | from paka.utils import call_once 8 | 9 | 10 | @call_once 11 | def create_keda(ctx: Context) -> None: 12 | """ 13 | Installs a KEDA chart. 14 | """ 15 | prometheus = create_prometheus(ctx) 16 | 17 | # Prometheus is a dependency for KEDA to work with the Prometheus metrics. 18 | # However, Prometheus might not be enabled in the config. In that case, 19 | # deletion of the KEDA resource will be blocked if Prometheus trigger is used. 20 | dependencies = [prometheus] if prometheus else [] 21 | 22 | ns = k8s.core.v1.Namespace( 23 | "keda", 24 | metadata={"name": "keda"}, 25 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider), 26 | ) 27 | Chart( 28 | "keda", 29 | ChartOpts( 30 | chart="keda", 31 | version="2.12.1", 32 | namespace="keda", 33 | fetch_opts=FetchOpts(repo="https://kedacore.github.io/charts"), 34 | values={}, 35 | ), 36 | opts=pulumi.ResourceOptions( 37 | provider=ctx.k8s_provider, depends_on=[ns, *dependencies] 38 | ), 39 | ) 40 | -------------------------------------------------------------------------------- /tests/model/test_http_model.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | import paka.model.http_model 4 | from paka.model.http_model import BaseMLModel, HttpSourceModel 5 | 6 | 7 | def test_http_source_model() -> None: 8 | with patch.object( 9 | paka.model.http_model.requests, "get" 10 | ) as mock_requests_get, patch.object( 11 | BaseMLModel, 12 | "finish", 13 | return_value=MagicMock(), 14 | ) as finish_mock: 15 | model_store_mock = MagicMock() 16 | model = HttpSourceModel( 17 | name="TestModel", 18 | urls=["http://example.com/file1", "http://example.com/file2"], 19 | model_store=model_store_mock, 20 | quantization="GPTQ", 21 | prompt_template_name=None, 22 | prompt_template_str=None, 23 | ) 24 | 25 | mock_response = MagicMock() 26 | mock_response.headers.get.return_value = 10 27 | mock_requests_get.return_value.__enter__.return_value = mock_response 28 | 29 | model.save() 30 | mock_requests_get.assert_called() 31 | model_store_mock.save_stream.assert_called() 32 | finish_mock.assert_called_once() 33 | 34 | model._save_single_url("http://example.com/file1") 35 | mock_requests_get.assert_called_with("http://example.com/file1", stream=True) 36 | model_store_mock.save_stream.assert_called() 37 | -------------------------------------------------------------------------------- /e2e/test_pulumi_install.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from unittest.mock import patch 4 | 5 | import pytest 6 | 7 | from paka.cluster.pulumi import ensure_pulumi 8 | from paka.constants import HOME_ENV_VAR 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "system, arch", 13 | [ 14 | ("darwin", "amd64"), 15 | ("darwin", "arm64"), 16 | ("linux", "amd64"), 17 | ("linux", "arm64"), 18 | ("windows", "amd64"), 19 | ("windows", "arm64"), 20 | ], 21 | ) 22 | def test_installation(system: str, arch: str) -> None: 23 | with patch("platform.system", return_value=system), patch( 24 | "platform.machine", return_value=arch 25 | ), tempfile.TemporaryDirectory() as temp_dir: 26 | os.environ[HOME_ENV_VAR] = temp_dir 27 | orig_path = os.environ["PATH"] 28 | 29 | try: 30 | ensure_pulumi() 31 | bin = "pulumi" 32 | if system == "windows": 33 | bin += ".exe" 34 | paths = os.environ["PATH"].split(":") 35 | list_of_list = [p.split(";") for p in paths if p] 36 | paths = [item for sublist in list_of_list for item in sublist] 37 | 38 | for path in paths: 39 | if os.path.exists(os.path.join(path, bin)): 40 | break 41 | else: 42 | pytest.fail(f"{bin} not found in PATH") 43 | finally: 44 | os.environ["PATH"] = orig_path 45 | -------------------------------------------------------------------------------- /e2e/test_kubectl_install.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from unittest.mock import patch 4 | 5 | import pytest 6 | 7 | from paka.cluster.kubectl import ensure_kubectl 8 | from paka.constants import HOME_ENV_VAR 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "system, arch", 13 | [ 14 | ("darwin", "amd64"), 15 | ("darwin", "arm64"), 16 | ("linux", "amd64"), 17 | ("linux", "arm64"), 18 | ("windows", "amd64"), 19 | ("windows", "arm64"), 20 | ], 21 | ) 22 | def test_installation(system: str, arch: str) -> None: 23 | with patch("platform.system", return_value=system), patch( 24 | "platform.machine", return_value=arch 25 | ), tempfile.TemporaryDirectory() as temp_dir: 26 | os.environ[HOME_ENV_VAR] = temp_dir 27 | orig_path = os.environ["PATH"] 28 | 29 | try: 30 | ensure_kubectl() 31 | bin = "kubectl" 32 | if system == "windows": 33 | bin += ".exe" 34 | paths = os.environ["PATH"].split(":") 35 | list_of_list = [p.split(";") for p in paths if p] 36 | paths = [item for sublist in list_of_list for item in sublist] 37 | 38 | for path in paths: 39 | if os.path.exists(os.path.join(path, bin)): 40 | break 41 | else: 42 | pytest.fail(f"{bin} not found in PATH") 43 | finally: 44 | os.environ["PATH"] = orig_path 45 | -------------------------------------------------------------------------------- /examples/invoice_extraction/output_parser.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from langchain.output_parsers import PydanticOutputParser 4 | from langchain_core.pydantic_v1 import BaseModel, Field 5 | 6 | 7 | class Invoice(BaseModel): 8 | number: str = Field(description="invoice number, e.g. #25470322") 9 | date: str = Field(description="invoice date, e.g. 2024-01-01T08:29:56") 10 | company: str = Field(description="remit to company, e.g. Akamai Technologies, Inc.") 11 | company_address: str = Field( 12 | description="remit to address, e.g. 249 Arch St. Philadelphia, PA 19106 USA" 13 | ) 14 | tax_id: str = Field(description="tax ID/EIN number, e.g. 04-3432319") 15 | customer: str = Field(description="invoice to customer, e.g. John Doe") 16 | customer_address: str = Field( 17 | description="invoice to address, e.g. 123 Main St. Springfield, IL 62701 USA" 18 | ) 19 | amount: str = Field(description="total amount from this invoice, e.g. $5.00") 20 | 21 | def to_dict(self) -> Dict: 22 | return { 23 | "number": self.number, 24 | "date": self.date, 25 | "company": self.company, 26 | "company_address": self.company_address, 27 | "tax_id": self.tax_id, 28 | "customer": self.customer, 29 | "customer_address": self.customer_address, 30 | "amount": self.amount, 31 | } 32 | 33 | 34 | invoice_parser = PydanticOutputParser(pydantic_object=Invoice) 35 | -------------------------------------------------------------------------------- /paka/k8s/model_group/ingress.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import List 4 | 5 | from kubernetes import client 6 | 7 | from paka.k8s.utils import CustomResource, apply_resource 8 | from paka.utils import kubify_name 9 | 10 | 11 | def create_model_vservice( 12 | namespace: str, model_name: str, hosts: List[str] = ["*"] 13 | ) -> None: 14 | istio_virtual_service = CustomResource( 15 | api_version="networking.istio.io/v1beta1", 16 | kind="VirtualService", 17 | plural="virtualservices", 18 | metadata=client.V1ObjectMeta(name=kubify_name(model_name), namespace=namespace), 19 | spec={ 20 | "hosts": hosts, 21 | "gateways": ["knative-serving/knative-ingress-gateway"], 22 | "http": [ 23 | { 24 | "match": [ 25 | { 26 | "authority": { 27 | "prefix": kubify_name(model_name), 28 | } 29 | } 30 | ], 31 | "route": [ 32 | { 33 | "destination": { 34 | "host": f"{kubify_name(model_name)}.{namespace}.svc.cluster.local", 35 | "port": {"number": 80}, 36 | } 37 | } 38 | ], 39 | } 40 | ], 41 | }, 42 | ) 43 | 44 | apply_resource(istio_virtual_service) 45 | -------------------------------------------------------------------------------- /paka/model/manifest.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import List, Optional 4 | 5 | from pydantic import BaseModel 6 | 7 | 8 | class ModelFile(BaseModel): 9 | name: str 10 | sha256: str 11 | 12 | 13 | class ModelManifest(BaseModel): 14 | """ 15 | A manifest for a model. The manifest is stored along with the model files. 16 | 17 | Attributes: 18 | name (str): The name of the model. 19 | files (List[ModelFile]): A list of model file where each model file contains a file name and a hash. 20 | quantization (Optional[str]): The quantization method (GPTQ, AWQ, GGUF_Q4_0, etc) the model uses. 21 | prompt_template_name (Optional[str]): The prompt template name (chatml, llama-2, gemma, etc) the model uses. This field is optional. 22 | prompt_template_str (Optional[str]): The prompt template string the model uses. This field is optional. 23 | main_model (Optional[str]): The main model file name. This field is optional. 24 | clip_model (Optional[str]): The clip model file name. This field is optional and is used for multimodal models. 25 | lora_model (Optional[str]): The lora model file name. This field is optional. 26 | """ 27 | 28 | name: str 29 | files: List[ModelFile] 30 | quantization: Optional[str] = None 31 | prompt_template_str: Optional[str] = None 32 | prompt_template_name: Optional[str] = None 33 | 34 | main_model: Optional[str] = None 35 | # Clip model is used for multimodal models 36 | clip_model: Optional[str] = None 37 | lora_model: Optional[str] = None 38 | -------------------------------------------------------------------------------- /paka/model/http_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import concurrent.futures 4 | from typing import List, Optional 5 | 6 | import requests 7 | 8 | from paka.model.base_model import BaseMLModel 9 | from paka.model.store import ModelStore 10 | 11 | 12 | class HttpSourceModel(BaseMLModel): 13 | def __init__( 14 | self, 15 | name: str, 16 | urls: List[str], 17 | model_store: ModelStore, 18 | quantization: Optional[str] = None, 19 | prompt_template_name: Optional[str] = None, 20 | prompt_template_str: Optional[str] = None, 21 | ) -> None: 22 | super().__init__( 23 | name=name, 24 | model_store=model_store, 25 | quantization=quantization, 26 | prompt_template_name=prompt_template_name, 27 | prompt_template_str=prompt_template_str, 28 | ) 29 | self.urls = urls 30 | 31 | def save(self) -> None: 32 | """ 33 | Save the model to a model store. 34 | """ 35 | with concurrent.futures.ThreadPoolExecutor( 36 | max_workers=self.concurrency 37 | ) as executor: 38 | futures = [executor.submit(self._save_single_url, url) for url in self.urls] 39 | concurrent.futures.wait(futures) 40 | self.finish() 41 | 42 | def _save_single_url(self, url: str) -> None: 43 | with requests.get(url, stream=True) as response: 44 | response.raise_for_status() 45 | total_size = int(response.headers.get("content-length", 0)) 46 | fname = url.split("/")[-1] 47 | self.save_single_stream(f"{self.name}/{fname}", response, total_size) 48 | -------------------------------------------------------------------------------- /tests/model/test_hf_model.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | import paka.model.hf_model 4 | from paka.model.hf_model import BaseMLModel, HuggingFaceModel 5 | 6 | 7 | def test_hf_model() -> None: 8 | with patch.object( 9 | paka.model.hf_model, "HfFileSystem", autospec=True 10 | ) as mock_hf_file_system, patch.object( 11 | BaseMLModel, 12 | "finish", 13 | return_value=MagicMock(), 14 | ) as finish_mock: 15 | model_store_mock = MagicMock() 16 | model = HuggingFaceModel( 17 | name="TestModel", 18 | repo_id="test-repo", 19 | files=["file1", "file2"], 20 | model_store=model_store_mock, 21 | quantization="GPTQ", 22 | ) 23 | 24 | mock_hf_file_system.return_value.glob.return_value = ["file1", "file2"] 25 | mock_hf_file_system.return_value.stat.return_value = { 26 | "size": 10, 27 | "lfs": {"sha256": "test_sha256"}, 28 | } 29 | mock_hf_file_system.return_value.open.return_value.__enter__.return_value = ( 30 | MagicMock() 31 | ) 32 | 33 | model.save() 34 | mock_hf_file_system.return_value.glob.assert_called() 35 | mock_hf_file_system.return_value.stat.assert_called() 36 | mock_hf_file_system.return_value.open.assert_called() 37 | model_store_mock.save_stream.assert_called() 38 | finish_mock.assert_called_once() 39 | 40 | model._save_single_file("file1") 41 | mock_hf_file_system.return_value.stat.assert_called_with("file1") 42 | mock_hf_file_system.return_value.open.assert_called_with("file1", "rb") 43 | model_store_mock.save_stream.assert_called() 44 | -------------------------------------------------------------------------------- /tests/policy_packs/aws/container_registry.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import List 4 | 5 | from pulumi_policy import ( 6 | Policy, 7 | ReportViolation, 8 | ResourceValidationArgs, 9 | ResourceValidationPolicy, 10 | StackValidationArgs, 11 | StackValidationPolicy, 12 | ) 13 | 14 | max_num_ecrs = 1 15 | 16 | 17 | def ecr_count_validator( 18 | stack: StackValidationArgs, report_violation: ReportViolation 19 | ) -> None: 20 | ecr_resources = filter( 21 | (lambda resource: resource.resource_type == "aws:ecr/repository:Repository"), 22 | stack.resources, 23 | ) 24 | 25 | ecrs = list(ecr_resources) 26 | if len(ecrs) > max_num_ecrs: 27 | report_violation( 28 | f"No more than {max_num_ecrs} repository(ies) should be created.", None 29 | ) 30 | 31 | 32 | ecr_count_check = StackValidationPolicy( 33 | name="ecr-count-check", 34 | description="Checks the number of ECR repositories created.", 35 | validate=ecr_count_validator, 36 | ) 37 | 38 | 39 | def ecr_force_delete_validator( 40 | args: ResourceValidationArgs, report_violation: ReportViolation 41 | ) -> None: 42 | if args.resource_type == "aws:ecr/repository:Repository": 43 | force_destroy = args.props["forceDelete"] 44 | if not force_destroy: 45 | report_violation( 46 | "You must set forceDelete to true. ", 47 | None, 48 | ) 49 | 50 | 51 | ecr_force_delete = ResourceValidationPolicy( 52 | name="ecr-force-delete", 53 | description="Requires forceDelete to be set to true.", 54 | validate=ecr_force_delete_validator, 55 | ) 56 | 57 | ecr_policies: List[Policy] = [ecr_count_check, ecr_force_delete] 58 | -------------------------------------------------------------------------------- /examples/website_rag/cluster.yaml: -------------------------------------------------------------------------------- 1 | version: "1.2" 2 | aws: 3 | cluster: 4 | name: website-rag 5 | region: us-west-2 6 | namespace: default 7 | nodeType: t3a.medium 8 | minNodes: 2 9 | maxNodes: 4 10 | vectorStore: 11 | nodeType: t3a.small 12 | replicas: 1 13 | prometheus: 14 | enabled: true 15 | mixedModelGroups: 16 | - name: gte-base 17 | nodeType: c7a.xlarge 18 | baseInstances: 0 19 | maxOnDemandInstances: 1 20 | spot: 21 | minInstances: 1 22 | maxInstances: 3 23 | runtime: 24 | image: ghcr.io/ggerganov/llama.cpp:server 25 | model: 26 | hfRepoId: jjleng/gte-base-gguf 27 | files: ["*.q4_0.gguf"] 28 | autoScaleTriggers: 29 | - type: cpu 30 | metadata: 31 | type: Utilization 32 | value: "50" 33 | - name: llama2-7b-chat 34 | nodeType: g4dn.xlarge 35 | gpu: 36 | enabled: true # This model group runs on GPU-enabled instances 37 | baseInstances: 0 38 | maxOnDemandInstances: 1 39 | spot: 40 | minInstances: 1 41 | maxInstances: 2 42 | runtime: 43 | image: vllm/vllm-openai:v0.4.2 44 | model: 45 | hfRepoId: TheBloke/Llama-2-7B-Chat-GPTQ 46 | autoScaleTriggers: 47 | - type: prometheus 48 | metadata: 49 | serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint 50 | metricName: latency_p95 51 | threshold: '20000' # Set to 20s, tune as needed 52 | query: | # Trigger scaling if p95 latency exceeds 20s 53 | histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama2-7b-chat.default.svc.cluster.local"}[5m])) by (le)) 54 | -------------------------------------------------------------------------------- /paka/cli/__main__.py: -------------------------------------------------------------------------------- 1 | import typer 2 | 3 | from paka import __version__ 4 | from paka.cli.build import build_app 5 | from paka.cli.cluster import cluster_app 6 | from paka.cli.function import function_app 7 | from paka.cli.job import job_app 8 | from paka.cli.kubeconfig import kube_app 9 | from paka.cli.model_group import model_group_app 10 | from paka.cli.run import run_app 11 | from paka.cli.utils import init_pulumi 12 | from paka.logger import setup_logger 13 | 14 | init_pulumi() 15 | 16 | 17 | def version_callback(version: bool) -> None: 18 | if version: 19 | typer.echo(f"Paka CLI Version: {__version__}") 20 | raise typer.Exit() 21 | 22 | 23 | def verbose_option( 24 | verbose: bool = typer.Option( 25 | False, "--verbose", "-v", help="Enable verbose output" 26 | ), 27 | ) -> None: 28 | setup_logger(verbose) 29 | 30 | 31 | cli = typer.Typer(context_settings={"help_option_names": ["-h", "--help"]}) 32 | cli.callback()(verbose_option) 33 | 34 | 35 | @cli.callback() 36 | def version_option( 37 | ctx: typer.Context, 38 | version: bool = typer.Option( 39 | False, "--version", help="Show version and exit", callback=version_callback 40 | ), 41 | ) -> None: 42 | pass 43 | 44 | 45 | cli.add_typer(cluster_app, name="cluster", help="Manage clusters.") 46 | 47 | cli.add_typer(job_app, name="job", help="Manage batch jobs.") 48 | 49 | cli.add_typer(build_app, name="build", help="Build Docker images.") 50 | 51 | cli.add_typer(kube_app, name="kubeconfig", help="Export kubeconfig.") 52 | 53 | cli.add_typer(run_app, name="run", help="Run one-off script.") 54 | 55 | cli.add_typer(function_app, name="function", help="Manage serverless functions.") 56 | 57 | cli.add_typer(model_group_app, name="model-group", help="Manage model groups.") 58 | 59 | 60 | def main() -> None: 61 | cli() 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /examples/templates/Llama2_7B_Chat_AWQ.yaml: -------------------------------------------------------------------------------- 1 | version: "1.2" 2 | aws: 3 | cluster: 4 | name: llama2-7b-chat-example # Use a name in lowercase letters with hyphens (kebab-case) 5 | region: us-west-2 6 | nodeType: t3a.medium 7 | minNodes: 2 8 | maxNodes: 4 # These nodes will host serverless functions and other essential loads 9 | prometheus: 10 | enabled: true # Enable metrics scraping with Prometheus 11 | mixedModelGroups: # A mixed model group can include both on-demand and spot nodes 12 | - name: llama2-7b-chat # Specify a name for the model group 13 | isPublic: true # Make the model group accessible through a public endpoint 14 | nodeType: g4dn.xlarge 15 | gpu: 16 | enabled: true # This model group runs on GPU-enabled instances 17 | baseInstances: 0 # Fail-safe instances, always run on-demand instances 18 | maxOnDemandInstances: 1 # Maximum number of on-demand instances, used as a fallback if spot instances are not available 19 | spot: 20 | minInstances: 1 21 | maxInstances: 2 # Prefer to run the inference backend on spot instances 22 | runtime: 23 | image: vllm/vllm-openai:v0.4.2 # Use vLLM backend 24 | model: 25 | hfRepoId: TheBloke/Llama-2-7B-Chat-AWQ # Specify the Hugging Face model to run 26 | useModelStore: false # Don't save models to s3 27 | autoScaleTriggers: 28 | - type: prometheus 29 | metadata: 30 | serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint 31 | metricName: latency_p95 32 | threshold: '20000' # Set to 20s, tune as needed 33 | query: | # Trigger scaling if p95 latency exceeds 20s 34 | histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama2-7b-chat.default.svc.cluster.local"}[5m])) by (le)) 35 | -------------------------------------------------------------------------------- /examples/templates/Llama2_7B_Chat_GPTQ.yaml: -------------------------------------------------------------------------------- 1 | version: "1.2" 2 | aws: 3 | cluster: 4 | name: llama2-7b-chat-example # Use a name in lowercase letters with hyphens (kebab-case) 5 | region: us-west-2 6 | nodeType: t3a.medium 7 | minNodes: 2 8 | maxNodes: 4 # These nodes will host serverless functions and other essential loads 9 | prometheus: 10 | enabled: true # Enable metrics scraping with Prometheus 11 | mixedModelGroups: # A mixed model group can include both on-demand and spot nodes 12 | - name: llama2-7b-chat # Specify a name for the model group 13 | isPublic: true # Make the model group accessible through a public endpoint 14 | nodeType: g4dn.xlarge 15 | gpu: 16 | enabled: true # This model group runs on GPU-enabled instances 17 | baseInstances: 0 # Fail-safe instances, always run on-demand instances 18 | maxOnDemandInstances: 1 # Maximum number of on-demand instances, used as a fallback if spot instances are not available 19 | spot: 20 | minInstances: 1 21 | maxInstances: 2 # Prefer to run the inference backend on spot instances 22 | runtime: 23 | image: vllm/vllm-openai:v0.4.2 # Use vLLM backend 24 | model: 25 | hfRepoId: TheBloke/Llama-2-7B-Chat-GPTQ # Specify the Hugging Face model to run 26 | useModelStore: false # Don't save models to s3 27 | autoScaleTriggers: 28 | - type: prometheus 29 | metadata: 30 | serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint 31 | metricName: latency_p95 32 | threshold: '20000' # Set to 20s, tune as needed 33 | query: | # Trigger scaling if p95 latency exceeds 20s 34 | histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama2-7b-chat.default.svc.cluster.local"}[5m])) by (le)) 35 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "paka" 3 | version = "0.1.11" 4 | description = "LLMOps tool designed to simplify the deployment and management of large language model (LLM) applications" 5 | homepage = "https://github.com/jjleng/paka" 6 | keywords = ["LLMOps", "RAG", "production", "Cloud"] 7 | authors = ["Jijun Leng"] 8 | readme = "README.md" 9 | 10 | [tool.codespell] 11 | check-filenames = true 12 | 13 | [tool.mypy] 14 | ignore_missing_imports = true 15 | disallow_untyped_defs = true 16 | check_untyped_defs = true 17 | plugins = "pydantic.mypy" 18 | warn_unused_configs = true 19 | 20 | [tool.isort] 21 | profile = "black" 22 | 23 | [tool.pytest.ini_options] 24 | filterwarnings = ["ignore::DeprecationWarning"] 25 | 26 | [tool.poetry.scripts] 27 | paka = "paka.cli.__main__:main" 28 | 29 | [tool.poetry.dependencies] 30 | python = "^3.8" 31 | pydantic = "^2.7.0" 32 | ruamel-yaml = "^0.18.6" 33 | pulumi = "3.105.0" 34 | pulumi-aws = "^6.31.0" 35 | typer = "^0.12.3" 36 | pulumi-eks = "^2.3.0" 37 | pulumi-awsx = "^2.7.0" 38 | pulumi-kubernetes = "^4.8.1" 39 | pathspec = "^0.12.1" 40 | requests = "^2.31.0" 41 | kubernetes = "^29.0.0" 42 | boto3 = "^1.34.86" 43 | tabulate = "^0.9.0" 44 | huggingface-hub = "^0.22.2" 45 | tqdm = "^4.66.2" 46 | typing-extensions = "^4.11.0" 47 | fasteners = "^0.19" 48 | tenacity = "^8.2.3" 49 | 50 | [tool.poetry.group.dev.dependencies] 51 | codespell = "^2.2.6" 52 | mypy = "^1.9.0" 53 | pre-commit = "3.5.0" 54 | pytest = "^8.1.1" 55 | pytest-snapshot = "^0.9.0" 56 | types-requests = "2.31.0.6" 57 | isort = "^5.13.2" 58 | types-tabulate = "^0.9.0.20240106" 59 | pulumi-policy = "^1.11.0" 60 | moto = "^5.0.5" 61 | boto3-stubs = { extras = ["ec2", "ecr", "s3"], version = "^1.34.106" } 62 | types-tqdm = "^4.66.0.20240417" 63 | pytest-order = "^1.2.1" 64 | kubernetes-stubs-elephant-fork = "^29.0.0.post1" 65 | 66 | [build-system] 67 | requires = ["poetry-core"] 68 | build-backend = "poetry.core.masonry.api" 69 | -------------------------------------------------------------------------------- /tests/k8s/model_group/runtime/test_vllm.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | import paka.k8s.model_group.runtime.vllm 4 | from paka.cluster.context import Context 5 | from paka.config import AwsModelGroup, Model, Runtime 6 | from paka.k8s.model_group.runtime.vllm import get_runtime_command_vllm, is_vllm_image 7 | 8 | 9 | def test_is_vllm_image() -> None: 10 | assert is_vllm_image("vllm:latest") == True 11 | assert is_vllm_image("notvllm:latest") == False 12 | 13 | 14 | def test_get_runtime_command_vllm() -> None: 15 | mock_store = MagicMock() 16 | with patch.object( 17 | paka.k8s.model_group.runtime.vllm, 18 | "get_model_store", 19 | return_value=mock_store, 20 | ) as mock_get_model_store, patch.object( 21 | paka.k8s.model_group.runtime.vllm, 22 | "validate_repo_id", 23 | return_value=True, 24 | ) as mock_validate_repo_id: 25 | ctx = Context() 26 | model_group = AwsModelGroup( 27 | name="test", 28 | minInstances=1, 29 | maxInstances=2, 30 | nodeType="t2.micro", 31 | runtime=Runtime(image="vllm:latest", command=["python", "app.py"]), 32 | model=Model(useModelStore=True), 33 | resourceRequest={"cpu": "1000", "memory": "1Gi"}, 34 | ) 35 | 36 | command = get_runtime_command_vllm(ctx, model_group) 37 | assert command == ["python", "app.py", "--model", "/data"] 38 | 39 | model_group.runtime.command = None 40 | command = get_runtime_command_vllm(ctx, model_group) 41 | assert command == [ 42 | "python3", 43 | "-O", 44 | "-u", 45 | "-m", 46 | "vllm.entrypoints.openai.api_server", 47 | "--host", 48 | "0.0.0.0", 49 | "--served-model-name", 50 | "test", 51 | "--model", 52 | "/data", 53 | ] 54 | -------------------------------------------------------------------------------- /examples/templates/Llama3_70B_Instruct_GPTQ.yaml: -------------------------------------------------------------------------------- 1 | version: "1.2" 2 | aws: 3 | cluster: 4 | name: llama3-70b-instruct-example # Use a name in lowercase letters with hyphens (kebab-case) 5 | region: us-west-2 6 | nodeType: t3a.medium 7 | minNodes: 2 8 | maxNodes: 4 # These nodes will host serverless functions and other essential loads 9 | prometheus: 10 | enabled: true # Enable metrics scraping with Prometheus 11 | mixedModelGroups: # A mixed model group can include both on-demand and spot nodes 12 | - name: llama3-70b-instruct # Specify a name for the model group 13 | isPublic: true # Make the model group accessible through a public endpoint 14 | nodeType: g4dn.12xlarge 15 | gpu: 16 | enabled: true # This model group runs on GPU-enabled instances 17 | diskSize: 80 18 | baseInstances: 0 # Fail-safe instances, always run on-demand instances 19 | maxOnDemandInstances: 1 # Maximum number of on-demand instances, used as a fallback if spot instances are not available 20 | spot: 21 | minInstances: 1 22 | maxInstances: 2 # Prefer to run the inference backend on spot instances 23 | runtime: 24 | image: vllm/vllm-openai:v0.4.2 # Use vLLM backend 25 | model: 26 | hfRepoId: TechxGenus/Meta-Llama-3-70B-Instruct-GPTQ # Specify the Hugging Face model to run 27 | useModelStore: false # Don't save models to s3 28 | autoScaleTriggers: 29 | - type: prometheus 30 | metadata: 31 | serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint 32 | metricName: latency_p95 33 | threshold: '20000' # Set to 20s, tune as needed 34 | query: | # Trigger scaling if p95 latency exceeds 20s 35 | histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama3-70b-instruct.default.svc.cluster.local"}[5m])) by (le)) 36 | -------------------------------------------------------------------------------- /paka/cluster/aws/elb.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | from typing import Optional 5 | 6 | import boto3 7 | from kubernetes import client, config 8 | 9 | 10 | # Pulumi cannot update the idle timeout of an ELB. This script uses boto3 to 11 | # update the idle timeout of an ELB. 12 | def _update_elb_idle_timeout( 13 | load_balancer_name: str, idle_timeout_seconds: int 14 | ) -> None: 15 | elb_client = boto3.client("elb") 16 | 17 | attributes = { 18 | "LoadBalancerAttributes": { 19 | "ConnectionSettings": {"IdleTimeout": idle_timeout_seconds} 20 | } 21 | } 22 | 23 | elb_client.modify_load_balancer_attributes( 24 | LoadBalancerName=load_balancer_name, 25 | LoadBalancerAttributes=attributes["LoadBalancerAttributes"], 26 | ) 27 | 28 | 29 | def update_elb_idle_timeout(kubeconfig_json: str, idle_timeout_seconds: int) -> None: 30 | elb_name = get_elb_name(kubeconfig_json) 31 | 32 | if elb_name: 33 | _update_elb_idle_timeout(elb_name, idle_timeout_seconds) 34 | 35 | 36 | def get_elb_name(kubeconfig_json: str) -> Optional[str]: 37 | config.load_kube_config_from_dict(json.loads(kubeconfig_json)) 38 | 39 | v1 = client.CoreV1Api() 40 | services = v1.list_service_for_all_namespaces(watch=False) 41 | 42 | for service in services.items: 43 | if service.spec and service.spec.type == "LoadBalancer": 44 | # The name of the ELB is the first part of the hostname of the load balancer 45 | if ( 46 | service.status 47 | and service.status.load_balancer 48 | and service.status.load_balancer.ingress 49 | ): 50 | elb_hostname = service.status.load_balancer.ingress[0].hostname 51 | if not elb_hostname: 52 | continue 53 | elb_name = elb_hostname.split("-")[0] 54 | return elb_name 55 | 56 | return None 57 | -------------------------------------------------------------------------------- /examples/templates/Mistral_7B_Instruct_GPTQ.yaml: -------------------------------------------------------------------------------- 1 | version: "1.2" 2 | aws: 3 | cluster: 4 | name: mistral-7b-instruct-example # Use a name in lowercase letters with hyphens (kebab-case) 5 | region: us-west-2 6 | nodeType: t3a.medium 7 | minNodes: 2 8 | maxNodes: 4 # These nodes will host serverless functions and other essential loads 9 | prometheus: 10 | enabled: true # Enable metrics scraping with Prometheus 11 | mixedModelGroups: # A mixed model group can include both on-demand and spot nodes 12 | - name: mistral-7b-instruct # Specify a name for the model group 13 | isPublic: true # Make the model group accessible through a public endpoint 14 | nodeType: g5g.xlarge 15 | gpu: # This would enable inference on CUDA devices 16 | enabled: true # This model group runs on GPU-enabled instances 17 | diskSize: 40 # 40GB 18 | baseInstances: 0 # Fail-safe instances, always run on-demand instances 19 | maxOnDemandInstances: 1 # Maximum number of on-demand instances, used as a fallback if spot instances are not available 20 | spot: 21 | minInstances: 1 22 | maxInstances: 2 # Prefer to run the inference backend on spot instances 23 | runtime: 24 | image: vllm/vllm-openai:v0.4.2 # Use vLLM backend 25 | model: 26 | hfRepoId: neuralmagic/Mistral-7B-Instruct-v0.3-GPTQ-4bit # Specify the Hugging Face model to run 27 | useModelStore: false # Don't save models to s3 28 | autoScaleTriggers: 29 | - type: prometheus 30 | metadata: 31 | serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint 32 | metricName: latency_p95 33 | threshold: '20000' # Set to 20s, tune as needed 34 | query: | # Trigger scaling if p95 latency exceeds 20s 35 | histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="mistral-7b-instruct.default.svc.cluster.local"}[5m])) by (le)) 36 | -------------------------------------------------------------------------------- /examples/templates/Llama3_70B_Instruct.yaml: -------------------------------------------------------------------------------- 1 | version: "1.2" 2 | aws: 3 | cluster: 4 | name: llama3-70b-instruct-example # Use a name in lowercase letters with hyphens (kebab-case) 5 | region: us-west-2 6 | nodeType: t3a.medium 7 | minNodes: 2 8 | maxNodes: 4 # These nodes will host serverless functions and other essential loads 9 | prometheus: 10 | enabled: true 11 | mixedModelGroups: # A mixed model group can include both on-demand and spot nodes 12 | - name: llama3-70b-instruct # Specify a name for the model group 13 | isPublic: true # Make the model group accessible through a public endpoint 14 | nodeType: g5.48xlarge 15 | gpu: 16 | enabled: true # This model group runs on GPU-enabled instances 17 | diskSize: 200 # 200GB 18 | baseInstances: 0 # Fail-safe instances, always run on-demand instances 19 | maxOnDemandInstances: 1 # Maximum number of on-demand instances, used as a fallback if spot instances are not available 20 | spot: 21 | minInstances: 1 22 | maxInstances: 2 # Prefer to run the inference backend on spot instances 23 | runtime: 24 | image: vllm/vllm-openai:v0.4.2 # Use vLLM backend 25 | env: 26 | - name: HF_TOKEN # Required to download model weights from a gated Hugging Face repo 27 | value: 28 | model: 29 | hfRepoId: meta-llama/Meta-Llama-3-70B-Instruct # Specify the Hugging Face model to run 30 | useModelStore: false # Don't save models to s3 31 | autoScaleTriggers: 32 | - type: prometheus 33 | metadata: 34 | serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint 35 | metricName: latency_p95 36 | threshold: '20000' # Set to 20s, tune as needed 37 | query: | # Trigger scaling if p95 latency exceeds 20s 38 | histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama3-70b-instruct.default.svc.cluster.local"}[5m])) by (le)) 39 | -------------------------------------------------------------------------------- /examples/templates/Llama3_8B_Instruct.yaml: -------------------------------------------------------------------------------- 1 | version: "1.2" 2 | aws: 3 | cluster: 4 | name: llama3-8b-instruct-example # Use a name in lowercase letters with hyphens (kebab-case) 5 | region: us-west-2 6 | nodeType: t3a.medium 7 | minNodes: 2 8 | maxNodes: 4 # These nodes will host serverless functions and other essential loads 9 | prometheus: 10 | enabled: true # Enable metrics scraping with Prometheus 11 | tracing: 12 | enabled: false 13 | mixedModelGroups: # A mixed model group can include both on-demand and spot nodes 14 | - name: llama3-8b-instruct # Specify a name for the model group 15 | isPublic: true # Make the model group accessible through a public endpoint 16 | nodeType: g6.xlarge 17 | gpu: 18 | enabled: true # This model group runs on GPU-enabled instances 19 | baseInstances: 0 # Fail-safe instances, always run on-demand instances 20 | maxOnDemandInstances: 1 # Maximum number of on-demand instances, used as a fallback if spot instances are not available 21 | spot: 22 | minInstances: 1 23 | maxInstances: 2 # Prefer to run the inference backend on spot instances 24 | runtime: 25 | image: vllm/vllm-openai:v0.4.2 # Use vLLM backend 26 | env: 27 | - name: HF_TOKEN # Required to download model weights from a gated Hugging Face repo 28 | value: 29 | model: 30 | hfRepoId: meta-llama/Meta-Llama-3-8B-Instruct # Specify the Hugging Face model to run 31 | useModelStore: false # Don't save models to s3 32 | autoScaleTriggers: 33 | - type: prometheus 34 | metadata: 35 | serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint 36 | metricName: latency_p95 37 | threshold: '20000' # Set to 20s, tune as needed 38 | query: | # Trigger scaling if p95 latency exceeds 20s 39 | histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama3-8b-instruct.default.svc.cluster.local"}[5m])) by (le)) 40 | -------------------------------------------------------------------------------- /examples/templates/Phi3_Mini_4K_Instruct.yaml: -------------------------------------------------------------------------------- 1 | version: "1.2" 2 | aws: 3 | cluster: 4 | name: phi3-mini-instruct-example # Use a name in lowercase letters with hyphens (kebab-case) 5 | region: us-west-2 6 | nodeType: t3a.medium 7 | minNodes: 2 8 | maxNodes: 4 # These nodes will host serverless functions and other essential loads 9 | prometheus: 10 | enabled: true # Enable metrics scraping with Prometheus 11 | mixedModelGroups: # A mixed model group can include both on-demand and spot nodes 12 | - name: phi3-mini-instruct # Specify a name for the model group 13 | isPublic: true # Make the model group accessible through a public endpoint 14 | nodeType: g5g.xlarge 15 | gpu: # This would enable inference on CUDA devices 16 | enabled: true # This model group runs on GPU-enabled instances 17 | baseInstances: 0 # Fail-safe instances, always run on-demand instances 18 | maxOnDemandInstances: 1 # Maximum number of on-demand instances, used as a fallback if spot instances are not available 19 | spot: 20 | minInstances: 1 21 | maxInstances: 2 # Prefer to run the inference backend on spot instances 22 | runtime: 23 | image: vllm/vllm-openai:v0.4.2 # Use vLLM backend 24 | env: 25 | - name: HF_TOKEN # Required to download model weights from a gated Hugging Face repo 26 | value: 27 | model: 28 | hfRepoId: microsoft/Phi-3-mini-4k-instruct # Specify the Hugging Face model to run 29 | useModelStore: true # Don't save models to s3 30 | autoScaleTriggers: 31 | - type: prometheus 32 | metadata: 33 | serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint 34 | metricName: latency_p95 35 | threshold: '20000' # Set to 20s, tune as needed 36 | query: | # Trigger scaling if p95 latency exceeds 20s 37 | histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="phi3-mini-instruct.default.svc.cluster.local"}[5m])) by (le)) 38 | -------------------------------------------------------------------------------- /examples/templates/Mistral_7B_Instruct.yaml: -------------------------------------------------------------------------------- 1 | version: "1.2" 2 | aws: 3 | cluster: 4 | name: mistral-7b-instruct-example # Use a name in lowercase letters with hyphens (kebab-case) 5 | region: us-west-2 6 | nodeType: t3a.medium 7 | minNodes: 2 8 | maxNodes: 4 # These nodes will host serverless functions and other essential loads 9 | prometheus: 10 | enabled: true # Enable metrics scraping with Prometheus 11 | mixedModelGroups: # A mixed model group can include both on-demand and spot nodes 12 | - name: mistral-7b-instruct # Specify a name for the model group 13 | isPublic: true # Make the model group accessible through a public endpoint 14 | nodeType: g4dn.xlarge 15 | gpu: # This would enable inference on CUDA devices 16 | enabled: true # This model group runs on GPU-enabled instances 17 | diskSize: 50 18 | baseInstances: 0 # Fail-safe instances, always run on-demand instances 19 | maxOnDemandInstances: 1 # Maximum number of on-demand instances, used as a fallback if spot instances are not available 20 | spot: 21 | minInstances: 1 22 | maxInstances: 2 # Prefer to run the inference backend on spot instances 23 | runtime: 24 | image: vllm/vllm-openai:v0.4.2 # Use vLLM backend 25 | env: 26 | - name: HF_TOKEN # Required to download model weights from a gated Hugging Face repo 27 | value: 28 | model: 29 | hfRepoId: mistralai/Mistral-7B-Instruct-v0.3 # Specify the Hugging Face model to run 30 | useModelStore: false # Don't save models to s3 31 | autoScaleTriggers: 32 | - type: prometheus 33 | metadata: 34 | serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint 35 | metricName: latency_p95 36 | threshold: '20000' # Set to 20s, tune as needed 37 | query: | # Trigger scaling if p95 latency exceeds 20s 38 | histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="mistral-7b-instruct.default.svc.cluster.local"}[5m])) by (le)) 39 | -------------------------------------------------------------------------------- /examples/invoice_extraction/README.md: -------------------------------------------------------------------------------- 1 | ## Invoice Extraction 2 | This code provides an example of how to build a RESTful API that converts an invoice PDF into a structured data format (JSON). It extracts text from the PDF and then uses the langchain and llama2-7B to extract structured data from the text. 3 | 4 | ## Running the Example 5 | 6 | Follow the steps below to run the example: 7 | 8 | 1. **Install the necessary dependencies:** 9 | ```bash 10 | pip install paka 11 | 12 | # Install AWS CLI and ensure your AWS credentials are correctly configured. 13 | aws configure 14 | ``` 15 | 16 | 2. **Ensure the Docker daemon is running:** 17 | ```bash 18 | docker info 19 | ``` 20 | 21 | 3. **Provision the cluster:** 22 | ```bash 23 | cd examples/invoice_extraction 24 | 25 | # Provision the cluster and update ~/.kube/config 26 | paka cluster up -f cluster.yaml 27 | 28 | # Provision a cluster with Nvidia GPUs 29 | paka cluster up -f gpu_cluster.yaml 30 | ``` 31 | 32 | 4. **Deploy the App:** 33 | ```bash 34 | # The command below will build the source and deploy it as a serverless function. 35 | paka function deploy --name invoice-extraction --source . --entrypoint serve 36 | ``` 37 | 38 | 5. **Check the status of the functions:** 39 | ```bash 40 | paka function list 41 | ``` 42 | 43 | If everything is successful, you should see the function in the list with a status of "READY". By default, the function is exposed through a publicly accessible REST API endpoint. 44 | 45 | 6. **Test the App:** 46 | 47 | Submit the PDF invoices by hitting the `/extract_invoice` endpoint of the deployed function. 48 | 49 | ```bash 50 | curl -X POST -H "Content-Type: multipart/form-data" -F "file=@/path/to/invoices/invoice-2024-02-29.pdf" http://invoice-extraction.default.xxxx.sslip.io/extract_invoice 51 | ``` 52 | 53 | If the invoice extraction is successful, you should see the structured data in the response, e.g. 54 | 55 | ```json 56 | {"number":"#25927345","date":"2024-01-31T05:07:53","company":"Akamai Technologies, Inc.","company_address":"249 Arch St. Philadelphia, PA 19106 USA","tax_id":"United States EIN: 04-3432319","customer":"John Doe","customer_address":"1 Hacker Way Menlo Park, CA 94025","amount":"$5.00"} 57 | ``` 58 | -------------------------------------------------------------------------------- /paka/k8s/model_group/runtime/vllm.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import re 4 | import shlex 5 | from typing import List 6 | 7 | from huggingface_hub.utils import validate_repo_id 8 | 9 | from paka.cluster.context import Context 10 | from paka.cluster.utils import get_model_store 11 | from paka.config import CloudModelGroup 12 | from paka.constants import MODEL_MOUNT_PATH 13 | from paka.k8s.utils import get_gpu_count 14 | 15 | 16 | # Heuristic to determine if the image is a vLLM image 17 | def is_vllm_image(image: str) -> bool: 18 | return image.lower().startswith("vllm") 19 | 20 | 21 | def get_runtime_command_vllm(ctx: Context, model_group: CloudModelGroup) -> List[str]: 22 | runtime = model_group.runtime 23 | if runtime.command: 24 | command_str = " ".join(runtime.command) if runtime.command else "" 25 | if re.search(r"(--model)[ \t]*\S+", command_str): 26 | return runtime.command 27 | 28 | if model_group.model: 29 | if model_group.model.useModelStore: 30 | store = get_model_store(ctx, with_progress_bar=False) 31 | if not store.glob(f"{model_group.name}/*"): 32 | raise ValueError( 33 | f"No model named {model_group.name} was found in the model store." 34 | ) 35 | model_to_load = f"{MODEL_MOUNT_PATH}" 36 | elif model_group.model.hfRepoId: 37 | validate_repo_id(model_group.model.hfRepoId) 38 | model_to_load = model_group.model.hfRepoId 39 | else: 40 | raise ValueError("Did not find a model to load.") 41 | 42 | def attach_model_to_command(command: List[str]) -> List[str]: 43 | return command + ["--model", model_to_load] 44 | 45 | if runtime.command: 46 | return attach_model_to_command(runtime.command) 47 | 48 | command = shlex.split( 49 | f"python3 -O -u -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --served-model-name {model_group.name}" 50 | ) 51 | 52 | gpu_count = get_gpu_count(ctx, model_group) 53 | 54 | if gpu_count > 1: 55 | command += ["--tensor-parallel-size", str(gpu_count)] 56 | 57 | return attach_model_to_command(command) 58 | -------------------------------------------------------------------------------- /tests/policy_packs/aws/eks.py: -------------------------------------------------------------------------------- 1 | from pulumi_policy import ( 2 | ReportViolation, 3 | ResourceValidationArgs, 4 | ResourceValidationPolicy, 5 | ) 6 | 7 | 8 | def model_group_validator( 9 | args: ResourceValidationArgs, report_violation: ReportViolation 10 | ) -> None: 11 | if args.resource_type == "aws:eks/nodeGroup:NodeGroup": 12 | instance_type = args.props["instanceTypes"][0] 13 | 14 | if instance_type == "c7a.xlarge": 15 | if "taints" in args.props: 16 | taints = args.props["taints"] 17 | 18 | # Verify that taint {key: app, value: model-group, effect: NoSchedule} exists 19 | exists = False 20 | for i in range(len(taints)): 21 | taint = taints[i] 22 | if ( 23 | taint["key"] == "app" 24 | and taint["value"] == "model-group" 25 | and taint["effect"] == "NO_SCHEDULE" 26 | ): 27 | exists = True 28 | if not exists: 29 | report_violation( 30 | "Taint {key: app, value: model-group, effect: NoSchedule} is not set for model-group node group.", 31 | None, 32 | ) 33 | 34 | # Verify that taint {key: model, value: , effect: NoSchedule} exists 35 | exists = False 36 | for i in range(len(taints)): 37 | taint = taints[i] 38 | if ( 39 | taint["key"] == "model" 40 | and taint["value"] == "llama2-7b" 41 | and taint["effect"] == "NO_SCHEDULE" 42 | ): 43 | exists = True 44 | if not exists: 45 | report_violation( 46 | "Taint {key: model, value: , effect: NoSchedule} is not set for model-group node group.", 47 | None, 48 | ) 49 | 50 | 51 | model_group_taints = ResourceValidationPolicy( 52 | name="model-group-taints", 53 | description="Model group should have taints.", 54 | validate=model_group_validator, 55 | ) 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # PyCharm 7 | .idea/ 8 | 9 | # VS Code 10 | .vscode/ 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | cover/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | docs/_output/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # celery beat schedule file 91 | celerybeat-schedule 92 | 93 | # SageMath parsed files 94 | *.sage.py 95 | 96 | # dotenv 97 | .env 98 | .env.* 99 | 100 | # virtualenv 101 | .venv/ 102 | venv/ 103 | ENV/ 104 | env/ 105 | bin/ 106 | pyvenv.cfg 107 | Pipfile.lock 108 | 109 | # Spyder project settings 110 | .spyderproject 111 | .spyproject 112 | 113 | # Rope project settings 114 | .ropeproject 115 | 116 | # mkdocs documentation 117 | /site 118 | 119 | # mypy 120 | .mypy_cache/ 121 | .dmypy.json 122 | dmypy.json 123 | 124 | # Pyre type checker 125 | .pyre/ 126 | 127 | # pytype static type analyzer 128 | .pytype/ 129 | 130 | # Cython debug symbols 131 | cython_debug/ 132 | 133 | 134 | # Poetry specific 135 | .pypoetry-cache 136 | 137 | # Jupyter Notebook 138 | .ipynb_checkpoints 139 | 140 | # ptest-kin 141 | .pytest-kind 142 | -------------------------------------------------------------------------------- /tests/model/test_store.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import io 3 | 4 | import boto3 5 | import pytest 6 | from botocore.exceptions import ClientError 7 | from moto import mock_aws 8 | 9 | from paka.model.store import MODEL_PATH_PREFIX, S3ModelStore 10 | 11 | 12 | @mock_aws 13 | def test_s3_model_store_save() -> None: 14 | conn = boto3.resource("s3", region_name="us-east-1") 15 | conn.create_bucket(Bucket="mybucket") 16 | 17 | store = S3ModelStore("mybucket") 18 | 19 | store.save("test.txt", b"Test data") 20 | 21 | body = conn.Object("mybucket", f"{MODEL_PATH_PREFIX}/test.txt").get()["Body"].read() 22 | assert body == b"Test data" 23 | 24 | 25 | @mock_aws 26 | def test_save_stream() -> None: 27 | conn = boto3.resource("s3", region_name="us-east-1") 28 | conn.create_bucket(Bucket="mybucket") 29 | 30 | store = S3ModelStore("mybucket") 31 | 32 | data = b"Test data" 33 | sha256_hash = hashlib.sha256(data).hexdigest() 34 | stream = io.BytesIO(data) 35 | store.save_stream("test.txt", stream, len(stream.getvalue()), sha256_hash) 36 | 37 | body = conn.Object("mybucket", f"{MODEL_PATH_PREFIX}/test.txt").get()["Body"].read() 38 | assert body == b"Test data" 39 | 40 | with pytest.raises( 41 | Exception, 42 | match="SHA256 hash of the downloaded file does not match the expected value", 43 | ): 44 | stream = io.BytesIO(data) 45 | store.save_stream("test_2.txt", stream, len(stream.getvalue()), "invalid_hash") 46 | 47 | try: 48 | conn.Object("mybucket", f"{MODEL_PATH_PREFIX}/test_2.txt").load() 49 | except ClientError as e: 50 | if e.response["Error"]["Code"] == "404": 51 | file_exists = False 52 | else: 53 | raise 54 | else: 55 | file_exists = True 56 | 57 | assert not file_exists 58 | 59 | 60 | @mock_aws 61 | def test_file_exists() -> None: 62 | conn = boto3.resource("s3", region_name="us-east-1") 63 | conn.create_bucket(Bucket="mybucket") 64 | 65 | store = S3ModelStore("mybucket") 66 | 67 | assert not store.file_exists("test.txt") 68 | 69 | conn.Object("mybucket", f"{MODEL_PATH_PREFIX}/test.txt").put(Body=b"Test data") 70 | 71 | assert store.file_exists("test.txt") 72 | 73 | assert store.file_exists("test", prefix_match=True) 74 | assert not store.file_exists("nonexistent", prefix_match=True) 75 | -------------------------------------------------------------------------------- /paka/model/settings.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional 4 | 5 | from pydantic import BaseModel, Field, field_validator 6 | 7 | 8 | class ModelSettings(BaseModel): 9 | quantization: Optional[str] = Field( 10 | None, description="The quantization method (GPTQ, AWQ, GGUF_Q4_0, etc) to use." 11 | ) 12 | prompt_template_name: Optional[str] = Field( 13 | None, description="The prompt template (chatml, llama-2, gemma, etc) to use." 14 | ) 15 | prompt_template_str: Optional[str] = Field( 16 | None, description="The prompt template string to use." 17 | ) 18 | 19 | @field_validator("quantization") 20 | def validate_quantization(cls, v: Optional[str]) -> Optional[str]: 21 | if v is None: 22 | return v 23 | 24 | valid_methods = [ 25 | "GPTQ", 26 | "AWQ", 27 | "GGUF_Q2_K", 28 | "GGUF_Q3_K_L", 29 | "GGUF_Q3_K_M", 30 | "GGUF_Q3_K_S", 31 | "GGUF_Q4_0", 32 | "GGUF_Q4_K_M", 33 | "GGUF_Q4_K_S", 34 | "GGUF_Q5_0", 35 | "GGUF_Q5_K_M", 36 | "GGUF_Q5_K_S", 37 | "GGUF_Q6_K", 38 | "GGUF_Q8_0", 39 | "GGUF_fp16", 40 | "GGUF_fp32", 41 | ] 42 | if v not in valid_methods: 43 | raise ValueError("Invalid quantization method") 44 | return v 45 | 46 | @field_validator("prompt_template_name") 47 | def validate_prompt_template_name(cls, v: str) -> str: 48 | valid_templates = [ 49 | "chatml", 50 | "llama-2", 51 | "gemma", 52 | "alpaca", 53 | "qwen", 54 | "vicuna", 55 | "oasst_llama", 56 | "baichuan-2", 57 | "baichuan", 58 | "openbuddy", 59 | "redpajama-incite", 60 | "snoozy", 61 | "phind", 62 | "intel", 63 | "open-orca", 64 | "mistrallite", 65 | "zephyr", 66 | "pygmalion", 67 | "mistral-instruct", 68 | "chatglm3", 69 | "openchat", 70 | "saiga", 71 | "codellama", 72 | ] 73 | if v is not None and v not in valid_templates: 74 | raise ValueError("Invalid prompt template name") 75 | return v 76 | -------------------------------------------------------------------------------- /paka/cli/build.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from typing import Optional 5 | 6 | import typer 7 | 8 | from paka.cli.utils import build_and_push, ensure_cluster_name, push_to_ecr 9 | from paka.utils import read_pulumi_stack 10 | 11 | build_app = typer.Typer() 12 | 13 | 14 | @build_app.command() 15 | def build_image( 16 | cluster_name: Optional[str] = typer.Option( 17 | os.getenv("PAKA_CURRENT_CLUSTER"), 18 | "--cluster", 19 | "-c", 20 | help="The name of the cluster.", 21 | ), 22 | source_dir: str = typer.Argument( 23 | ..., 24 | help="Source directory of the application.", 25 | ), 26 | image_name: str = typer.Option( 27 | "", 28 | "--image-name", 29 | help="Provide a custom name for the Docker image. If omitted, " 30 | "the base name of the source code directory will be used as the image name.", 31 | ), 32 | ) -> None: 33 | """ 34 | Build a Docker image from the application in the specified source directory. 35 | 36 | The source directory must contain a Procfile and a .cnignore file. The Procfile 37 | defines the commands to run for the application. The .cnignore file defines the 38 | files and directories to exclude from the image. Once the image is built, 39 | it will be pushed to the container repository of the current cluster. 40 | 41 | A Dockerfile is NOT required. The image will be built using Cloud Native Buildpacks. 42 | In cluster build is not supported yet. User machine must have Docker installed. 43 | """ 44 | build_and_push(cluster_name, source_dir, image_name) 45 | 46 | 47 | @build_app.command() 48 | def push_image( 49 | cluster_name: Optional[str] = typer.Option( 50 | os.getenv("PAKA_CURRENT_CLUSTER"), 51 | "--cluster", 52 | "-c", 53 | help="The name of the cluster.", 54 | ), 55 | image_name: str = typer.Option( 56 | "", 57 | "--image-name", 58 | help="Name of the pre-built Docker image. If image tag is not provided, 'latest' will be used.", 59 | ), 60 | ) -> None: 61 | """ 62 | Push a pre-built Docker image to the container repository of the current cluster. 63 | """ 64 | cluster_name = ensure_cluster_name(cluster_name) 65 | 66 | push_to_ecr( 67 | image_name, 68 | read_pulumi_stack(cluster_name, "registry"), 69 | read_pulumi_stack(cluster_name, "region"), 70 | image_name, 71 | ) 72 | -------------------------------------------------------------------------------- /paka/cluster/redis.py: -------------------------------------------------------------------------------- 1 | import pulumi 2 | import pulumi_kubernetes as k8s 3 | from pulumi_kubernetes.apiextensions import CustomResource 4 | from pulumi_kubernetes.helm.v3 import Chart, ChartOpts, FetchOpts 5 | 6 | from paka.cluster.context import Context 7 | from paka.utils import call_once 8 | 9 | 10 | @call_once 11 | def create_redis(ctx: Context) -> None: 12 | """ 13 | Installs redis with a helm chart. 14 | """ 15 | config = ctx.cloud_config 16 | 17 | if not config.job or not config.job.enabled: 18 | return 19 | 20 | ns = k8s.core.v1.Namespace( 21 | "redis", 22 | metadata={"name": "redis"}, 23 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider), 24 | ) 25 | 26 | chart = Chart( 27 | "redis", 28 | ChartOpts( 29 | chart="redis", 30 | version="18.6.1", 31 | namespace=ctx.namespace, 32 | fetch_opts=FetchOpts(repo="https://charts.bitnami.com/bitnami"), 33 | values={ 34 | "architecture": "standalone", 35 | "master": { 36 | "persistence": { 37 | "enabled": True, 38 | "size": config.job.brokerStorageSize, 39 | }, 40 | }, 41 | "metrics": {"enabled": True}, # For enabling metrics 42 | }, 43 | ), 44 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider, depends_on=[ns]), 45 | ) 46 | 47 | if not config.prometheus or not config.prometheus.enabled: 48 | return 49 | 50 | CustomResource( 51 | "redis-metrics-monitor", 52 | api_version="monitoring.coreos.com/v1", 53 | kind="ServiceMonitor", 54 | metadata={ 55 | "name": "redis-metrics-monitor", 56 | "namespace": "redis", 57 | }, 58 | spec={ 59 | "selector": { 60 | "matchLabels": { 61 | "app.kubernetes.io/instance": "redis", 62 | "app.kubernetes.io/name": "redis", 63 | "app.kubernetes.io/component": "metrics", 64 | } 65 | }, 66 | "namespaceSelector": { 67 | "matchNames": ["redis"], 68 | }, 69 | "endpoints": [ 70 | { 71 | "port": "http-metrics", 72 | "interval": "15s", 73 | }, 74 | ], 75 | }, 76 | opts=pulumi.ResourceOptions( 77 | provider=ctx.k8s_provider, 78 | depends_on=[chart], 79 | ), 80 | ) 81 | -------------------------------------------------------------------------------- /paka/k8s/job/autoscaler.py: -------------------------------------------------------------------------------- 1 | from kubernetes import client 2 | 3 | from paka.k8s.utils import ( 4 | CustomResource, 5 | apply_resource, 6 | delete_namespaced_custom_object, 7 | ) 8 | 9 | 10 | def create_autoscaler( 11 | namespace: str, 12 | redis_svc_name: str, 13 | queue_name: str, 14 | trigger_queue_length: int, 15 | job_name: str, 16 | min_replicas: int, 17 | max_replicas: int, 18 | ) -> None: 19 | """ 20 | Creates a KEDA autoscaler for a job with a Redis trigger. 21 | 22 | The autoscaler scales the job based on the length of a Redis list. 23 | The job is scaled up when the list length exceeds the trigger queue length, 24 | and scaled down when the list is empty. 25 | 26 | Args: 27 | namespace (str): The namespace to create the resources in. 28 | redis_svc_name (str): The name of the Redis service. 29 | queue_name (str): The name of the Redis list to monitor. 30 | trigger_queue_length (int): The list length at which to trigger scaling. 31 | job_name (str): The name of the job to scale. 32 | min_replicas (int): The minimum number of job replicas. 33 | max_replicas (int): The maximum number of job replicas. 34 | 35 | Returns: 36 | None 37 | """ 38 | scaled_object = CustomResource( 39 | api_version="keda.sh/v1alpha1", 40 | kind="ScaledObject", 41 | plural="scaledobjects", 42 | metadata=client.V1ObjectMeta(name=job_name, namespace=namespace), 43 | spec={ 44 | "scaleTargetRef": { 45 | "kind": "Deployment", 46 | "name": job_name, 47 | }, 48 | "minReplicaCount": min_replicas, 49 | "maxReplicaCount": max_replicas, 50 | "triggers": [ 51 | { 52 | "type": "redis", 53 | "metadata": { 54 | "type": "list", 55 | "listName": queue_name, 56 | "listLength": f"{trigger_queue_length}", 57 | "address": f"{redis_svc_name}.redis.svc.cluster.local:6379", 58 | }, 59 | } 60 | ], 61 | }, 62 | ) 63 | apply_resource(scaled_object) 64 | 65 | 66 | def delete_autoscaler(namespace: str, job_name: str) -> None: 67 | scaled_object = CustomResource( 68 | api_version="keda.sh/v1alpha1", 69 | kind="ScaledObject", 70 | plural="scaledobjects", 71 | metadata=client.V1ObjectMeta(name=job_name, namespace=namespace), 72 | spec={}, 73 | ) 74 | delete_namespaced_custom_object(job_name, namespace, scaled_object) 75 | -------------------------------------------------------------------------------- /tests/policy_packs/aws/object_store.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import List 4 | 5 | from pulumi_policy import ( 6 | Policy, 7 | ReportViolation, 8 | ResourceValidationArgs, 9 | ResourceValidationPolicy, 10 | StackValidationArgs, 11 | StackValidationPolicy, 12 | ) 13 | 14 | max_num_buckets = 1 15 | 16 | 17 | def s3_count_validator( 18 | stack: StackValidationArgs, report_violation: ReportViolation 19 | ) -> None: 20 | s3_resources = filter( 21 | (lambda resource: resource.resource_type == "aws:s3/bucket:Bucket"), 22 | stack.resources, 23 | ) 24 | 25 | buckets = list(s3_resources) 26 | if len(buckets) > max_num_buckets: 27 | report_violation( 28 | f"No more than {max_num_buckets} bucket(s) should be created.", None 29 | ) 30 | 31 | 32 | s3_count_check = StackValidationPolicy( 33 | name="s3-count-check", 34 | description="Checks the number of buckets created.", 35 | validate=s3_count_validator, 36 | ) 37 | 38 | 39 | def s3_no_public_read_validator( 40 | args: ResourceValidationArgs, report_violation: ReportViolation 41 | ) -> None: 42 | if args.resource_type == "aws:s3/bucket:Bucket" and "acl" in args.props: 43 | acl = args.props["acl"] 44 | if acl == "public-read" or acl == "public-read-write": 45 | report_violation( 46 | "You cannot set public-read or public-read-write on an S3 bucket. " 47 | + "Read more about ACLs here: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html", 48 | None, 49 | ) 50 | 51 | 52 | s3_no_public_read = ResourceValidationPolicy( 53 | name="s3-no-public-read", 54 | description="Prohibits setting the publicRead or publicReadWrite permission on AWS S3 buckets.", 55 | validate=s3_no_public_read_validator, 56 | ) 57 | 58 | 59 | def s3_force_destroy_validator( 60 | args: ResourceValidationArgs, report_violation: ReportViolation 61 | ) -> None: 62 | if args.resource_type == "aws:s3/bucket:Bucket" and "forceDestroy" in args.props: 63 | force_destroy = args.props["forceDestroy"] 64 | if not force_destroy: 65 | report_violation( 66 | "You must set forceDestroy to true. " 67 | + "Read more about forceDestroy here: https://www.pulumi.com/docs/intro/concepts/resources/#deletion", 68 | None, 69 | ) 70 | 71 | 72 | s3_force_destroy = ResourceValidationPolicy( 73 | name="s3-force-destroy", 74 | description="Requires forceDestroy to be set to true.", 75 | validate=s3_force_destroy_validator, 76 | ) 77 | 78 | s3_policies: List[Policy] = [s3_count_check, s3_no_public_read, s3_force_destroy] 79 | -------------------------------------------------------------------------------- /examples/website_rag/crawler.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Generator, Tuple 3 | from urllib.parse import urljoin, urlparse 4 | 5 | import requests 6 | from bs4 import BeautifulSoup 7 | 8 | 9 | def is_relative_url(url: str) -> bool: 10 | return not bool(urlparse(url).netloc) 11 | 12 | 13 | def get_root_url(url: str) -> str: 14 | parsed_url = urlparse(url) 15 | return f"{parsed_url.scheme}://{parsed_url.netloc}" 16 | 17 | 18 | def get_filename(save_dir: str, href: str) -> str: 19 | parsed_url = urlparse(href) 20 | path = parsed_url.path 21 | if path.endswith("/") or not path: 22 | path = path + "index.html" 23 | else: 24 | if not path.endswith(".html"): 25 | path = path + ".html" 26 | filename = os.path.join(save_dir, parsed_url.netloc, path.lstrip("/")) 27 | return filename 28 | 29 | 30 | def save_html_file(response_text: str, url: str, save_dir: str) -> None: 31 | filename = get_filename(save_dir, url) 32 | os.makedirs(os.path.dirname(filename), exist_ok=True) 33 | with open(filename, "w") as file: 34 | file.write(response_text) 35 | 36 | 37 | def get_html(url: str) -> str: 38 | try: 39 | response = requests.get(url) 40 | content_type = response.headers["content-type"] 41 | if "html" in content_type: 42 | return response.text 43 | except Exception as e: 44 | print(f"Failed to get {url}: {e}") 45 | return "" 46 | 47 | 48 | def crawl(url: str, max_depth: int = 3) -> Generator[Tuple[str, str], None, None]: 49 | visited = set() 50 | 51 | def _crawl(url: str, depth: int = 0) -> Generator[Tuple[str, str], None, None]: 52 | url_without_fragment = url.split("#")[0] 53 | 54 | if ( 55 | depth > max_depth 56 | or not url.startswith("https") 57 | or not url.startswith("http") 58 | ): 59 | return 60 | 61 | root_url = get_root_url(url) 62 | 63 | orig_html_content = get_html(url) 64 | soup = BeautifulSoup(orig_html_content, "html.parser") 65 | 66 | yield url, soup.get_text(separator=" ", strip=True) 67 | 68 | visited.add(url_without_fragment) 69 | 70 | for link in soup.find_all("a"): 71 | href = link.get("href") 72 | if not href: 73 | continue 74 | href_domain = urlparse(href).netloc 75 | url_domain = urlparse(url).netloc 76 | if href_domain and href_domain != url_domain: 77 | continue 78 | 79 | full_url = urljoin(root_url, href) if is_relative_url(href) else href 80 | if full_url not in visited: 81 | yield from _crawl(full_url, depth + 1) 82 | 83 | yield from _crawl(url) 84 | -------------------------------------------------------------------------------- /paka/cluster/aws/ebs_csi_driver.py: -------------------------------------------------------------------------------- 1 | import pulumi 2 | import pulumi_aws as aws 3 | import pulumi_eks as eks 4 | from pulumi_kubernetes.helm.v3 import Chart, ChartOpts, FetchOpts 5 | 6 | from paka.cluster.aws.utils import odic_role_for_sa 7 | from paka.cluster.context import Context 8 | from paka.utils import call_once 9 | 10 | 11 | @call_once 12 | def create_ebs_csi_driver(ctx: Context, cluster: eks.Cluster) -> None: 13 | cluster_name = ctx.cluster_name 14 | 15 | csi_driver_policy_doc = aws.iam.get_policy_document( 16 | statements=[ 17 | aws.iam.GetPolicyDocumentStatementArgs( 18 | actions=[ 19 | "ec2:CreateSnapshot", 20 | "ec2:AttachVolume", 21 | "ec2:DetachVolume", 22 | "ec2:ModifyVolume", 23 | "ec2:DescribeAvailabilityZones", 24 | "ec2:DescribeInstances", 25 | "ec2:DescribeSnapshots", 26 | "ec2:DescribeTags", 27 | "ec2:DescribeVolumes", 28 | "ec2:DescribeVolumesModifications", 29 | "ec2:CreateTags", 30 | "ec2:CreateVolume", 31 | "ec2:DeleteVolume", 32 | ], 33 | resources=["*"], 34 | effect="Allow", 35 | ) 36 | ] 37 | ) 38 | 39 | csi_driver_policy = aws.iam.Policy( 40 | f"{cluster_name}-csi-driver-policy", policy=csi_driver_policy_doc.json 41 | ) 42 | 43 | csi_driver_role = odic_role_for_sa( 44 | ctx, cluster, "csi-driver", "kube-system:ebs-csi-controller-sa" 45 | ) 46 | 47 | aws.iam.RolePolicyAttachment( 48 | f"{cluster_name}-csi-driver-role-policy-attachment", 49 | policy_arn=csi_driver_policy.arn, 50 | role=csi_driver_role.name, 51 | ) 52 | 53 | Chart( 54 | "aws-ebs-csi-driver", 55 | ChartOpts( 56 | chart="aws-ebs-csi-driver", 57 | version="2.26.0", 58 | namespace="kube-system", 59 | fetch_opts=FetchOpts( 60 | repo="https://kubernetes-sigs.github.io/aws-ebs-csi-driver" 61 | ), 62 | values={ 63 | "controller": { 64 | "serviceAccount": { 65 | "create": "true", 66 | "name": "ebs-csi-controller-sa", 67 | "annotations": { 68 | "eks.amazonaws.com/role-arn": csi_driver_role.arn 69 | }, 70 | "automountServiceAccountToken": "true", 71 | }, 72 | } 73 | }, 74 | ), 75 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider), 76 | ) 77 | -------------------------------------------------------------------------------- /examples/website_rag/serve.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | from typing import Annotated, Any 5 | 6 | from constants import LLM_URL, QDRANT_URL 7 | from embeddings import LlamaEmbeddings 8 | from fastapi import Depends, FastAPI, Request, Response 9 | from langchain.chains import RetrievalQA 10 | from langchain_community.vectorstores import Qdrant 11 | from langchain_core.runnables import RunnableLambda 12 | from langserve import APIHandler, add_routes # type: ignore 13 | from qdrant_client import QdrantClient 14 | from vllm import Vllm 15 | 16 | logging.basicConfig( 17 | level=logging.INFO, 18 | format="%(asctime)s [%(levelname)s] %(message)s", 19 | handlers=[logging.StreamHandler()], 20 | ) 21 | 22 | port = int(os.getenv("PORT", 8080)) 23 | 24 | client = QdrantClient( 25 | url=QDRANT_URL, 26 | prefer_grpc=True, 27 | ) 28 | collection_name = "langchain_documents" 29 | 30 | embeddings = LlamaEmbeddings() 31 | qdrant = Qdrant(client, collection_name, embeddings=embeddings) 32 | 33 | retriever = qdrant.as_retriever() 34 | 35 | app = FastAPI( 36 | title="LangChain Docs Server", 37 | version="0.1.0", 38 | description="Spin up a simple api server to retrieve documents from the vector store.", 39 | ) 40 | # Adds routes to the app for using the retriever under: 41 | # /invoke 42 | # /batch 43 | # /stream 44 | add_routes(app, retriever) 45 | 46 | 47 | def run_llm(query: str) -> Any: 48 | start_time = time.time() 49 | logging.info(f"Running LLM with query: {query}") 50 | llm = Vllm( 51 | model="llama2-7b-chat", 52 | model_url=LLM_URL, 53 | temperature=0, 54 | max_tokens=2500, 55 | streaming=False, 56 | ) 57 | 58 | qa = RetrievalQA.from_chain_type( 59 | llm=llm, retriever=retriever, chain_type="stuff", return_source_documents=True 60 | ) 61 | 62 | query = f"[INST] <><>\n\n{query} [/INST]\n" 63 | result = qa.invoke({"query": query}) 64 | logging.info(f"LLM result: {result}") 65 | 66 | end_time = time.time() 67 | logging.info(f"Execution time: {end_time - start_time} seconds") 68 | return result 69 | 70 | 71 | async def _get_api_handler() -> APIHandler: 72 | """Prepare a RunnableLambda.""" 73 | return APIHandler(RunnableLambda(run_llm), path="/v2") 74 | 75 | 76 | @app.post("/v2/invoke") 77 | async def v2_invoke( 78 | request: Request, runnable: Annotated[APIHandler, Depends(_get_api_handler)] 79 | ) -> Response: 80 | """Handle invoke request.""" 81 | # The API Handler validates the parts of the request 82 | # that are used by the runnnable (e.g., input, config fields) 83 | return await runnable.invoke(request) 84 | 85 | 86 | if __name__ == "__main__": 87 | import uvicorn 88 | 89 | uvicorn.run(app, host="localhost", port=port) 90 | -------------------------------------------------------------------------------- /paka/model/hf_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import concurrent.futures 4 | import os 5 | from typing import Any, Dict, List, Optional 6 | 7 | from huggingface_hub import HfFileSystem 8 | from huggingface_hub.utils import validate_repo_id 9 | 10 | from paka.logger import logger 11 | from paka.model.base_model import BaseMLModel 12 | from paka.model.store import ModelStore 13 | 14 | 15 | class HuggingFaceModel(BaseMLModel): 16 | def __init__( 17 | self, 18 | name: str, 19 | repo_id: str, 20 | files: List[str], 21 | model_store: ModelStore, 22 | quantization: Optional[str] = None, 23 | prompt_template_name: Optional[str] = None, 24 | prompt_template_str: Optional[str] = None, 25 | ) -> None: 26 | super().__init__( 27 | name=name, 28 | model_store=model_store, 29 | quantization=quantization, 30 | prompt_template_name=prompt_template_name, 31 | prompt_template_str=prompt_template_str, 32 | ) 33 | validate_repo_id(repo_id) 34 | self.repo_id: str = repo_id 35 | self.fs = HfFileSystem() 36 | self._files = files 37 | 38 | def save(self) -> None: 39 | """ 40 | Saves the model to a model store. 41 | """ 42 | files: List[str] = [] 43 | for file in self._files: 44 | match_files = self.fs.glob(f"{self.repo_id}/{file}") 45 | 46 | if not match_files: 47 | logger.warn( 48 | f"No matching files found for {file} in HuggingFace repo {self.repo_id}" 49 | ) 50 | 51 | files.extend(match_files) 52 | 53 | with concurrent.futures.ThreadPoolExecutor( 54 | max_workers=self.concurrency 55 | ) as executor: 56 | futures = [executor.submit(self._save_single_file, file) for file in files] 57 | concurrent.futures.wait(futures) 58 | self.finish() 59 | 60 | def _save_single_file(self, hf_file_path: str) -> None: 61 | """ 62 | Saves a HuggingFace model file to the specified model store. 63 | 64 | Args: 65 | hf_file_path (str): The path to the HuggingFace model file. 66 | 67 | Returns: 68 | None 69 | """ 70 | file_info: Dict[str, Any] = self.fs.stat(hf_file_path) 71 | total_size = file_info["size"] 72 | sha256 = ( 73 | file_info["lfs"]["sha256"] 74 | if "lfs" in file_info and file_info["lfs"] 75 | else "" 76 | ) 77 | 78 | fname = os.path.basename(hf_file_path) 79 | with self.fs.open(hf_file_path, "rb") as hf_file: 80 | self.save_single_stream(f"{self.name}/{fname}", hf_file, total_size, sha256) 81 | -------------------------------------------------------------------------------- /paka/cluster/kubectl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import shutil 4 | from pathlib import Path 5 | 6 | import requests 7 | 8 | from paka.logger import logger 9 | from paka.utils import download_url, get_project_data_dir 10 | 11 | KUBECTL_VERSION_URL = "https://cdn.dl.k8s.io/release/stable.txt" 12 | CHUNK_SIZE = 8192 13 | 14 | 15 | def get_latest_kubectl_version() -> str: 16 | """Return the latest version of kubectl available for download.""" 17 | try: 18 | response = requests.get(KUBECTL_VERSION_URL) 19 | response.raise_for_status() 20 | return response.text 21 | except requests.RequestException as e: 22 | logger.error(f"Failed to get latest kubectl version: {e}") 23 | return "v1.30.0" 24 | 25 | 26 | # We are not pinning the version of kubectl to a specific version 27 | # Get the latest version of kubectl should be safe 28 | KUBECTL_VERSION = os.getenv("KUBECTL_VERSION", get_latest_kubectl_version()) 29 | 30 | 31 | # install_path is a full path to the kubectl binary 32 | # It should be in a format like this /path/to/kubectl-x.xx.x/kubectl 33 | def ensure_kubectl_by_path(install_path: Path) -> None: 34 | """Ensure kubectl is installed at the given path.""" 35 | parent_dir = install_path.parent 36 | os.environ["PATH"] = f"{parent_dir.absolute()}{os.pathsep}{os.environ['PATH']}" 37 | 38 | if install_path.exists(): 39 | return 40 | 41 | system = platform.system().lower() 42 | arch = platform.machine().lower() 43 | 44 | if arch in ["amd64", "x86_64"]: 45 | arch = "amd64" 46 | 47 | if arch not in ["amd64", "arm64"]: 48 | raise Exception(f"Unsupported architecture: {arch}") 49 | 50 | grandparent_dir = parent_dir.parent 51 | 52 | for old_kubectl_dir in grandparent_dir.glob("kubectl-*"): 53 | shutil.rmtree(old_kubectl_dir) 54 | 55 | if not install_path.exists(): 56 | url = os.getenv( 57 | "KUBECTL_DOWNLOAD_URL", 58 | f"https://dl.k8s.io/release/{KUBECTL_VERSION}/bin/{system}/{arch}/kubectl", 59 | ) 60 | if system == "windows" and not url.endswith(".exe"): 61 | url += ".exe" 62 | logger.info(f"Downloading {url}..") 63 | 64 | with download_url(url) as tmp_file: 65 | tmp_file_p = Path(tmp_file) 66 | tmp_file_p.chmod(0o755) 67 | parent_dir.mkdir(parents=True, exist_ok=True) 68 | shutil.copy2(tmp_file_p, install_path) 69 | 70 | 71 | def ensure_kubectl() -> None: 72 | system = platform.system().lower() 73 | kubectl_path = ( 74 | Path(get_project_data_dir()) / "bin" / f"kubectl-{KUBECTL_VERSION}" / "kubectl" 75 | ) 76 | if system == "windows": 77 | kubectl_path = kubectl_path.with_suffix(".exe") 78 | 79 | ensure_kubectl_by_path(kubectl_path) 80 | -------------------------------------------------------------------------------- /paka/model/progress_bar.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from threading import Lock 4 | from typing import Any, Dict, List, Optional, Tuple 5 | 6 | from tqdm import tqdm 7 | 8 | 9 | class ProgressBar: 10 | def __init__(self, message: str = "Downloading") -> None: 11 | self.counter: Dict[str, int] = {} 12 | self.lock = Lock() 13 | self.progress_bar: Optional[tqdm] = None 14 | self.completed_files: List[Tuple[str, str]] = [] 15 | self.message = message 16 | 17 | def __getattr__(self, name: str) -> Any: 18 | return getattr(self.progress_bar, name) 19 | 20 | def set_postfix_str(self, *args: Any, **kwargs: Any) -> None: 21 | if self.progress_bar is None: 22 | return 23 | self.progress_bar.set_postfix_str(*args, **kwargs) 24 | 25 | def clear_counter(self) -> None: 26 | with self.lock: 27 | self.counter = {} 28 | 29 | def create_progress_bar(self, total_size: int) -> None: 30 | with self.lock: 31 | if self.progress_bar is not None: 32 | return 33 | 34 | self.progress_bar = tqdm( 35 | total=total_size, unit="B", unit_scale=True, desc=self.message 36 | ) 37 | 38 | def update_progress_bar(self, key: str, value: int) -> None: 39 | if key in self.counter: 40 | return 41 | 42 | with self.lock: 43 | if self.progress_bar is not None: 44 | # Increase the total count of the progress bar by the provided value 45 | self.progress_bar.total += value 46 | # Refresh the progress bar to reflect the new total 47 | self.progress_bar.refresh() 48 | 49 | def close_progress_bar(self) -> None: 50 | if self.progress_bar is None: 51 | return 52 | with self.lock: 53 | self.counter = {} 54 | self.progress_bar.close() 55 | self.progress_bar = None 56 | 57 | def advance_progress_bar(self, key: str = "", value: int = 0) -> None: 58 | if self.progress_bar is None: 59 | return 60 | with self.lock: 61 | if key: 62 | self.counter[key] = value 63 | # Calculate the total progress by summing the progress of all tasks 64 | total_progress = sum(self.counter.values()) 65 | # Update the progress bar by the amount of progress made since the last update 66 | self.progress_bar.update(total_progress - self.progress_bar.n) 67 | self.progress_bar.refresh() 68 | 69 | 70 | class NullProgressBar: 71 | def __init__(self, *args: Any, **kwargs: Any) -> None: 72 | pass 73 | 74 | def __getattr__(self, name: str) -> Any: 75 | return lambda *args, **kwargs: None 76 | 77 | def __setattr__(self, name: str, value: Any) -> None: 78 | pass 79 | -------------------------------------------------------------------------------- /paka/model/base_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from abc import ABC, abstractmethod 5 | from typing import List, Optional, Tuple 6 | 7 | from paka.logger import logger 8 | from paka.model.manifest import ModelFile, ModelManifest 9 | from paka.model.settings import ModelSettings 10 | from paka.model.store import ModelStore, StreamLike 11 | from paka.utils import to_yaml 12 | 13 | 14 | class BaseMLModel(ABC): 15 | def __init__( 16 | self, 17 | name: str, 18 | model_store: ModelStore, 19 | quantization: Optional[str], 20 | prompt_template_name: Optional[str], 21 | prompt_template_str: Optional[str], 22 | # Max concurrency for saving model streams 23 | concurrency: int = 1, 24 | ) -> None: 25 | self.name = name 26 | self.completed_files: List[Tuple[str, str]] = [] 27 | self.settings = ModelSettings( 28 | quantization=quantization, 29 | prompt_template_name=prompt_template_name, 30 | prompt_template_str=prompt_template_str, 31 | ) 32 | 33 | self.model_store = model_store 34 | self.concurrency = concurrency 35 | 36 | def save_manifest_yml(self, manifest: Optional[ModelManifest] = None) -> None: 37 | if manifest is None: 38 | manifest = ModelManifest( 39 | name=self.name, 40 | files=[ 41 | ModelFile(name=name, sha256=sha256) 42 | for (name, sha256) in self.completed_files 43 | ], 44 | quantization=self.settings.quantization, 45 | prompt_template_name=self.settings.prompt_template_name, 46 | prompt_template_str=self.settings.prompt_template_str, 47 | ) 48 | 49 | model_store = self.model_store 50 | 51 | manifest_yml = to_yaml(manifest.model_dump(exclude_none=True)) 52 | 53 | file_path = f"{self.name}/manifest.yml" 54 | if model_store.file_exists(file_path): 55 | logger.info( 56 | f"manifest.yml file already exists at {file_path}. Overwriting..." 57 | ) 58 | model_store.delete_file(file_path) 59 | model_store.save(file_path, manifest_yml.encode("utf-8")) 60 | logger.info(f"manifest.yml file saved to {file_path}") 61 | 62 | @abstractmethod 63 | def save(self) -> None: 64 | pass 65 | 66 | def save_single_stream( 67 | self, path: str, stream: StreamLike, total_size: int, sha256: str = "" 68 | ) -> None: 69 | self.model_store.save_stream(path, stream, total_size, sha256) 70 | fname = os.path.basename(path) 71 | self.completed_files.append((fname, sha256)) 72 | 73 | def finish(self) -> None: 74 | self.try_close_progress_bar() 75 | self.save_manifest_yml() 76 | 77 | def try_close_progress_bar(self) -> None: 78 | pb = getattr(self.model_store, "progress_bar", None) 79 | if pb: 80 | pb.close_progress_bar() 81 | -------------------------------------------------------------------------------- /paka/container/pack.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import tarfile 4 | import tempfile 5 | import zipfile 6 | from pathlib import Path 7 | 8 | import requests 9 | 10 | from paka.logger import logger 11 | from paka.utils import ( 12 | calculate_sha256, 13 | download_url, 14 | get_gh_release_latest_version, 15 | get_project_data_dir, 16 | ) 17 | 18 | 19 | def ensure_pack() -> str: 20 | paka_home = Path(get_project_data_dir()) 21 | 22 | bin_dir = paka_home / "bin" 23 | bin_dir.mkdir(parents=True, exist_ok=True) 24 | 25 | pack_files = list(bin_dir.glob("pack-*")) 26 | if pack_files: 27 | return str(pack_files[0]) 28 | 29 | pack_version = get_gh_release_latest_version("buildpacks/pack") 30 | 31 | new_pack_path = bin_dir / f"pack-{pack_version}" 32 | 33 | system = platform.system().lower() 34 | arch = platform.machine().lower() 35 | 36 | if system == "windows": 37 | new_pack_path = new_pack_path.with_suffix(".exe") 38 | 39 | if new_pack_path.exists(): 40 | return str(new_pack_path) 41 | 42 | for old_pack_path in bin_dir.glob("pack-*"): 43 | if old_pack_path.is_file(): 44 | old_pack_path.unlink() 45 | 46 | if system == "darwin": 47 | system = "macos" 48 | 49 | if system == "windows": 50 | pack_file = f"pack-{pack_version}-windows.zip" 51 | 52 | elif arch in ["amd64", "x86_64"]: 53 | pack_file = f"pack-{pack_version}-{system}.tgz" 54 | elif arch == "arm64": 55 | pack_file = f"pack-{pack_version}-{system}-{arch}.tgz" 56 | else: 57 | raise Exception(f"Unsupported architecture: {arch}") 58 | 59 | url = f"https://github.com/buildpacks/pack/releases/download/{pack_version}/{pack_file}" 60 | 61 | logger.info(f"Downloading {pack_file}...") 62 | 63 | with download_url(url) as archive_file: 64 | archive_file_sha256 = calculate_sha256(archive_file) 65 | 66 | # Now, fetch the sha256 file and compare the hash 67 | sha256_url = f"{url}.sha256" 68 | 69 | response = requests.get(sha256_url) 70 | response.raise_for_status() 71 | expected_sha256, expected_filename = response.text.strip().split() 72 | 73 | assert expected_filename == pack_file 74 | 75 | if archive_file_sha256 != expected_sha256: 76 | raise Exception( 77 | f"SHA256 mismatch: {archive_file_sha256} != {expected_sha256}" 78 | ) 79 | 80 | if system == "windows": 81 | with zipfile.ZipFile(archive_file, "r") as zip_ref: 82 | zip_ref.extractall(bin_dir) 83 | else: 84 | with tarfile.open(archive_file, "r:gz") as tar: 85 | tar.extractall(bin_dir) 86 | 87 | pack_path = bin_dir / "pack" 88 | 89 | if system == "windows": 90 | pack_path = pack_path.with_suffix(".exe") 91 | 92 | pack_path.chmod(0o755) 93 | pack_path.rename(new_pack_path) 94 | 95 | logger.info("Pack installed successfully.") 96 | 97 | return str(new_pack_path) 98 | -------------------------------------------------------------------------------- /examples/website_rag/ingest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import Generator 3 | 4 | from bs4.element import Tag 5 | from constants import QDRANT_URL 6 | from crawler import crawl 7 | from embeddings import LlamaEmbeddings 8 | from langchain.text_splitter import RecursiveCharacterTextSplitter 9 | from langchain_community.vectorstores import Qdrant 10 | from langchain_core.documents import Document 11 | 12 | 13 | def _metadata_extractor(raw_html: str, url: str) -> dict: 14 | """Extract metadata from raw html using BeautifulSoup.""" 15 | metadata = {"source": url} 16 | 17 | try: 18 | from bs4 import BeautifulSoup 19 | except ImportError: 20 | print( 21 | "The bs4 package is required for default metadata extraction. " 22 | "Please install it with `pip install bs4`." 23 | ) 24 | return metadata 25 | soup = BeautifulSoup(raw_html, "html.parser") 26 | if title := soup.find("title"): 27 | metadata["title"] = title.get_text() 28 | if (description := soup.find("meta", attrs={"name": "description"})) and isinstance( 29 | description, Tag 30 | ): 31 | description_content = description.get("content", "") or "" 32 | metadata["description"] = ( 33 | " ".join(description_content) 34 | if isinstance(description_content, list) 35 | else description_content 36 | ) 37 | else: 38 | metadata["description"] = "" 39 | if (html := soup.find("html")) and isinstance(html, Tag): 40 | html_lang = html.get("lang", "") or "" 41 | metadata["language"] = ( 42 | " ".join(html_lang) if isinstance(html_lang, list) else html_lang 43 | ) 44 | else: 45 | metadata["language"] = "" 46 | return metadata 47 | 48 | 49 | def docs_loader(website: str) -> Generator[Document, None, None]: 50 | # We use a custom crawler. LangChain's RecursiveUrlLoader cannot be used as is. 51 | crawler = crawl(website, max_depth=0) 52 | 53 | for url, html_content in crawler: 54 | yield Document( 55 | page_content=html_content, metadata=_metadata_extractor(html_content, url) 56 | ) 57 | 58 | 59 | def embed_website(website: str) -> None: 60 | chunk_size = 400 61 | chunk_overlap = 50 62 | text_splitter = RecursiveCharacterTextSplitter( 63 | separators=["\n\n", "\n", " ", ""], 64 | chunk_size=chunk_size, 65 | chunk_overlap=chunk_overlap, 66 | length_function=len, 67 | ) 68 | docs = text_splitter.split_documents(docs_loader(website)) 69 | embeddings = LlamaEmbeddings() 70 | 71 | print("Embedding documents...") 72 | print("Total number of documents:", len(docs)) 73 | 74 | Qdrant.from_documents( 75 | docs, 76 | embeddings, 77 | url=QDRANT_URL, 78 | prefer_grpc=True, 79 | collection_name="langchain_documents", 80 | ) 81 | print("done") 82 | 83 | 84 | if __name__ == "__main__": 85 | if len(sys.argv) > 1: 86 | embed_website(sys.argv[1]) 87 | else: 88 | print("Please provide a URL as a command-line argument.") 89 | -------------------------------------------------------------------------------- /e2e/pytest_kind/plugin.py: -------------------------------------------------------------------------------- 1 | # Code modified from the original repo: https://codeberg.org/hjacobs/pytest-kind/src/branch/main/pytest_kind 2 | from __future__ import annotations 3 | 4 | from pathlib import Path 5 | from typing import Generator 6 | 7 | import pytest 8 | from _pytest.config.argparsing import Parser 9 | from pytest import FixtureRequest 10 | 11 | from .cluster import KindCluster 12 | 13 | 14 | @pytest.fixture(scope="session") 15 | def kind_cluster(request: FixtureRequest) -> Generator[KindCluster, None, None]: 16 | """Provide a Kubernetes kind cluster as test fixture.""" 17 | print(request.config) 18 | name = request.config.getoption("cluster_name") 19 | keep = request.config.getoption("keep_cluster") 20 | kubeconfig = request.config.getoption("kubeconfig") 21 | image = request.config.getoption("kind_image") 22 | kind_path = request.config.getoption("kind_bin") 23 | kubectl_path = request.config.getoption("kind_kubectl_bin") 24 | cluster_config = request.config.getoption("cluster_config") 25 | cluster = KindCluster( 26 | name, 27 | Path(kubeconfig) if kubeconfig else None, 28 | image=image, 29 | kind_path=Path(kind_path) if kind_path else None, 30 | kubectl_path=Path(kubectl_path) if kubectl_path else None, 31 | ) 32 | cluster.create(cluster_config) 33 | yield cluster 34 | if not keep: 35 | cluster.delete() 36 | 37 | 38 | def pytest_addoption(parser: Parser) -> None: 39 | group = parser.getgroup("kind") 40 | group.addoption( 41 | "--cluster-name", 42 | default="paka-e2e", 43 | help="Name of the Kubernetes kind cluster", 44 | ) 45 | group.addoption( 46 | "--keep-cluster", 47 | default=False, 48 | action="store_true", 49 | help="Keep the Kubernetes kind cluster (do not delete after test run)", 50 | ) 51 | group.addoption( 52 | "--kubeconfig", 53 | default=None, 54 | help=( 55 | "If provided, use the specified kubeconfig " 56 | "instead of the one generated by the cluster" 57 | ), 58 | ) 59 | group.addoption( 60 | "--cluster-config", 61 | default=None, 62 | help=("The cluster configuration file to use to create the Kind cluster."), 63 | ) 64 | group.addoption( 65 | "--kind-image", 66 | default=None, 67 | action="store", 68 | type=str, 69 | help=( 70 | "If provided, use the specified docker image " 71 | "instead of the default one. (e.g. kindest/node:v1.20.2)" 72 | ), 73 | ) 74 | group.addoption( 75 | "--kind-bin", 76 | default=None, 77 | action="store", 78 | type=str, 79 | help=( 80 | "If provided, use the specified kind binary instead of " 81 | "downloading one. Takes a filesystem path string." 82 | ), 83 | ) 84 | group.addoption( 85 | "--kind-kubectl-bin", 86 | default=None, 87 | action="store", 88 | type=str, 89 | help=( 90 | "If provided, use the specified kubectl binary instead of " 91 | "downloading one. Takes a filesystem path string." 92 | ), 93 | ) 94 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import Path 4 | from unittest.mock import mock_open, patch 5 | 6 | from paka.constants import HOME_ENV_VAR, PROJECT_NAME 7 | from paka.utils import ( 8 | call_once, 9 | camel_to_kebab, 10 | camel_to_snake, 11 | get_cluster_data_dir, 12 | get_project_data_dir, 13 | kubify_name, 14 | save_kubeconfig, 15 | to_yaml, 16 | ) 17 | 18 | 19 | def test_camel_to_kebab() -> None: 20 | assert camel_to_kebab("ExampleProject") == "example-project" 21 | assert camel_to_kebab("AnotherExampleProject") == "another-example-project" 22 | assert camel_to_kebab("YetAnotherExample") == "yet-another-example" 23 | assert camel_to_kebab("lowercase") == "lowercase" 24 | assert camel_to_kebab("UPPERCASE") == "uppercase" 25 | 26 | 27 | def test_kubify_name() -> None: 28 | assert kubify_name("MyName") == "myname" 29 | assert kubify_name("My.Name") == "my-name" 30 | assert kubify_name("My_Name") == "my-name" 31 | assert kubify_name("My-Name") == "my-name" 32 | assert kubify_name("123MyName") == "myname" 33 | assert kubify_name("MyName123") == "myname123" 34 | assert kubify_name("MyName!") == "myname" 35 | 36 | 37 | def test_call_once() -> None: 38 | counter = 0 39 | 40 | @call_once 41 | def increment_counter() -> None: 42 | nonlocal counter 43 | counter += 1 44 | 45 | # Call the function twice 46 | increment_counter() 47 | increment_counter() 48 | 49 | # Check that the counter was only incremented once 50 | assert counter == 1 51 | 52 | 53 | def test_to_yaml() -> None: 54 | obj = {"key": "value"} 55 | yaml_str = to_yaml(obj) 56 | assert yaml_str == "key: value\n" 57 | 58 | obj1 = {"key": {"nested_key": "nested_value"}} 59 | yaml_str = to_yaml(obj1) 60 | assert yaml_str == "key:\n nested_key: nested_value\n" 61 | 62 | obj2 = {"key": ["value1", "value2"]} 63 | yaml_str = to_yaml(obj2) 64 | assert yaml_str == "key:\n - value1\n - value2\n" 65 | 66 | 67 | def test_save_kubeconfig() -> None: 68 | m = mock_open() 69 | # Replace the built-in open function with the mock object 70 | with patch("builtins.open", m): 71 | kubeconfig_json = json.dumps({"apiVersion": "v1"}) 72 | save_kubeconfig("test", kubeconfig_json) 73 | f = os.path.join(get_cluster_data_dir("test"), "kubeconfig.yaml") 74 | m.assert_called_once_with(f, "w") 75 | handle = m() 76 | handle.write.assert_called_once() 77 | 78 | 79 | def test_get_project_data_dir() -> None: 80 | with patch.dict(os.environ, {HOME_ENV_VAR: "/test/home"}): 81 | result = get_project_data_dir() 82 | 83 | assert result == "/test/home" 84 | 85 | with patch.dict(os.environ, {}, clear=True): 86 | result = get_project_data_dir() 87 | 88 | assert result == os.path.join( 89 | str(Path.home()), f".{camel_to_kebab(PROJECT_NAME)}" 90 | ) 91 | 92 | 93 | def test_camel_to_snake() -> None: 94 | assert camel_to_snake("camelCase") == "camel_case" 95 | assert camel_to_snake("HTTPRequest") == "http_request" 96 | assert camel_to_snake("IPV6Address") == "ipv6_address" 97 | assert camel_to_snake("noChange") == "no_change" 98 | assert camel_to_snake("") == "" 99 | -------------------------------------------------------------------------------- /paka/cluster/aws/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Sequence 4 | 5 | import pulumi 6 | import pulumi_aws as aws 7 | import pulumi_eks as eks 8 | from pulumi import Input 9 | 10 | from paka.cluster.context import Context 11 | from paka.utils import get_instance_info 12 | 13 | 14 | def odic_role_for_sa( 15 | ctx: Context, 16 | cluster: eks.Cluster, 17 | role_name: str, 18 | ns_service_account: str, 19 | ) -> aws.iam.Role: 20 | """ 21 | Creates an IAM role for a service account in an EKS cluster using OpenID Connect (OIDC) authentication. 22 | 23 | Args: 24 | config (CloudConfig): The cloud configuration. 25 | cluster (eks.Cluster): The EKS cluster. 26 | role_name (str): The name of the role. 27 | ns_service_account (str): The name of the service account. e.g. "default:sa", "kube-system:auto-scaler" 28 | 29 | Returns: 30 | aws.iam.Role: The IAM role for the service account. 31 | """ 32 | oidc_url = cluster.core.oidc_provider.url 33 | oidc_arn = cluster.core.oidc_provider.arn 34 | 35 | assume_role_policy = pulumi.Output.all(oidc_url, oidc_arn).apply( 36 | lambda args: aws.iam.get_policy_document( 37 | statements=[ 38 | aws.iam.GetPolicyDocumentStatementArgs( 39 | effect="Allow", 40 | principals=[ 41 | aws.iam.GetPolicyDocumentStatementPrincipalArgs( 42 | type="Federated", 43 | identifiers=[str(args[1])], 44 | ) 45 | ], 46 | actions=["sts:AssumeRoleWithWebIdentity"], 47 | conditions=[ 48 | aws.iam.GetPolicyDocumentStatementConditionArgs( 49 | test="StringEquals", 50 | variable=f"{args[0]}:sub", 51 | values=[f"system:serviceaccount:{ns_service_account}"], 52 | ) 53 | ], 54 | ) 55 | ], 56 | ).json 57 | ) 58 | 59 | role = aws.iam.Role( 60 | f"{ctx.cluster_name}-{role_name}-role", 61 | assume_role_policy=assume_role_policy, 62 | ) 63 | 64 | return role 65 | 66 | 67 | def get_ami_for_instance(ctx: Context, instance_type: str) -> str: 68 | instance_info = get_instance_info(ctx.provider, ctx.region, instance_type) 69 | gpu_count = instance_info.get("gpu_count", 0) or 0 70 | arch = instance_info.get("arch", "x86_64") 71 | 72 | if gpu_count > 0: 73 | if arch == "x86_64": 74 | return "AL2_x86_64_GPU" 75 | else: 76 | return "BOTTLEROCKET_ARM_64_NVIDIA" 77 | else: 78 | if arch == "arm64": 79 | return "AL2_ARM_64" 80 | return "AL2_x86_64" 81 | 82 | 83 | def create_vpc_endpoint_for_s3( 84 | vpc_id: str, route_table_ids: Input[Sequence[Input[str]]], region: str 85 | ) -> aws.ec2.VpcEndpoint: 86 | s3_service_name = f"com.amazonaws.{region}.s3" 87 | 88 | vpc_endpoint = aws.ec2.VpcEndpoint( 89 | "s3-vpc-endpoint", 90 | vpc_id=vpc_id, 91 | service_name=s3_service_name, 92 | route_table_ids=route_table_ids, 93 | ) 94 | return vpc_endpoint 95 | -------------------------------------------------------------------------------- /examples/invoice_extraction/serve.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import time 5 | from typing import Any, Dict, List 6 | from uuid import uuid4 7 | 8 | from fastapi import FastAPI, File, UploadFile 9 | from langchain.callbacks.base import BaseCallbackHandler 10 | from langchain_community.document_loaders import PyPDFLoader 11 | from langchain_core.prompts import PromptTemplate 12 | from output_parser import invoice_parser 13 | from vllm import Vllm 14 | 15 | LLM_URL = "http://llama2-7b-chat" 16 | 17 | port = int(os.getenv("PORT", 8080)) 18 | app = FastAPI( 19 | title="Invoice Extraction Server", 20 | ) 21 | logging.basicConfig( 22 | level=logging.INFO, 23 | format="%(asctime)s [%(levelname)s] %(message)s", 24 | handlers=[logging.StreamHandler()], 25 | ) 26 | 27 | 28 | class CustomHandler(BaseCallbackHandler): 29 | def on_llm_start( 30 | self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any 31 | ) -> Any: 32 | formatted_prompts = "\n".join(prompts) 33 | logging.info(f"Prompt:\n{formatted_prompts}") 34 | 35 | 36 | def extract(pdf_path: str) -> str: 37 | pdf_loader = PyPDFLoader(pdf_path) 38 | pages = pdf_loader.load_and_split() 39 | page_content = pages[0].page_content 40 | 41 | logging.info(f"Extracting from PDF: {pdf_path}") 42 | 43 | template = """ 44 | Extract all the following values : invoice number, invoice date, remit to company, remit to address, 45 | tax ID, invoice to customer, invoice to address, total amount from this invoice: {invoice_text} 46 | 47 | {format_instructions} 48 | 49 | Only returns the extracted JSON object, don't say anything else. 50 | Don't say "Sure, here is the extracted JSON object based" or anything similar. 51 | """ 52 | 53 | chat_template = f"[INST] <><>\n\n{template} [/INST]\n" 54 | 55 | prompt = PromptTemplate( 56 | template=chat_template, 57 | input_variables=["invoice_text"], 58 | partial_variables={ 59 | "format_instructions": invoice_parser.get_format_instructions() 60 | }, 61 | ) 62 | 63 | llm = Vllm( 64 | model="llama2-7b-chat", 65 | model_url=LLM_URL, 66 | temperature=0, 67 | streaming=False, 68 | ) 69 | 70 | chain = prompt | llm | invoice_parser 71 | 72 | start_time = time.time() 73 | result = chain.invoke( 74 | {"invoice_text": page_content}, config={"callbacks": [CustomHandler()]} 75 | ) 76 | end_time = time.time() 77 | logging.info(f"Execution time: {end_time - start_time} seconds") 78 | return result.to_dict() 79 | 80 | 81 | @app.post("/extract_invoice") 82 | async def upload_file(file: UploadFile = File(...)) -> Any: 83 | unique_filename = str(uuid4()) 84 | tmp_file_path = f"/tmp/{unique_filename}" 85 | 86 | try: 87 | with open(tmp_file_path, "wb") as buffer: 88 | shutil.copyfileobj(file.file, buffer) 89 | 90 | return extract(tmp_file_path) 91 | finally: 92 | if os.path.exists(tmp_file_path): 93 | os.remove(tmp_file_path) 94 | 95 | 96 | if __name__ == "__main__": 97 | import uvicorn 98 | 99 | uvicorn.run(app, host="localhost", port=port) 100 | -------------------------------------------------------------------------------- /examples/website_rag/README.md: -------------------------------------------------------------------------------- 1 | ## website-rag 2 | This code provides an example for Quality Answering (QA) of a website by leveraging LangChain. It begins by scraping the website to gather necessary data. The scraped text is then segmented into chunks. These chunks are transformed into embeddings, a numerical representation of the text data, which are then stored in a vector store, specifically Qdrant. 3 | 4 | These embeddings are used to facilitate Question Answering (QA) with the help of Llama2-7b. This allows for interactive querying of the stored data, providing a robust tool for website QA. 5 | 6 | ## Running the example 7 | 8 | To run the example, first install the necessary dependencies: 9 | ```bash 10 | pip install paka 11 | 12 | # Install AWS CLI and ensure your AWS credentials are correctly configured. 13 | aws configure 14 | ``` 15 | 16 | ### Make sure docker daemon is running 17 | ```bash 18 | docker info 19 | ``` 20 | 21 | ### Provisioning the cluster 22 | 23 | ```bash 24 | cd examples/website_rag 25 | 26 | # Provision the cluster and update ~/.kube/config 27 | paka cluster up -f cluster.yaml 28 | ``` 29 | 30 | ### Scrape the website and create embeddings 31 | 32 | 33 | ```bash 34 | # Default BP_BUILDER is "paketobuildpacks/builder-jammy-base". 35 | # Here we use "paketobuildpacks/builder-jammy-full" to install sqlite 36 | # `ingest` is the entrypoint for the container, which is defined in the Procfile. 37 | BP_BUILDER="paketobuildpacks/builder-jammy-full" paka run --entrypoint ingest --source . 38 | ``` 39 | 40 | The command above will scrape https://python.langchain.com/docs, chunk the text, and create embeddings through langchain. Embeddings are created by a light Bert model that is managed by paka model group. The embeddings are then stored in a Qdrant cluster provisioned by paka. 41 | 42 | ### Run the serverless LangServe App 43 | 44 | ```bash 45 | # Below command will build the source and deploy it as a serverless function. 46 | BP_BUILDER="paketobuildpacks/builder-jammy-full" paka function deploy --name langchain-docs --source . --entrypoint serve 47 | 48 | # Or, without building from the source, you can deploy the pre-built image 49 | paka function deploy --name langchain-docs --image website_rag-latest --entrypoint serve 50 | ``` 51 | 52 | Check the statuses of the functions 53 | ```bash 54 | paka function list 55 | ``` 56 | 57 | If everything is successful, you should see the function in the list with a status of "READY". By default, the function is exposed through a public accessible REST API endpoint. 58 | 59 | ### Query the website 60 | 61 | Doing a similarity search by hitting the `/invoke` endpoint of the deployed function. 62 | 63 | ```bash 64 | curl -X POST -H "Content-Type: application/json" -d '{"input": "what is langchain"}' http://langchain-docs.default.xxxx.sslip.io/invoke 65 | ``` 66 | 67 | Asking a question by hitting the `/v2/invoke` endpoint of the deployed function. This will use the Llama2-7b model to answer the question. 68 | 69 | NOTE: The request may take a while to respond since by default we are asking the model to generate answers based on 4 documents. RetrievalQA cannot stream the response; everything has to be processed before the response is sent back. 70 | 71 | ```bash 72 | curl -X POST -H "Content-Type: application/json" -d '{"input": "what is langchain"}' http://langchain-docs.default.xxxx.sslip.io/v2/invoke 73 | ``` 74 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # Frequently Asked Questions 2 | 3 | ### How to use Paka in a team? 4 | Paka is designed to handle cluster management in a team setting. To activate this feature, you'll need to establish a shared storage backend. This backend will hold the state data for the cluster provision. 5 | 6 | A practical choice for this shared storage backend is AWS S3. You can set it up by using the `PULUMI_BACKEND_URL` environment variable. The format for this is `PULUMI_BACKEND_URL=s3:///`. 7 | 8 | It's important to note that Paka uses Pulumi for cluster provisioning, hence the use of the Pulumi backend URL. 9 | 10 | ### How to run functions on dedicated nodes? 11 | To run functions on dedicated nodes, you can use the `function` field in the cluster spec. This field allows you to specify a node label that the function should run on. 12 | 13 | ```yaml 14 | function: 15 | nodeGroups: 16 | - nodeTypes: ["t3a.large"] 17 | diskSize: 20 18 | isSpot: true 19 | minInstances: 1 20 | maxInstances: 3 21 | ``` 22 | 23 | ### How to run jobs on dedicated nodes? 24 | To run jobs on dedicated nodes, you can use the `job` field in the cluster spec. This field allows you to specify a node label that the job should run on. 25 | 26 | ```yaml 27 | job: 28 | enabled: true 29 | brokerStorageSize: 40Gi 30 | nodeGroups: 31 | - nodeTypes: ["t3a.large"] 32 | diskSize: 20 33 | isSpot: true 34 | minInstances: 1 35 | maxInstances: 3 36 | ``` 37 | 38 | ### How to monitor logs? 39 | For AWS deployment, logs are sinked to AWS CloudWatch. You can view the logs by navigating to the CloudWatch console and selecting the log group for the function you want to monitor. Alternatively, you can use the Stern CLI (https://github.com/stern/stern) to view the logs. 40 | 41 | To view the model logs, you can use the following command: 42 | ```bash 43 | stern --selector app=model-group 44 | ``` 45 | 46 | To view the function logs, you can use the following command: 47 | ```bash 48 | stern "my-app*" 49 | ``` 50 | 51 | ### How to scale the cluster? 52 | For model groups, you can scale the cluster by updating the `maxInstances` field in the cluster spec. This field specifies the maximum number of instances that can be created for the model group. And then set up appropriate auto-scaling triggers. 53 | 54 | Scaling by CPU utilization: 55 | ```yaml 56 | modelGroups: 57 | - name: auto-scale-model 58 | minInstances: 1 59 | maxInstances: 3 60 | ... 61 | autoScaleTriggers: 62 | - type: cpu 63 | metadata: 64 | type: Utilization 65 | value: "50" 66 | ``` 67 | 68 | Scaling by Prometheus metrics: 69 | ```yaml 70 | prometheus: 71 | enabled: true 72 | modelGroups: 73 | - name: auto-scale-model 74 | minInstances: 1 75 | maxInstances: 3 76 | ... 77 | autoScaleTriggers: 78 | - type: prometheus 79 | metadata: 80 | serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint 81 | metricName: latency_p95 82 | threshold: '20000' # Set to 20s, tune as needed 83 | query: | # Trigger scaling if p95 latency exceeds 20s 84 | histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama2-7b-chat.default.svc.cluster.local"}[5m])) by (le)) 85 | ``` 86 | 87 | For functions, you can adjust the scaling parameters as you deploy the function. 88 | 89 | ```bash 90 | paka function deploy --name my-function --source . --entrypoint serve --min-instances 1 --max-instances 3 --scaling-metric concurrency --metric_target 2 91 | ``` 92 | -------------------------------------------------------------------------------- /docs/quick_start.md: -------------------------------------------------------------------------------- 1 | Since Paka currently only supports AWS, the quick start guide will be tailored to AWS. 2 | 3 | ### Install the necessary dependencies 4 | - Install docker daemon and CLI. 5 | - Install the aws cli and ensure your AWS credentials are correctly configured. 6 | ```bash 7 | aws configure 8 | ``` 9 | - Install Paka. 10 | ```bash 11 | pip install paka 12 | ``` 13 | 14 | ### Request GPU quota increase 15 | Go to the AWS console and request a quota increase. Beware that there are two types of quotas: On-Demand and Spot. The On-Demand quota is the number of instances that are not preemptible, while the Spot quota is the number of instances that can be preempted. Spot instances are cheaper than On-Demand instances. 16 | 17 | Paka supports mixed instance types, so you can use spot instances for cost savings and on-demand instances as a fallback. 18 | 19 | ### Create a cluster config file 20 | Create a `cluster.yaml` (could be any name) file. See [cluster.yaml](https://github.com/jjleng/paka/blob/main/examples/invoice_extraction/cluster.yaml) as an example. Refer to the [cluster config](https://github.com/jjleng/paka/blob/main/docs/cluster_config.md) for the fields that can be included in the cluster config file. 21 | 22 | ### Provision the cluster 23 | Provision the cluster with the following command: 24 | ```bash 25 | paka cluster up -f cluster.yaml 26 | ``` 27 | 28 | ### Build an LLM powered application 29 | Create an application skeleton. See [invoice_extraction](https://github.com/jjleng/paka/tree/main/examples/invoice_extraction) as an example. Ensure the following files are included in your application root directory: 30 | 31 | - **Procfile**: Defines the entrypoint for your application. See [Procfile](https://github.com/jjleng/paka/blob/main/examples/invoice_extraction/Procfile). 32 | - **.cnignore file**: Excludes any files that shouldn't be included in the build. See [.cnignore](https://github.com/jjleng/paka/blob/main/examples/invoice_extraction/.cnignore). 33 | - **runtime.txt**: Pins the version of the runtime your application uses. See [runtime.txt](https://github.com/jjleng/paka/blob/main/examples/invoice_extraction/runtime.txt). 34 | - **requirements.txt or package.json**: Lists all necessary packages for your application. 35 | 36 | 37 | ### Deploy the application 38 | ```bash 39 | paka function deploy --name APP_NAME --source . --entrypoint ENTRYPOINT_NAME 40 | ``` 41 | 42 | APP_NAME is the name of the application. The command above will build the source and deploy it as a serverless function. 43 | `--source` specifies the source directory of the application. 44 | `--entrypoint` specifies the entrypoint of the application, which is defined in the Procfile. 45 | 46 | ### Check the logs 47 | For AWS deployment, logs are sinked to AWS CloudWatch. You can view the logs by navigating to the CloudWatch console and selecting the log group for the function you want to monitor. Alternatively, you can use the Stern CLI (https://github.com/stern/stern) to view the logs. 48 | 49 | To view the model logs, you can use the following command: 50 | ```bash 51 | stern --selector app=model-group 52 | ``` 53 | 54 | To view the function logs, you can use the following command: 55 | ```bash 56 | stern "my-app*" 57 | ``` 58 | 59 | ### Continuous Integration/Deployment 60 | You can set up a CI/CD pipeline to automate the deployment process. For example, you can use GitHub Actions to build and deploy the application on every push to the main branch. To deploy the local changes to the cloud, you can simply run the deploy command again. 61 | 62 | ```bash 63 | paka function deploy --name APP_NAME --source . --entrypoint ENTRYPOINT_NAME 64 | ``` 65 | 66 | ### Tear down the cluster 67 | ```bash 68 | paka cluster down -f cluster.yaml -y 69 | ``` 70 | -------------------------------------------------------------------------------- /paka/cluster/nvidia_device_plugin.py: -------------------------------------------------------------------------------- 1 | import pulumi 2 | import pulumi_kubernetes as k8s 3 | 4 | from paka.cluster.context import Context 5 | from paka.utils import call_once 6 | 7 | 8 | @call_once 9 | def install_nvidia_device_plugin(ctx: Context, version: str = "v0.15.0-rc.2") -> None: 10 | """ 11 | Installs the NVIDIA device plugin for GPU support in the cluster. 12 | 13 | This function deploys the NVIDIA device plugin to the cluster using a DaemonSet. 14 | The device plugin allows Kubernetes to discover and manage GPU resources on the nodes. 15 | 16 | Args: 17 | k8s_provider (k8s.Provider): The Kubernetes provider to use for deploying the device plugin. 18 | 19 | Returns: 20 | None 21 | """ 22 | 23 | k8s.apps.v1.DaemonSet( 24 | "nvidia-device-plugin-daemonset", 25 | metadata=k8s.meta.v1.ObjectMetaArgs( 26 | namespace="kube-system", 27 | ), 28 | spec=k8s.apps.v1.DaemonSetSpecArgs( 29 | selector=k8s.meta.v1.LabelSelectorArgs( 30 | match_labels={ 31 | "name": "nvidia-device-plugin-ds", 32 | }, 33 | ), 34 | update_strategy=k8s.apps.v1.DaemonSetUpdateStrategyArgs( 35 | type="RollingUpdate", 36 | ), 37 | template=k8s.core.v1.PodTemplateSpecArgs( 38 | metadata=k8s.meta.v1.ObjectMetaArgs( 39 | labels={ 40 | "name": "nvidia-device-plugin-ds", 41 | }, 42 | ), 43 | spec=k8s.core.v1.PodSpecArgs( 44 | tolerations=[ 45 | k8s.core.v1.TolerationArgs( 46 | key="nvidia.com/gpu", 47 | operator="Exists", 48 | effect="NoSchedule", 49 | ), 50 | k8s.core.v1.TolerationArgs(operator="Exists"), 51 | ], 52 | priority_class_name="system-node-critical", 53 | containers=[ 54 | k8s.core.v1.ContainerArgs( 55 | image=f"nvcr.io/nvidia/k8s-device-plugin:{version}", 56 | name="nvidia-device-plugin-ctr", 57 | env=[ 58 | k8s.core.v1.EnvVarArgs( 59 | name="FAIL_ON_INIT_ERROR", 60 | value="false", 61 | ) 62 | ], 63 | security_context=k8s.core.v1.SecurityContextArgs( 64 | allow_privilege_escalation=False, 65 | capabilities=k8s.core.v1.CapabilitiesArgs( 66 | drop=["ALL"], 67 | ), 68 | ), 69 | volume_mounts=[ 70 | k8s.core.v1.VolumeMountArgs( 71 | name="device-plugin", 72 | mount_path="/var/lib/kubelet/device-plugins", 73 | ) 74 | ], 75 | ) 76 | ], 77 | volumes=[ 78 | k8s.core.v1.VolumeArgs( 79 | name="device-plugin", 80 | host_path=k8s.core.v1.HostPathVolumeSourceArgs( 81 | path="/var/lib/kubelet/device-plugins", 82 | ), 83 | ) 84 | ], 85 | ), 86 | ), 87 | ), 88 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider), 89 | ) 90 | -------------------------------------------------------------------------------- /paka/cli/model_group.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from typing import Optional, Set 5 | 6 | import boto3 7 | import typer 8 | from kubernetes import client 9 | from tabulate import tabulate 10 | 11 | from paka.cli.utils import ( 12 | ensure_cluster_name, 13 | get_cluster_namespace, 14 | load_kubeconfig, 15 | read_pulumi_stack, 16 | ) 17 | from paka.k8s.model_group.service import MODEL_PATH_PREFIX, filter_services 18 | from paka.logger import logger 19 | 20 | model_group_app = typer.Typer() 21 | 22 | 23 | @model_group_app.command() 24 | def list_downloaded_models( 25 | cluster_name: Optional[str] = typer.Option( 26 | os.getenv("PAKA_CURRENT_CLUSTER"), 27 | "--cluster", 28 | "-c", 29 | help="The name of the cluster.", 30 | ), 31 | ) -> None: 32 | """ 33 | List all models that have been downloaded to the object store. 34 | """ 35 | load_kubeconfig(cluster_name) 36 | cluster_name = ensure_cluster_name(cluster_name) 37 | bucket = read_pulumi_stack(cluster_name, "bucket") 38 | 39 | s3 = boto3.client("s3") 40 | response = s3.list_objects_v2(Bucket=bucket, Prefix=MODEL_PATH_PREFIX) 41 | if "Contents" in response: 42 | unique_models: Set[str] = set() 43 | for obj in response["Contents"]: 44 | key = obj["Key"] 45 | if key.startswith(f"{MODEL_PATH_PREFIX}/"): 46 | key = key[len(f"{MODEL_PATH_PREFIX}/") :] 47 | 48 | key = key.split("/")[0] 49 | unique_models.add(key) 50 | 51 | for key in unique_models: 52 | logger.info(key) 53 | else: 54 | logger.info("No models found.") 55 | 56 | 57 | @model_group_app.command() 58 | def list( 59 | cluster_name: Optional[str] = typer.Option( 60 | os.getenv("PAKA_CURRENT_CLUSTER"), 61 | "--cluster", 62 | "-c", 63 | help="The name of the cluster.", 64 | ), 65 | ) -> None: 66 | """ 67 | List all model groups. 68 | """ 69 | load_kubeconfig(cluster_name) 70 | services = filter_services(get_cluster_namespace(cluster_name)) 71 | 72 | # Get public model groups 73 | public_model_groups = [ 74 | service.spec.selector.get("model", "") 75 | for service in services 76 | if service.spec 77 | and service.spec.selector 78 | and "model" in service.spec.selector 79 | and service.metadata 80 | and service.metadata.labels 81 | and (service.metadata.labels.get("is-public", "false") == "true") 82 | ] 83 | 84 | # Get private model groups 85 | private_model_groups = [ 86 | service.spec.selector.get("model", "") 87 | for service in services 88 | if service.spec 89 | and service.spec.selector 90 | and "model" in service.spec.selector 91 | and service.metadata 92 | and service.metadata.labels 93 | and (service.metadata.labels.get("is-public", "false") == "false") 94 | ] 95 | 96 | model_groups = public_model_groups + private_model_groups 97 | 98 | v1 = client.CoreV1Api() 99 | cfg = v1.read_namespaced_config_map("config-domain", "knative-serving") 100 | cfg_data = cfg.data or {} 101 | filtered_keys = [key for key in cfg_data if key.endswith("sslip.io")] 102 | if not filtered_keys: 103 | if not model_groups: 104 | logger.info("No model groups found.") 105 | else: 106 | logger.info("\n".join(model_groups)) 107 | return 108 | domain = filtered_keys[0] 109 | 110 | table = [(group, f"http://{group}.{domain}") for group in public_model_groups] 111 | table.extend([(group, f"private") for group in private_model_groups]) 112 | logger.info(tabulate(table, headers=["Model Group", "Endpoint"])) 113 | -------------------------------------------------------------------------------- /paka/cli/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shlex 3 | from typing import Optional 4 | 5 | import typer 6 | from kubernetes import client 7 | 8 | from paka.cli.utils import get_cluster_namespace, load_kubeconfig, resolve_image 9 | from paka.k8s.utils import tail_logs 10 | from paka.logger import logger 11 | from paka.utils import kubify_name, random_str 12 | 13 | CLEANUP_TIMEOUT = 600 # 10 minutes 14 | 15 | run_app = typer.Typer() 16 | 17 | 18 | @run_app.callback(invoke_without_command=True) 19 | def one_off_script( 20 | cluster_name: Optional[str] = typer.Option( 21 | os.getenv("PAKA_CURRENT_CLUSTER"), 22 | "--cluster", 23 | "-c", 24 | help="The name of the cluster.", 25 | ), 26 | entrypoint: str = typer.Option( 27 | ..., 28 | "--entrypoint", 29 | help="The entrypoint of the application. This refers to the command " 30 | "defined in the Procfile that will be executed.", 31 | ), 32 | source_dir: Optional[str] = typer.Option( 33 | None, 34 | "--source", 35 | help="The directory containing the source code of the application. If " 36 | "specified, a new Docker image will be built using the source code from " 37 | "this directory. A Dockerfile is not required because the build process " 38 | "uses Cloud Native's Buildpacks, which automatically detect and install " 39 | "dependencies.", 40 | ), 41 | image: Optional[str] = typer.Option( 42 | None, 43 | "--image", 44 | help="The name of the Docker image to deploy. If both an image and a " 45 | "source directory are provided, this image will be used and the source " 46 | "directory will be ignored.", 47 | ), 48 | ) -> None: 49 | """ 50 | Runs a one-off script. 51 | 52 | This command creates a new Kubernetes job that runs the specified entrypoint command 53 | in a container with the specified Docker image. If a source directory is provided, a new 54 | Docker image is built using the source code from that directory. 55 | """ 56 | load_kubeconfig(cluster_name) 57 | resolved_image = resolve_image(cluster_name, image, source_dir) 58 | 59 | # Generate a job name which is the hash of the command 60 | job_name = f"run-{kubify_name(random_str(10))}" 61 | 62 | job = client.V1Job( 63 | api_version="batch/v1", 64 | kind="Job", 65 | metadata=client.V1ObjectMeta(name=job_name, labels={"job-name": job_name}), 66 | spec=client.V1JobSpec( 67 | template=client.V1PodTemplateSpec( 68 | spec=client.V1PodSpec( 69 | containers=[ 70 | client.V1Container( 71 | name="one-off-script", 72 | image=resolved_image, 73 | image_pull_policy="Always", 74 | command=shlex.split(entrypoint), 75 | ) 76 | ], 77 | restart_policy="Never", 78 | ) 79 | ), 80 | backoff_limit=0, 81 | ttl_seconds_after_finished=CLEANUP_TIMEOUT, 82 | ), 83 | ) 84 | 85 | namespace = get_cluster_namespace(cluster_name) 86 | 87 | logger.info(f"Submitting the task...") 88 | batch_api = client.BatchV1Api() 89 | batch_api.create_namespaced_job(namespace=namespace, body=job) 90 | logger.info(f"Successfully submitted the task.") 91 | 92 | logger.info(f"Waiting for the task to complete...") 93 | api = client.CoreV1Api() 94 | pods = api.list_namespaced_pod( 95 | namespace=namespace, label_selector=f"job-name={job_name}" 96 | ) 97 | for pod in pods.items: 98 | if pod.metadata and pod.metadata.name: 99 | tail_logs(namespace, pod.metadata.name, "one-off-script") 100 | -------------------------------------------------------------------------------- /paka/cluster/manager/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from functools import cached_property 5 | from typing import Any 6 | 7 | from pulumi import automation as auto 8 | 9 | from paka.cluster.context import Context 10 | from paka.cluster.pulumi import ensure_pulumi 11 | from paka.config import CloudConfig, Config 12 | from paka.constants import PULUMI_STACK_NAME 13 | from paka.k8s.model_group.service import ( 14 | cleanup_staled_model_group_services, 15 | create_model_group_service, 16 | ) 17 | from paka.k8s.model_group.service_v1 import ( 18 | create_model_group_service as create_model_group_service_v1, 19 | ) 20 | from paka.logger import logger 21 | 22 | STACK_NAME = "default" 23 | 24 | 25 | class ClusterManager(ABC): 26 | """ 27 | Abstract base class for a cluster manager. 28 | 29 | A ClusterManager is responsible for managing a cluster of compute resources. 30 | 31 | Subclasses must implement the abstract methods defined in this class. 32 | """ 33 | 34 | config: Config 35 | cloud_config: CloudConfig 36 | 37 | def __init__(self, config: Config) -> None: 38 | self.config = config 39 | if not config.aws is None: 40 | self.cloud_config = config.aws 41 | self.ctx = Context() 42 | self.ctx.set_config(config) 43 | 44 | @abstractmethod 45 | def provision_k8s(self) -> None: 46 | pass 47 | 48 | def _stack_for_program(self, program: auto.PulumiFn) -> auto.Stack: 49 | return auto.create_or_select_stack( 50 | stack_name=PULUMI_STACK_NAME, 51 | project_name=self.cloud_config.cluster.name, 52 | program=program, 53 | ) 54 | 55 | @cached_property 56 | def _stack(self) -> auto.Stack: 57 | ensure_pulumi() 58 | 59 | def program() -> None: 60 | self.provision_k8s() 61 | 62 | return self._stack_for_program(program) 63 | 64 | def create(self) -> None: 65 | if self.config.aws is None: 66 | raise ValueError("Only AWS is supported.") 67 | 68 | if self.config.aws: 69 | self._stack.set_config( 70 | "aws:region", auto.ConfigValue(value=self.cloud_config.cluster.region) 71 | ) 72 | 73 | logger.info("Creating resources...") 74 | self._stack.up(on_output=logger.info) 75 | 76 | if ( 77 | self.cloud_config.modelGroups is None 78 | and self.cloud_config.mixedModelGroups is None 79 | ): 80 | return 81 | 82 | namespace = self.cloud_config.cluster.namespace 83 | 84 | # Clean up staled model group resources before creating new ones 85 | model_group_names = [mg.name for mg in self.config.aws.modelGroups or []] 86 | mixed_model_group_names = [ 87 | mg.name for mg in self.config.aws.mixedModelGroups or [] 88 | ] 89 | all_group_names = model_group_names + mixed_model_group_names 90 | 91 | cleanup_staled_model_group_services(namespace, all_group_names) 92 | # TODO: We should clean up deployment as well 93 | 94 | for model_group in self.cloud_config.modelGroups or []: 95 | create_model_group_service(self.ctx, namespace, model_group) 96 | 97 | for mixed_model_group in self.cloud_config.mixedModelGroups or []: 98 | create_model_group_service_v1(self.ctx, namespace, mixed_model_group) 99 | 100 | def destroy(self) -> Any: 101 | logger.info("Destroying resources...") 102 | return self._stack.destroy(on_output=logger.info) 103 | 104 | def refresh(self) -> None: 105 | logger.info("Refreshing the stack...") 106 | self._stack.refresh(on_output=logger.info) 107 | 108 | def preview(self, *args: Any, **kwargs: Any) -> None: 109 | if not "on_output" in kwargs: 110 | kwargs["on_output"] = logger.info 111 | self._stack.preview(*args, **kwargs) 112 | -------------------------------------------------------------------------------- /tests/k8s/test_utils.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | from kubernetes.client.exceptions import ApiException 4 | 5 | import paka.k8s.utils 6 | from paka.k8s.utils import KubeconfigMerger, KubernetesResource, apply_resource 7 | 8 | 9 | def test_apply_resource() -> None: 10 | resource = MagicMock(spec=KubernetesResource) 11 | resource.kind = "Deployment" 12 | resource.metadata = MagicMock() 13 | resource.metadata.name = "test" 14 | resource.metadata.namespace = "default" 15 | 16 | with patch("kubernetes.client.AppsV1Api") as mock_api_class: 17 | mock_api = mock_api_class.return_value 18 | mock_api.create_namespaced_deployment = MagicMock() 19 | mock_api.replace_namespaced_deployment = MagicMock() 20 | mock_api.read_namespaced_deployment = MagicMock( 21 | side_effect=ApiException(status=404) 22 | ) 23 | 24 | apply_resource(resource) 25 | 26 | mock_api.create_namespaced_deployment.assert_called_once_with( 27 | resource.metadata.namespace, resource 28 | ) 29 | 30 | 31 | def test_apply_resource_existing() -> None: 32 | resource = MagicMock(spec=KubernetesResource) 33 | resource.kind = "Deployment" 34 | resource.metadata = MagicMock() 35 | resource.metadata.name = "test" 36 | resource.metadata.namespace = "default" 37 | 38 | with patch("kubernetes.client.AppsV1Api") as mock_api_class: 39 | mock_api = mock_api_class.return_value 40 | mock_api.create_namespaced_deployment = MagicMock() 41 | mock_api.replace_namespaced_deployment = MagicMock() 42 | mock_api.read_namespaced_deployment = MagicMock() 43 | 44 | apply_resource(resource) 45 | 46 | mock_api.replace_namespaced_deployment.assert_called_once_with( 47 | resource.metadata.name, resource.metadata.namespace, resource 48 | ) 49 | 50 | 51 | def test_apply_resource_scaled_object() -> None: 52 | resource = MagicMock() 53 | resource.kind = "ScaledObject" 54 | resource.metadata = MagicMock() 55 | resource.metadata.name = "test" 56 | resource.metadata.namespace = "default" 57 | 58 | with patch.object( 59 | paka.k8s.utils, "create_namespaced_custom_object" 60 | ) as mock_create, patch.object( 61 | paka.k8s.utils, "read_namespaced_custom_object" 62 | ) as mock_read: 63 | mock_read.side_effect = ApiException(status=404) 64 | 65 | apply_resource(resource) 66 | 67 | mock_create.assert_called_once_with(resource.metadata.namespace, resource) 68 | 69 | 70 | def test_kubeconfig_merger() -> None: 71 | # Initialize a KubeconfigMerger object with some initial config 72 | merger = KubeconfigMerger( 73 | { 74 | "clusters": [{"name": "cluster1", "data": "data1"}], 75 | "users": [{"name": "user1", "data": "data1"}], 76 | "contexts": [{"name": "context1", "data": "data1"}], 77 | "current-context": "context1", 78 | "other-key": "other-value", 79 | } 80 | ) 81 | 82 | # Define a new config to be merged 83 | new_config = { 84 | "clusters": [{"name": "cluster2", "data": "data2"}], 85 | "users": [{"name": "user2", "data": "data2"}], 86 | "contexts": [{"name": "context2", "data": "data2"}], 87 | "current-context": "context2", 88 | "other-key": "other-value2", 89 | } 90 | 91 | merger.merge(new_config) 92 | 93 | assert merger.config == { 94 | "clusters": [ 95 | {"name": "cluster1", "data": "data1"}, 96 | {"name": "cluster2", "data": "data2"}, 97 | ], 98 | "users": [ 99 | {"name": "user1", "data": "data1"}, 100 | {"name": "user2", "data": "data2"}, 101 | ], 102 | "contexts": [ 103 | {"name": "context1", "data": "data1"}, 104 | {"name": "context2", "data": "data2"}, 105 | ], 106 | "current-context": "context2", 107 | "other-key": "other-value2", 108 | } 109 | -------------------------------------------------------------------------------- /paka/cluster/pulumi.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import shutil 4 | import tarfile 5 | import zipfile 6 | from pathlib import Path 7 | 8 | import requests 9 | 10 | from paka.cluster.kubectl import ensure_kubectl 11 | from paka.logger import logger 12 | from paka.utils import calculate_sha256, download_url, get_project_data_dir 13 | 14 | # Pin the Pulumi version to avoid breaking changes 15 | PULUMI_VERSION = "v3.114.0" 16 | 17 | 18 | def change_permissions_recursive(path: Path, mode: int) -> None: 19 | for child in path.iterdir(): 20 | if child.is_file(): 21 | child.chmod(mode) 22 | elif child.is_dir(): 23 | child.chmod(mode) 24 | change_permissions_recursive(child, mode) 25 | 26 | 27 | def ensure_pulumi() -> None: 28 | # Plulumi kubernetes provider requires kubectl to be installed 29 | ensure_kubectl() 30 | paka_home = Path(get_project_data_dir()) 31 | 32 | bin_dir = paka_home / "bin" 33 | bin_dir.mkdir(parents=True, exist_ok=True) 34 | 35 | system = platform.system().lower() 36 | arch = platform.machine().lower() 37 | 38 | current_path = os.environ.get("PATH", "") 39 | 40 | pulumi_files = list(bin_dir.glob("pulumi-*")) 41 | if pulumi_files: 42 | os.environ["PATH"] = f"{pulumi_files[0]}{os.pathsep}{current_path}" 43 | return 44 | 45 | pulumi_version = PULUMI_VERSION 46 | 47 | new_pulumi_path = bin_dir / f"pulumi-{pulumi_version}" 48 | 49 | if arch in ["amd64", "x86_64"]: 50 | arch = "x64" 51 | elif arch == "arm64": 52 | arch = "arm64" 53 | else: 54 | raise Exception(f"Unsupported architecture: {arch}") 55 | 56 | pulumi_file = f"pulumi-{pulumi_version}-{system}-{arch}" 57 | 58 | if system == "windows": 59 | pulumi_file = f"{pulumi_file}.zip" 60 | else: 61 | pulumi_file = f"{pulumi_file}.tar.gz" 62 | 63 | # First of all, download the checksum file 64 | checksum_url = f"https://github.com/pulumi/pulumi/releases/download/{pulumi_version}/pulumi-{pulumi_version[1:]}-checksums.txt" 65 | 66 | response = requests.get(checksum_url) 67 | response.raise_for_status() 68 | file_sha256_dict = {} 69 | # Iterate over the lines in the checksum file and split by sha256 and filename 70 | for line in response.text.strip().split("\n"): 71 | expected_sha256, filename = line.strip().split() 72 | file_sha256_dict[filename] = expected_sha256 73 | 74 | url = f"https://github.com/pulumi/pulumi/releases/download/{pulumi_version}/{pulumi_file}" 75 | 76 | logger.info(f"Downloading {pulumi_file}...") 77 | 78 | with download_url(url) as archive_file: 79 | archive_file_sha256 = calculate_sha256(archive_file) 80 | 81 | if pulumi_file not in file_sha256_dict: 82 | raise Exception(f"SHA256 not found for {pulumi_file}") 83 | 84 | expected_sha256 = file_sha256_dict[pulumi_file] 85 | 86 | if archive_file_sha256 != expected_sha256: 87 | raise Exception( 88 | f"SHA256 mismatch: {archive_file_sha256} != {expected_sha256}" 89 | ) 90 | 91 | if system == "windows": 92 | with zipfile.ZipFile(archive_file, "r") as zip_ref: 93 | zip_ref.extractall(bin_dir) 94 | else: 95 | with tarfile.open(archive_file, "r:gz") as tar: 96 | tar.extractall(bin_dir) 97 | 98 | pulumi_path = bin_dir / "pulumi" 99 | change_permissions_recursive(pulumi_path, 0o755) 100 | pulumi_path = pulumi_path.rename(new_pulumi_path) 101 | 102 | # For windows, the Pulumi binary is under pulumi_path/bin 103 | # For other platforms, the Pulumi binary is under pulumi_path 104 | if system == "windows": 105 | windows_bin_path = pulumi_path / "bin" 106 | for file in windows_bin_path.iterdir(): 107 | if file.is_file(): 108 | shutil.move(str(file), str(pulumi_path)) 109 | 110 | logger.info("Pulumi installed successfully.") 111 | 112 | os.environ["PATH"] = f"{pulumi_path}{os.pathsep}{current_path}" 113 | -------------------------------------------------------------------------------- /paka/container/ecr.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import subprocess 3 | 4 | import boto3 5 | 6 | from paka.logger import logger 7 | from paka.utils import random_str 8 | 9 | 10 | def authenticate_docker_to_ecr(aws_region: str) -> str: 11 | try: 12 | ecr_client = boto3.client("ecr", region_name=aws_region) 13 | token = ecr_client.get_authorization_token() 14 | username, password = ( 15 | base64.b64decode(token["authorizationData"][0]["authorizationToken"]) 16 | .decode("utf-8") 17 | .split(":") 18 | ) 19 | ecr_url = token["authorizationData"][0]["proxyEndpoint"] 20 | 21 | p = subprocess.Popen( 22 | ["docker", "login", "-u", username, "--password-stdin", ecr_url], 23 | stdin=subprocess.PIPE, 24 | stdout=subprocess.PIPE, 25 | stderr=subprocess.PIPE, 26 | ) 27 | stdout, stderr = p.communicate(input=password.encode()) 28 | if p.returncode != 0: 29 | raise Exception(f"Docker login failed: {stderr.decode()}") 30 | 31 | return ecr_url 32 | except Exception as e: 33 | print(f"An error occurred: {str(e)}") 34 | raise 35 | 36 | 37 | def push_to_ecr( 38 | local_image_name: str, repository_uri: str, aws_region: str, app_name: str 39 | ) -> str: 40 | """ 41 | Pushes a Docker image to an Amazon ECR repository. 42 | 43 | This function tags the Docker image with a version tag and the "latest" tag, 44 | logs in to the ECR repository, and pushes the image to the repository. 45 | The version tag is generated randomly. 46 | 47 | All applications share the same container registry repository. 48 | To differentiate between them, we append the application name to the image tag. 49 | The '-latest' suffix is added to handle cases where applications themselves are tagged. 50 | This ensures that even tagged applications have a unique identifier in the shared repository. 51 | 52 | Args: 53 | local_image_name (str): The name of the Docker image to push. 54 | repository_uri (str): The URI of the ECR repository to push the image to. 55 | aws_region (str): The AWS region where the ECR repository is located. 56 | app_name (str): The name of the application. Used to generate the image tags. 57 | 58 | Raises: 59 | subprocess.CalledProcessError: If an error occurs while executing a subprocess command. 60 | 61 | Returns: 62 | str: The version tag of the image that was pushed. 63 | """ 64 | try: 65 | # Generate a random version number 66 | version = random_str() 67 | 68 | # Tag the image with the repository URI and the version tag 69 | version_tag = f"{app_name}-v{version}" 70 | 71 | local_image_tagged = ( 72 | f"{local_image_name}:latest" 73 | if ":" not in local_image_name 74 | else local_image_name 75 | ) 76 | 77 | # Tag the image with the repository URI 78 | subprocess.run( 79 | [ 80 | "docker", 81 | "tag", 82 | local_image_tagged, 83 | f"{repository_uri}:{version_tag}", 84 | ], 85 | check=True, 86 | ) 87 | 88 | # Tag the image with the repository URI and the "latest" tag 89 | latest_tag = f"{app_name}-latest" 90 | subprocess.run( 91 | [ 92 | "docker", 93 | "tag", 94 | local_image_tagged, 95 | f"{repository_uri}:{latest_tag}", 96 | ], 97 | check=True, 98 | ) 99 | 100 | # Authenticate Docker to the ECR 101 | authenticate_docker_to_ecr(aws_region) 102 | 103 | # Push the image to the ECR repository 104 | subprocess.run( 105 | ["docker", "push", f"{repository_uri}:{version_tag}"], check=True 106 | ) 107 | subprocess.run(["docker", "push", f"{repository_uri}:{latest_tag}"], check=True) 108 | 109 | logger.info(f"Successfully pushed {local_image_name} to {repository_uri}") 110 | return version_tag 111 | except subprocess.CalledProcessError as e: 112 | logger.error(f"An error occurred: {e}") 113 | raise 114 | -------------------------------------------------------------------------------- /paka/cluster/aws/cluster_autoscaler.py: -------------------------------------------------------------------------------- 1 | import pulumi 2 | import pulumi_aws as aws 3 | import pulumi_eks as eks 4 | import pulumi_kubernetes.helm.v3 as helm 5 | from pulumi_kubernetes.core.v1 import ConfigMap 6 | 7 | from paka.cluster.aws.utils import odic_role_for_sa 8 | from paka.cluster.context import Context 9 | from paka.utils import call_once, to_yaml 10 | 11 | 12 | def create_priority_expander(ctx: Context) -> ConfigMap: 13 | # Create a priority expander to ensure that the cluster autoscaler provisions spot instances first. 14 | priority_data = {10: [".*spot.*"], 1: [".*"]} 15 | return ConfigMap( 16 | "cluster-autoscaler-priority-expander", 17 | metadata={ 18 | "name": "cluster-autoscaler-priority-expander", 19 | "namespace": "kube-system", 20 | }, 21 | data={ 22 | "priorities": to_yaml(priority_data), 23 | }, 24 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider), 25 | ) 26 | 27 | 28 | @call_once 29 | def create_cluster_autoscaler( 30 | ctx: Context, 31 | cluster: eks.Cluster, 32 | ) -> None: 33 | """ 34 | Sets up the cluster autoscaler for an EKS cluster. 35 | 36 | Args: 37 | cluster (eks.Cluster): The EKS cluster. 38 | k8s_provider (k8s.Provider): The Kubernetes provider. 39 | config (CloudConfig): The cluster config provided by user. 40 | 41 | Returns: 42 | None 43 | """ 44 | cluster_name = ctx.cluster_name 45 | 46 | autoscaler_policy_doc = aws.iam.get_policy_document( 47 | statements=[ 48 | aws.iam.GetPolicyDocumentStatementArgs( 49 | actions=[ 50 | "autoscaling:DescribeAutoScalingGroups", 51 | "autoscaling:DescribeAutoScalingInstances", 52 | "autoscaling:DescribeLaunchConfigurations", 53 | "autoscaling:DescribeTags", 54 | "autoscaling:SetDesiredCapacity", 55 | "autoscaling:TerminateInstanceInAutoScalingGroup", 56 | "ec2:DescribeLaunchTemplateVersions", 57 | "eks:DescribeNodegroup", 58 | "ec2:GetInstanceTypesFromInstanceRequirements", 59 | "ec2:DescribeImages", 60 | ], 61 | resources=["*"], 62 | ) 63 | ] 64 | ) 65 | 66 | autoscaler_policy = aws.iam.Policy( 67 | f"{cluster_name}-autoscaler-policy", policy=autoscaler_policy_doc.json 68 | ) 69 | 70 | # The OIDC provider is required because the cluster autoscaler runs within the Kubernetes 71 | # cluster and needs to interact with the AWS API to manage the Auto Scaling Groups (ASGs). 72 | # OIDC provides a secure mechanism for the cluster autoscaler to authenticate with the AWS API. 73 | autoscaler_role = odic_role_for_sa( 74 | ctx, cluster, "autoscaler", "kube-system:cluster-autoscaler" 75 | ) 76 | 77 | aws.iam.RolePolicyAttachment( 78 | f"{cluster_name}-autoscaler-role-policy-attachment", 79 | policy_arn=autoscaler_policy.arn, 80 | role=autoscaler_role.name, 81 | ) 82 | 83 | expander = create_priority_expander(ctx) 84 | 85 | helm.Chart( 86 | "cluster-autoscaler", 87 | helm.ChartOpts( 88 | chart="cluster-autoscaler", 89 | version="9.34.0", 90 | namespace="kube-system", 91 | fetch_opts=helm.FetchOpts(repo="https://kubernetes.github.io/autoscaler"), 92 | values={ 93 | "autoDiscovery": {"clusterName": cluster.eks_cluster.name}, 94 | "awsRegion": ctx.region, 95 | "rbac": { 96 | "create": True, 97 | "serviceAccount": { 98 | "create": True, 99 | "name": "cluster-autoscaler", 100 | "annotations": { 101 | "eks.amazonaws.com/role-arn": autoscaler_role.arn 102 | }, 103 | }, 104 | }, 105 | "serviceMonitor": {"interval": "2s"}, 106 | "image": {"tag": "v1.28.2"}, 107 | "extraArgs": { 108 | "expander": "priority,random", # Use priority expander if possible 109 | }, 110 | }, 111 | ), 112 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider, depends_on=[expander]), 113 | ) 114 | -------------------------------------------------------------------------------- /paka/cluster/qdrant.py: -------------------------------------------------------------------------------- 1 | import pulumi 2 | import pulumi_kubernetes as k8s 3 | from pulumi_kubernetes.helm.v3 import Chart, ChartOpts, FetchOpts 4 | 5 | from paka.cluster.context import Context 6 | from paka.utils import call_once 7 | 8 | 9 | @call_once 10 | def create_qdrant(ctx: Context) -> None: 11 | """ 12 | Installs the qdrant helm chart. 13 | """ 14 | config = ctx.cloud_config 15 | 16 | if not config.vectorStore: 17 | return 18 | 19 | ns = k8s.core.v1.Namespace( 20 | "qdrant", 21 | metadata={"name": "qdrant", "labels": {"istio-injection": "enabled"}}, 22 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider), 23 | ) 24 | 25 | resource_request = ( 26 | { 27 | "resources": { 28 | "requests": { 29 | "cpu": config.vectorStore.resourceRequest.cpu, 30 | "memory": config.vectorStore.resourceRequest.memory, 31 | }, 32 | } 33 | } 34 | if config.vectorStore.resourceRequest 35 | else {} 36 | ) 37 | 38 | Chart( 39 | "qdrant", 40 | ChartOpts( 41 | chart="qdrant", 42 | version="0.7.5", 43 | namespace="qdrant", 44 | fetch_opts=FetchOpts(repo="https://qdrant.github.io/qdrant-helm"), 45 | values={ 46 | "metrics": { 47 | "serviceMonitor": { 48 | "enabled": ( 49 | True 50 | if config.prometheus and config.prometheus.enabled 51 | else False 52 | ), 53 | }, 54 | }, 55 | "replicaCount": config.vectorStore.replicas, 56 | "persistence": { 57 | "size": config.vectorStore.storageSize, 58 | }, 59 | "livenessProbe": { 60 | "enabled": True, 61 | }, 62 | "tolerations": [ 63 | { 64 | "key": "app", 65 | "operator": "Equal", 66 | "value": "qdrant", 67 | "effect": "NoSchedule", 68 | } 69 | ], 70 | "affinity": { 71 | "nodeAffinity": { 72 | "requiredDuringSchedulingIgnoredDuringExecution": { 73 | "nodeSelectorTerms": [ 74 | { 75 | "matchExpressions": [ 76 | { 77 | "key": "app", 78 | "operator": "In", 79 | "values": ["qdrant"], 80 | } 81 | ] 82 | } 83 | ] 84 | } 85 | }, 86 | "podAntiAffinity": { 87 | "requiredDuringSchedulingIgnoredDuringExecution": [ 88 | { 89 | "labelSelector": { 90 | "matchExpressions": [ 91 | { 92 | "key": "app", 93 | "operator": "In", 94 | "values": ["qdrant"], 95 | } 96 | ] 97 | }, 98 | "topologyKey": "kubernetes.io/hostname", 99 | } 100 | ] 101 | }, 102 | }, 103 | "topologySpreadConstraints": [ 104 | { 105 | "maxSkew": 1, 106 | "topologyKey": "topology.kubernetes.io/zone", 107 | "whenUnsatisfiable": "ScheduleAnyway", 108 | "labelSelector": {"matchLabels": {"app": "qdrant"}}, 109 | } 110 | ], 111 | **resource_request, 112 | }, 113 | ), 114 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider, depends_on=[ns]), 115 | ) 116 | -------------------------------------------------------------------------------- /paka/cluster/aws/service_account.py: -------------------------------------------------------------------------------- 1 | import pulumi 2 | import pulumi_aws as aws 3 | import pulumi_eks as eks 4 | import pulumi_kubernetes as k8s 5 | 6 | from paka.cluster.aws.utils import odic_role_for_sa 7 | from paka.cluster.context import Context 8 | from paka.constants import ACCESS_ALL_SA 9 | from paka.utils import call_once 10 | 11 | 12 | @call_once 13 | def create_service_accounts( 14 | ctx: Context, 15 | cluster: eks.Cluster, 16 | ) -> None: 17 | """ 18 | Creates service accounts with necessary IAM roles and policies. 19 | 20 | This function creates two IAM policies: one for S3 access and one for ECR access. 21 | It then creates an IAM role for the service account and attaches the two policies to this role. 22 | Finally, it creates a Kubernetes service account and annotates it with the ARN of the IAM role. 23 | 24 | The S3 policy allows the service account to get objects and list the bucket. 25 | The ECR policy allows the service account to perform various actions related to ECR images. 26 | 27 | Args: 28 | config (CloudConfig): The cloud configuration containing the cluster name. 29 | cluster (eks.Cluster): The EKS cluster to create the service accounts in. 30 | k8s_provider (k8s.Provider): The Kubernetes provider to use when creating the service account. 31 | 32 | Returns: 33 | None 34 | """ 35 | cluster_name = ctx.cluster_name 36 | bucket = ctx.bucket 37 | 38 | s3_policy = aws.iam.Policy( 39 | f"{cluster_name}-s3-access-policy", 40 | policy=aws.iam.get_policy_document( 41 | statements=[ 42 | aws.iam.GetPolicyDocumentStatementArgs( 43 | effect="Allow", 44 | actions=["s3:GetObject", "s3:ListBucket"], 45 | resources=[ 46 | f"arn:aws:s3:::{bucket}/*", 47 | f"arn:aws:s3:::{bucket}", 48 | ], 49 | ) 50 | ] 51 | ).json, 52 | ) 53 | 54 | ecr_policy = aws.iam.Policy( 55 | f"{cluster_name}-ecr-access-policy", 56 | policy=aws.iam.get_policy_document( 57 | statements=[ 58 | aws.iam.GetPolicyDocumentStatementArgs( 59 | effect="Allow", 60 | actions=[ 61 | "ecr:GetDownloadUrlForLayer", 62 | "ecr:BatchGetImage", 63 | "ecr:BatchCheckLayerAvailability", 64 | "ecr:ListImages", 65 | "ecr:DescribeImages", 66 | ], 67 | resources=["*"], 68 | ) 69 | ] 70 | ).json, 71 | ) 72 | 73 | cloudwatch_policy = aws.iam.Policy( 74 | "cloudwatch-policy", 75 | policy=aws.iam.get_policy_document( 76 | statements=[ 77 | aws.iam.GetPolicyDocumentStatementArgs( 78 | effect="Allow", 79 | actions=[ 80 | "logs:CreateLogGroup", 81 | "logs:CreateLogStream", 82 | "logs:PutLogEvents", 83 | "logs:DescribeLogStreams", 84 | ], 85 | resources=["arn:aws:logs:*:*:*"], 86 | ) 87 | ] 88 | ).json, 89 | ) 90 | 91 | namespace = ctx.namespace 92 | sa_role = odic_role_for_sa( 93 | ctx, 94 | cluster, 95 | "sa", 96 | f"{namespace}:{ACCESS_ALL_SA}", 97 | ) 98 | 99 | aws.iam.RolePolicyAttachment( 100 | f"{cluster_name}-sa-s3-role-policy-attachment", 101 | role=sa_role.name, 102 | policy_arn=s3_policy.arn, 103 | ) 104 | 105 | aws.iam.RolePolicyAttachment( 106 | f"{cluster_name}-sa-ecr-role-policy-attachment", 107 | role=sa_role.name, 108 | policy_arn=ecr_policy.arn, 109 | ) 110 | 111 | aws.iam.RolePolicyAttachment( 112 | f"{cluster_name}-sa-cloudwatch-role-policy-attachment", 113 | role=sa_role.name, 114 | policy_arn=cloudwatch_policy.arn, 115 | ) 116 | 117 | k8s.core.v1.ServiceAccount( 118 | f"{cluster_name}-service-account", 119 | metadata={ 120 | "namespace": ctx.namespace, 121 | "name": ACCESS_ALL_SA, 122 | "annotations": {"eks.amazonaws.com/role-arn": sa_role.arn}, 123 | }, 124 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider), 125 | ) 126 | -------------------------------------------------------------------------------- /paka/cluster/context.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional 4 | 5 | import fasteners 6 | import pulumi_kubernetes as k8s 7 | 8 | from paka.config import CloudConfig, Config 9 | 10 | 11 | class Context: 12 | _k8s_provider: Optional[k8s.Provider] 13 | _config: Optional[Config] 14 | # Materialized bucket with a unique name 15 | _bucket: Optional[str] 16 | # Materialized container registry url 17 | _registry: Optional[str] 18 | # The kubeconfig str 19 | _kubeconfig: Optional[str] 20 | 21 | # Need to lock the access to these fields 22 | _should_save_kubeconfig: bool = False 23 | 24 | def __init__(self) -> None: 25 | # Ugly, ideally, we can create these locks dynamically in __getattr__. 26 | # However, __getattr__ is not thread safe either. We need another lock to protect the creation of locks. 27 | # This lock is going to be a bottleneck. Therefore, we pre-create the locks. 28 | # Multiple locks pose a risk of deadlock. We need to be careful when acquiring multiple locks. 29 | self._k8s_provider_lock = fasteners.ReaderWriterLock() 30 | self._config_lock = fasteners.ReaderWriterLock() 31 | self._bucket_lock = fasteners.ReaderWriterLock() 32 | self._registry_lock = fasteners.ReaderWriterLock() 33 | self._kubeconfig_lock = fasteners.ReaderWriterLock() 34 | 35 | @fasteners.write_locked(lock="_k8s_provider_lock") 36 | def set_k8s_provider(self, k8s_provider: k8s.Provider) -> None: 37 | self._k8s_provider = k8s_provider 38 | 39 | @property 40 | @fasteners.read_locked(lock="_k8s_provider_lock") 41 | def k8s_provider(self) -> Optional[k8s.Provider]: 42 | return self._k8s_provider 43 | 44 | @fasteners.write_locked(lock="_config_lock") 45 | def set_config(self, config: Config) -> None: 46 | self._config = config 47 | 48 | @property 49 | @fasteners.read_locked(lock="_config_lock") 50 | def config(self) -> Optional[Config]: 51 | return self._config 52 | 53 | @property 54 | @fasteners.read_locked(lock="_config_lock") 55 | def cloud_config(self) -> Optional[CloudConfig]: 56 | if self._config is None: 57 | raise RuntimeError("Config is not set.") 58 | if self._config.aws is None: 59 | raise RuntimeError("Only AWS is supported.") 60 | 61 | return self._config.aws 62 | 63 | @property 64 | @fasteners.read_locked(lock="_config_lock") 65 | def region(self) -> Optional[str]: 66 | # fasteners's inter thread reader lock is reentrant. We can call other methods that acquire the same lock. 67 | # https://github.com/harlowja/fasteners/blob/06c3f06cab4e135b8d921932019a231c180eb9f4/docs/guide/inter_thread.md#lack-of-features 68 | return self.cloud_config.cluster.region 69 | 70 | @property 71 | @fasteners.read_locked(lock="_config_lock") 72 | def namespace(self) -> Optional[str]: 73 | # reentrant 74 | return self.cloud_config.cluster.namespace 75 | 76 | @property 77 | @fasteners.read_locked(lock="_config_lock") 78 | def provider(self) -> str: 79 | # reentrant 80 | _ = self.cloud_config 81 | return "aws" 82 | 83 | @property 84 | @fasteners.read_locked(lock="_config_lock") 85 | def cluster_name(self) -> str: 86 | # reentrant 87 | return self.cloud_config.cluster.name 88 | 89 | @fasteners.write_locked(lock="_bucket_lock") 90 | def set_bucket(self, bucket: str) -> None: 91 | self._bucket = bucket 92 | 93 | @property 94 | @fasteners.read_locked(lock="_bucket_lock") 95 | def bucket(self) -> Optional[str]: 96 | return self._bucket 97 | 98 | @fasteners.write_locked(lock="_registry_lock") 99 | def set_registry(self, registry: str) -> None: 100 | self._registry = registry 101 | 102 | @property 103 | @fasteners.read_locked(lock="_registry_lock") 104 | def registry(self) -> Optional[str]: 105 | return self._registry 106 | 107 | @fasteners.write_locked(lock="_kubeconfig_lock") 108 | def set_kubeconfig(self, kubeconfig: str) -> None: 109 | self._kubeconfig = kubeconfig 110 | 111 | @property 112 | @fasteners.read_locked(lock="_kubeconfig_lock") 113 | def kubeconfig(self) -> Optional[str]: 114 | return self._kubeconfig 115 | 116 | def set_should_save_kubeconfig(self, should_save_kubeconfig: bool) -> None: 117 | self._should_save_kubeconfig = should_save_kubeconfig 118 | 119 | @property 120 | def should_save_kubeconfig(self) -> bool: 121 | return self._should_save_kubeconfig 122 | -------------------------------------------------------------------------------- /paka/cluster/prometheus.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Callable, Dict, Optional, Tuple 4 | 5 | import pulumi 6 | import pulumi_kubernetes as k8s 7 | from pulumi_kubernetes.helm.v3 import Chart, ChartOpts, FetchOpts 8 | 9 | from paka.cluster.context import Context 10 | 11 | 12 | def memoize(func: Callable[..., Any]) -> Callable[..., Any]: 13 | cache: Dict[Callable[..., Any], Any] = dict() 14 | 15 | def memoized_func(*args: Tuple[Any, ...], **kwargs: Dict[str, Any]) -> Any: 16 | if func not in cache: 17 | cache[func] = func(*args, **kwargs) 18 | return cache[func] 19 | 20 | return memoized_func 21 | 22 | 23 | @memoize 24 | def create_prometheus(ctx: Context) -> Optional[Chart]: 25 | """ 26 | Installs a Prometheus chart. 27 | """ 28 | config = ctx.cloud_config 29 | if not config.prometheus or not config.prometheus.enabled: 30 | return None 31 | 32 | ns = k8s.core.v1.Namespace( 33 | "prometheus", 34 | metadata={"name": "prometheus"}, 35 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider), 36 | ) 37 | 38 | return Chart( 39 | "kube-prometheus-stack", 40 | ChartOpts( 41 | chart="kube-prometheus-stack", 42 | version="58.6.0", 43 | namespace="prometheus", 44 | fetch_opts=FetchOpts( 45 | repo="https://prometheus-community.github.io/helm-charts" 46 | ), 47 | values={ 48 | "nodeExporter": { 49 | "enabled": config.prometheus.nodeExporter, 50 | }, 51 | "alertmanager": { 52 | "enabled": config.prometheus.alertmanager, 53 | }, 54 | "grafana": { 55 | "enabled": config.prometheus.grafana, 56 | }, 57 | "kubeApiServer": { 58 | "enabled": config.prometheus.kubeApiServer, 59 | }, 60 | "kubelet": { 61 | "enabled": config.prometheus.kubelet, 62 | }, 63 | "kubeControllerManager": { 64 | "enabled": config.prometheus.kubeControllerManager, 65 | }, 66 | "coreDns": { 67 | "enabled": config.prometheus.coreDns, 68 | }, 69 | "kubeEtcd": { 70 | "enabled": config.prometheus.kubeEtcd, 71 | }, 72 | "kubeScheduler": { 73 | "enabled": config.prometheus.kubeScheduler, 74 | }, 75 | "kubeProxy": { 76 | "enabled": config.prometheus.kubeProxy, 77 | }, 78 | "kubeStateMetrics": { 79 | "enabled": config.prometheus.kubeStateMetrics, 80 | }, 81 | "thanosRuler": { 82 | "enabled": config.prometheus.thanosRuler, 83 | }, 84 | # Disable the Prometheus Operator's admission webhooks, since they don't work with Pulumi. 85 | # This means ill-formatted Prometheus rules may make their way into Prometheus. :( 86 | "prometheusOperator": { 87 | "admissionWebhooks": {"enabled": False}, 88 | "tls": {"enabled": False}, 89 | }, 90 | "kube-state-metrics": { 91 | "metricLabelsAllowlist": [ 92 | "pods=[*]", 93 | "deployments=[app.kubernetes.io/name,app.kubernetes.io/component,app.kubernetes.io/instance]", 94 | ] 95 | }, 96 | "prometheus": { 97 | "prometheusSpec": { 98 | "serviceMonitorSelectorNilUsesHelmValues": False, 99 | "podMonitorSelectorNilUsesHelmValues": False, 100 | "storageSpec": { 101 | "volumeClaimTemplate": { 102 | "spec": { 103 | "accessModes": ["ReadWriteOnce"], 104 | "resources": { 105 | "requests": { 106 | "storage": config.prometheus.storageSize, 107 | } 108 | }, 109 | } 110 | } 111 | }, 112 | } 113 | }, 114 | }, 115 | ), 116 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider, depends_on=[ns]), 117 | ) 118 | -------------------------------------------------------------------------------- /paka/cluster/fluentbit.py: -------------------------------------------------------------------------------- 1 | import pulumi 2 | import pulumi_kubernetes as k8s 3 | 4 | from paka.cluster.context import Context 5 | from paka.constants import ACCESS_ALL_SA 6 | from paka.utils import call_once 7 | 8 | 9 | @call_once 10 | def create_fluentbit(ctx: Context, fluent_bit_config: str) -> None: 11 | """ 12 | Creates a fluentbit daemonset with the given configuration. 13 | """ 14 | 15 | parsers_config = """ 16 | [PARSER] 17 | Name docker 18 | Format json 19 | Time_Key time 20 | Time_Format %Y-%m-%dT%H:%M:%S.%fZ 21 | """ 22 | parsers_config_map = k8s.core.v1.ConfigMap( 23 | "fluent-bit-parsers", 24 | data={"parsers.conf": parsers_config}, 25 | metadata={ 26 | "namespace": ctx.namespace, 27 | "name": "fluent-bit-parsers", 28 | }, 29 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider), 30 | ) 31 | 32 | fluent_bit_config_map = k8s.core.v1.ConfigMap( 33 | "fluent-bit-config-map", 34 | data={"fluent-bit.conf": fluent_bit_config}, 35 | metadata={ 36 | "namespace": ctx.namespace, 37 | "name": "fluent-bit-config", 38 | }, 39 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider), 40 | ) 41 | 42 | k8s.apps.v1.DaemonSet( 43 | "fluent-bit-daemonset", 44 | spec=k8s.apps.v1.DaemonSetSpecArgs( 45 | selector=k8s.meta.v1.LabelSelectorArgs( 46 | match_labels={"k8s-app": "fluent-bit-logging"}, 47 | ), 48 | template=k8s.core.v1.PodTemplateSpecArgs( 49 | metadata=k8s.meta.v1.ObjectMetaArgs( 50 | labels={"k8s-app": "fluent-bit-logging"}, 51 | annotations={"sidecar.istio.io/inject": "false"}, 52 | ), 53 | spec=k8s.core.v1.PodSpecArgs( 54 | service_account_name=ACCESS_ALL_SA, 55 | tolerations=[k8s.core.v1.TolerationArgs(operator="Exists")], 56 | containers=[ 57 | k8s.core.v1.ContainerArgs( 58 | name="fluent-bit", 59 | image="fluent/fluent-bit:latest", 60 | volume_mounts=[ 61 | k8s.core.v1.VolumeMountArgs( 62 | name="config", 63 | mount_path="/fluent-bit/etc/fluent-bit.conf", 64 | sub_path="fluent-bit.conf", 65 | ), 66 | k8s.core.v1.VolumeMountArgs( 67 | name="varlog", 68 | mount_path="/var/log", 69 | ), 70 | k8s.core.v1.VolumeMountArgs( 71 | name="varlibdockercontainers", 72 | mount_path="/var/lib/docker/containers", 73 | read_only=True, 74 | ), 75 | k8s.core.v1.VolumeMountArgs( 76 | name="parsers-config", 77 | mount_path="/fluent-bit/etc/parsers.conf", 78 | sub_path="parsers.conf", 79 | ), 80 | ], 81 | ) 82 | ], 83 | volumes=[ 84 | k8s.core.v1.VolumeArgs( 85 | name="config", 86 | config_map=k8s.core.v1.ConfigMapVolumeSourceArgs( 87 | name=fluent_bit_config_map.metadata["name"], 88 | ), 89 | ), 90 | k8s.core.v1.VolumeArgs( 91 | name="varlog", 92 | host_path=k8s.core.v1.HostPathVolumeSourceArgs( 93 | path="/var/log", 94 | ), 95 | ), 96 | k8s.core.v1.VolumeArgs( 97 | name="varlibdockercontainers", 98 | host_path=k8s.core.v1.HostPathVolumeSourceArgs( 99 | path="/var/lib/docker/containers", 100 | ), 101 | ), 102 | k8s.core.v1.VolumeArgs( 103 | name="parsers-config", 104 | config_map=k8s.core.v1.ConfigMapVolumeSourceArgs( 105 | name=parsers_config_map.metadata["name"], 106 | ), 107 | ), 108 | ], 109 | ), 110 | ), 111 | ), 112 | metadata={"namespace": ctx.namespace}, 113 | opts=pulumi.ResourceOptions(provider=ctx.k8s_provider), 114 | ) 115 | -------------------------------------------------------------------------------- /paka/cli/cluster.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import threading 4 | import time 5 | from typing import List 6 | 7 | import typer 8 | 9 | from paka.cli.utils import load_cluster_manager, load_kubeconfig 10 | from paka.k8s.utils import remove_crd_finalizers 11 | 12 | cluster_app = typer.Typer() 13 | 14 | 15 | @cluster_app.command() 16 | def up( 17 | cluster_config: str = typer.Option( 18 | "", 19 | "--file", 20 | "-f", 21 | help="Path to the cluster config file. The cluster config file is a " 22 | "YAML file that contains the configuration of the cluster", 23 | ), 24 | no_kubeconfig: bool = typer.Option( 25 | False, 26 | "--no-kubeconfig", 27 | "-n", 28 | help="By default, the connection details of the newly created Kubernetes " 29 | "cluster are added to the default kubeconfig file (~/.kube/config). " 30 | "This allows kubectl to communicate with the new cluster. " 31 | "Use this option to prevent updating the kubeconfig file.", 32 | ), 33 | ) -> None: 34 | """ 35 | Creates or updates a Kubernetes cluster based on the provided configuration. 36 | """ 37 | cluster_manager = load_cluster_manager(cluster_config) 38 | cluster_manager.ctx.set_should_save_kubeconfig(not no_kubeconfig) 39 | cluster_manager.create() 40 | 41 | 42 | @cluster_app.command() 43 | def down( 44 | cluster_config: str = typer.Option( 45 | "", 46 | "--file", 47 | "-f", 48 | help="Path to the cluster config file. The cluster config file is a " 49 | "YAML file that contains the configuration of the cluster", 50 | ), 51 | yes: bool = typer.Option( 52 | False, 53 | "--yes", 54 | "-y", 55 | help="Automatic yes to prompts. Use this option to bypass the confirmation " 56 | "prompt and directly proceed with the operation.", 57 | ), 58 | ) -> None: 59 | """ 60 | Tears down the Kubernetes cluster, removing all associated resources and data. 61 | """ 62 | if yes or typer.confirm( 63 | f"Are you sure you want to proceed with the operation? Please note that " 64 | "all resources and data will be permanently deleted.", 65 | default=False, 66 | ): 67 | cluster_manager = load_cluster_manager(cluster_config) 68 | 69 | # Sometime finalizers might block CRD deletion, so we need to force delete those. 70 | # This is best effort and might not work in all cases. 71 | # TODO: better way to handle this 72 | # https://github.com/kubernetes/kubernetes/issues/60538 73 | stop_event = threading.Event() 74 | 75 | def remove_finalizers_forever() -> None: 76 | try: 77 | crds = [ 78 | "scaledobjects.keda.sh", 79 | "routes.serving.knative.dev", 80 | "ingresses.networking.internal.knative.dev", 81 | ] 82 | 83 | load_kubeconfig(cluster_manager.cloud_config.cluster.name) 84 | 85 | while not stop_event.is_set(): 86 | for crd in crds: 87 | try: 88 | remove_crd_finalizers(crd) 89 | except Exception as e: 90 | pass 91 | time.sleep(1) # Wait for a second before the next iteration 92 | except: 93 | pass 94 | 95 | thread = threading.Thread(target=remove_finalizers_forever) 96 | thread.start() 97 | 98 | try: 99 | cluster_manager.destroy() 100 | finally: 101 | stop_event.set() 102 | thread.join() 103 | 104 | 105 | @cluster_app.command() 106 | def preview( 107 | cluster_config: str = typer.Option( 108 | "", 109 | "--file", 110 | "-f", 111 | help="Path to the cluster config file. The cluster config file is a " 112 | "YAML file that contains the configuration of the cluster", 113 | ), 114 | policy_packs: List[str] = typer.Option( 115 | [], 116 | "--policy-pack", 117 | "-p", 118 | help="Path to the policy pack.", 119 | ), 120 | ) -> None: 121 | """ 122 | Previews the changes that will be applied to the cloud resources. 123 | """ 124 | cluster_manager = load_cluster_manager(cluster_config) 125 | if policy_packs: 126 | cluster_manager.preview(policy_packs=policy_packs) 127 | else: 128 | cluster_manager.preview() 129 | 130 | 131 | @cluster_app.command() 132 | def refresh( 133 | cluster_config: str = typer.Option( 134 | "", 135 | "--file", 136 | "-f", 137 | help="Path to the cluster config file. The cluster config file is a " 138 | "YAML file that contains the configuration of the cluster", 139 | ), 140 | ) -> None: 141 | """ 142 | Synchronize the local cluster state with the state in the cloud. 143 | """ 144 | cluster_manager = load_cluster_manager(cluster_config) 145 | cluster_manager.refresh() 146 | -------------------------------------------------------------------------------- /paka/k8s/model_group/runtime/llama_cpp.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import re 5 | from typing import List, Optional 6 | 7 | from huggingface_hub import HfFileSystem 8 | from huggingface_hub.utils import validate_repo_id 9 | 10 | from paka.cluster.context import Context 11 | from paka.cluster.utils import get_model_store 12 | from paka.config import CloudModelGroup 13 | from paka.constants import MODEL_MOUNT_PATH 14 | 15 | 16 | # Heuristic to determine if the image is a llama.cpp image 17 | def is_llama_cpp_image(image: str) -> bool: 18 | return "llama.cpp" in image.lower() 19 | 20 | 21 | def get_model_file_from_model_store( 22 | ctx: Context, 23 | model_group: CloudModelGroup, 24 | ) -> Optional[str]: 25 | if model_group.model and model_group.model.useModelStore: 26 | store = get_model_store(ctx, with_progress_bar=False) 27 | # Find the file that ends with .gguf or .ggml 28 | model_files = [ 29 | file 30 | for file in store.glob(f"{model_group.name}/*") 31 | if re.search(r"\.(gguf|ggml)$", file, re.IGNORECASE) 32 | ] 33 | 34 | if not model_files: 35 | model_files = [ 36 | file 37 | for file in store.glob(f"{model_group.name}/*") 38 | if any( 39 | re.match(file_pattern, file) 40 | for file_pattern in model_group.model.files 41 | ) 42 | ] 43 | 44 | if len(model_files) > 1: 45 | raise ValueError( 46 | f"Multiple model files found in {model_group.name}/ directory." 47 | ) 48 | 49 | if len(model_files) == 1: 50 | return os.path.basename(model_files[0]) 51 | 52 | return None 53 | 54 | 55 | def get_runtime_command_llama_cpp( 56 | ctx: Context, model_group: CloudModelGroup 57 | ) -> List[str]: 58 | runtime = model_group.runtime 59 | if runtime.command: 60 | command_str = " ".join(runtime.command) if runtime.command else "" 61 | # If the command knows where or how to load the model file, we don't need to do anything. 62 | if ( 63 | re.search(r"(--model|-m)[ \t]*\S+", command_str) 64 | or ( 65 | re.search(r"--hf-repo|-hfr", command_str) 66 | and re.search(r"--hf-file|-hff", command_str) 67 | ) 68 | or re.search(r"--model-url|-mu[ \t]*\S+", command_str) 69 | ): 70 | return runtime.command 71 | 72 | model_file = get_model_file_from_model_store(ctx, model_group) 73 | 74 | def attach_model_to_command(command: List[str]) -> List[str]: 75 | if model_file: 76 | return command + ["--model", f"{MODEL_MOUNT_PATH}/{model_file}"] 77 | elif model_group.model and model_group.model.hfRepoId: 78 | 79 | validate_repo_id(model_group.model.hfRepoId) 80 | hf_fs = HfFileSystem() 81 | files = [ 82 | file 83 | for pattern in model_group.model.files 84 | for file in hf_fs.glob(f"{model_group.model.hfRepoId}/{pattern}") 85 | ] 86 | 87 | if len(files) > 1: 88 | raise ValueError("Multiple model files found in HuggingFace repo.") 89 | if len(files) == 0: 90 | raise ValueError("No model file found in HuggingFace repo.") 91 | 92 | hf_file = os.path.basename(files[0]) 93 | 94 | return command + [ 95 | "--hf-repo", 96 | model_group.model.hfRepoId, 97 | "--hf-file", 98 | hf_file, 99 | "--model", 100 | os.path.basename( 101 | hf_file 102 | ), # This is the model file name that the huggingface model is saved as 103 | ] 104 | else: 105 | raise ValueError("Did not find a model to load.") 106 | 107 | if runtime.command: 108 | return attach_model_to_command(runtime.command) 109 | 110 | # https://github.com/ggerganov/llama.cpp/tree/master/examples/server 111 | command = [ 112 | "/server", 113 | "--host", 114 | "0.0.0.0", 115 | "--parallel", # Number of parallel requests to handle 116 | "1", 117 | "--cont-batching", # Enable continuous batching 118 | "--ctx-size", 119 | "4096", 120 | "--batch-size", # Maximum number of tokens to decode in a batch 121 | "512", 122 | "--ubatch-size", # Physical batch size 123 | "512", 124 | "--n-predict", # Maximum number of tokens to predict. 125 | "-1", 126 | "--embedding", 127 | "--flash-attn", # Enable flash attention 128 | "--metrics", # Enable metrics 129 | ] 130 | 131 | if hasattr(model_group, "gpu") and model_group.gpu and model_group.gpu.enabled: 132 | # The value 999 is typically sufficient for most models, as it attempts to offload as many layers as possible to the GPU. 133 | # However, for particularly large models, this may result in exceeding the GPU's memory capacity and cause errors. 134 | # A more effective approach would be to conduct a series of experiments with varying values for --n-gpu-layers to find the optimal setting. 135 | command.extend(["--n-gpu-layers", "999"]) 136 | 137 | return attach_model_to_command(command) 138 | -------------------------------------------------------------------------------- /tests/k8s/model_group/runtime/test_llama_cpp.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | import pytest 4 | 5 | import paka.cluster 6 | import paka.cluster.utils 7 | import paka.k8s.model_group.runtime.llama_cpp 8 | from paka.cluster.context import Context 9 | from paka.config import AwsModelGroup, Model, Runtime 10 | from paka.constants import MODEL_MOUNT_PATH 11 | from paka.k8s.model_group.runtime.llama_cpp import get_runtime_command_llama_cpp 12 | 13 | 14 | @pytest.fixture 15 | def model_group() -> AwsModelGroup: 16 | return AwsModelGroup( 17 | name="test-model-group", 18 | minInstances=1, 19 | maxInstances=2, 20 | nodeType="t2.micro", 21 | runtime=Runtime( 22 | image="johndoe/llama.cpp:server", 23 | command=["/server", "--model", f"{MODEL_MOUNT_PATH}/model.gguf"], 24 | ), 25 | resourceRequest={"cpu": "1000", "memory": "1Gi"}, 26 | ) 27 | 28 | 29 | def test_get_runtime_command_llama_cpp(model_group: AwsModelGroup) -> None: 30 | mock_store = MagicMock() 31 | with patch.object( 32 | paka.k8s.model_group.runtime.llama_cpp, 33 | "get_model_store", 34 | return_value=mock_store, 35 | ) as mock_get_model_store, patch.object( 36 | paka.k8s.model_group.runtime.llama_cpp, "HfFileSystem" 37 | ) as mock_hf_fs, patch.object( 38 | paka.k8s.model_group.runtime.llama_cpp, 39 | "validate_repo_id", 40 | return_value=True, 41 | ) as mock_validate_repo_id: 42 | # Test case: runtime command is already provided 43 | assert get_runtime_command_llama_cpp(Context(), model_group) == [ 44 | "/server", 45 | "--model", 46 | f"{MODEL_MOUNT_PATH}/model.gguf", 47 | ] 48 | 49 | # Test case: model file is found in model store 50 | model_group.runtime.command = None 51 | model_group.model = Model(useModelStore=True) 52 | # Mock os.listdir to return a specific list of files 53 | mock_store.glob.return_value = ["model.gguf"] 54 | command = get_runtime_command_llama_cpp(Context(), model_group) 55 | assert "--model" in command, "Expected '--model' to be in command list" 56 | model_index = command.index("--model") 57 | assert ( 58 | command[model_index + 1] == f"{MODEL_MOUNT_PATH}/model.gguf" 59 | ), f"Expected '--model' to be followed by '{MODEL_MOUNT_PATH}/model.gguf'" 60 | 61 | # Test case: model file is not found in the model store but found in HuggingFace repo 62 | model_group.model = Model( 63 | useModelStore=True, hfRepoId="repoId", files=["model.gguf"] 64 | ) 65 | # Mock os.listdir to return an empty list 66 | mock_store.glob.return_value = [] 67 | # Mock HfFileSystem.glob to return a specific list of files 68 | mock_hf_fs.return_value.glob.return_value = ["repoId/model.gguf"] 69 | command = get_runtime_command_llama_cpp(Context(), model_group) 70 | assert "--hf-repo" in command, "Expected '--hf-repo' to be in command list" 71 | repo_index = command.index("--hf-repo") 72 | assert ( 73 | command[repo_index + 1] == "repoId" 74 | ), "Expected '--hf-repo' to be followed by 'repoId'" 75 | 76 | assert "--hf-file" in command, "Expected '--hf-file' to be in command list" 77 | file_index = command.index("--hf-file") 78 | assert ( 79 | command[file_index + 1] == "model.gguf" 80 | ), "Expected '--hf-file' to be followed by 'model.gguf'" 81 | assert "--model" in command, "Expected '--model' to be in command list" 82 | model_index = command.index("--model") 83 | assert ( 84 | command[model_index + 1] == f"model.gguf" 85 | ), f"Expected '--model' to be followed by 'model.gguf'" 86 | 87 | # Test case: model file is not found in the model store and not found in HuggingFace repo 88 | model_group.model = Model( 89 | useModelStore=True, hfRepoId="repoId", files=["model.gguf"] 90 | ) 91 | # Mock os.listdir to return an empty list 92 | mock_store.glob.return_value = [] 93 | # Mock HfFileSystem.glob to return an empty list 94 | mock_hf_fs.return_value.glob.return_value = [] 95 | with pytest.raises( 96 | ValueError, match="No model file found in HuggingFace repo." 97 | ): 98 | get_runtime_command_llama_cpp(Context(), model_group) 99 | 100 | # Test case: Multiple model files found in the model store 101 | model_group.model = Model(useModelStore=True) 102 | # Mock os.listdir to return multiple model files 103 | mock_store.glob.return_value = ["model1.ggml", "model2.ggml"] 104 | with pytest.raises( 105 | ValueError, 106 | match=f"Multiple model files found in {model_group.name}/ directory.", 107 | ): 108 | get_runtime_command_llama_cpp(Context(), model_group) 109 | 110 | # Test case: No model file found in the model store 111 | model_group.model = Model(useModelStore=True) 112 | # Mock os.listdir to return an empty list 113 | mock_store.glob.return_value = [] 114 | mock_hf_fs.return_value.glob.return_value = [] 115 | with pytest.raises(ValueError, match="Did not find a model to load."): 116 | get_runtime_command_llama_cpp(Context(), model_group) 117 | --------------------------------------------------------------------------------