├── e2e
    ├── __init__.py
    ├── pytest_kind
    │   ├── __init__.py
    │   └── plugin.py
    ├── conftest.py
    ├── test_cluster.yaml
    ├── test_pack_install.py
    ├── test_pulumi_install.py
    └── test_kubectl_install.py
├── tests
    ├── __init__.py
    ├── cli
    │   ├── __init__.py
    │   └── test_function.py
    ├── k8s
    │   ├── __init__.py
    │   ├── function
    │   │   └── __init__.py
    │   ├── model_group
    │   │   ├── __init__.py
    │   │   └── runtime
    │   │   │   ├── __init__.py
    │   │   │   ├── test_vllm.py
    │   │   │   └── test_llama_cpp.py
    │   └── test_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── test_progress_bar.py
    │   ├── test_settings.py
    │   ├── test_base_model.py
    │   ├── test_http_model.py
    │   ├── test_hf_model.py
    │   └── test_store.py
    ├── config
    │   ├── __init__.py
    │   └── snapshots
    │   │   └── test_config
    │   │       └── test_aws_yaml
    │   │           └── aws_yaml.txt
    ├── container
    │   ├── __init__.py
    │   └── test_ecr.py
    ├── policy_packs
    │   ├── __init__.py
    │   └── aws
    │   │   ├── __init__.py
    │   │   ├── PulumiPolicy.yaml
    │   │   ├── __main__.py
    │   │   ├── test_cluster.yaml
    │   │   ├── container_registry.py
    │   │   ├── eks.py
    │   │   └── object_store.py
    ├── test_examples.py
    └── test_utils.py
├── paka
    ├── cli
    │   ├── __init__.py
    │   ├── kubeconfig.py
    │   ├── __main__.py
    │   ├── build.py
    │   ├── model_group.py
    │   ├── run.py
    │   └── cluster.py
    ├── model
    │   ├── __init__.py
    │   ├── manifest.py
    │   ├── http_model.py
    │   ├── settings.py
    │   ├── hf_model.py
    │   ├── progress_bar.py
    │   └── base_model.py
    ├── cluster
    │   ├── __init__.py
    │   ├── aws
    │   │   ├── __init__.py
    │   │   ├── container_registry.py
    │   │   ├── object_store.py
    │   │   ├── cloudwatch.py
    │   │   ├── elb.py
    │   │   ├── ebs_csi_driver.py
    │   │   ├── utils.py
    │   │   ├── cluster_autoscaler.py
    │   │   └── service_account.py
    │   ├── manager
    │   │   ├── __init__.py
    │   │   ├── aws.py
    │   │   └── base.py
    │   ├── utils.py
    │   ├── namespace.py
    │   ├── zipkin.py
    │   ├── keda.py
    │   ├── redis.py
    │   ├── kubectl.py
    │   ├── nvidia_device_plugin.py
    │   ├── pulumi.py
    │   ├── qdrant.py
    │   ├── context.py
    │   ├── prometheus.py
    │   └── fluentbit.py
    ├── k8s
    │   ├── job
    │   │   ├── __init__.py
    │   │   └── autoscaler.py
    │   ├── function
    │   │   └── __init__.py
    │   └── model_group
    │   │   ├── __init__.py
    │   │   ├── runtime
    │   │       ├── __init__.py
    │   │       ├── vllm.py
    │   │       └── llama_cpp.py
    │   │   ├── manifest.py
    │   │   └── ingress.py
    ├── __init__.py
    ├── constants.py
    ├── logger.py
    └── container
    │   ├── pack.py
    │   └── ecr.py
├── examples
    ├── website_rag
    │   ├── __init__.py
    │   ├── runtime.txt
    │   ├── .gitignore
    │   ├── .cnignore
    │   ├── Procfile
    │   ├── constants.py
    │   ├── embeddings.py
    │   ├── requirements.txt
    │   ├── cluster.yaml
    │   ├── crawler.py
    │   ├── serve.py
    │   ├── ingest.py
    │   └── README.md
    ├── invoice_extraction
    │   ├── __init__.py
    │   ├── runtime.txt
    │   ├── .gitignore
    │   ├── .cnignore
    │   ├── Procfile
    │   ├── invoices
    │   │   ├── invoice-2024-01-01.pdf
    │   │   ├── invoice-2024-01-31.pdf
    │   │   ├── invoice-2024-02-29.pdf
    │   │   └── invoice-2024-03-31.pdf
    │   ├── cluster_cpu.yaml
    │   ├── requirements.txt
    │   ├── cluster.yaml
    │   ├── output_parser.py
    │   ├── README.md
    │   └── serve.py
    └── templates
    │   ├── Llama2_7B_Chat_AWQ.yaml
    │   ├── Llama2_7B_Chat_GPTQ.yaml
    │   ├── Llama3_70B_Instruct_GPTQ.yaml
    │   ├── Mistral_7B_Instruct_GPTQ.yaml
    │   ├── Llama3_70B_Instruct.yaml
    │   ├── Llama3_8B_Instruct.yaml
    │   ├── Phi3_Mini_4K_Instruct.yaml
    │   └── Mistral_7B_Instruct.yaml
├── docs
    ├── img
    │   ├── architecture.png
    │   └── tokens_per_sec.png
    ├── faq.md
    └── quick_start.md
├── Makefile
├── .github
    └── workflows
    │   ├── publish_release.yml
    │   └── pull-request-tests.yml
├── .pre-commit-config.yaml
├── LICENSE
├── pyproject.toml
└── .gitignore


/e2e/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paka/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paka/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/k8s/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/e2e/pytest_kind/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paka/cluster/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paka/cluster/aws/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paka/k8s/job/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/config/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/container/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/website_rag/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paka/cluster/manager/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paka/k8s/function/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paka/k8s/model_group/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/k8s/function/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/k8s/model_group/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/policy_packs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/policy_packs/aws/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/invoice_extraction/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/paka/k8s/model_group/runtime/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/k8s/model_group/runtime/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/website_rag/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.11.*
2 | 


--------------------------------------------------------------------------------
/examples/invoice_extraction/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.11.*
2 | 


--------------------------------------------------------------------------------
/examples/website_rag/.gitignore:
--------------------------------------------------------------------------------
1 | .mypy_cache
2 | venv/
3 | 


--------------------------------------------------------------------------------
/tests/policy_packs/aws/PulumiPolicy.yaml:
--------------------------------------------------------------------------------
1 | runtime: python
2 | 


--------------------------------------------------------------------------------
/examples/invoice_extraction/.gitignore:
--------------------------------------------------------------------------------
1 | .mypy_cache
2 | venv/
3 | 


--------------------------------------------------------------------------------
/e2e/conftest.py:
--------------------------------------------------------------------------------
1 | from .pytest_kind.plugin import *  # noqa: F401, F403
2 | 


--------------------------------------------------------------------------------
/examples/website_rag/.cnignore:
--------------------------------------------------------------------------------
1 | venv
2 | __pycache__
3 | .mypy_cache
4 | 


--------------------------------------------------------------------------------
/examples/invoice_extraction/.cnignore:
--------------------------------------------------------------------------------
1 | venv
2 | __pycache__
3 | .mypy_cache
4 | 


--------------------------------------------------------------------------------
/examples/invoice_extraction/Procfile:
--------------------------------------------------------------------------------
1 | web: python serve.py
2 | serve: python serve.py
3 | 


--------------------------------------------------------------------------------
/docs/img/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jjleng/paka/HEAD/docs/img/architecture.png


--------------------------------------------------------------------------------
/docs/img/tokens_per_sec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jjleng/paka/HEAD/docs/img/tokens_per_sec.png


--------------------------------------------------------------------------------
/examples/invoice_extraction/invoices/invoice-2024-01-01.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jjleng/paka/HEAD/examples/invoice_extraction/invoices/invoice-2024-01-01.pdf


--------------------------------------------------------------------------------
/examples/invoice_extraction/invoices/invoice-2024-01-31.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jjleng/paka/HEAD/examples/invoice_extraction/invoices/invoice-2024-01-31.pdf


--------------------------------------------------------------------------------
/examples/invoice_extraction/invoices/invoice-2024-02-29.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jjleng/paka/HEAD/examples/invoice_extraction/invoices/invoice-2024-02-29.pdf


--------------------------------------------------------------------------------
/examples/invoice_extraction/invoices/invoice-2024-03-31.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jjleng/paka/HEAD/examples/invoice_extraction/invoices/invoice-2024-03-31.pdf


--------------------------------------------------------------------------------
/examples/website_rag/Procfile:
--------------------------------------------------------------------------------
1 | ingest: python ingest.py https://python.langchain.com/docs/get_started/introduction
2 | web: python serve.py
3 | serve: python serve.py
4 | 


--------------------------------------------------------------------------------
/examples/website_rag/constants.py:
--------------------------------------------------------------------------------
1 | QDRANT_URL = "http://qdrant.qdrant.svc.cluster.local:6333"
2 | LLM_URL = "http://llama2-7b-chat"
3 | EMBEDDING_URL = "http://gte-base"
4 | 


--------------------------------------------------------------------------------
/paka/__init__.py:
--------------------------------------------------------------------------------
1 | from importlib.metadata import PackageNotFoundError, version
2 | 
3 | try:
4 |     __version__ = version(__name__)
5 | except PackageNotFoundError:
6 |     __version__ = ""
7 | 


--------------------------------------------------------------------------------
/paka/k8s/model_group/manifest.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class Manifest(BaseModel):
 5 |     name: str
 6 |     url: str
 7 |     type: str
 8 |     file: str
 9 |     sha256: str
10 | 


--------------------------------------------------------------------------------
/paka/cluster/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from paka.cluster.context import Context
 4 | from paka.model.store import ModelStore, S3ModelStore
 5 | 
 6 | 
 7 | def get_model_store(ctx: Context, *args: Any, **kwargs: Any) -> ModelStore:
 8 |     assert ctx.provider == "aws"
 9 | 
10 |     return S3ModelStore(ctx.bucket, *args, **kwargs)
11 | 


--------------------------------------------------------------------------------
/tests/policy_packs/aws/__main__.py:
--------------------------------------------------------------------------------
 1 | from pulumi_policy import EnforcementLevel, PolicyPack
 2 | 
 3 | from tests.policy_packs.aws.container_registry import ecr_policies
 4 | from tests.policy_packs.aws.eks import model_group_taints
 5 | from tests.policy_packs.aws.object_store import s3_policies
 6 | 
 7 | PolicyPack(
 8 |     name="aws",
 9 |     enforcement_level=EnforcementLevel.MANDATORY,
10 |     policies=s3_policies + ecr_policies + [model_group_taints],
11 | )
12 | 


--------------------------------------------------------------------------------
/tests/container/test_ecr.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock, patch
 2 | 
 3 | from moto import mock_aws
 4 | 
 5 | from paka.container.ecr import authenticate_docker_to_ecr
 6 | 
 7 | 
 8 | @mock_aws
 9 | def test_authenticate_docker_to_ecr() -> None:
10 |     with patch("subprocess.Popen") as mock_popen:
11 |         mock_result = MagicMock()
12 |         mock_result.communicate.return_value = (b"", b"")
13 |         mock_result.returncode = 0
14 |         mock_popen.return_value = mock_result
15 | 
16 |         authenticate_docker_to_ecr("us-west-2")
17 | 


--------------------------------------------------------------------------------
/paka/constants.py:
--------------------------------------------------------------------------------
 1 | # The name of the project
 2 | PROJECT_NAME = "paka"
 3 | 
 4 | # The service account that has access to all resources
 5 | ACCESS_ALL_SA = "access-all-sa"
 6 | 
 7 | # The environment variable for the directory where the paka data is saved
 8 | HOME_ENV_VAR = "PAKA_HOME"
 9 | 
10 | # The environment variable for the buildpack builder
11 | BP_BUILDER_ENV_VAR = "BP_BUILDER"
12 | 
13 | # The path where the model files are mounted in the container
14 | MODEL_MOUNT_PATH = "/data"
15 | 
16 | # Pulumi stack name
17 | PULUMI_STACK_NAME = "default"
18 | 


--------------------------------------------------------------------------------
/tests/policy_packs/aws/test_cluster.yaml:
--------------------------------------------------------------------------------
 1 | aws:
 2 |   cluster:
 3 |     name: test-cluster
 4 |     region: us-west-2
 5 |     nodeType: t2.micro
 6 |     minNodes: 2
 7 |     maxNodes: 2
 8 |   modelGroups:
 9 |     - nodeType: c7a.xlarge
10 |       minInstances: 1
11 |       maxInstances: 1
12 |       name: llama2-7b
13 |       runtime:
14 |         image: ghcr.io/ggerganov/llama.cpp:server
15 |       model:
16 |         hfRepoId: TheBloke/Llama-2-7B-GGUF
17 |         files: ["*.Q4_0.gguf"]
18 |   vectorStore:
19 |      nodeType: t2.small
20 |      replicas: 2
21 | 


--------------------------------------------------------------------------------
/e2e/test_cluster.yaml:
--------------------------------------------------------------------------------
 1 | # kind-config.yaml
 2 | kind: Cluster
 3 | apiVersion: kind.x-k8s.io/v1alpha4
 4 | nodes:
 5 | - role: control-plane
 6 | - role: worker
 7 | - role: worker
 8 |   kubeadmConfigPatches:
 9 |   - |
10 |     kind: JoinConfiguration
11 |     nodeRegistration:
12 |       taints:
13 |       - key: "app"
14 |         value: "model-group"
15 |         effect: "NoSchedule"
16 |       - key: "model"
17 |         value: "gte-base"
18 |         effect: "NoSchedule"
19 |       kubeletExtraArgs:
20 |         node-labels: "app=model-group,model=gte-base"
21 | 


--------------------------------------------------------------------------------
/tests/config/snapshots/test_config/test_aws_yaml/aws_yaml.txt:
--------------------------------------------------------------------------------
 1 | version: '1.0'
 2 | aws:
 3 |   cluster:
 4 |     name: test-cluster
 5 |     region: us-east-1
 6 |     namespace: default
 7 |     nodeType: t2.micro
 8 |     minNodes: 2
 9 |     maxNodes: 2
10 |     logRetentionDays: 14
11 |   modelGroups:
12 |     - minInstances: 1
13 |       maxInstances: 2
14 |       nodeType: t2.micro
15 |       diskSize: 20
16 |       name: test-model-group
17 |       runtime:
18 |         image: test-image
19 |       resourceRequest:
20 |         cpu: 500m
21 |         memory: 2Gi
22 |       isPublic: false
23 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: install test lint setup policy-pack type-check check-all e2e
 2 | 
 3 | install:
 4 | 	poetry install
 5 | 
 6 | test:
 7 | 	poetry run pytest tests
 8 | 
 9 | e2e:
10 | 	poetry run pytest --cluster-config e2e/test_cluster.yaml e2e
11 | 
12 | type-check:
13 | 	poetry run mypy paka tests
14 | 
15 | policy-pack:
16 | 	poetry run python -m paka.cli cluster preview -f $(shell pwd)/tests/policy_packs/aws/test_cluster.yaml --policy-pack $(shell pwd)/tests/policy_packs/aws
17 | 
18 | setup:
19 | 	poetry run pre-commit install
20 | 
21 | lint: setup
22 | 	poetry run pre-commit run --all-files --show-diff-on-failure
23 | 
24 | check-all: lint type-check test e2e
25 | 


--------------------------------------------------------------------------------
/paka/cluster/aws/container_registry.py:
--------------------------------------------------------------------------------
 1 | import pulumi_aws as aws
 2 | 
 3 | from paka.cluster.context import Context
 4 | from paka.utils import call_once
 5 | 
 6 | 
 7 | @call_once
 8 | def create_container_registry(ctx: Context) -> None:
 9 |     """
10 |     Create a container registry in AWS ECR for storing Docker images.
11 | 
12 |     Returns:
13 |         None
14 |     """
15 |     repository = aws.ecr.Repository(
16 |         ctx.cluster_name,
17 |         force_delete=True,
18 |         image_tag_mutability="MUTABLE",
19 |     )
20 | 
21 |     # Save the repository URL to the cluster data file
22 |     repository.repository_url.apply(lambda url: ctx.set_registry(url))
23 | 


--------------------------------------------------------------------------------
/paka/cluster/aws/object_store.py:
--------------------------------------------------------------------------------
 1 | import pulumi_aws as aws
 2 | 
 3 | from paka.cluster.context import Context
 4 | from paka.utils import call_once
 5 | 
 6 | 
 7 | @call_once
 8 | def create_object_store(ctx: Context) -> None:
 9 |     """
10 |     Creates an object store in AWS S3 based on the provided configuration.
11 | 
12 |     Returns:
13 |         None
14 |     """
15 |     # `bucket` is the name of the bucket. It will avoid pulumi appending a random string to the name
16 |     # `force_destroy`` is needed to delete the bucket when it's not empty
17 |     bucket = aws.s3.Bucket(ctx.cluster_name, force_destroy=True)
18 |     bucket.id.apply(lambda id: ctx.set_bucket(id))
19 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_release.yml:
--------------------------------------------------------------------------------
 1 | name: Release to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'
 7 | 
 8 | jobs:
 9 |   build-and-publish:
10 |     runs-on: ubuntu-latest
11 |     environment:
12 |       name: pypi
13 |       url: https://pypi.org/p/paka
14 |     permissions:
15 |       id-token: write
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |       - name: Set up Python
19 |         uses: actions/setup-python@v2
20 |         with:
21 |           python-version: '3.11'
22 |       - name: Install Poetry
23 |         run: pip install poetry
24 |       - name: Build package
25 |         run: poetry build
26 |       - name: Publish to PyPI
27 |         uses: pypa/gh-action-pypi-publish@release/v1
28 | 


--------------------------------------------------------------------------------
/.github/workflows/pull-request-tests.yml:
--------------------------------------------------------------------------------
 1 | name: 'Run tests on pr'
 2 | 
 3 | run-name: ${{ github.actor }} has create a pull request 💻
 4 | 
 5 | on:
 6 |   push:
 7 |     branches: ['main']
 8 |   pull_request:
 9 |     branches: ['main']
10 | 
11 | permissions:
12 |   contents: read
13 | 
14 | jobs:
15 |   build:
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v3
20 | 
21 |       - name: Set up Python 3.10
22 |         uses: actions/setup-python@v3
23 |         with:
24 |           python-version: '3.10'
25 | 
26 |       - name: Install poetry
27 |         run: pip3 install poetry
28 | 
29 |       - name: Install dependencies
30 |         run: poetry install
31 | 
32 |       - name: Test with pytest
33 |         run: make check-all
34 | 


--------------------------------------------------------------------------------
/tests/test_examples.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from paka.config import parse_yaml
 6 | 
 7 | examples_path = Path(__file__).parent.parent / "examples"
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "cluster_config",
12 |     [
13 |         examples_path / "website_rag" / "cluster.yaml",
14 |         examples_path / "invoice_extraction" / "cluster.yaml",
15 |         examples_path / "invoice_extraction" / "cluster_cpu.yaml",
16 |     ],
17 | )
18 | def test_example_configs(cluster_config: Path) -> None:
19 |     cluster_config = Path(cluster_config).expanduser().absolute()
20 | 
21 |     if not cluster_config.exists():
22 |         raise FileNotFoundError(f"The cluster config file does not exist")
23 | 
24 |     with open(cluster_config, "r") as file:
25 |         parse_yaml(file.read())
26 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: 'v4.5.0'
 4 |     hooks:
 5 |       - id: check-merge-conflict
 6 |       - id: check-toml
 7 |       - id: check-yaml
 8 |       - id: detect-private-key
 9 |       - id: end-of-file-fixer
10 |       - id: mixed-line-ending
11 |       - id: trailing-whitespace
12 |   - repo: https://github.com/pre-commit/mirrors-mypy
13 |     rev: 'v1.7.1'
14 |     hooks:
15 |       - id: mypy
16 |         additional_dependencies:
17 |           - types-requests
18 |           - types-tabulate
19 |           - pydantic
20 |           - "pydantic[mypy]"
21 |   - repo: https://github.com/pre-commit/mirrors-isort
22 |     rev: 'v5.10.1'
23 |     hooks:
24 |       - id: isort
25 |   - repo: https://github.com/psf/black
26 |     rev: '24.1a1'
27 |     hooks:
28 |       - id: black
29 | 


--------------------------------------------------------------------------------
/examples/invoice_extraction/cluster_cpu.yaml:
--------------------------------------------------------------------------------
 1 | version: "1.2"
 2 | aws:
 3 |   cluster:
 4 |     name: invoice-extraction
 5 |     region: us-west-2
 6 |     namespace: default
 7 |     nodeType: t3a.medium
 8 |     minNodes: 2
 9 |     maxNodes: 4
10 |   prometheus:
11 |     enabled: false
12 |   tracing:
13 |     enabled: false
14 |   mixedModelGroups:
15 |     - nodeType: c7i.large
16 |       baseInstances: 0
17 |       maxOnDemandInstances: 1
18 |       spot:
19 |         minInstances: 1
20 |         maxInstances: 3
21 |       name: llama2-7b-chat
22 |       runtime:
23 |         image: ghcr.io/ggerganov/llama.cpp:server
24 |       model:
25 |         hfRepoId: TheBloke/Llama-2-7B-Chat-GGUF
26 |         files: ["*.Q4_0.gguf"] # Use the q4 quantization
27 |       autoScaleTriggers:
28 |         - type: cpu
29 |           metadata:
30 |             type: Utilization
31 |             value: "50"
32 | 


--------------------------------------------------------------------------------
/examples/website_rag/embeddings.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import requests
 4 | from constants import EMBEDDING_URL
 5 | from langchain_core.embeddings import Embeddings
 6 | from langchain_core.pydantic_v1 import BaseModel
 7 | 
 8 | MAX_ATTEMPTS = 10000
 9 | 
10 | 
11 | class LlamaEmbeddings(BaseModel, Embeddings):
12 |     def embed_documents(self, texts: List[str]) -> List[List[float]]:
13 |         url = f"{EMBEDDING_URL}/v1/embeddings"
14 |         headers = {"Content-Type": "application/json", "accept": "application/json"}
15 |         data = {
16 |             "input": texts,
17 |         }
18 | 
19 |         response = requests.post(url, headers=headers, json=data, verify=False)
20 | 
21 |         return [data["embedding"] for data in response.json()["data"]]
22 | 
23 |     def embed_query(self, text: str) -> List[float]:
24 |         return self.embed_documents([text])[0]
25 | 


--------------------------------------------------------------------------------
/paka/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | # Create a logger
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | # "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 8 | def setup_logger(verbose: bool = False, format: str = "%(message)s") -> None:
 9 |     # Set the logging level based on the verbose flag
10 |     logger.setLevel(logging.DEBUG if verbose else logging.INFO)
11 | 
12 |     for handler in logger.handlers:
13 |         logger.removeHandler(handler)
14 | 
15 |     # Create a console handler
16 |     ch = logging.StreamHandler()
17 |     ch.setLevel(logging.DEBUG if verbose else logging.INFO)
18 | 
19 |     # Create a formatter
20 |     formatter = logging.Formatter(format)
21 | 
22 |     # Add the formatter to the console handler
23 |     ch.setFormatter(formatter)
24 | 
25 |     # Add the console handler to the logger
26 |     logger.addHandler(ch)
27 | 
28 | 
29 | setup_logger()
30 | 


--------------------------------------------------------------------------------
/paka/cluster/manager/aws.py:
--------------------------------------------------------------------------------
 1 | from paka.cluster.aws.container_registry import create_container_registry
 2 | from paka.cluster.aws.eks import create_k8s_cluster
 3 | from paka.cluster.aws.object_store import create_object_store
 4 | from paka.cluster.manager.base import ClusterManager
 5 | from paka.config import Config
 6 | 
 7 | 
 8 | class AWSClusterManager(ClusterManager):
 9 |     """
10 |     AWS-specific implementation of the ClusterManager abstract base class.
11 | 
12 |     The AWSClusterManager class is responsible for managing a cluster of AWS resources.
13 |     It provides methods for creating and managing AWS-specific resources such as EKS clusters,
14 |     node groups, and service accounts. It also handles AWS-specific configuration and setup tasks.
15 |     """
16 | 
17 |     def __init__(self, config: Config) -> None:
18 |         super().__init__(config)
19 | 
20 |     def provision_k8s(self) -> None:
21 |         create_object_store(self.ctx)
22 |         create_container_registry(self.ctx)
23 |         create_k8s_cluster(self.ctx)
24 | 


--------------------------------------------------------------------------------
/examples/invoice_extraction/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.9.3
 2 | aiosignal==1.3.1
 3 | annotated-types==0.6.0
 4 | anyio==4.3.0
 5 | attrs==23.2.0
 6 | certifi==2024.2.2
 7 | charset-normalizer==3.3.2
 8 | click==8.1.7
 9 | dataclasses-json==0.6.4
10 | fastapi==0.110.1
11 | frozenlist==1.4.1
12 | h11==0.14.0
13 | idna==3.6
14 | install==1.3.5
15 | jsonpatch==1.33
16 | jsonpointer==2.4
17 | langchain==0.1.14
18 | langchain-community==0.0.31
19 | langchain-core==0.1.40
20 | langchain-text-splitters==0.0.1
21 | langsmith==0.1.40
22 | marshmallow==3.21.1
23 | multidict==6.0.5
24 | mypy-extensions==1.0.0
25 | numpy==1.26.4
26 | orjson==3.10.0
27 | packaging==23.2
28 | pydantic==2.6.4
29 | pydantic_core==2.16.3
30 | pypdf==4.1.0
31 | python-multipart==0.0.9
32 | PyYAML==6.0.1
33 | requests==2.31.0
34 | six==1.16.0
35 | sniffio==1.3.1
36 | SQLAlchemy==2.0.29
37 | starlette==0.37.2
38 | sse-starlette==1.8.2
39 | sseclient-py==1.8.0
40 | tenacity==8.2.3
41 | typing-inspect==0.9.0
42 | typing_extensions==4.10.0
43 | urllib3==2.2.1
44 | uvicorn==0.29.0
45 | yarl==1.9.4
46 | 


--------------------------------------------------------------------------------
/paka/cluster/namespace.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pulumi
 4 | import pulumi_kubernetes as k8s
 5 | from kubernetes import client, config
 6 | 
 7 | from paka.cluster.context import Context
 8 | 
 9 | 
10 | def create_namespace(ctx: Context, kubeconfig_json: str) -> None:
11 |     # Pulumi does not support creating the default namespace again, so we need to handle it separately
12 |     if ctx.namespace != "default":
13 |         k8s.core.v1.Namespace(
14 |             "app-ns",
15 |             metadata={
16 |                 "name": ctx.namespace,
17 |                 "labels": {"istio-injection": "enabled"},
18 |             },
19 |             opts=pulumi.ResourceOptions(provider=ctx.k8s_provider),
20 |         )
21 |     else:
22 |         config.load_kube_config_from_dict(json.loads(kubeconfig_json))
23 |         # We are dealing with the default namespace
24 |         api_instance = client.CoreV1Api()
25 | 
26 |         body = {"metadata": {"labels": {"istio-injection": "enabled"}}}
27 | 
28 |         api_instance.patch_namespace("default", body)
29 | 


--------------------------------------------------------------------------------
/paka/cli/kubeconfig.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import json
 4 | import os
 5 | from typing import Optional
 6 | 
 7 | import typer
 8 | 
 9 | from paka.cli.utils import ensure_cluster_name
10 | from paka.k8s.utils import update_kubeconfig
11 | from paka.logger import logger
12 | from paka.utils import read_pulumi_stack
13 | 
14 | kube_app = typer.Typer()
15 | 
16 | 
17 | @kube_app.command()
18 | def update(
19 |     cluster_name: Optional[str] = typer.Option(
20 |         os.getenv("PAKA_CURRENT_CLUSTER"),
21 |         "--cluster",
22 |         "-c",
23 |         help="The name of the cluster.",
24 |     )
25 | ) -> None:
26 |     """
27 |     Updates the default kubeconfig file (~/.kube/config) to include the connection
28 |     details of the specified cluster.
29 |     """
30 |     logger.info("Updating kubeconfig...")
31 |     cluster_name = ensure_cluster_name(cluster_name)
32 |     kubeconfig = read_pulumi_stack(cluster_name, "kubeconfig")
33 | 
34 |     update_kubeconfig(json.loads(kubeconfig))
35 |     logger.info("Successfully updated kubeconfig.")
36 | 


--------------------------------------------------------------------------------
/paka/cluster/zipkin.py:
--------------------------------------------------------------------------------
 1 | import pulumi
 2 | from pulumi_kubernetes.helm.v3 import Chart, ChartOpts, FetchOpts
 3 | 
 4 | from paka.cluster.context import Context
 5 | from paka.utils import call_once
 6 | 
 7 | 
 8 | @call_once
 9 | def create_zipkin(ctx: Context) -> None:
10 |     """
11 |     Installs zipkin with a helm chart.
12 |     """
13 | 
14 |     config = ctx.cloud_config
15 | 
16 |     if not config.tracing or not config.tracing.enabled:
17 |         return
18 | 
19 |     autoscaling = (
20 |         {"autoscaling": {"enabled": True}} if config.tracing.autoScalingEnabled else {}
21 |     )
22 | 
23 |     Chart(
24 |         "zipkin",
25 |         ChartOpts(
26 |             chart="zipkin",
27 |             version="0.1.2",
28 |             namespace="istio-system",
29 |             fetch_opts=FetchOpts(repo="https://zipkin.io/zipkin-helm"),
30 |             values={
31 |                 **autoscaling,
32 |                 **(config.tracing.zipkinHelmSettings or {}),
33 |             },
34 |         ),
35 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider),
36 |     )
37 | 


--------------------------------------------------------------------------------
/tests/model/test_progress_bar.py:
--------------------------------------------------------------------------------
 1 | from paka.model.progress_bar import NullProgressBar, ProgressBar
 2 | 
 3 | 
 4 | def test_progress_bar() -> None:
 5 |     pb = ProgressBar("Testing")
 6 |     fake_pb = NullProgressBar("Testing")
 7 | 
 8 |     pb.create_progress_bar(100)
 9 |     fake_pb.create_progress_bar(100)
10 |     assert pb.progress_bar is not None
11 |     assert pb.progress_bar.total == 100
12 |     assert pb.progress_bar.desc == "Testing"
13 | 
14 |     pb.advance_progress_bar("task1", 10)
15 |     fake_pb.advance_progress_bar("task1", 10)
16 |     assert pb.progress_bar.n == 10
17 | 
18 |     pb.set_postfix_str("test postfix")
19 |     fake_pb.set_postfix_str("test postfix")
20 |     assert pb.progress_bar.postfix == "test postfix"
21 | 
22 |     pb.update_progress_bar("task2", 20)
23 |     fake_pb.update_progress_bar("task2", 20)
24 |     assert pb.progress_bar.total == 120
25 | 
26 |     pb.clear_counter()
27 |     fake_pb.clear_counter()
28 |     assert pb.counter == {}
29 | 
30 |     pb.close_progress_bar()
31 |     fake_pb.close_progress_bar()
32 |     assert pb.progress_bar is None
33 | 


--------------------------------------------------------------------------------
/tests/model/test_settings.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from paka.model.settings import ModelSettings
 4 | 
 5 | 
 6 | def test_validate_quantization() -> None:
 7 |     settings = ModelSettings(
 8 |         inference_devices=["cpu"], quantization="GPTQ", runtime="llama.cpp"
 9 |     )
10 |     assert settings.quantization == "GPTQ"
11 | 
12 |     with pytest.raises(ValueError):
13 |         ModelSettings(
14 |             inference_devices=["cpu"],
15 |             quantization="invalid_quantization",
16 |             runtime="llama.cpp",
17 |         )
18 | 
19 | 
20 | def test_validate_prompt_template_name() -> None:
21 |     settings = ModelSettings(
22 |         inference_devices=["cpu"],
23 |         quantization="GPTQ",
24 |         runtime="llama.cpp",
25 |         prompt_template_name="chatml",
26 |     )
27 |     assert settings.prompt_template_name == "chatml"
28 | 
29 |     with pytest.raises(ValueError):
30 |         ModelSettings(
31 |             inference_devices=["cpu"],
32 |             quantization="GPTQ",
33 |             runtime="llama.cpp",
34 |             prompt_template_name="invalid_template",
35 |         )
36 | 


--------------------------------------------------------------------------------
/tests/model/test_base_model.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from unittest.mock import MagicMock, patch
 3 | 
 4 | from paka.model.base_model import BaseMLModel
 5 | 
 6 | 
 7 | class ConcreteMLModel(BaseMLModel):
 8 |     def save(self) -> None:
 9 |         pass
10 | 
11 | 
12 | def test_base_ml_model() -> None:
13 | 
14 |     progress_bar_mock = MagicMock()
15 |     model_store_mock = MagicMock(progress_bar=progress_bar_mock)
16 |     model = ConcreteMLModel(
17 |         name="TestModel",
18 |         model_store=model_store_mock,
19 |         quantization="GPTQ",
20 |         prompt_template_name=None,
21 |         prompt_template_str=None,
22 |     )
23 | 
24 |     model.save_manifest_yml()
25 |     model_store_mock.save.assert_called_once()
26 | 
27 |     stream = io.BytesIO(b"Test data")
28 | 
29 |     model.save_single_stream("test.txt", stream, 9, "test_sha256")
30 |     model_store_mock.save_stream.assert_called_with(
31 |         "test.txt", stream, 9, "test_sha256"
32 |     )
33 |     assert ("test.txt", "test_sha256") in model.completed_files
34 | 
35 |     model.finish()
36 |     progress_bar_mock.close_progress_bar.assert_called_once()
37 | 


--------------------------------------------------------------------------------
/e2e/test_pack_install.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | from pathlib import Path
 4 | from unittest.mock import patch
 5 | 
 6 | import pytest
 7 | 
 8 | from paka.constants import HOME_ENV_VAR
 9 | from paka.container.pack import ensure_pack
10 | 
11 | 
12 | def test_installation_on_windows() -> None:
13 |     with patch(
14 |         "platform.system", return_value="windows"
15 |     ), tempfile.TemporaryDirectory() as temp_dir:
16 |         os.environ[HOME_ENV_VAR] = temp_dir
17 | 
18 |         pack = Path(ensure_pack())
19 | 
20 |         assert pack.exists()
21 |         assert str(pack).endswith(".exe")
22 | 
23 | 
24 | @pytest.mark.parametrize(
25 |     "system, arch",
26 |     [("darwin", "amd64"), ("darwin", "arm64"), ("linux", "amd64"), ("linux", "arm64")],
27 | )
28 | def test_installation_on_other_platforms(system: str, arch: str) -> None:
29 |     with patch("platform.system", return_value=system), patch(
30 |         "platform.machine", return_value=arch
31 |     ), tempfile.TemporaryDirectory() as temp_dir:
32 |         os.environ[HOME_ENV_VAR] = temp_dir
33 | 
34 |         pack = Path(ensure_pack())
35 | 
36 |         assert pack.exists()
37 | 


--------------------------------------------------------------------------------
/paka/cluster/aws/cloudwatch.py:
--------------------------------------------------------------------------------
 1 | import pulumi_aws as aws
 2 | 
 3 | from paka.cluster.context import Context
 4 | from paka.cluster.fluentbit import create_fluentbit
 5 | from paka.constants import PROJECT_NAME
 6 | 
 7 | LOG_GROUP = f"EKSContainerLogs/{PROJECT_NAME}"
 8 | 
 9 | 
10 | def enable_cloudwatch(ctx: Context) -> None:
11 |     aws.cloudwatch.LogGroup(
12 |         "log-group",
13 |         name=LOG_GROUP,
14 |         retention_in_days=ctx.cloud_config.cluster.logRetentionDays,
15 |     )
16 | 
17 |     # Fluent Bit configuration for forwarding logs to CloudWatch
18 |     fluent_bit_config = f"""
19 | [SERVICE]
20 |     Parsers_File /fluent-bit/etc/parsers.conf
21 | 
22 | [INPUT]
23 |     Name              tail
24 |     Path              /var/log/containers/*.log
25 |     Parser            docker
26 |     Tag               kube.*
27 |     Refresh_Interval  5
28 | 
29 | [OUTPUT]
30 |     Name               cloudwatch_logs
31 |     Match              kube.*
32 |     log_group_name     {LOG_GROUP}
33 |     log_stream_prefix  eks/
34 |     region             {ctx.cloud_config.cluster.region}
35 | """
36 |     create_fluentbit(ctx, fluent_bit_config)
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023-2024 Jijun Leng
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/tests/cli/test_function.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import typer
 3 | 
 4 | from paka.cli.function import process_traffic_splits, validate_traffic_split
 5 | 
 6 | 
 7 | def test_validate_traffic_split() -> None:
 8 |     # Test valid input
 9 |     assert validate_traffic_split("rev1=20") == ("rev1", 20)
10 | 
11 |     # Test missing '='
12 |     with pytest.raises(ValueError):
13 |         validate_traffic_split("rev120")
14 | 
15 |     # Test non-numeric percentage
16 |     with pytest.raises(ValueError):
17 |         validate_traffic_split("rev1=twenty")
18 | 
19 |     # Test percentage out of range
20 |     with pytest.raises(ValueError):
21 |         validate_traffic_split("rev1=101")
22 | 
23 | 
24 | def test_process_traffic_splits() -> None:
25 |     # Test valid input
26 |     splits, total = process_traffic_splits(["rev1=20", "rev2=30"])
27 |     assert splits == [("rev1", 20), ("rev2", 30)]
28 |     assert total == 50
29 | 
30 |     # Test duplicate revisions
31 |     with pytest.raises(typer.Exit):
32 |         process_traffic_splits(["rev1=20", "rev1=30"])
33 | 
34 |     # Test invalid split
35 |     with pytest.raises(ValueError):
36 |         process_traffic_splits(["rev1=20", "rev2=thirty"])
37 | 


--------------------------------------------------------------------------------
/examples/invoice_extraction/cluster.yaml:
--------------------------------------------------------------------------------
 1 | version: "1.2"
 2 | aws:
 3 |   cluster:
 4 |     name: invoice-extraction
 5 |     region: us-west-2
 6 |     namespace: default
 7 |     nodeType: t3a.medium
 8 |     minNodes: 2
 9 |     maxNodes: 4
10 |   prometheus:
11 |     enabled: true
12 |   tracing:
13 |     enabled: false
14 |   mixedModelGroups:
15 |     - name: llama2-7b-chat
16 |       nodeType: g4dn.xlarge
17 |       gpu:
18 |         enabled: true # This model group runs on GPU-enabled instances
19 |       baseInstances: 0
20 |       maxOnDemandInstances: 1
21 |       spot:
22 |         minInstances: 1
23 |         maxInstances: 2
24 |       runtime:
25 |         image: vllm/vllm-openai:v0.4.2
26 |       model:
27 |         hfRepoId: TheBloke/Llama-2-7B-Chat-GPTQ
28 |       autoScaleTriggers:
29 |         - type: prometheus
30 |           metadata:
31 |             serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint
32 |             metricName: latency_p95
33 |             threshold: '20000' # Set to 20s, tune as needed
34 |             query: | # Trigger scaling if p95 latency exceeds 20s
35 |               histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama2-7b-chat.default.svc.cluster.local"}[5m])) by (le))
36 | 


--------------------------------------------------------------------------------
/examples/website_rag/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.9.3
 2 | aiosignal==1.3.1
 3 | annotated-types==0.6.0
 4 | anyio==4.2.0
 5 | attrs==23.2.0
 6 | beautifulsoup4==4.12.3
 7 | certifi==2024.2.2
 8 | charset-normalizer==3.3.2
 9 | click==8.1.7
10 | dataclasses-json==0.6.4
11 | fancycompleter==0.9.1
12 | fastapi==0.109.2
13 | frozenlist==1.4.1
14 | grpcio==1.60.1
15 | grpcio-tools==1.60.1
16 | h11==0.14.0
17 | h2==4.1.0
18 | hpack==4.0.0
19 | httpcore==1.0.2
20 | httpx==0.26.0
21 | hyperframe==6.0.1
22 | idna==3.6
23 | jsonpatch==1.33
24 | jsonpointer==2.4
25 | langchain==0.1.4
26 | langchain-community==0.0.16
27 | langchain-core==0.1.18
28 | langdetect==1.0.9
29 | langserve==0.0.41
30 | langsmith==0.0.86
31 | marshmallow==3.20.2
32 | multidict==6.0.5
33 | mypy-extensions==1.0.0
34 | numpy==1.26.4
35 | orjson==3.9.13
36 | packaging==23.2
37 | portalocker==2.8.2
38 | protobuf==4.25.2
39 | pydantic==2.6.1
40 | pydantic_core==2.16.2
41 | Pygments==2.17.2
42 | pyrepl==0.9.0
43 | python-dotenv==1.0.1
44 | PyYAML==6.0.1
45 | qdrant-client==1.7.2
46 | requests==2.31.0
47 | six==1.16.0
48 | sniffio==1.3.0
49 | soupsieve==2.5
50 | SQLAlchemy==2.0.25
51 | sse-starlette==1.8.2
52 | sseclient-py==1.8.0
53 | starlette==0.36.3
54 | tenacity==8.2.3
55 | typing-inspect==0.9.0
56 | typing_extensions==4.9.0
57 | urllib3==2.2.0
58 | uvicorn==0.27.0.post1
59 | wmctrl==0.5
60 | yarl==1.9.4
61 | 


--------------------------------------------------------------------------------
/paka/cluster/keda.py:
--------------------------------------------------------------------------------
 1 | import pulumi
 2 | import pulumi_kubernetes as k8s
 3 | from pulumi_kubernetes.helm.v3 import Chart, ChartOpts, FetchOpts
 4 | 
 5 | from paka.cluster.context import Context
 6 | from paka.cluster.prometheus import create_prometheus
 7 | from paka.utils import call_once
 8 | 
 9 | 
10 | @call_once
11 | def create_keda(ctx: Context) -> None:
12 |     """
13 |     Installs a KEDA chart.
14 |     """
15 |     prometheus = create_prometheus(ctx)
16 | 
17 |     # Prometheus is a dependency for KEDA to work with the Prometheus metrics.
18 |     # However, Prometheus might not be enabled in the config. In that case,
19 |     # deletion of the KEDA resource will be blocked if Prometheus trigger is used.
20 |     dependencies = [prometheus] if prometheus else []
21 | 
22 |     ns = k8s.core.v1.Namespace(
23 |         "keda",
24 |         metadata={"name": "keda"},
25 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider),
26 |     )
27 |     Chart(
28 |         "keda",
29 |         ChartOpts(
30 |             chart="keda",
31 |             version="2.12.1",
32 |             namespace="keda",
33 |             fetch_opts=FetchOpts(repo="https://kedacore.github.io/charts"),
34 |             values={},
35 |         ),
36 |         opts=pulumi.ResourceOptions(
37 |             provider=ctx.k8s_provider, depends_on=[ns, *dependencies]
38 |         ),
39 |     )
40 | 


--------------------------------------------------------------------------------
/tests/model/test_http_model.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock, patch
 2 | 
 3 | import paka.model.http_model
 4 | from paka.model.http_model import BaseMLModel, HttpSourceModel
 5 | 
 6 | 
 7 | def test_http_source_model() -> None:
 8 |     with patch.object(
 9 |         paka.model.http_model.requests, "get"
10 |     ) as mock_requests_get, patch.object(
11 |         BaseMLModel,
12 |         "finish",
13 |         return_value=MagicMock(),
14 |     ) as finish_mock:
15 |         model_store_mock = MagicMock()
16 |         model = HttpSourceModel(
17 |             name="TestModel",
18 |             urls=["http://example.com/file1", "http://example.com/file2"],
19 |             model_store=model_store_mock,
20 |             quantization="GPTQ",
21 |             prompt_template_name=None,
22 |             prompt_template_str=None,
23 |         )
24 | 
25 |         mock_response = MagicMock()
26 |         mock_response.headers.get.return_value = 10
27 |         mock_requests_get.return_value.__enter__.return_value = mock_response
28 | 
29 |         model.save()
30 |         mock_requests_get.assert_called()
31 |         model_store_mock.save_stream.assert_called()
32 |         finish_mock.assert_called_once()
33 | 
34 |         model._save_single_url("http://example.com/file1")
35 |         mock_requests_get.assert_called_with("http://example.com/file1", stream=True)
36 |         model_store_mock.save_stream.assert_called()
37 | 


--------------------------------------------------------------------------------
/e2e/test_pulumi_install.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | from unittest.mock import patch
 4 | 
 5 | import pytest
 6 | 
 7 | from paka.cluster.pulumi import ensure_pulumi
 8 | from paka.constants import HOME_ENV_VAR
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     "system, arch",
13 |     [
14 |         ("darwin", "amd64"),
15 |         ("darwin", "arm64"),
16 |         ("linux", "amd64"),
17 |         ("linux", "arm64"),
18 |         ("windows", "amd64"),
19 |         ("windows", "arm64"),
20 |     ],
21 | )
22 | def test_installation(system: str, arch: str) -> None:
23 |     with patch("platform.system", return_value=system), patch(
24 |         "platform.machine", return_value=arch
25 |     ), tempfile.TemporaryDirectory() as temp_dir:
26 |         os.environ[HOME_ENV_VAR] = temp_dir
27 |         orig_path = os.environ["PATH"]
28 | 
29 |         try:
30 |             ensure_pulumi()
31 |             bin = "pulumi"
32 |             if system == "windows":
33 |                 bin += ".exe"
34 |             paths = os.environ["PATH"].split(":")
35 |             list_of_list = [p.split(";") for p in paths if p]
36 |             paths = [item for sublist in list_of_list for item in sublist]
37 | 
38 |             for path in paths:
39 |                 if os.path.exists(os.path.join(path, bin)):
40 |                     break
41 |             else:
42 |                 pytest.fail(f"{bin} not found in PATH")
43 |         finally:
44 |             os.environ["PATH"] = orig_path
45 | 


--------------------------------------------------------------------------------
/e2e/test_kubectl_install.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | from unittest.mock import patch
 4 | 
 5 | import pytest
 6 | 
 7 | from paka.cluster.kubectl import ensure_kubectl
 8 | from paka.constants import HOME_ENV_VAR
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     "system, arch",
13 |     [
14 |         ("darwin", "amd64"),
15 |         ("darwin", "arm64"),
16 |         ("linux", "amd64"),
17 |         ("linux", "arm64"),
18 |         ("windows", "amd64"),
19 |         ("windows", "arm64"),
20 |     ],
21 | )
22 | def test_installation(system: str, arch: str) -> None:
23 |     with patch("platform.system", return_value=system), patch(
24 |         "platform.machine", return_value=arch
25 |     ), tempfile.TemporaryDirectory() as temp_dir:
26 |         os.environ[HOME_ENV_VAR] = temp_dir
27 |         orig_path = os.environ["PATH"]
28 | 
29 |         try:
30 |             ensure_kubectl()
31 |             bin = "kubectl"
32 |             if system == "windows":
33 |                 bin += ".exe"
34 |             paths = os.environ["PATH"].split(":")
35 |             list_of_list = [p.split(";") for p in paths if p]
36 |             paths = [item for sublist in list_of_list for item in sublist]
37 | 
38 |             for path in paths:
39 |                 if os.path.exists(os.path.join(path, bin)):
40 |                     break
41 |             else:
42 |                 pytest.fail(f"{bin} not found in PATH")
43 |         finally:
44 |             os.environ["PATH"] = orig_path
45 | 


--------------------------------------------------------------------------------
/examples/invoice_extraction/output_parser.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | from langchain.output_parsers import PydanticOutputParser
 4 | from langchain_core.pydantic_v1 import BaseModel, Field
 5 | 
 6 | 
 7 | class Invoice(BaseModel):
 8 |     number: str = Field(description="invoice number, e.g. #25470322")
 9 |     date: str = Field(description="invoice date, e.g. 2024-01-01T08:29:56")
10 |     company: str = Field(description="remit to company, e.g. Akamai Technologies, Inc.")
11 |     company_address: str = Field(
12 |         description="remit to address, e.g. 249 Arch St. Philadelphia, PA 19106 USA"
13 |     )
14 |     tax_id: str = Field(description="tax ID/EIN number, e.g. 04-3432319")
15 |     customer: str = Field(description="invoice to customer, e.g. John Doe")
16 |     customer_address: str = Field(
17 |         description="invoice to address, e.g. 123 Main St. Springfield, IL 62701 USA"
18 |     )
19 |     amount: str = Field(description="total amount from this invoice, e.g. $5.00")
20 | 
21 |     def to_dict(self) -> Dict:
22 |         return {
23 |             "number": self.number,
24 |             "date": self.date,
25 |             "company": self.company,
26 |             "company_address": self.company_address,
27 |             "tax_id": self.tax_id,
28 |             "customer": self.customer,
29 |             "customer_address": self.customer_address,
30 |             "amount": self.amount,
31 |         }
32 | 
33 | 
34 | invoice_parser = PydanticOutputParser(pydantic_object=Invoice)
35 | 


--------------------------------------------------------------------------------
/paka/k8s/model_group/ingress.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import List
 4 | 
 5 | from kubernetes import client
 6 | 
 7 | from paka.k8s.utils import CustomResource, apply_resource
 8 | from paka.utils import kubify_name
 9 | 
10 | 
11 | def create_model_vservice(
12 |     namespace: str, model_name: str, hosts: List[str] = ["*"]
13 | ) -> None:
14 |     istio_virtual_service = CustomResource(
15 |         api_version="networking.istio.io/v1beta1",
16 |         kind="VirtualService",
17 |         plural="virtualservices",
18 |         metadata=client.V1ObjectMeta(name=kubify_name(model_name), namespace=namespace),
19 |         spec={
20 |             "hosts": hosts,
21 |             "gateways": ["knative-serving/knative-ingress-gateway"],
22 |             "http": [
23 |                 {
24 |                     "match": [
25 |                         {
26 |                             "authority": {
27 |                                 "prefix": kubify_name(model_name),
28 |                             }
29 |                         }
30 |                     ],
31 |                     "route": [
32 |                         {
33 |                             "destination": {
34 |                                 "host": f"{kubify_name(model_name)}.{namespace}.svc.cluster.local",
35 |                                 "port": {"number": 80},
36 |                             }
37 |                         }
38 |                     ],
39 |                 }
40 |             ],
41 |         },
42 |     )
43 | 
44 |     apply_resource(istio_virtual_service)
45 | 


--------------------------------------------------------------------------------
/paka/model/manifest.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import List, Optional
 4 | 
 5 | from pydantic import BaseModel
 6 | 
 7 | 
 8 | class ModelFile(BaseModel):
 9 |     name: str
10 |     sha256: str
11 | 
12 | 
13 | class ModelManifest(BaseModel):
14 |     """
15 |     A manifest for a model. The manifest is stored along with the model files.
16 | 
17 |     Attributes:
18 |         name (str): The name of the model.
19 |         files (List[ModelFile]): A list of model file where each model file contains a file name and a hash.
20 |         quantization (Optional[str]): The quantization method (GPTQ, AWQ, GGUF_Q4_0, etc) the model uses.
21 |         prompt_template_name (Optional[str]): The prompt template name (chatml, llama-2, gemma, etc) the model uses. This field is optional.
22 |         prompt_template_str (Optional[str]): The prompt template string the model uses. This field is optional.
23 |         main_model (Optional[str]): The main model file name. This field is optional.
24 |         clip_model (Optional[str]): The clip model file name. This field is optional and is used for multimodal models.
25 |         lora_model (Optional[str]): The lora model file name. This field is optional.
26 |     """
27 | 
28 |     name: str
29 |     files: List[ModelFile]
30 |     quantization: Optional[str] = None
31 |     prompt_template_str: Optional[str] = None
32 |     prompt_template_name: Optional[str] = None
33 | 
34 |     main_model: Optional[str] = None
35 |     # Clip model is used for multimodal models
36 |     clip_model: Optional[str] = None
37 |     lora_model: Optional[str] = None
38 | 


--------------------------------------------------------------------------------
/paka/model/http_model.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import concurrent.futures
 4 | from typing import List, Optional
 5 | 
 6 | import requests
 7 | 
 8 | from paka.model.base_model import BaseMLModel
 9 | from paka.model.store import ModelStore
10 | 
11 | 
12 | class HttpSourceModel(BaseMLModel):
13 |     def __init__(
14 |         self,
15 |         name: str,
16 |         urls: List[str],
17 |         model_store: ModelStore,
18 |         quantization: Optional[str] = None,
19 |         prompt_template_name: Optional[str] = None,
20 |         prompt_template_str: Optional[str] = None,
21 |     ) -> None:
22 |         super().__init__(
23 |             name=name,
24 |             model_store=model_store,
25 |             quantization=quantization,
26 |             prompt_template_name=prompt_template_name,
27 |             prompt_template_str=prompt_template_str,
28 |         )
29 |         self.urls = urls
30 | 
31 |     def save(self) -> None:
32 |         """
33 |         Save the model to a model store.
34 |         """
35 |         with concurrent.futures.ThreadPoolExecutor(
36 |             max_workers=self.concurrency
37 |         ) as executor:
38 |             futures = [executor.submit(self._save_single_url, url) for url in self.urls]
39 |             concurrent.futures.wait(futures)
40 |             self.finish()
41 | 
42 |     def _save_single_url(self, url: str) -> None:
43 |         with requests.get(url, stream=True) as response:
44 |             response.raise_for_status()
45 |             total_size = int(response.headers.get("content-length", 0))
46 |             fname = url.split("/")[-1]
47 |             self.save_single_stream(f"{self.name}/{fname}", response, total_size)
48 | 


--------------------------------------------------------------------------------
/tests/model/test_hf_model.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock, patch
 2 | 
 3 | import paka.model.hf_model
 4 | from paka.model.hf_model import BaseMLModel, HuggingFaceModel
 5 | 
 6 | 
 7 | def test_hf_model() -> None:
 8 |     with patch.object(
 9 |         paka.model.hf_model, "HfFileSystem", autospec=True
10 |     ) as mock_hf_file_system, patch.object(
11 |         BaseMLModel,
12 |         "finish",
13 |         return_value=MagicMock(),
14 |     ) as finish_mock:
15 |         model_store_mock = MagicMock()
16 |         model = HuggingFaceModel(
17 |             name="TestModel",
18 |             repo_id="test-repo",
19 |             files=["file1", "file2"],
20 |             model_store=model_store_mock,
21 |             quantization="GPTQ",
22 |         )
23 | 
24 |         mock_hf_file_system.return_value.glob.return_value = ["file1", "file2"]
25 |         mock_hf_file_system.return_value.stat.return_value = {
26 |             "size": 10,
27 |             "lfs": {"sha256": "test_sha256"},
28 |         }
29 |         mock_hf_file_system.return_value.open.return_value.__enter__.return_value = (
30 |             MagicMock()
31 |         )
32 | 
33 |         model.save()
34 |         mock_hf_file_system.return_value.glob.assert_called()
35 |         mock_hf_file_system.return_value.stat.assert_called()
36 |         mock_hf_file_system.return_value.open.assert_called()
37 |         model_store_mock.save_stream.assert_called()
38 |         finish_mock.assert_called_once()
39 | 
40 |         model._save_single_file("file1")
41 |         mock_hf_file_system.return_value.stat.assert_called_with("file1")
42 |         mock_hf_file_system.return_value.open.assert_called_with("file1", "rb")
43 |         model_store_mock.save_stream.assert_called()
44 | 


--------------------------------------------------------------------------------
/tests/policy_packs/aws/container_registry.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import List
 4 | 
 5 | from pulumi_policy import (
 6 |     Policy,
 7 |     ReportViolation,
 8 |     ResourceValidationArgs,
 9 |     ResourceValidationPolicy,
10 |     StackValidationArgs,
11 |     StackValidationPolicy,
12 | )
13 | 
14 | max_num_ecrs = 1
15 | 
16 | 
17 | def ecr_count_validator(
18 |     stack: StackValidationArgs, report_violation: ReportViolation
19 | ) -> None:
20 |     ecr_resources = filter(
21 |         (lambda resource: resource.resource_type == "aws:ecr/repository:Repository"),
22 |         stack.resources,
23 |     )
24 | 
25 |     ecrs = list(ecr_resources)
26 |     if len(ecrs) > max_num_ecrs:
27 |         report_violation(
28 |             f"No more than {max_num_ecrs} repository(ies) should be created.", None
29 |         )
30 | 
31 | 
32 | ecr_count_check = StackValidationPolicy(
33 |     name="ecr-count-check",
34 |     description="Checks the number of ECR repositories created.",
35 |     validate=ecr_count_validator,
36 | )
37 | 
38 | 
39 | def ecr_force_delete_validator(
40 |     args: ResourceValidationArgs, report_violation: ReportViolation
41 | ) -> None:
42 |     if args.resource_type == "aws:ecr/repository:Repository":
43 |         force_destroy = args.props["forceDelete"]
44 |         if not force_destroy:
45 |             report_violation(
46 |                 "You must set forceDelete to true. ",
47 |                 None,
48 |             )
49 | 
50 | 
51 | ecr_force_delete = ResourceValidationPolicy(
52 |     name="ecr-force-delete",
53 |     description="Requires forceDelete to be set to true.",
54 |     validate=ecr_force_delete_validator,
55 | )
56 | 
57 | ecr_policies: List[Policy] = [ecr_count_check, ecr_force_delete]
58 | 


--------------------------------------------------------------------------------
/examples/website_rag/cluster.yaml:
--------------------------------------------------------------------------------
 1 | version: "1.2"
 2 | aws:
 3 |   cluster:
 4 |     name: website-rag
 5 |     region: us-west-2
 6 |     namespace: default
 7 |     nodeType: t3a.medium
 8 |     minNodes: 2
 9 |     maxNodes: 4
10 |   vectorStore:
11 |      nodeType: t3a.small
12 |      replicas: 1
13 |   prometheus:
14 |     enabled: true
15 |   mixedModelGroups:
16 |     - name: gte-base
17 |       nodeType: c7a.xlarge
18 |       baseInstances: 0
19 |       maxOnDemandInstances: 1
20 |       spot:
21 |         minInstances: 1
22 |         maxInstances: 3
23 |       runtime:
24 |         image: ghcr.io/ggerganov/llama.cpp:server
25 |       model:
26 |         hfRepoId: jjleng/gte-base-gguf
27 |         files: ["*.q4_0.gguf"]
28 |       autoScaleTriggers:
29 |         - type: cpu
30 |           metadata:
31 |             type: Utilization
32 |             value: "50"
33 |     - name: llama2-7b-chat
34 |       nodeType: g4dn.xlarge
35 |       gpu:
36 |         enabled: true # This model group runs on GPU-enabled instances
37 |       baseInstances: 0
38 |       maxOnDemandInstances: 1
39 |       spot:
40 |         minInstances: 1
41 |         maxInstances: 2
42 |       runtime:
43 |         image: vllm/vllm-openai:v0.4.2
44 |       model:
45 |         hfRepoId: TheBloke/Llama-2-7B-Chat-GPTQ
46 |       autoScaleTriggers:
47 |         - type: prometheus
48 |           metadata:
49 |             serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint
50 |             metricName: latency_p95
51 |             threshold: '20000' # Set to 20s, tune as needed
52 |             query: | # Trigger scaling if p95 latency exceeds 20s
53 |               histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama2-7b-chat.default.svc.cluster.local"}[5m])) by (le))
54 | 


--------------------------------------------------------------------------------
/paka/cli/__main__.py:
--------------------------------------------------------------------------------
 1 | import typer
 2 | 
 3 | from paka import __version__
 4 | from paka.cli.build import build_app
 5 | from paka.cli.cluster import cluster_app
 6 | from paka.cli.function import function_app
 7 | from paka.cli.job import job_app
 8 | from paka.cli.kubeconfig import kube_app
 9 | from paka.cli.model_group import model_group_app
10 | from paka.cli.run import run_app
11 | from paka.cli.utils import init_pulumi
12 | from paka.logger import setup_logger
13 | 
14 | init_pulumi()
15 | 
16 | 
17 | def version_callback(version: bool) -> None:
18 |     if version:
19 |         typer.echo(f"Paka CLI Version: {__version__}")
20 |         raise typer.Exit()
21 | 
22 | 
23 | def verbose_option(
24 |     verbose: bool = typer.Option(
25 |         False, "--verbose", "-v", help="Enable verbose output"
26 |     ),
27 | ) -> None:
28 |     setup_logger(verbose)
29 | 
30 | 
31 | cli = typer.Typer(context_settings={"help_option_names": ["-h", "--help"]})
32 | cli.callback()(verbose_option)
33 | 
34 | 
35 | @cli.callback()
36 | def version_option(
37 |     ctx: typer.Context,
38 |     version: bool = typer.Option(
39 |         False, "--version", help="Show version and exit", callback=version_callback
40 |     ),
41 | ) -> None:
42 |     pass
43 | 
44 | 
45 | cli.add_typer(cluster_app, name="cluster", help="Manage clusters.")
46 | 
47 | cli.add_typer(job_app, name="job", help="Manage batch jobs.")
48 | 
49 | cli.add_typer(build_app, name="build", help="Build Docker images.")
50 | 
51 | cli.add_typer(kube_app, name="kubeconfig", help="Export kubeconfig.")
52 | 
53 | cli.add_typer(run_app, name="run", help="Run one-off script.")
54 | 
55 | cli.add_typer(function_app, name="function", help="Manage serverless functions.")
56 | 
57 | cli.add_typer(model_group_app, name="model-group", help="Manage model groups.")
58 | 
59 | 
60 | def main() -> None:
61 |     cli()
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     main()
66 | 


--------------------------------------------------------------------------------
/examples/templates/Llama2_7B_Chat_AWQ.yaml:
--------------------------------------------------------------------------------
 1 | version: "1.2"
 2 | aws:
 3 |   cluster:
 4 |     name: llama2-7b-chat-example # Use a name in lowercase letters with hyphens (kebab-case)
 5 |     region: us-west-2
 6 |     nodeType: t3a.medium
 7 |     minNodes: 2
 8 |     maxNodes: 4 # These nodes will host serverless functions and other essential loads
 9 |   prometheus:
10 |     enabled: true # Enable metrics scraping with Prometheus
11 |   mixedModelGroups: # A mixed model group can include both on-demand and spot nodes
12 |     - name: llama2-7b-chat # Specify a name for the model group
13 |       isPublic: true # Make the model group accessible through a public endpoint
14 |       nodeType: g4dn.xlarge
15 |       gpu:
16 |         enabled: true # This model group runs on GPU-enabled instances
17 |       baseInstances: 0 # Fail-safe instances, always run on-demand instances
18 |       maxOnDemandInstances: 1 # Maximum number of on-demand instances, used as a fallback if spot instances are not available
19 |       spot:
20 |         minInstances: 1
21 |         maxInstances: 2 # Prefer to run the inference backend on spot instances
22 |       runtime:
23 |         image: vllm/vllm-openai:v0.4.2 # Use vLLM backend
24 |       model:
25 |         hfRepoId: TheBloke/Llama-2-7B-Chat-AWQ # Specify the Hugging Face model to run
26 |         useModelStore: false # Don't save models to s3
27 |       autoScaleTriggers:
28 |         - type: prometheus
29 |           metadata:
30 |             serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint
31 |             metricName: latency_p95
32 |             threshold: '20000' # Set to 20s, tune as needed
33 |             query: | # Trigger scaling if p95 latency exceeds 20s
34 |               histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama2-7b-chat.default.svc.cluster.local"}[5m])) by (le))
35 | 


--------------------------------------------------------------------------------
/examples/templates/Llama2_7B_Chat_GPTQ.yaml:
--------------------------------------------------------------------------------
 1 | version: "1.2"
 2 | aws:
 3 |   cluster:
 4 |     name: llama2-7b-chat-example # Use a name in lowercase letters with hyphens (kebab-case)
 5 |     region: us-west-2
 6 |     nodeType: t3a.medium
 7 |     minNodes: 2
 8 |     maxNodes: 4 # These nodes will host serverless functions and other essential loads
 9 |   prometheus:
10 |     enabled: true # Enable metrics scraping with Prometheus
11 |   mixedModelGroups: # A mixed model group can include both on-demand and spot nodes
12 |     - name: llama2-7b-chat # Specify a name for the model group
13 |       isPublic: true # Make the model group accessible through a public endpoint
14 |       nodeType: g4dn.xlarge
15 |       gpu:
16 |         enabled: true # This model group runs on GPU-enabled instances
17 |       baseInstances: 0 # Fail-safe instances, always run on-demand instances
18 |       maxOnDemandInstances: 1  # Maximum number of on-demand instances, used as a fallback if spot instances are not available
19 |       spot:
20 |         minInstances: 1
21 |         maxInstances: 2 # Prefer to run the inference backend on spot instances
22 |       runtime:
23 |         image: vllm/vllm-openai:v0.4.2 # Use vLLM backend
24 |       model:
25 |         hfRepoId: TheBloke/Llama-2-7B-Chat-GPTQ # Specify the Hugging Face model to run
26 |         useModelStore: false  # Don't save models to s3
27 |       autoScaleTriggers:
28 |         - type: prometheus
29 |           metadata:
30 |             serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint
31 |             metricName: latency_p95
32 |             threshold: '20000' # Set to 20s, tune as needed
33 |             query: | # Trigger scaling if p95 latency exceeds 20s
34 |               histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama2-7b-chat.default.svc.cluster.local"}[5m])) by (le))
35 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "paka"
 3 | version = "0.1.11"
 4 | description = "LLMOps tool designed to simplify the deployment and management of large language model (LLM) applications"
 5 | homepage = "https://github.com/jjleng/paka"
 6 | keywords = ["LLMOps", "RAG", "production", "Cloud"]
 7 | authors = ["Jijun Leng"]
 8 | readme = "README.md"
 9 | 
10 | [tool.codespell]
11 | check-filenames = true
12 | 
13 | [tool.mypy]
14 | ignore_missing_imports = true
15 | disallow_untyped_defs = true
16 | check_untyped_defs = true
17 | plugins = "pydantic.mypy"
18 | warn_unused_configs = true
19 | 
20 | [tool.isort]
21 | profile = "black"
22 | 
23 | [tool.pytest.ini_options]
24 | filterwarnings = ["ignore::DeprecationWarning"]
25 | 
26 | [tool.poetry.scripts]
27 | paka = "paka.cli.__main__:main"
28 | 
29 | [tool.poetry.dependencies]
30 | python = "^3.8"
31 | pydantic = "^2.7.0"
32 | ruamel-yaml = "^0.18.6"
33 | pulumi = "3.105.0"
34 | pulumi-aws = "^6.31.0"
35 | typer = "^0.12.3"
36 | pulumi-eks = "^2.3.0"
37 | pulumi-awsx = "^2.7.0"
38 | pulumi-kubernetes = "^4.8.1"
39 | pathspec = "^0.12.1"
40 | requests = "^2.31.0"
41 | kubernetes = "^29.0.0"
42 | boto3 = "^1.34.86"
43 | tabulate = "^0.9.0"
44 | huggingface-hub = "^0.22.2"
45 | tqdm = "^4.66.2"
46 | typing-extensions = "^4.11.0"
47 | fasteners = "^0.19"
48 | tenacity = "^8.2.3"
49 | 
50 | [tool.poetry.group.dev.dependencies]
51 | codespell = "^2.2.6"
52 | mypy = "^1.9.0"
53 | pre-commit = "3.5.0"
54 | pytest = "^8.1.1"
55 | pytest-snapshot = "^0.9.0"
56 | types-requests = "2.31.0.6"
57 | isort = "^5.13.2"
58 | types-tabulate = "^0.9.0.20240106"
59 | pulumi-policy = "^1.11.0"
60 | moto = "^5.0.5"
61 | boto3-stubs = { extras = ["ec2", "ecr", "s3"], version = "^1.34.106" }
62 | types-tqdm = "^4.66.0.20240417"
63 | pytest-order = "^1.2.1"
64 | kubernetes-stubs-elephant-fork = "^29.0.0.post1"
65 | 
66 | [build-system]
67 | requires = ["poetry-core"]
68 | build-backend = "poetry.core.masonry.api"
69 | 


--------------------------------------------------------------------------------
/tests/k8s/model_group/runtime/test_vllm.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock, patch
 2 | 
 3 | import paka.k8s.model_group.runtime.vllm
 4 | from paka.cluster.context import Context
 5 | from paka.config import AwsModelGroup, Model, Runtime
 6 | from paka.k8s.model_group.runtime.vllm import get_runtime_command_vllm, is_vllm_image
 7 | 
 8 | 
 9 | def test_is_vllm_image() -> None:
10 |     assert is_vllm_image("vllm:latest") == True
11 |     assert is_vllm_image("notvllm:latest") == False
12 | 
13 | 
14 | def test_get_runtime_command_vllm() -> None:
15 |     mock_store = MagicMock()
16 |     with patch.object(
17 |         paka.k8s.model_group.runtime.vllm,
18 |         "get_model_store",
19 |         return_value=mock_store,
20 |     ) as mock_get_model_store, patch.object(
21 |         paka.k8s.model_group.runtime.vllm,
22 |         "validate_repo_id",
23 |         return_value=True,
24 |     ) as mock_validate_repo_id:
25 |         ctx = Context()
26 |         model_group = AwsModelGroup(
27 |             name="test",
28 |             minInstances=1,
29 |             maxInstances=2,
30 |             nodeType="t2.micro",
31 |             runtime=Runtime(image="vllm:latest", command=["python", "app.py"]),
32 |             model=Model(useModelStore=True),
33 |             resourceRequest={"cpu": "1000", "memory": "1Gi"},
34 |         )
35 | 
36 |         command = get_runtime_command_vllm(ctx, model_group)
37 |         assert command == ["python", "app.py", "--model", "/data"]
38 | 
39 |         model_group.runtime.command = None
40 |         command = get_runtime_command_vllm(ctx, model_group)
41 |         assert command == [
42 |             "python3",
43 |             "-O",
44 |             "-u",
45 |             "-m",
46 |             "vllm.entrypoints.openai.api_server",
47 |             "--host",
48 |             "0.0.0.0",
49 |             "--served-model-name",
50 |             "test",
51 |             "--model",
52 |             "/data",
53 |         ]
54 | 


--------------------------------------------------------------------------------
/examples/templates/Llama3_70B_Instruct_GPTQ.yaml:
--------------------------------------------------------------------------------
 1 | version: "1.2"
 2 | aws:
 3 |   cluster:
 4 |     name: llama3-70b-instruct-example # Use a name in lowercase letters with hyphens (kebab-case)
 5 |     region: us-west-2
 6 |     nodeType: t3a.medium
 7 |     minNodes: 2
 8 |     maxNodes: 4 # These nodes will host serverless functions and other essential loads
 9 |   prometheus:
10 |     enabled: true # Enable metrics scraping with Prometheus
11 |   mixedModelGroups: # A mixed model group can include both on-demand and spot nodes
12 |     - name: llama3-70b-instruct # Specify a name for the model group
13 |       isPublic: true # Make the model group accessible through a public endpoint
14 |       nodeType: g4dn.12xlarge
15 |       gpu:
16 |         enabled: true # This model group runs on GPU-enabled instances
17 |         diskSize: 80
18 |       baseInstances: 0 # Fail-safe instances, always run on-demand instances
19 |       maxOnDemandInstances: 1  # Maximum number of on-demand instances, used as a fallback if spot instances are not available
20 |       spot:
21 |         minInstances: 1
22 |         maxInstances: 2 # Prefer to run the inference backend on spot instances
23 |       runtime:
24 |         image: vllm/vllm-openai:v0.4.2 # Use vLLM backend
25 |       model:
26 |         hfRepoId: TechxGenus/Meta-Llama-3-70B-Instruct-GPTQ # Specify the Hugging Face model to run
27 |         useModelStore: false  # Don't save models to s3
28 |       autoScaleTriggers:
29 |         - type: prometheus
30 |           metadata:
31 |             serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint
32 |             metricName: latency_p95
33 |             threshold: '20000' # Set to 20s, tune as needed
34 |             query: | # Trigger scaling if p95 latency exceeds 20s
35 |               histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama3-70b-instruct.default.svc.cluster.local"}[5m])) by (le))
36 | 


--------------------------------------------------------------------------------
/paka/cluster/aws/elb.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import json
 4 | from typing import Optional
 5 | 
 6 | import boto3
 7 | from kubernetes import client, config
 8 | 
 9 | 
10 | # Pulumi cannot update the idle timeout of an ELB. This script uses boto3 to
11 | # update the idle timeout of an ELB.
12 | def _update_elb_idle_timeout(
13 |     load_balancer_name: str, idle_timeout_seconds: int
14 | ) -> None:
15 |     elb_client = boto3.client("elb")
16 | 
17 |     attributes = {
18 |         "LoadBalancerAttributes": {
19 |             "ConnectionSettings": {"IdleTimeout": idle_timeout_seconds}
20 |         }
21 |     }
22 | 
23 |     elb_client.modify_load_balancer_attributes(
24 |         LoadBalancerName=load_balancer_name,
25 |         LoadBalancerAttributes=attributes["LoadBalancerAttributes"],
26 |     )
27 | 
28 | 
29 | def update_elb_idle_timeout(kubeconfig_json: str, idle_timeout_seconds: int) -> None:
30 |     elb_name = get_elb_name(kubeconfig_json)
31 | 
32 |     if elb_name:
33 |         _update_elb_idle_timeout(elb_name, idle_timeout_seconds)
34 | 
35 | 
36 | def get_elb_name(kubeconfig_json: str) -> Optional[str]:
37 |     config.load_kube_config_from_dict(json.loads(kubeconfig_json))
38 | 
39 |     v1 = client.CoreV1Api()
40 |     services = v1.list_service_for_all_namespaces(watch=False)
41 | 
42 |     for service in services.items:
43 |         if service.spec and service.spec.type == "LoadBalancer":
44 |             # The name of the ELB is the first part of the hostname of the load balancer
45 |             if (
46 |                 service.status
47 |                 and service.status.load_balancer
48 |                 and service.status.load_balancer.ingress
49 |             ):
50 |                 elb_hostname = service.status.load_balancer.ingress[0].hostname
51 |                 if not elb_hostname:
52 |                     continue
53 |                 elb_name = elb_hostname.split("-")[0]
54 |                 return elb_name
55 | 
56 |     return None
57 | 


--------------------------------------------------------------------------------
/examples/templates/Mistral_7B_Instruct_GPTQ.yaml:
--------------------------------------------------------------------------------
 1 | version: "1.2"
 2 | aws:
 3 |   cluster:
 4 |     name: mistral-7b-instruct-example # Use a name in lowercase letters with hyphens (kebab-case)
 5 |     region: us-west-2
 6 |     nodeType: t3a.medium
 7 |     minNodes: 2
 8 |     maxNodes: 4 # These nodes will host serverless functions and other essential loads
 9 |   prometheus:
10 |     enabled: true # Enable metrics scraping with Prometheus
11 |   mixedModelGroups: # A mixed model group can include both on-demand and spot nodes
12 |     - name: mistral-7b-instruct # Specify a name for the model group
13 |       isPublic: true # Make the model group accessible through a public endpoint
14 |       nodeType: g5g.xlarge
15 |       gpu: # This would enable inference on CUDA devices
16 |         enabled: true # This model group runs on GPU-enabled instances
17 |         diskSize: 40 # 40GB
18 |       baseInstances: 0 # Fail-safe instances, always run on-demand instances
19 |       maxOnDemandInstances: 1  # Maximum number of on-demand instances, used as a fallback if spot instances are not available
20 |       spot:
21 |         minInstances: 1
22 |         maxInstances: 2 # Prefer to run the inference backend on spot instances
23 |       runtime:
24 |         image: vllm/vllm-openai:v0.4.2 # Use vLLM backend
25 |       model:
26 |         hfRepoId: neuralmagic/Mistral-7B-Instruct-v0.3-GPTQ-4bit # Specify the Hugging Face model to run
27 |         useModelStore: false  # Don't save models to s3
28 |       autoScaleTriggers:
29 |         - type: prometheus
30 |           metadata:
31 |             serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint
32 |             metricName: latency_p95
33 |             threshold: '20000' # Set to 20s, tune as needed
34 |             query: | # Trigger scaling if p95 latency exceeds 20s
35 |               histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="mistral-7b-instruct.default.svc.cluster.local"}[5m])) by (le))
36 | 


--------------------------------------------------------------------------------
/examples/templates/Llama3_70B_Instruct.yaml:
--------------------------------------------------------------------------------
 1 | version: "1.2"
 2 | aws:
 3 |   cluster:
 4 |     name: llama3-70b-instruct-example # Use a name in lowercase letters with hyphens (kebab-case)
 5 |     region: us-west-2
 6 |     nodeType: t3a.medium
 7 |     minNodes: 2
 8 |     maxNodes: 4 # These nodes will host serverless functions and other essential loads
 9 |   prometheus:
10 |     enabled: true
11 |   mixedModelGroups: # A mixed model group can include both on-demand and spot nodes
12 |     - name: llama3-70b-instruct # Specify a name for the model group
13 |       isPublic: true # Make the model group accessible through a public endpoint
14 |       nodeType: g5.48xlarge
15 |       gpu:
16 |         enabled: true # This model group runs on GPU-enabled instances
17 |         diskSize: 200 # 200GB
18 |       baseInstances: 0 # Fail-safe instances, always run on-demand instances
19 |       maxOnDemandInstances: 1 # Maximum number of on-demand instances, used as a fallback if spot instances are not available
20 |       spot:
21 |         minInstances: 1
22 |         maxInstances: 2 # Prefer to run the inference backend on spot instances
23 |       runtime:
24 |         image: vllm/vllm-openai:v0.4.2 # Use vLLM backend
25 |         env:
26 |           - name: HF_TOKEN # Required to download model weights from a gated Hugging Face repo
27 |             value: <REPLACE_WITH_YOUR_HF_TOKEN>
28 |       model:
29 |         hfRepoId: meta-llama/Meta-Llama-3-70B-Instruct # Specify the Hugging Face model to run
30 |         useModelStore: false # Don't save models to s3
31 |       autoScaleTriggers:
32 |         - type: prometheus
33 |           metadata:
34 |             serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint
35 |             metricName: latency_p95
36 |             threshold: '20000' # Set to 20s, tune as needed
37 |             query: | # Trigger scaling if p95 latency exceeds 20s
38 |               histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama3-70b-instruct.default.svc.cluster.local"}[5m])) by (le))
39 | 


--------------------------------------------------------------------------------
/examples/templates/Llama3_8B_Instruct.yaml:
--------------------------------------------------------------------------------
 1 | version: "1.2"
 2 | aws:
 3 |   cluster:
 4 |     name: llama3-8b-instruct-example # Use a name in lowercase letters with hyphens (kebab-case)
 5 |     region: us-west-2
 6 |     nodeType: t3a.medium
 7 |     minNodes: 2
 8 |     maxNodes: 4 # These nodes will host serverless functions and other essential loads
 9 |   prometheus:
10 |     enabled: true # Enable metrics scraping with Prometheus
11 |   tracing:
12 |     enabled: false
13 |   mixedModelGroups: # A mixed model group can include both on-demand and spot nodes
14 |     - name: llama3-8b-instruct # Specify a name for the model group
15 |       isPublic: true # Make the model group accessible through a public endpoint
16 |       nodeType: g6.xlarge
17 |       gpu:
18 |         enabled: true # This model group runs on GPU-enabled instances
19 |       baseInstances: 0 # Fail-safe instances, always run on-demand instances
20 |       maxOnDemandInstances: 1 # Maximum number of on-demand instances, used as a fallback if spot instances are not available
21 |       spot:
22 |         minInstances: 1
23 |         maxInstances: 2 # Prefer to run the inference backend on spot instances
24 |       runtime:
25 |         image: vllm/vllm-openai:v0.4.2 # Use vLLM backend
26 |         env:
27 |           - name: HF_TOKEN # Required to download model weights from a gated Hugging Face repo
28 |             value: <REPLACE_WITH_YOUR_HF_TOKEN>
29 |       model:
30 |         hfRepoId: meta-llama/Meta-Llama-3-8B-Instruct # Specify the Hugging Face model to run
31 |         useModelStore: false # Don't save models to s3
32 |       autoScaleTriggers:
33 |         - type: prometheus
34 |           metadata:
35 |             serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint
36 |             metricName: latency_p95
37 |             threshold: '20000' # Set to 20s, tune as needed
38 |             query: | # Trigger scaling if p95 latency exceeds 20s
39 |               histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama3-8b-instruct.default.svc.cluster.local"}[5m])) by (le))
40 | 


--------------------------------------------------------------------------------
/examples/templates/Phi3_Mini_4K_Instruct.yaml:
--------------------------------------------------------------------------------
 1 | version: "1.2"
 2 | aws:
 3 |   cluster:
 4 |     name: phi3-mini-instruct-example # Use a name in lowercase letters with hyphens (kebab-case)
 5 |     region: us-west-2
 6 |     nodeType: t3a.medium
 7 |     minNodes: 2
 8 |     maxNodes: 4 # These nodes will host serverless functions and other essential loads
 9 |   prometheus:
10 |     enabled: true # Enable metrics scraping with Prometheus
11 |   mixedModelGroups: # A mixed model group can include both on-demand and spot nodes
12 |     - name: phi3-mini-instruct # Specify a name for the model group
13 |       isPublic: true # Make the model group accessible through a public endpoint
14 |       nodeType: g5g.xlarge
15 |       gpu: # This would enable inference on CUDA devices
16 |         enabled: true # This model group runs on GPU-enabled instances
17 |       baseInstances: 0 # Fail-safe instances, always run on-demand instances
18 |       maxOnDemandInstances: 1 # Maximum number of on-demand instances, used as a fallback if spot instances are not available
19 |       spot:
20 |         minInstances: 1
21 |         maxInstances: 2 # Prefer to run the inference backend on spot instances
22 |       runtime:
23 |         image: vllm/vllm-openai:v0.4.2 # Use vLLM backend
24 |         env:
25 |           - name: HF_TOKEN # Required to download model weights from a gated Hugging Face repo
26 |             value: <REPLACE_WITH_YOUR_HF_TOKEN>
27 |       model:
28 |         hfRepoId: microsoft/Phi-3-mini-4k-instruct # Specify the Hugging Face model to run
29 |         useModelStore: true # Don't save models to s3
30 |       autoScaleTriggers:
31 |         - type: prometheus
32 |           metadata:
33 |             serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint
34 |             metricName: latency_p95
35 |             threshold: '20000' # Set to 20s, tune as needed
36 |             query: | # Trigger scaling if p95 latency exceeds 20s
37 |               histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="phi3-mini-instruct.default.svc.cluster.local"}[5m])) by (le))
38 | 


--------------------------------------------------------------------------------
/examples/templates/Mistral_7B_Instruct.yaml:
--------------------------------------------------------------------------------
 1 | version: "1.2"
 2 | aws:
 3 |   cluster:
 4 |     name: mistral-7b-instruct-example # Use a name in lowercase letters with hyphens (kebab-case)
 5 |     region: us-west-2
 6 |     nodeType: t3a.medium
 7 |     minNodes: 2
 8 |     maxNodes: 4 # These nodes will host serverless functions and other essential loads
 9 |   prometheus:
10 |     enabled: true # Enable metrics scraping with Prometheus
11 |   mixedModelGroups: # A mixed model group can include both on-demand and spot nodes
12 |     - name: mistral-7b-instruct # Specify a name for the model group
13 |       isPublic: true # Make the model group accessible through a public endpoint
14 |       nodeType: g4dn.xlarge
15 |       gpu: # This would enable inference on CUDA devices
16 |         enabled: true # This model group runs on GPU-enabled instances
17 |         diskSize: 50
18 |       baseInstances: 0 # Fail-safe instances, always run on-demand instances
19 |       maxOnDemandInstances: 1 # Maximum number of on-demand instances, used as a fallback if spot instances are not available
20 |       spot:
21 |         minInstances: 1
22 |         maxInstances: 2 # Prefer to run the inference backend on spot instances
23 |       runtime:
24 |         image: vllm/vllm-openai:v0.4.2 # Use vLLM backend
25 |         env:
26 |           - name: HF_TOKEN # Required to download model weights from a gated Hugging Face repo
27 |             value: <REPLACE_WITH_YOUR_HF_TOKEN>
28 |       model:
29 |         hfRepoId: mistralai/Mistral-7B-Instruct-v0.3 # Specify the Hugging Face model to run
30 |         useModelStore: false # Don't save models to s3
31 |       autoScaleTriggers:
32 |         - type: prometheus
33 |           metadata:
34 |             serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint
35 |             metricName: latency_p95
36 |             threshold: '20000' # Set to 20s, tune as needed
37 |             query: | # Trigger scaling if p95 latency exceeds 20s
38 |               histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="mistral-7b-instruct.default.svc.cluster.local"}[5m])) by (le))
39 | 


--------------------------------------------------------------------------------
/examples/invoice_extraction/README.md:
--------------------------------------------------------------------------------
 1 | ## Invoice Extraction
 2 | This code provides an example of how to build a RESTful API that converts an invoice PDF into a structured data format (JSON). It extracts text from the PDF and then uses the langchain and llama2-7B to extract structured data from the text.
 3 | 
 4 | ## Running the Example
 5 | 
 6 | Follow the steps below to run the example:
 7 | 
 8 | 1. **Install the necessary dependencies:**
 9 | ```bash
10 | pip install paka
11 | 
12 | # Install AWS CLI and ensure your AWS credentials are correctly configured.
13 | aws configure
14 | ```
15 | 
16 | 2. **Ensure the Docker daemon is running:**
17 | ```bash
18 | docker info
19 | ```
20 | 
21 | 3. **Provision the cluster:**
22 | ```bash
23 | cd examples/invoice_extraction
24 | 
25 | # Provision the cluster and update ~/.kube/config
26 | paka cluster up -f cluster.yaml
27 | 
28 | # Provision a cluster with Nvidia GPUs
29 | paka cluster up -f gpu_cluster.yaml
30 | ```
31 | 
32 | 4. **Deploy the App:**
33 | ```bash
34 | # The command below will build the source and deploy it as a serverless function.
35 | paka function deploy --name invoice-extraction --source . --entrypoint serve
36 | ```
37 | 
38 | 5. **Check the status of the functions:**
39 | ```bash
40 | paka function list
41 | ```
42 | 
43 | If everything is successful, you should see the function in the list with a status of "READY". By default, the function is exposed through a publicly accessible REST API endpoint.
44 | 
45 | 6. **Test the App:**
46 | 
47 | Submit the PDF invoices by hitting the `/extract_invoice` endpoint of the deployed function.
48 | 
49 | ```bash
50 | curl -X POST -H "Content-Type: multipart/form-data" -F "file=@/path/to/invoices/invoice-2024-02-29.pdf" http://invoice-extraction.default.xxxx.sslip.io/extract_invoice
51 | ```
52 | 
53 | If the invoice extraction is successful, you should see the structured data in the response, e.g.
54 | 
55 | ```json
56 | {"number":"#25927345","date":"2024-01-31T05:07:53","company":"Akamai Technologies, Inc.","company_address":"249 Arch St. Philadelphia, PA 19106 USA","tax_id":"United States EIN: 04-3432319","customer":"John Doe","customer_address":"1 Hacker Way Menlo Park, CA  94025","amount":"$5.00"}
57 | ```
58 | 


--------------------------------------------------------------------------------
/paka/k8s/model_group/runtime/vllm.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import re
 4 | import shlex
 5 | from typing import List
 6 | 
 7 | from huggingface_hub.utils import validate_repo_id
 8 | 
 9 | from paka.cluster.context import Context
10 | from paka.cluster.utils import get_model_store
11 | from paka.config import CloudModelGroup
12 | from paka.constants import MODEL_MOUNT_PATH
13 | from paka.k8s.utils import get_gpu_count
14 | 
15 | 
16 | # Heuristic to determine if the image is a vLLM image
17 | def is_vllm_image(image: str) -> bool:
18 |     return image.lower().startswith("vllm")
19 | 
20 | 
21 | def get_runtime_command_vllm(ctx: Context, model_group: CloudModelGroup) -> List[str]:
22 |     runtime = model_group.runtime
23 |     if runtime.command:
24 |         command_str = " ".join(runtime.command) if runtime.command else ""
25 |         if re.search(r"(--model)[ \t]*\S+", command_str):
26 |             return runtime.command
27 | 
28 |     if model_group.model:
29 |         if model_group.model.useModelStore:
30 |             store = get_model_store(ctx, with_progress_bar=False)
31 |             if not store.glob(f"{model_group.name}/*"):
32 |                 raise ValueError(
33 |                     f"No model named {model_group.name} was found in the model store."
34 |                 )
35 |             model_to_load = f"{MODEL_MOUNT_PATH}"
36 |         elif model_group.model.hfRepoId:
37 |             validate_repo_id(model_group.model.hfRepoId)
38 |             model_to_load = model_group.model.hfRepoId
39 |         else:
40 |             raise ValueError("Did not find a model to load.")
41 | 
42 |     def attach_model_to_command(command: List[str]) -> List[str]:
43 |         return command + ["--model", model_to_load]
44 | 
45 |     if runtime.command:
46 |         return attach_model_to_command(runtime.command)
47 | 
48 |     command = shlex.split(
49 |         f"python3 -O -u -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --served-model-name {model_group.name}"
50 |     )
51 | 
52 |     gpu_count = get_gpu_count(ctx, model_group)
53 | 
54 |     if gpu_count > 1:
55 |         command += ["--tensor-parallel-size", str(gpu_count)]
56 | 
57 |     return attach_model_to_command(command)
58 | 


--------------------------------------------------------------------------------
/tests/policy_packs/aws/eks.py:
--------------------------------------------------------------------------------
 1 | from pulumi_policy import (
 2 |     ReportViolation,
 3 |     ResourceValidationArgs,
 4 |     ResourceValidationPolicy,
 5 | )
 6 | 
 7 | 
 8 | def model_group_validator(
 9 |     args: ResourceValidationArgs, report_violation: ReportViolation
10 | ) -> None:
11 |     if args.resource_type == "aws:eks/nodeGroup:NodeGroup":
12 |         instance_type = args.props["instanceTypes"][0]
13 | 
14 |         if instance_type == "c7a.xlarge":
15 |             if "taints" in args.props:
16 |                 taints = args.props["taints"]
17 | 
18 |                 # Verify that taint {key: app, value: model-group, effect: NoSchedule} exists
19 |                 exists = False
20 |                 for i in range(len(taints)):
21 |                     taint = taints[i]
22 |                     if (
23 |                         taint["key"] == "app"
24 |                         and taint["value"] == "model-group"
25 |                         and taint["effect"] == "NO_SCHEDULE"
26 |                     ):
27 |                         exists = True
28 |                 if not exists:
29 |                     report_violation(
30 |                         "Taint {key: app, value: model-group, effect: NoSchedule} is not set for model-group node group.",
31 |                         None,
32 |                     )
33 | 
34 |                 # Verify that taint {key: model, value: <model-group-name>, effect: NoSchedule} exists
35 |                 exists = False
36 |                 for i in range(len(taints)):
37 |                     taint = taints[i]
38 |                     if (
39 |                         taint["key"] == "model"
40 |                         and taint["value"] == "llama2-7b"
41 |                         and taint["effect"] == "NO_SCHEDULE"
42 |                     ):
43 |                         exists = True
44 |                 if not exists:
45 |                     report_violation(
46 |                         "Taint {key: model, value: <model-group-name>, effect: NoSchedule} is not set for model-group node group.",
47 |                         None,
48 |                     )
49 | 
50 | 
51 | model_group_taints = ResourceValidationPolicy(
52 |     name="model-group-taints",
53 |     description="Model group should have taints.",
54 |     validate=model_group_validator,
55 | )
56 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # PyCharm
  7 | .idea/
  8 | 
  9 | # VS Code
 10 | .vscode/
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | # Usually these files are written by a python script from a template
 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | cover/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | db.sqlite3-journal
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | docs/_output/
 80 | 
 81 | # PyBuilder
 82 | target/
 83 | 
 84 | # Jupyter Notebook
 85 | .ipynb_checkpoints
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # celery beat schedule file
 91 | celerybeat-schedule
 92 | 
 93 | # SageMath parsed files
 94 | *.sage.py
 95 | 
 96 | # dotenv
 97 | .env
 98 | .env.*
 99 | 
100 | # virtualenv
101 | .venv/
102 | venv/
103 | ENV/
104 | env/
105 | bin/
106 | pyvenv.cfg
107 | Pipfile.lock
108 | 
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 | 
113 | # Rope project settings
114 | .ropeproject
115 | 
116 | # mkdocs documentation
117 | /site
118 | 
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 | 
124 | # Pyre type checker
125 | .pyre/
126 | 
127 | # pytype static type analyzer
128 | .pytype/
129 | 
130 | # Cython debug symbols
131 | cython_debug/
132 | 
133 | 
134 | # Poetry specific
135 | .pypoetry-cache
136 | 
137 | # Jupyter Notebook
138 | .ipynb_checkpoints
139 | 
140 | # ptest-kin
141 | .pytest-kind
142 | 


--------------------------------------------------------------------------------
/tests/model/test_store.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import io
 3 | 
 4 | import boto3
 5 | import pytest
 6 | from botocore.exceptions import ClientError
 7 | from moto import mock_aws
 8 | 
 9 | from paka.model.store import MODEL_PATH_PREFIX, S3ModelStore
10 | 
11 | 
12 | @mock_aws
13 | def test_s3_model_store_save() -> None:
14 |     conn = boto3.resource("s3", region_name="us-east-1")
15 |     conn.create_bucket(Bucket="mybucket")
16 | 
17 |     store = S3ModelStore("mybucket")
18 | 
19 |     store.save("test.txt", b"Test data")
20 | 
21 |     body = conn.Object("mybucket", f"{MODEL_PATH_PREFIX}/test.txt").get()["Body"].read()
22 |     assert body == b"Test data"
23 | 
24 | 
25 | @mock_aws
26 | def test_save_stream() -> None:
27 |     conn = boto3.resource("s3", region_name="us-east-1")
28 |     conn.create_bucket(Bucket="mybucket")
29 | 
30 |     store = S3ModelStore("mybucket")
31 | 
32 |     data = b"Test data"
33 |     sha256_hash = hashlib.sha256(data).hexdigest()
34 |     stream = io.BytesIO(data)
35 |     store.save_stream("test.txt", stream, len(stream.getvalue()), sha256_hash)
36 | 
37 |     body = conn.Object("mybucket", f"{MODEL_PATH_PREFIX}/test.txt").get()["Body"].read()
38 |     assert body == b"Test data"
39 | 
40 |     with pytest.raises(
41 |         Exception,
42 |         match="SHA256 hash of the downloaded file does not match the expected value",
43 |     ):
44 |         stream = io.BytesIO(data)
45 |         store.save_stream("test_2.txt", stream, len(stream.getvalue()), "invalid_hash")
46 | 
47 |     try:
48 |         conn.Object("mybucket", f"{MODEL_PATH_PREFIX}/test_2.txt").load()
49 |     except ClientError as e:
50 |         if e.response["Error"]["Code"] == "404":
51 |             file_exists = False
52 |         else:
53 |             raise
54 |     else:
55 |         file_exists = True
56 | 
57 |     assert not file_exists
58 | 
59 | 
60 | @mock_aws
61 | def test_file_exists() -> None:
62 |     conn = boto3.resource("s3", region_name="us-east-1")
63 |     conn.create_bucket(Bucket="mybucket")
64 | 
65 |     store = S3ModelStore("mybucket")
66 | 
67 |     assert not store.file_exists("test.txt")
68 | 
69 |     conn.Object("mybucket", f"{MODEL_PATH_PREFIX}/test.txt").put(Body=b"Test data")
70 | 
71 |     assert store.file_exists("test.txt")
72 | 
73 |     assert store.file_exists("test", prefix_match=True)
74 |     assert not store.file_exists("nonexistent", prefix_match=True)
75 | 


--------------------------------------------------------------------------------
/paka/model/settings.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from pydantic import BaseModel, Field, field_validator
 6 | 
 7 | 
 8 | class ModelSettings(BaseModel):
 9 |     quantization: Optional[str] = Field(
10 |         None, description="The quantization method (GPTQ, AWQ, GGUF_Q4_0, etc) to use."
11 |     )
12 |     prompt_template_name: Optional[str] = Field(
13 |         None, description="The prompt template (chatml, llama-2, gemma, etc) to use."
14 |     )
15 |     prompt_template_str: Optional[str] = Field(
16 |         None, description="The prompt template string to use."
17 |     )
18 | 
19 |     @field_validator("quantization")
20 |     def validate_quantization(cls, v: Optional[str]) -> Optional[str]:
21 |         if v is None:
22 |             return v
23 | 
24 |         valid_methods = [
25 |             "GPTQ",
26 |             "AWQ",
27 |             "GGUF_Q2_K",
28 |             "GGUF_Q3_K_L",
29 |             "GGUF_Q3_K_M",
30 |             "GGUF_Q3_K_S",
31 |             "GGUF_Q4_0",
32 |             "GGUF_Q4_K_M",
33 |             "GGUF_Q4_K_S",
34 |             "GGUF_Q5_0",
35 |             "GGUF_Q5_K_M",
36 |             "GGUF_Q5_K_S",
37 |             "GGUF_Q6_K",
38 |             "GGUF_Q8_0",
39 |             "GGUF_fp16",
40 |             "GGUF_fp32",
41 |         ]
42 |         if v not in valid_methods:
43 |             raise ValueError("Invalid quantization method")
44 |         return v
45 | 
46 |     @field_validator("prompt_template_name")
47 |     def validate_prompt_template_name(cls, v: str) -> str:
48 |         valid_templates = [
49 |             "chatml",
50 |             "llama-2",
51 |             "gemma",
52 |             "alpaca",
53 |             "qwen",
54 |             "vicuna",
55 |             "oasst_llama",
56 |             "baichuan-2",
57 |             "baichuan",
58 |             "openbuddy",
59 |             "redpajama-incite",
60 |             "snoozy",
61 |             "phind",
62 |             "intel",
63 |             "open-orca",
64 |             "mistrallite",
65 |             "zephyr",
66 |             "pygmalion",
67 |             "mistral-instruct",
68 |             "chatglm3",
69 |             "openchat",
70 |             "saiga",
71 |             "codellama",
72 |         ]
73 |         if v is not None and v not in valid_templates:
74 |             raise ValueError("Invalid prompt template name")
75 |         return v
76 | 


--------------------------------------------------------------------------------
/paka/cli/build.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | from typing import Optional
 5 | 
 6 | import typer
 7 | 
 8 | from paka.cli.utils import build_and_push, ensure_cluster_name, push_to_ecr
 9 | from paka.utils import read_pulumi_stack
10 | 
11 | build_app = typer.Typer()
12 | 
13 | 
14 | @build_app.command()
15 | def build_image(
16 |     cluster_name: Optional[str] = typer.Option(
17 |         os.getenv("PAKA_CURRENT_CLUSTER"),
18 |         "--cluster",
19 |         "-c",
20 |         help="The name of the cluster.",
21 |     ),
22 |     source_dir: str = typer.Argument(
23 |         ...,
24 |         help="Source directory of the application.",
25 |     ),
26 |     image_name: str = typer.Option(
27 |         "",
28 |         "--image-name",
29 |         help="Provide a custom name for the Docker image. If omitted, "
30 |         "the base name of the source code directory will be used as the image name.",
31 |     ),
32 | ) -> None:
33 |     """
34 |     Build a Docker image from the application in the specified source directory.
35 | 
36 |     The source directory must contain a Procfile and a .cnignore file. The Procfile
37 |     defines the commands to run for the application. The .cnignore file defines the
38 |     files and directories to exclude from the image. Once the image is built,
39 |     it will be pushed to the container repository of the current cluster.
40 | 
41 |     A Dockerfile is NOT required. The image will be built using Cloud Native Buildpacks.
42 |     In cluster build is not supported yet. User machine must have Docker installed.
43 |     """
44 |     build_and_push(cluster_name, source_dir, image_name)
45 | 
46 | 
47 | @build_app.command()
48 | def push_image(
49 |     cluster_name: Optional[str] = typer.Option(
50 |         os.getenv("PAKA_CURRENT_CLUSTER"),
51 |         "--cluster",
52 |         "-c",
53 |         help="The name of the cluster.",
54 |     ),
55 |     image_name: str = typer.Option(
56 |         "",
57 |         "--image-name",
58 |         help="Name of the pre-built Docker image. If image tag is not provided, 'latest' will be used.",
59 |     ),
60 | ) -> None:
61 |     """
62 |     Push a pre-built Docker image to the container repository of the current cluster.
63 |     """
64 |     cluster_name = ensure_cluster_name(cluster_name)
65 | 
66 |     push_to_ecr(
67 |         image_name,
68 |         read_pulumi_stack(cluster_name, "registry"),
69 |         read_pulumi_stack(cluster_name, "region"),
70 |         image_name,
71 |     )
72 | 


--------------------------------------------------------------------------------
/paka/cluster/redis.py:
--------------------------------------------------------------------------------
 1 | import pulumi
 2 | import pulumi_kubernetes as k8s
 3 | from pulumi_kubernetes.apiextensions import CustomResource
 4 | from pulumi_kubernetes.helm.v3 import Chart, ChartOpts, FetchOpts
 5 | 
 6 | from paka.cluster.context import Context
 7 | from paka.utils import call_once
 8 | 
 9 | 
10 | @call_once
11 | def create_redis(ctx: Context) -> None:
12 |     """
13 |     Installs redis with a helm chart.
14 |     """
15 |     config = ctx.cloud_config
16 | 
17 |     if not config.job or not config.job.enabled:
18 |         return
19 | 
20 |     ns = k8s.core.v1.Namespace(
21 |         "redis",
22 |         metadata={"name": "redis"},
23 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider),
24 |     )
25 | 
26 |     chart = Chart(
27 |         "redis",
28 |         ChartOpts(
29 |             chart="redis",
30 |             version="18.6.1",
31 |             namespace=ctx.namespace,
32 |             fetch_opts=FetchOpts(repo="https://charts.bitnami.com/bitnami"),
33 |             values={
34 |                 "architecture": "standalone",
35 |                 "master": {
36 |                     "persistence": {
37 |                         "enabled": True,
38 |                         "size": config.job.brokerStorageSize,
39 |                     },
40 |                 },
41 |                 "metrics": {"enabled": True},  # For enabling metrics
42 |             },
43 |         ),
44 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider, depends_on=[ns]),
45 |     )
46 | 
47 |     if not config.prometheus or not config.prometheus.enabled:
48 |         return
49 | 
50 |     CustomResource(
51 |         "redis-metrics-monitor",
52 |         api_version="monitoring.coreos.com/v1",
53 |         kind="ServiceMonitor",
54 |         metadata={
55 |             "name": "redis-metrics-monitor",
56 |             "namespace": "redis",
57 |         },
58 |         spec={
59 |             "selector": {
60 |                 "matchLabels": {
61 |                     "app.kubernetes.io/instance": "redis",
62 |                     "app.kubernetes.io/name": "redis",
63 |                     "app.kubernetes.io/component": "metrics",
64 |                 }
65 |             },
66 |             "namespaceSelector": {
67 |                 "matchNames": ["redis"],
68 |             },
69 |             "endpoints": [
70 |                 {
71 |                     "port": "http-metrics",
72 |                     "interval": "15s",
73 |                 },
74 |             ],
75 |         },
76 |         opts=pulumi.ResourceOptions(
77 |             provider=ctx.k8s_provider,
78 |             depends_on=[chart],
79 |         ),
80 |     )
81 | 


--------------------------------------------------------------------------------
/paka/k8s/job/autoscaler.py:
--------------------------------------------------------------------------------
 1 | from kubernetes import client
 2 | 
 3 | from paka.k8s.utils import (
 4 |     CustomResource,
 5 |     apply_resource,
 6 |     delete_namespaced_custom_object,
 7 | )
 8 | 
 9 | 
10 | def create_autoscaler(
11 |     namespace: str,
12 |     redis_svc_name: str,
13 |     queue_name: str,
14 |     trigger_queue_length: int,
15 |     job_name: str,
16 |     min_replicas: int,
17 |     max_replicas: int,
18 | ) -> None:
19 |     """
20 |     Creates a KEDA autoscaler for a job with a Redis trigger.
21 | 
22 |     The autoscaler scales the job based on the length of a Redis list.
23 |     The job is scaled up when the list length exceeds the trigger queue length,
24 |     and scaled down when the list is empty.
25 | 
26 |     Args:
27 |         namespace (str): The namespace to create the resources in.
28 |         redis_svc_name (str): The name of the Redis service.
29 |         queue_name (str): The name of the Redis list to monitor.
30 |         trigger_queue_length (int): The list length at which to trigger scaling.
31 |         job_name (str): The name of the job to scale.
32 |         min_replicas (int): The minimum number of job replicas.
33 |         max_replicas (int): The maximum number of job replicas.
34 | 
35 |     Returns:
36 |         None
37 |     """
38 |     scaled_object = CustomResource(
39 |         api_version="keda.sh/v1alpha1",
40 |         kind="ScaledObject",
41 |         plural="scaledobjects",
42 |         metadata=client.V1ObjectMeta(name=job_name, namespace=namespace),
43 |         spec={
44 |             "scaleTargetRef": {
45 |                 "kind": "Deployment",
46 |                 "name": job_name,
47 |             },
48 |             "minReplicaCount": min_replicas,
49 |             "maxReplicaCount": max_replicas,
50 |             "triggers": [
51 |                 {
52 |                     "type": "redis",
53 |                     "metadata": {
54 |                         "type": "list",
55 |                         "listName": queue_name,
56 |                         "listLength": f"{trigger_queue_length}",
57 |                         "address": f"{redis_svc_name}.redis.svc.cluster.local:6379",
58 |                     },
59 |                 }
60 |             ],
61 |         },
62 |     )
63 |     apply_resource(scaled_object)
64 | 
65 | 
66 | def delete_autoscaler(namespace: str, job_name: str) -> None:
67 |     scaled_object = CustomResource(
68 |         api_version="keda.sh/v1alpha1",
69 |         kind="ScaledObject",
70 |         plural="scaledobjects",
71 |         metadata=client.V1ObjectMeta(name=job_name, namespace=namespace),
72 |         spec={},
73 |     )
74 |     delete_namespaced_custom_object(job_name, namespace, scaled_object)
75 | 


--------------------------------------------------------------------------------
/tests/policy_packs/aws/object_store.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import List
 4 | 
 5 | from pulumi_policy import (
 6 |     Policy,
 7 |     ReportViolation,
 8 |     ResourceValidationArgs,
 9 |     ResourceValidationPolicy,
10 |     StackValidationArgs,
11 |     StackValidationPolicy,
12 | )
13 | 
14 | max_num_buckets = 1
15 | 
16 | 
17 | def s3_count_validator(
18 |     stack: StackValidationArgs, report_violation: ReportViolation
19 | ) -> None:
20 |     s3_resources = filter(
21 |         (lambda resource: resource.resource_type == "aws:s3/bucket:Bucket"),
22 |         stack.resources,
23 |     )
24 | 
25 |     buckets = list(s3_resources)
26 |     if len(buckets) > max_num_buckets:
27 |         report_violation(
28 |             f"No more than {max_num_buckets} bucket(s) should be created.", None
29 |         )
30 | 
31 | 
32 | s3_count_check = StackValidationPolicy(
33 |     name="s3-count-check",
34 |     description="Checks the number of buckets created.",
35 |     validate=s3_count_validator,
36 | )
37 | 
38 | 
39 | def s3_no_public_read_validator(
40 |     args: ResourceValidationArgs, report_violation: ReportViolation
41 | ) -> None:
42 |     if args.resource_type == "aws:s3/bucket:Bucket" and "acl" in args.props:
43 |         acl = args.props["acl"]
44 |         if acl == "public-read" or acl == "public-read-write":
45 |             report_violation(
46 |                 "You cannot set public-read or public-read-write on an S3 bucket. "
47 |                 + "Read more about ACLs here: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html",
48 |                 None,
49 |             )
50 | 
51 | 
52 | s3_no_public_read = ResourceValidationPolicy(
53 |     name="s3-no-public-read",
54 |     description="Prohibits setting the publicRead or publicReadWrite permission on AWS S3 buckets.",
55 |     validate=s3_no_public_read_validator,
56 | )
57 | 
58 | 
59 | def s3_force_destroy_validator(
60 |     args: ResourceValidationArgs, report_violation: ReportViolation
61 | ) -> None:
62 |     if args.resource_type == "aws:s3/bucket:Bucket" and "forceDestroy" in args.props:
63 |         force_destroy = args.props["forceDestroy"]
64 |         if not force_destroy:
65 |             report_violation(
66 |                 "You must set forceDestroy to true. "
67 |                 + "Read more about forceDestroy here: https://www.pulumi.com/docs/intro/concepts/resources/#deletion",
68 |                 None,
69 |             )
70 | 
71 | 
72 | s3_force_destroy = ResourceValidationPolicy(
73 |     name="s3-force-destroy",
74 |     description="Requires forceDestroy to be set to true.",
75 |     validate=s3_force_destroy_validator,
76 | )
77 | 
78 | s3_policies: List[Policy] = [s3_count_check, s3_no_public_read, s3_force_destroy]
79 | 


--------------------------------------------------------------------------------
/examples/website_rag/crawler.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Generator, Tuple
 3 | from urllib.parse import urljoin, urlparse
 4 | 
 5 | import requests
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | 
 9 | def is_relative_url(url: str) -> bool:
10 |     return not bool(urlparse(url).netloc)
11 | 
12 | 
13 | def get_root_url(url: str) -> str:
14 |     parsed_url = urlparse(url)
15 |     return f"{parsed_url.scheme}://{parsed_url.netloc}"
16 | 
17 | 
18 | def get_filename(save_dir: str, href: str) -> str:
19 |     parsed_url = urlparse(href)
20 |     path = parsed_url.path
21 |     if path.endswith("/") or not path:
22 |         path = path + "index.html"
23 |     else:
24 |         if not path.endswith(".html"):
25 |             path = path + ".html"
26 |     filename = os.path.join(save_dir, parsed_url.netloc, path.lstrip("/"))
27 |     return filename
28 | 
29 | 
30 | def save_html_file(response_text: str, url: str, save_dir: str) -> None:
31 |     filename = get_filename(save_dir, url)
32 |     os.makedirs(os.path.dirname(filename), exist_ok=True)
33 |     with open(filename, "w") as file:
34 |         file.write(response_text)
35 | 
36 | 
37 | def get_html(url: str) -> str:
38 |     try:
39 |         response = requests.get(url)
40 |         content_type = response.headers["content-type"]
41 |         if "html" in content_type:
42 |             return response.text
43 |     except Exception as e:
44 |         print(f"Failed to get {url}: {e}")
45 |     return ""
46 | 
47 | 
48 | def crawl(url: str, max_depth: int = 3) -> Generator[Tuple[str, str], None, None]:
49 |     visited = set()
50 | 
51 |     def _crawl(url: str, depth: int = 0) -> Generator[Tuple[str, str], None, None]:
52 |         url_without_fragment = url.split("#")[0]
53 | 
54 |         if (
55 |             depth > max_depth
56 |             or not url.startswith("https")
57 |             or not url.startswith("http")
58 |         ):
59 |             return
60 | 
61 |         root_url = get_root_url(url)
62 | 
63 |         orig_html_content = get_html(url)
64 |         soup = BeautifulSoup(orig_html_content, "html.parser")
65 | 
66 |         yield url, soup.get_text(separator=" ", strip=True)
67 | 
68 |         visited.add(url_without_fragment)
69 | 
70 |         for link in soup.find_all("a"):
71 |             href = link.get("href")
72 |             if not href:
73 |                 continue
74 |             href_domain = urlparse(href).netloc
75 |             url_domain = urlparse(url).netloc
76 |             if href_domain and href_domain != url_domain:
77 |                 continue
78 | 
79 |             full_url = urljoin(root_url, href) if is_relative_url(href) else href
80 |             if full_url not in visited:
81 |                 yield from _crawl(full_url, depth + 1)
82 | 
83 |     yield from _crawl(url)
84 | 


--------------------------------------------------------------------------------
/paka/cluster/aws/ebs_csi_driver.py:
--------------------------------------------------------------------------------
 1 | import pulumi
 2 | import pulumi_aws as aws
 3 | import pulumi_eks as eks
 4 | from pulumi_kubernetes.helm.v3 import Chart, ChartOpts, FetchOpts
 5 | 
 6 | from paka.cluster.aws.utils import odic_role_for_sa
 7 | from paka.cluster.context import Context
 8 | from paka.utils import call_once
 9 | 
10 | 
11 | @call_once
12 | def create_ebs_csi_driver(ctx: Context, cluster: eks.Cluster) -> None:
13 |     cluster_name = ctx.cluster_name
14 | 
15 |     csi_driver_policy_doc = aws.iam.get_policy_document(
16 |         statements=[
17 |             aws.iam.GetPolicyDocumentStatementArgs(
18 |                 actions=[
19 |                     "ec2:CreateSnapshot",
20 |                     "ec2:AttachVolume",
21 |                     "ec2:DetachVolume",
22 |                     "ec2:ModifyVolume",
23 |                     "ec2:DescribeAvailabilityZones",
24 |                     "ec2:DescribeInstances",
25 |                     "ec2:DescribeSnapshots",
26 |                     "ec2:DescribeTags",
27 |                     "ec2:DescribeVolumes",
28 |                     "ec2:DescribeVolumesModifications",
29 |                     "ec2:CreateTags",
30 |                     "ec2:CreateVolume",
31 |                     "ec2:DeleteVolume",
32 |                 ],
33 |                 resources=["*"],
34 |                 effect="Allow",
35 |             )
36 |         ]
37 |     )
38 | 
39 |     csi_driver_policy = aws.iam.Policy(
40 |         f"{cluster_name}-csi-driver-policy", policy=csi_driver_policy_doc.json
41 |     )
42 | 
43 |     csi_driver_role = odic_role_for_sa(
44 |         ctx, cluster, "csi-driver", "kube-system:ebs-csi-controller-sa"
45 |     )
46 | 
47 |     aws.iam.RolePolicyAttachment(
48 |         f"{cluster_name}-csi-driver-role-policy-attachment",
49 |         policy_arn=csi_driver_policy.arn,
50 |         role=csi_driver_role.name,
51 |     )
52 | 
53 |     Chart(
54 |         "aws-ebs-csi-driver",
55 |         ChartOpts(
56 |             chart="aws-ebs-csi-driver",
57 |             version="2.26.0",
58 |             namespace="kube-system",
59 |             fetch_opts=FetchOpts(
60 |                 repo="https://kubernetes-sigs.github.io/aws-ebs-csi-driver"
61 |             ),
62 |             values={
63 |                 "controller": {
64 |                     "serviceAccount": {
65 |                         "create": "true",
66 |                         "name": "ebs-csi-controller-sa",
67 |                         "annotations": {
68 |                             "eks.amazonaws.com/role-arn": csi_driver_role.arn
69 |                         },
70 |                         "automountServiceAccountToken": "true",
71 |                     },
72 |                 }
73 |             },
74 |         ),
75 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider),
76 |     )
77 | 


--------------------------------------------------------------------------------
/examples/website_rag/serve.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import time
 4 | from typing import Annotated, Any
 5 | 
 6 | from constants import LLM_URL, QDRANT_URL
 7 | from embeddings import LlamaEmbeddings
 8 | from fastapi import Depends, FastAPI, Request, Response
 9 | from langchain.chains import RetrievalQA
10 | from langchain_community.vectorstores import Qdrant
11 | from langchain_core.runnables import RunnableLambda
12 | from langserve import APIHandler, add_routes  # type: ignore
13 | from qdrant_client import QdrantClient
14 | from vllm import Vllm
15 | 
16 | logging.basicConfig(
17 |     level=logging.INFO,
18 |     format="%(asctime)s [%(levelname)s] %(message)s",
19 |     handlers=[logging.StreamHandler()],
20 | )
21 | 
22 | port = int(os.getenv("PORT", 8080))
23 | 
24 | client = QdrantClient(
25 |     url=QDRANT_URL,
26 |     prefer_grpc=True,
27 | )
28 | collection_name = "langchain_documents"
29 | 
30 | embeddings = LlamaEmbeddings()
31 | qdrant = Qdrant(client, collection_name, embeddings=embeddings)
32 | 
33 | retriever = qdrant.as_retriever()
34 | 
35 | app = FastAPI(
36 |     title="LangChain Docs Server",
37 |     version="0.1.0",
38 |     description="Spin up a simple api server to retrieve documents from the vector store.",
39 | )
40 | # Adds routes to the app for using the retriever under:
41 | # /invoke
42 | # /batch
43 | # /stream
44 | add_routes(app, retriever)
45 | 
46 | 
47 | def run_llm(query: str) -> Any:
48 |     start_time = time.time()
49 |     logging.info(f"Running LLM with query: {query}")
50 |     llm = Vllm(
51 |         model="llama2-7b-chat",
52 |         model_url=LLM_URL,
53 |         temperature=0,
54 |         max_tokens=2500,
55 |         streaming=False,
56 |     )
57 | 
58 |     qa = RetrievalQA.from_chain_type(
59 |         llm=llm, retriever=retriever, chain_type="stuff", return_source_documents=True
60 |     )
61 | 
62 |     query = f"[INST] <<SYS>><</SYS>>\n\n{query} [/INST]\n"
63 |     result = qa.invoke({"query": query})
64 |     logging.info(f"LLM result: {result}")
65 | 
66 |     end_time = time.time()
67 |     logging.info(f"Execution time: {end_time - start_time} seconds")
68 |     return result
69 | 
70 | 
71 | async def _get_api_handler() -> APIHandler:
72 |     """Prepare a RunnableLambda."""
73 |     return APIHandler(RunnableLambda(run_llm), path="/v2")
74 | 
75 | 
76 | @app.post("/v2/invoke")
77 | async def v2_invoke(
78 |     request: Request, runnable: Annotated[APIHandler, Depends(_get_api_handler)]
79 | ) -> Response:
80 |     """Handle invoke request."""
81 |     # The API Handler validates the parts of the request
82 |     # that are used by the runnnable (e.g., input, config fields)
83 |     return await runnable.invoke(request)
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     import uvicorn
88 | 
89 |     uvicorn.run(app, host="localhost", port=port)
90 | 


--------------------------------------------------------------------------------
/paka/model/hf_model.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import concurrent.futures
 4 | import os
 5 | from typing import Any, Dict, List, Optional
 6 | 
 7 | from huggingface_hub import HfFileSystem
 8 | from huggingface_hub.utils import validate_repo_id
 9 | 
10 | from paka.logger import logger
11 | from paka.model.base_model import BaseMLModel
12 | from paka.model.store import ModelStore
13 | 
14 | 
15 | class HuggingFaceModel(BaseMLModel):
16 |     def __init__(
17 |         self,
18 |         name: str,
19 |         repo_id: str,
20 |         files: List[str],
21 |         model_store: ModelStore,
22 |         quantization: Optional[str] = None,
23 |         prompt_template_name: Optional[str] = None,
24 |         prompt_template_str: Optional[str] = None,
25 |     ) -> None:
26 |         super().__init__(
27 |             name=name,
28 |             model_store=model_store,
29 |             quantization=quantization,
30 |             prompt_template_name=prompt_template_name,
31 |             prompt_template_str=prompt_template_str,
32 |         )
33 |         validate_repo_id(repo_id)
34 |         self.repo_id: str = repo_id
35 |         self.fs = HfFileSystem()
36 |         self._files = files
37 | 
38 |     def save(self) -> None:
39 |         """
40 |         Saves the model to a model store.
41 |         """
42 |         files: List[str] = []
43 |         for file in self._files:
44 |             match_files = self.fs.glob(f"{self.repo_id}/{file}")
45 | 
46 |             if not match_files:
47 |                 logger.warn(
48 |                     f"No matching files found for {file} in HuggingFace repo {self.repo_id}"
49 |                 )
50 | 
51 |             files.extend(match_files)
52 | 
53 |         with concurrent.futures.ThreadPoolExecutor(
54 |             max_workers=self.concurrency
55 |         ) as executor:
56 |             futures = [executor.submit(self._save_single_file, file) for file in files]
57 |             concurrent.futures.wait(futures)
58 |             self.finish()
59 | 
60 |     def _save_single_file(self, hf_file_path: str) -> None:
61 |         """
62 |         Saves a HuggingFace model file to the specified model store.
63 | 
64 |         Args:
65 |             hf_file_path (str): The path to the HuggingFace model file.
66 | 
67 |         Returns:
68 |             None
69 |         """
70 |         file_info: Dict[str, Any] = self.fs.stat(hf_file_path)
71 |         total_size = file_info["size"]
72 |         sha256 = (
73 |             file_info["lfs"]["sha256"]
74 |             if "lfs" in file_info and file_info["lfs"]
75 |             else ""
76 |         )
77 | 
78 |         fname = os.path.basename(hf_file_path)
79 |         with self.fs.open(hf_file_path, "rb") as hf_file:
80 |             self.save_single_stream(f"{self.name}/{fname}", hf_file, total_size, sha256)
81 | 


--------------------------------------------------------------------------------
/paka/cluster/kubectl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | import shutil
 4 | from pathlib import Path
 5 | 
 6 | import requests
 7 | 
 8 | from paka.logger import logger
 9 | from paka.utils import download_url, get_project_data_dir
10 | 
11 | KUBECTL_VERSION_URL = "https://cdn.dl.k8s.io/release/stable.txt"
12 | CHUNK_SIZE = 8192
13 | 
14 | 
15 | def get_latest_kubectl_version() -> str:
16 |     """Return the latest version of kubectl available for download."""
17 |     try:
18 |         response = requests.get(KUBECTL_VERSION_URL)
19 |         response.raise_for_status()
20 |         return response.text
21 |     except requests.RequestException as e:
22 |         logger.error(f"Failed to get latest kubectl version: {e}")
23 |         return "v1.30.0"
24 | 
25 | 
26 | # We are not pinning the version of kubectl to a specific version
27 | # Get the latest version of kubectl should be safe
28 | KUBECTL_VERSION = os.getenv("KUBECTL_VERSION", get_latest_kubectl_version())
29 | 
30 | 
31 | # install_path is a full path to the kubectl binary
32 | # It should be in a format like this /path/to/kubectl-x.xx.x/kubectl
33 | def ensure_kubectl_by_path(install_path: Path) -> None:
34 |     """Ensure kubectl is installed at the given path."""
35 |     parent_dir = install_path.parent
36 |     os.environ["PATH"] = f"{parent_dir.absolute()}{os.pathsep}{os.environ['PATH']}"
37 | 
38 |     if install_path.exists():
39 |         return
40 | 
41 |     system = platform.system().lower()
42 |     arch = platform.machine().lower()
43 | 
44 |     if arch in ["amd64", "x86_64"]:
45 |         arch = "amd64"
46 | 
47 |     if arch not in ["amd64", "arm64"]:
48 |         raise Exception(f"Unsupported architecture: {arch}")
49 | 
50 |     grandparent_dir = parent_dir.parent
51 | 
52 |     for old_kubectl_dir in grandparent_dir.glob("kubectl-*"):
53 |         shutil.rmtree(old_kubectl_dir)
54 | 
55 |     if not install_path.exists():
56 |         url = os.getenv(
57 |             "KUBECTL_DOWNLOAD_URL",
58 |             f"https://dl.k8s.io/release/{KUBECTL_VERSION}/bin/{system}/{arch}/kubectl",
59 |         )
60 |         if system == "windows" and not url.endswith(".exe"):
61 |             url += ".exe"
62 |         logger.info(f"Downloading {url}..")
63 | 
64 |         with download_url(url) as tmp_file:
65 |             tmp_file_p = Path(tmp_file)
66 |             tmp_file_p.chmod(0o755)
67 |             parent_dir.mkdir(parents=True, exist_ok=True)
68 |             shutil.copy2(tmp_file_p, install_path)
69 | 
70 | 
71 | def ensure_kubectl() -> None:
72 |     system = platform.system().lower()
73 |     kubectl_path = (
74 |         Path(get_project_data_dir()) / "bin" / f"kubectl-{KUBECTL_VERSION}" / "kubectl"
75 |     )
76 |     if system == "windows":
77 |         kubectl_path = kubectl_path.with_suffix(".exe")
78 | 
79 |     ensure_kubectl_by_path(kubectl_path)
80 | 


--------------------------------------------------------------------------------
/paka/model/progress_bar.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from threading import Lock
 4 | from typing import Any, Dict, List, Optional, Tuple
 5 | 
 6 | from tqdm import tqdm
 7 | 
 8 | 
 9 | class ProgressBar:
10 |     def __init__(self, message: str = "Downloading") -> None:
11 |         self.counter: Dict[str, int] = {}
12 |         self.lock = Lock()
13 |         self.progress_bar: Optional[tqdm] = None
14 |         self.completed_files: List[Tuple[str, str]] = []
15 |         self.message = message
16 | 
17 |     def __getattr__(self, name: str) -> Any:
18 |         return getattr(self.progress_bar, name)
19 | 
20 |     def set_postfix_str(self, *args: Any, **kwargs: Any) -> None:
21 |         if self.progress_bar is None:
22 |             return
23 |         self.progress_bar.set_postfix_str(*args, **kwargs)
24 | 
25 |     def clear_counter(self) -> None:
26 |         with self.lock:
27 |             self.counter = {}
28 | 
29 |     def create_progress_bar(self, total_size: int) -> None:
30 |         with self.lock:
31 |             if self.progress_bar is not None:
32 |                 return
33 | 
34 |             self.progress_bar = tqdm(
35 |                 total=total_size, unit="B", unit_scale=True, desc=self.message
36 |             )
37 | 
38 |     def update_progress_bar(self, key: str, value: int) -> None:
39 |         if key in self.counter:
40 |             return
41 | 
42 |         with self.lock:
43 |             if self.progress_bar is not None:
44 |                 # Increase the total count of the progress bar by the provided value
45 |                 self.progress_bar.total += value
46 |                 # Refresh the progress bar to reflect the new total
47 |                 self.progress_bar.refresh()
48 | 
49 |     def close_progress_bar(self) -> None:
50 |         if self.progress_bar is None:
51 |             return
52 |         with self.lock:
53 |             self.counter = {}
54 |             self.progress_bar.close()
55 |             self.progress_bar = None
56 | 
57 |     def advance_progress_bar(self, key: str = "", value: int = 0) -> None:
58 |         if self.progress_bar is None:
59 |             return
60 |         with self.lock:
61 |             if key:
62 |                 self.counter[key] = value
63 |             # Calculate the total progress by summing the progress of all tasks
64 |             total_progress = sum(self.counter.values())
65 |             # Update the progress bar by the amount of progress made since the last update
66 |             self.progress_bar.update(total_progress - self.progress_bar.n)
67 |             self.progress_bar.refresh()
68 | 
69 | 
70 | class NullProgressBar:
71 |     def __init__(self, *args: Any, **kwargs: Any) -> None:
72 |         pass
73 | 
74 |     def __getattr__(self, name: str) -> Any:
75 |         return lambda *args, **kwargs: None
76 | 
77 |     def __setattr__(self, name: str, value: Any) -> None:
78 |         pass
79 | 


--------------------------------------------------------------------------------
/paka/model/base_model.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | from abc import ABC, abstractmethod
 5 | from typing import List, Optional, Tuple
 6 | 
 7 | from paka.logger import logger
 8 | from paka.model.manifest import ModelFile, ModelManifest
 9 | from paka.model.settings import ModelSettings
10 | from paka.model.store import ModelStore, StreamLike
11 | from paka.utils import to_yaml
12 | 
13 | 
14 | class BaseMLModel(ABC):
15 |     def __init__(
16 |         self,
17 |         name: str,
18 |         model_store: ModelStore,
19 |         quantization: Optional[str],
20 |         prompt_template_name: Optional[str],
21 |         prompt_template_str: Optional[str],
22 |         # Max concurrency for saving model streams
23 |         concurrency: int = 1,
24 |     ) -> None:
25 |         self.name = name
26 |         self.completed_files: List[Tuple[str, str]] = []
27 |         self.settings = ModelSettings(
28 |             quantization=quantization,
29 |             prompt_template_name=prompt_template_name,
30 |             prompt_template_str=prompt_template_str,
31 |         )
32 | 
33 |         self.model_store = model_store
34 |         self.concurrency = concurrency
35 | 
36 |     def save_manifest_yml(self, manifest: Optional[ModelManifest] = None) -> None:
37 |         if manifest is None:
38 |             manifest = ModelManifest(
39 |                 name=self.name,
40 |                 files=[
41 |                     ModelFile(name=name, sha256=sha256)
42 |                     for (name, sha256) in self.completed_files
43 |                 ],
44 |                 quantization=self.settings.quantization,
45 |                 prompt_template_name=self.settings.prompt_template_name,
46 |                 prompt_template_str=self.settings.prompt_template_str,
47 |             )
48 | 
49 |         model_store = self.model_store
50 | 
51 |         manifest_yml = to_yaml(manifest.model_dump(exclude_none=True))
52 | 
53 |         file_path = f"{self.name}/manifest.yml"
54 |         if model_store.file_exists(file_path):
55 |             logger.info(
56 |                 f"manifest.yml file already exists at {file_path}. Overwriting..."
57 |             )
58 |             model_store.delete_file(file_path)
59 |         model_store.save(file_path, manifest_yml.encode("utf-8"))
60 |         logger.info(f"manifest.yml file saved to {file_path}")
61 | 
62 |     @abstractmethod
63 |     def save(self) -> None:
64 |         pass
65 | 
66 |     def save_single_stream(
67 |         self, path: str, stream: StreamLike, total_size: int, sha256: str = ""
68 |     ) -> None:
69 |         self.model_store.save_stream(path, stream, total_size, sha256)
70 |         fname = os.path.basename(path)
71 |         self.completed_files.append((fname, sha256))
72 | 
73 |     def finish(self) -> None:
74 |         self.try_close_progress_bar()
75 |         self.save_manifest_yml()
76 | 
77 |     def try_close_progress_bar(self) -> None:
78 |         pb = getattr(self.model_store, "progress_bar", None)
79 |         if pb:
80 |             pb.close_progress_bar()
81 | 


--------------------------------------------------------------------------------
/paka/container/pack.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | import tarfile
 4 | import tempfile
 5 | import zipfile
 6 | from pathlib import Path
 7 | 
 8 | import requests
 9 | 
10 | from paka.logger import logger
11 | from paka.utils import (
12 |     calculate_sha256,
13 |     download_url,
14 |     get_gh_release_latest_version,
15 |     get_project_data_dir,
16 | )
17 | 
18 | 
19 | def ensure_pack() -> str:
20 |     paka_home = Path(get_project_data_dir())
21 | 
22 |     bin_dir = paka_home / "bin"
23 |     bin_dir.mkdir(parents=True, exist_ok=True)
24 | 
25 |     pack_files = list(bin_dir.glob("pack-*"))
26 |     if pack_files:
27 |         return str(pack_files[0])
28 | 
29 |     pack_version = get_gh_release_latest_version("buildpacks/pack")
30 | 
31 |     new_pack_path = bin_dir / f"pack-{pack_version}"
32 | 
33 |     system = platform.system().lower()
34 |     arch = platform.machine().lower()
35 | 
36 |     if system == "windows":
37 |         new_pack_path = new_pack_path.with_suffix(".exe")
38 | 
39 |     if new_pack_path.exists():
40 |         return str(new_pack_path)
41 | 
42 |     for old_pack_path in bin_dir.glob("pack-*"):
43 |         if old_pack_path.is_file():
44 |             old_pack_path.unlink()
45 | 
46 |     if system == "darwin":
47 |         system = "macos"
48 | 
49 |     if system == "windows":
50 |         pack_file = f"pack-{pack_version}-windows.zip"
51 | 
52 |     elif arch in ["amd64", "x86_64"]:
53 |         pack_file = f"pack-{pack_version}-{system}.tgz"
54 |     elif arch == "arm64":
55 |         pack_file = f"pack-{pack_version}-{system}-{arch}.tgz"
56 |     else:
57 |         raise Exception(f"Unsupported architecture: {arch}")
58 | 
59 |     url = f"https://github.com/buildpacks/pack/releases/download/{pack_version}/{pack_file}"
60 | 
61 |     logger.info(f"Downloading {pack_file}...")
62 | 
63 |     with download_url(url) as archive_file:
64 |         archive_file_sha256 = calculate_sha256(archive_file)
65 | 
66 |         # Now, fetch the sha256 file and compare the hash
67 |         sha256_url = f"{url}.sha256"
68 | 
69 |         response = requests.get(sha256_url)
70 |         response.raise_for_status()
71 |         expected_sha256, expected_filename = response.text.strip().split()
72 | 
73 |         assert expected_filename == pack_file
74 | 
75 |         if archive_file_sha256 != expected_sha256:
76 |             raise Exception(
77 |                 f"SHA256 mismatch: {archive_file_sha256} != {expected_sha256}"
78 |             )
79 | 
80 |         if system == "windows":
81 |             with zipfile.ZipFile(archive_file, "r") as zip_ref:
82 |                 zip_ref.extractall(bin_dir)
83 |         else:
84 |             with tarfile.open(archive_file, "r:gz") as tar:
85 |                 tar.extractall(bin_dir)
86 | 
87 |     pack_path = bin_dir / "pack"
88 | 
89 |     if system == "windows":
90 |         pack_path = pack_path.with_suffix(".exe")
91 | 
92 |     pack_path.chmod(0o755)
93 |     pack_path.rename(new_pack_path)
94 | 
95 |     logger.info("Pack installed successfully.")
96 | 
97 |     return str(new_pack_path)
98 | 


--------------------------------------------------------------------------------
/examples/website_rag/ingest.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from typing import Generator
 3 | 
 4 | from bs4.element import Tag
 5 | from constants import QDRANT_URL
 6 | from crawler import crawl
 7 | from embeddings import LlamaEmbeddings
 8 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 9 | from langchain_community.vectorstores import Qdrant
10 | from langchain_core.documents import Document
11 | 
12 | 
13 | def _metadata_extractor(raw_html: str, url: str) -> dict:
14 |     """Extract metadata from raw html using BeautifulSoup."""
15 |     metadata = {"source": url}
16 | 
17 |     try:
18 |         from bs4 import BeautifulSoup
19 |     except ImportError:
20 |         print(
21 |             "The bs4 package is required for default metadata extraction. "
22 |             "Please install it with `pip install bs4`."
23 |         )
24 |         return metadata
25 |     soup = BeautifulSoup(raw_html, "html.parser")
26 |     if title := soup.find("title"):
27 |         metadata["title"] = title.get_text()
28 |     if (description := soup.find("meta", attrs={"name": "description"})) and isinstance(
29 |         description, Tag
30 |     ):
31 |         description_content = description.get("content", "") or ""
32 |         metadata["description"] = (
33 |             " ".join(description_content)
34 |             if isinstance(description_content, list)
35 |             else description_content
36 |         )
37 |     else:
38 |         metadata["description"] = ""
39 |     if (html := soup.find("html")) and isinstance(html, Tag):
40 |         html_lang = html.get("lang", "") or ""
41 |         metadata["language"] = (
42 |             " ".join(html_lang) if isinstance(html_lang, list) else html_lang
43 |         )
44 |     else:
45 |         metadata["language"] = ""
46 |     return metadata
47 | 
48 | 
49 | def docs_loader(website: str) -> Generator[Document, None, None]:
50 |     # We use a custom crawler. LangChain's RecursiveUrlLoader cannot be used as is.
51 |     crawler = crawl(website, max_depth=0)
52 | 
53 |     for url, html_content in crawler:
54 |         yield Document(
55 |             page_content=html_content, metadata=_metadata_extractor(html_content, url)
56 |         )
57 | 
58 | 
59 | def embed_website(website: str) -> None:
60 |     chunk_size = 400
61 |     chunk_overlap = 50
62 |     text_splitter = RecursiveCharacterTextSplitter(
63 |         separators=["\n\n", "\n", " ", ""],
64 |         chunk_size=chunk_size,
65 |         chunk_overlap=chunk_overlap,
66 |         length_function=len,
67 |     )
68 |     docs = text_splitter.split_documents(docs_loader(website))
69 |     embeddings = LlamaEmbeddings()
70 | 
71 |     print("Embedding documents...")
72 |     print("Total number of documents:", len(docs))
73 | 
74 |     Qdrant.from_documents(
75 |         docs,
76 |         embeddings,
77 |         url=QDRANT_URL,
78 |         prefer_grpc=True,
79 |         collection_name="langchain_documents",
80 |     )
81 |     print("done")
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     if len(sys.argv) > 1:
86 |         embed_website(sys.argv[1])
87 |     else:
88 |         print("Please provide a URL as a command-line argument.")
89 | 


--------------------------------------------------------------------------------
/e2e/pytest_kind/plugin.py:
--------------------------------------------------------------------------------
 1 | # Code modified from the original repo: https://codeberg.org/hjacobs/pytest-kind/src/branch/main/pytest_kind
 2 | from __future__ import annotations
 3 | 
 4 | from pathlib import Path
 5 | from typing import Generator
 6 | 
 7 | import pytest
 8 | from _pytest.config.argparsing import Parser
 9 | from pytest import FixtureRequest
10 | 
11 | from .cluster import KindCluster
12 | 
13 | 
14 | @pytest.fixture(scope="session")
15 | def kind_cluster(request: FixtureRequest) -> Generator[KindCluster, None, None]:
16 |     """Provide a Kubernetes kind cluster as test fixture."""
17 |     print(request.config)
18 |     name = request.config.getoption("cluster_name")
19 |     keep = request.config.getoption("keep_cluster")
20 |     kubeconfig = request.config.getoption("kubeconfig")
21 |     image = request.config.getoption("kind_image")
22 |     kind_path = request.config.getoption("kind_bin")
23 |     kubectl_path = request.config.getoption("kind_kubectl_bin")
24 |     cluster_config = request.config.getoption("cluster_config")
25 |     cluster = KindCluster(
26 |         name,
27 |         Path(kubeconfig) if kubeconfig else None,
28 |         image=image,
29 |         kind_path=Path(kind_path) if kind_path else None,
30 |         kubectl_path=Path(kubectl_path) if kubectl_path else None,
31 |     )
32 |     cluster.create(cluster_config)
33 |     yield cluster
34 |     if not keep:
35 |         cluster.delete()
36 | 
37 | 
38 | def pytest_addoption(parser: Parser) -> None:
39 |     group = parser.getgroup("kind")
40 |     group.addoption(
41 |         "--cluster-name",
42 |         default="paka-e2e",
43 |         help="Name of the Kubernetes kind cluster",
44 |     )
45 |     group.addoption(
46 |         "--keep-cluster",
47 |         default=False,
48 |         action="store_true",
49 |         help="Keep the Kubernetes kind cluster (do not delete after test run)",
50 |     )
51 |     group.addoption(
52 |         "--kubeconfig",
53 |         default=None,
54 |         help=(
55 |             "If provided, use the specified kubeconfig "
56 |             "instead of the one generated by the cluster"
57 |         ),
58 |     )
59 |     group.addoption(
60 |         "--cluster-config",
61 |         default=None,
62 |         help=("The cluster configuration file to use to create the Kind cluster."),
63 |     )
64 |     group.addoption(
65 |         "--kind-image",
66 |         default=None,
67 |         action="store",
68 |         type=str,
69 |         help=(
70 |             "If provided, use the specified docker image "
71 |             "instead of the default one. (e.g. kindest/node:v1.20.2)"
72 |         ),
73 |     )
74 |     group.addoption(
75 |         "--kind-bin",
76 |         default=None,
77 |         action="store",
78 |         type=str,
79 |         help=(
80 |             "If provided, use the specified kind binary instead of "
81 |             "downloading one. Takes a filesystem path string."
82 |         ),
83 |     )
84 |     group.addoption(
85 |         "--kind-kubectl-bin",
86 |         default=None,
87 |         action="store",
88 |         type=str,
89 |         help=(
90 |             "If provided, use the specified kubectl binary instead of "
91 |             "downloading one. Takes a filesystem path string."
92 |         ),
93 |     )
94 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from pathlib import Path
 4 | from unittest.mock import mock_open, patch
 5 | 
 6 | from paka.constants import HOME_ENV_VAR, PROJECT_NAME
 7 | from paka.utils import (
 8 |     call_once,
 9 |     camel_to_kebab,
10 |     camel_to_snake,
11 |     get_cluster_data_dir,
12 |     get_project_data_dir,
13 |     kubify_name,
14 |     save_kubeconfig,
15 |     to_yaml,
16 | )
17 | 
18 | 
19 | def test_camel_to_kebab() -> None:
20 |     assert camel_to_kebab("ExampleProject") == "example-project"
21 |     assert camel_to_kebab("AnotherExampleProject") == "another-example-project"
22 |     assert camel_to_kebab("YetAnotherExample") == "yet-another-example"
23 |     assert camel_to_kebab("lowercase") == "lowercase"
24 |     assert camel_to_kebab("UPPERCASE") == "uppercase"
25 | 
26 | 
27 | def test_kubify_name() -> None:
28 |     assert kubify_name("MyName") == "myname"
29 |     assert kubify_name("My.Name") == "my-name"
30 |     assert kubify_name("My_Name") == "my-name"
31 |     assert kubify_name("My-Name") == "my-name"
32 |     assert kubify_name("123MyName") == "myname"
33 |     assert kubify_name("MyName123") == "myname123"
34 |     assert kubify_name("MyName!") == "myname"
35 | 
36 | 
37 | def test_call_once() -> None:
38 |     counter = 0
39 | 
40 |     @call_once
41 |     def increment_counter() -> None:
42 |         nonlocal counter
43 |         counter += 1
44 | 
45 |     # Call the function twice
46 |     increment_counter()
47 |     increment_counter()
48 | 
49 |     # Check that the counter was only incremented once
50 |     assert counter == 1
51 | 
52 | 
53 | def test_to_yaml() -> None:
54 |     obj = {"key": "value"}
55 |     yaml_str = to_yaml(obj)
56 |     assert yaml_str == "key: value\n"
57 | 
58 |     obj1 = {"key": {"nested_key": "nested_value"}}
59 |     yaml_str = to_yaml(obj1)
60 |     assert yaml_str == "key:\n  nested_key: nested_value\n"
61 | 
62 |     obj2 = {"key": ["value1", "value2"]}
63 |     yaml_str = to_yaml(obj2)
64 |     assert yaml_str == "key:\n  - value1\n  - value2\n"
65 | 
66 | 
67 | def test_save_kubeconfig() -> None:
68 |     m = mock_open()
69 |     # Replace the built-in open function with the mock object
70 |     with patch("builtins.open", m):
71 |         kubeconfig_json = json.dumps({"apiVersion": "v1"})
72 |         save_kubeconfig("test", kubeconfig_json)
73 |     f = os.path.join(get_cluster_data_dir("test"), "kubeconfig.yaml")
74 |     m.assert_called_once_with(f, "w")
75 |     handle = m()
76 |     handle.write.assert_called_once()
77 | 
78 | 
79 | def test_get_project_data_dir() -> None:
80 |     with patch.dict(os.environ, {HOME_ENV_VAR: "/test/home"}):
81 |         result = get_project_data_dir()
82 | 
83 |         assert result == "/test/home"
84 | 
85 |     with patch.dict(os.environ, {}, clear=True):
86 |         result = get_project_data_dir()
87 | 
88 |         assert result == os.path.join(
89 |             str(Path.home()), f".{camel_to_kebab(PROJECT_NAME)}"
90 |         )
91 | 
92 | 
93 | def test_camel_to_snake() -> None:
94 |     assert camel_to_snake("camelCase") == "camel_case"
95 |     assert camel_to_snake("HTTPRequest") == "http_request"
96 |     assert camel_to_snake("IPV6Address") == "ipv6_address"
97 |     assert camel_to_snake("noChange") == "no_change"
98 |     assert camel_to_snake("") == ""
99 | 


--------------------------------------------------------------------------------
/paka/cluster/aws/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Sequence
 4 | 
 5 | import pulumi
 6 | import pulumi_aws as aws
 7 | import pulumi_eks as eks
 8 | from pulumi import Input
 9 | 
10 | from paka.cluster.context import Context
11 | from paka.utils import get_instance_info
12 | 
13 | 
14 | def odic_role_for_sa(
15 |     ctx: Context,
16 |     cluster: eks.Cluster,
17 |     role_name: str,
18 |     ns_service_account: str,
19 | ) -> aws.iam.Role:
20 |     """
21 |     Creates an IAM role for a service account in an EKS cluster using OpenID Connect (OIDC) authentication.
22 | 
23 |     Args:
24 |         config (CloudConfig): The cloud configuration.
25 |         cluster (eks.Cluster): The EKS cluster.
26 |         role_name (str): The name of the role.
27 |         ns_service_account (str): The name of the service account. e.g. "default:sa", "kube-system:auto-scaler"
28 | 
29 |     Returns:
30 |         aws.iam.Role: The IAM role for the service account.
31 |     """
32 |     oidc_url = cluster.core.oidc_provider.url
33 |     oidc_arn = cluster.core.oidc_provider.arn
34 | 
35 |     assume_role_policy = pulumi.Output.all(oidc_url, oidc_arn).apply(
36 |         lambda args: aws.iam.get_policy_document(
37 |             statements=[
38 |                 aws.iam.GetPolicyDocumentStatementArgs(
39 |                     effect="Allow",
40 |                     principals=[
41 |                         aws.iam.GetPolicyDocumentStatementPrincipalArgs(
42 |                             type="Federated",
43 |                             identifiers=[str(args[1])],
44 |                         )
45 |                     ],
46 |                     actions=["sts:AssumeRoleWithWebIdentity"],
47 |                     conditions=[
48 |                         aws.iam.GetPolicyDocumentStatementConditionArgs(
49 |                             test="StringEquals",
50 |                             variable=f"{args[0]}:sub",
51 |                             values=[f"system:serviceaccount:{ns_service_account}"],
52 |                         )
53 |                     ],
54 |                 )
55 |             ],
56 |         ).json
57 |     )
58 | 
59 |     role = aws.iam.Role(
60 |         f"{ctx.cluster_name}-{role_name}-role",
61 |         assume_role_policy=assume_role_policy,
62 |     )
63 | 
64 |     return role
65 | 
66 | 
67 | def get_ami_for_instance(ctx: Context, instance_type: str) -> str:
68 |     instance_info = get_instance_info(ctx.provider, ctx.region, instance_type)
69 |     gpu_count = instance_info.get("gpu_count", 0) or 0
70 |     arch = instance_info.get("arch", "x86_64")
71 | 
72 |     if gpu_count > 0:
73 |         if arch == "x86_64":
74 |             return "AL2_x86_64_GPU"
75 |         else:
76 |             return "BOTTLEROCKET_ARM_64_NVIDIA"
77 |     else:
78 |         if arch == "arm64":
79 |             return "AL2_ARM_64"
80 |     return "AL2_x86_64"
81 | 
82 | 
83 | def create_vpc_endpoint_for_s3(
84 |     vpc_id: str, route_table_ids: Input[Sequence[Input[str]]], region: str
85 | ) -> aws.ec2.VpcEndpoint:
86 |     s3_service_name = f"com.amazonaws.{region}.s3"
87 | 
88 |     vpc_endpoint = aws.ec2.VpcEndpoint(
89 |         "s3-vpc-endpoint",
90 |         vpc_id=vpc_id,
91 |         service_name=s3_service_name,
92 |         route_table_ids=route_table_ids,
93 |     )
94 |     return vpc_endpoint
95 | 


--------------------------------------------------------------------------------
/examples/invoice_extraction/serve.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import shutil
  4 | import time
  5 | from typing import Any, Dict, List
  6 | from uuid import uuid4
  7 | 
  8 | from fastapi import FastAPI, File, UploadFile
  9 | from langchain.callbacks.base import BaseCallbackHandler
 10 | from langchain_community.document_loaders import PyPDFLoader
 11 | from langchain_core.prompts import PromptTemplate
 12 | from output_parser import invoice_parser
 13 | from vllm import Vllm
 14 | 
 15 | LLM_URL = "http://llama2-7b-chat"
 16 | 
 17 | port = int(os.getenv("PORT", 8080))
 18 | app = FastAPI(
 19 |     title="Invoice Extraction Server",
 20 | )
 21 | logging.basicConfig(
 22 |     level=logging.INFO,
 23 |     format="%(asctime)s [%(levelname)s] %(message)s",
 24 |     handlers=[logging.StreamHandler()],
 25 | )
 26 | 
 27 | 
 28 | class CustomHandler(BaseCallbackHandler):
 29 |     def on_llm_start(
 30 |         self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
 31 |     ) -> Any:
 32 |         formatted_prompts = "\n".join(prompts)
 33 |         logging.info(f"Prompt:\n{formatted_prompts}")
 34 | 
 35 | 
 36 | def extract(pdf_path: str) -> str:
 37 |     pdf_loader = PyPDFLoader(pdf_path)
 38 |     pages = pdf_loader.load_and_split()
 39 |     page_content = pages[0].page_content
 40 | 
 41 |     logging.info(f"Extracting from PDF: {pdf_path}")
 42 | 
 43 |     template = """
 44 |      Extract all the following values : invoice number, invoice date, remit to company, remit to address,
 45 |      tax ID, invoice to customer, invoice to address, total amount from this invoice: {invoice_text}
 46 | 
 47 |      {format_instructions}
 48 | 
 49 |      Only returns the extracted JSON object, don't say anything else.
 50 |      Don't say "Sure, here is the extracted JSON object based" or anything similar.
 51 |     """
 52 | 
 53 |     chat_template = f"[INST] <<SYS>><</SYS>>\n\n{template} [/INST]\n"
 54 | 
 55 |     prompt = PromptTemplate(
 56 |         template=chat_template,
 57 |         input_variables=["invoice_text"],
 58 |         partial_variables={
 59 |             "format_instructions": invoice_parser.get_format_instructions()
 60 |         },
 61 |     )
 62 | 
 63 |     llm = Vllm(
 64 |         model="llama2-7b-chat",
 65 |         model_url=LLM_URL,
 66 |         temperature=0,
 67 |         streaming=False,
 68 |     )
 69 | 
 70 |     chain = prompt | llm | invoice_parser
 71 | 
 72 |     start_time = time.time()
 73 |     result = chain.invoke(
 74 |         {"invoice_text": page_content}, config={"callbacks": [CustomHandler()]}
 75 |     )
 76 |     end_time = time.time()
 77 |     logging.info(f"Execution time: {end_time - start_time} seconds")
 78 |     return result.to_dict()
 79 | 
 80 | 
 81 | @app.post("/extract_invoice")
 82 | async def upload_file(file: UploadFile = File(...)) -> Any:
 83 |     unique_filename = str(uuid4())
 84 |     tmp_file_path = f"/tmp/{unique_filename}"
 85 | 
 86 |     try:
 87 |         with open(tmp_file_path, "wb") as buffer:
 88 |             shutil.copyfileobj(file.file, buffer)
 89 | 
 90 |         return extract(tmp_file_path)
 91 |     finally:
 92 |         if os.path.exists(tmp_file_path):
 93 |             os.remove(tmp_file_path)
 94 | 
 95 | 
 96 | if __name__ == "__main__":
 97 |     import uvicorn
 98 | 
 99 |     uvicorn.run(app, host="localhost", port=port)
100 | 


--------------------------------------------------------------------------------
/examples/website_rag/README.md:
--------------------------------------------------------------------------------
 1 | ## website-rag
 2 | This code provides an example for Quality Answering (QA) of a website by leveraging LangChain. It begins by scraping the website to gather necessary data. The scraped text is then segmented into chunks. These chunks are transformed into embeddings, a numerical representation of the text data, which are then stored in a vector store, specifically Qdrant.
 3 | 
 4 | These embeddings are used to facilitate Question Answering (QA) with the help of Llama2-7b. This allows for interactive querying of the stored data, providing a robust tool for website QA.
 5 | 
 6 | ## Running the example
 7 | 
 8 | To run the example, first install the necessary dependencies:
 9 | ```bash
10 | pip install paka
11 | 
12 | # Install AWS CLI and ensure your AWS credentials are correctly configured.
13 | aws configure
14 | ```
15 | 
16 | ### Make sure docker daemon is running
17 | ```bash
18 | docker info
19 | ```
20 | 
21 | ### Provisioning the cluster
22 | 
23 | ```bash
24 | cd examples/website_rag
25 | 
26 | # Provision the cluster and update ~/.kube/config
27 | paka cluster up -f cluster.yaml
28 | ```
29 | 
30 | ### Scrape the website and create embeddings
31 | 
32 | 
33 | ```bash
34 | # Default BP_BUILDER is "paketobuildpacks/builder-jammy-base".
35 | # Here we use "paketobuildpacks/builder-jammy-full" to install sqlite
36 | # `ingest` is the entrypoint for the container, which is defined in the Procfile.
37 | BP_BUILDER="paketobuildpacks/builder-jammy-full" paka run --entrypoint ingest --source .
38 | ```
39 | 
40 | The command above will scrape https://python.langchain.com/docs, chunk the text, and create embeddings through langchain. Embeddings are created by a light Bert model that is managed by paka model group. The embeddings are then stored in a Qdrant cluster provisioned by paka.
41 | 
42 | ### Run the serverless LangServe App
43 | 
44 | ```bash
45 | # Below command will build the source and deploy it as a serverless function.
46 | BP_BUILDER="paketobuildpacks/builder-jammy-full" paka function deploy --name langchain-docs --source . --entrypoint serve
47 | 
48 | # Or, without building from the source, you can deploy the pre-built image
49 | paka function deploy --name langchain-docs --image website_rag-latest --entrypoint serve
50 | ```
51 | 
52 | Check the statuses of the functions
53 | ```bash
54 | paka function list
55 | ```
56 | 
57 | If everything is successful, you should see the function in the list with a status of "READY". By default, the function is exposed through a public accessible REST API endpoint.
58 | 
59 | ### Query the website
60 | 
61 | Doing a similarity search by hitting the `/invoke` endpoint of the deployed function.
62 | 
63 | ```bash
64 | curl -X POST -H "Content-Type: application/json" -d '{"input": "what is langchain"}' http://langchain-docs.default.xxxx.sslip.io/invoke
65 | ```
66 | 
67 | Asking a question by hitting the `/v2/invoke` endpoint of the deployed function. This will use the Llama2-7b model to answer the question.
68 | 
69 | NOTE: The request may take a while to respond since by default we are asking the model to generate answers based on 4 documents. RetrievalQA cannot stream the response; everything has to be processed before the response is sent back.
70 | 
71 | ```bash
72 | curl -X POST -H "Content-Type: application/json" -d '{"input": "what is langchain"}' http://langchain-docs.default.xxxx.sslip.io/v2/invoke
73 | ```
74 | 


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
 1 | # Frequently Asked Questions
 2 | 
 3 | ### How to use Paka in a team?
 4 | Paka is designed to handle cluster management in a team setting. To activate this feature, you'll need to establish a shared storage backend. This backend will hold the state data for the cluster provision.
 5 | 
 6 | A practical choice for this shared storage backend is AWS S3. You can set it up by using the `PULUMI_BACKEND_URL` environment variable. The format for this is `PULUMI_BACKEND_URL=s3://<bucket-name>/<path>`.
 7 | 
 8 | It's important to note that Paka uses Pulumi for cluster provisioning, hence the use of the Pulumi backend URL.
 9 | 
10 | ### How to run functions on dedicated nodes?
11 | To run functions on dedicated nodes, you can use the `function` field in the cluster spec. This field allows you to specify a node label that the function should run on.
12 | 
13 | ```yaml
14 | function:
15 |   nodeGroups:
16 |     - nodeTypes: ["t3a.large"]
17 |       diskSize: 20
18 |       isSpot: true
19 |       minInstances: 1
20 |       maxInstances: 3
21 | ```
22 | 
23 | ### How to run jobs on dedicated nodes?
24 | To run jobs on dedicated nodes, you can use the `job` field in the cluster spec. This field allows you to specify a node label that the job should run on.
25 | 
26 | ```yaml
27 | job:
28 |   enabled: true
29 |   brokerStorageSize: 40Gi
30 |   nodeGroups:
31 |     - nodeTypes: ["t3a.large"]
32 |       diskSize: 20
33 |       isSpot: true
34 |       minInstances: 1
35 |       maxInstances: 3
36 | ```
37 | 
38 | ### How to monitor logs?
39 | For AWS deployment, logs are sinked to AWS CloudWatch. You can view the logs by navigating to the CloudWatch console and selecting the log group for the function you want to monitor. Alternatively, you can use the Stern CLI (https://github.com/stern/stern) to view the logs.
40 | 
41 | To view the model logs, you can use the following command:
42 | ```bash
43 | stern --selector app=model-group
44 | ```
45 | 
46 | To view the function logs, you can use the following command:
47 | ```bash
48 | stern "my-app*"
49 | ```
50 | 
51 | ### How to scale the cluster?
52 | For model groups, you can scale the cluster by updating the `maxInstances` field in the cluster spec. This field specifies the maximum number of instances that can be created for the model group. And then set up appropriate auto-scaling triggers.
53 | 
54 | Scaling by CPU utilization:
55 | ```yaml
56 | modelGroups:
57 |   - name: auto-scale-model
58 |     minInstances: 1
59 |     maxInstances: 3
60 |     ...
61 |     autoScaleTriggers:
62 |       - type: cpu
63 |         metadata:
64 |           type: Utilization
65 |           value: "50"
66 | ```
67 | 
68 | Scaling by Prometheus metrics:
69 | ```yaml
70 | prometheus:
71 |   enabled: true
72 | modelGroups:
73 |   - name: auto-scale-model
74 |     minInstances: 1
75 |     maxInstances: 3
76 |     ...
77 |     autoScaleTriggers:
78 |       - type: prometheus
79 |         metadata:
80 |           serverAddress: http://kube-prometheus-stack-prometheus.prometheus.svc.cluster.local:9090 # Prometheus endpoint
81 |           metricName: latency_p95
82 |           threshold: '20000' # Set to 20s, tune as needed
83 |           query: | # Trigger scaling if p95 latency exceeds 20s
84 |             histogram_quantile(0.95, sum(rate(istio_request_duration_milliseconds_bucket{destination_service="llama2-7b-chat.default.svc.cluster.local"}[5m])) by (le))
85 | ```
86 | 
87 | For functions, you can adjust the scaling parameters as you deploy the function.
88 | 
89 | ```bash
90 | paka function deploy --name my-function --source . --entrypoint serve --min-instances 1 --max-instances 3 --scaling-metric concurrency --metric_target 2
91 | ```
92 | 


--------------------------------------------------------------------------------
/docs/quick_start.md:
--------------------------------------------------------------------------------
 1 | Since Paka currently only supports AWS, the quick start guide will be tailored to AWS.
 2 | 
 3 | ### Install the necessary dependencies
 4 | - Install docker daemon and CLI.
 5 | - Install the aws cli and ensure your AWS credentials are correctly configured.
 6 | ```bash
 7 | aws configure
 8 | ```
 9 | - Install Paka.
10 | ```bash
11 | pip install paka
12 | ```
13 | 
14 | ### Request GPU quota increase
15 | Go to the AWS console and request a quota increase. Beware that there are two types of quotas: On-Demand and Spot. The On-Demand quota is the number of instances that are not preemptible, while the Spot quota is the number of instances that can be preempted. Spot instances are cheaper than On-Demand instances.
16 | 
17 | Paka supports mixed instance types, so you can use spot instances for cost savings and on-demand instances as a fallback.
18 | 
19 | ### Create a cluster config file
20 | Create a `cluster.yaml` (could be any name) file. See [cluster.yaml](https://github.com/jjleng/paka/blob/main/examples/invoice_extraction/cluster.yaml) as an example. Refer to the [cluster config](https://github.com/jjleng/paka/blob/main/docs/cluster_config.md) for the fields that can be included in the cluster config file.
21 | 
22 | ### Provision the cluster
23 | Provision the cluster with the following command:
24 | ```bash
25 | paka cluster up -f cluster.yaml
26 | ```
27 | 
28 | ### Build an LLM powered application
29 | Create an application skeleton. See [invoice_extraction](https://github.com/jjleng/paka/tree/main/examples/invoice_extraction) as an example. Ensure the following files are included in your application root directory:
30 | 
31 | - **Procfile**: Defines the entrypoint for your application. See [Procfile](https://github.com/jjleng/paka/blob/main/examples/invoice_extraction/Procfile).
32 | - **.cnignore file**: Excludes any files that shouldn't be included in the build. See [.cnignore](https://github.com/jjleng/paka/blob/main/examples/invoice_extraction/.cnignore).
33 | - **runtime.txt**: Pins the version of the runtime your application uses. See [runtime.txt](https://github.com/jjleng/paka/blob/main/examples/invoice_extraction/runtime.txt).
34 | - **requirements.txt or package.json**: Lists all necessary packages for your application.
35 | 
36 | 
37 | ### Deploy the application
38 | ```bash
39 | paka function deploy --name APP_NAME --source . --entrypoint ENTRYPOINT_NAME
40 | ```
41 | 
42 | APP_NAME is the name of the application. The command above will build the source and deploy it as a serverless function.
43 | `--source` specifies the source directory of the application.
44 | `--entrypoint` specifies the entrypoint of the application, which is defined in the Procfile.
45 | 
46 | ### Check the logs
47 | For AWS deployment, logs are sinked to AWS CloudWatch. You can view the logs by navigating to the CloudWatch console and selecting the log group for the function you want to monitor. Alternatively, you can use the Stern CLI (https://github.com/stern/stern) to view the logs.
48 | 
49 | To view the model logs, you can use the following command:
50 | ```bash
51 | stern --selector app=model-group
52 | ```
53 | 
54 | To view the function logs, you can use the following command:
55 | ```bash
56 | stern "my-app*"
57 | ```
58 | 
59 | ### Continuous Integration/Deployment
60 | You can set up a CI/CD pipeline to automate the deployment process. For example, you can use GitHub Actions to build and deploy the application on every push to the main branch. To deploy the local changes to the cloud, you can simply run the deploy command again.
61 | 
62 | ```bash
63 | paka function deploy --name APP_NAME --source . --entrypoint ENTRYPOINT_NAME
64 | ```
65 | 
66 | ### Tear down the cluster
67 | ```bash
68 | paka cluster down -f cluster.yaml -y
69 | ```
70 | 


--------------------------------------------------------------------------------
/paka/cluster/nvidia_device_plugin.py:
--------------------------------------------------------------------------------
 1 | import pulumi
 2 | import pulumi_kubernetes as k8s
 3 | 
 4 | from paka.cluster.context import Context
 5 | from paka.utils import call_once
 6 | 
 7 | 
 8 | @call_once
 9 | def install_nvidia_device_plugin(ctx: Context, version: str = "v0.15.0-rc.2") -> None:
10 |     """
11 |     Installs the NVIDIA device plugin for GPU support in the cluster.
12 | 
13 |     This function deploys the NVIDIA device plugin to the cluster using a DaemonSet.
14 |     The device plugin allows Kubernetes to discover and manage GPU resources on the nodes.
15 | 
16 |     Args:
17 |         k8s_provider (k8s.Provider): The Kubernetes provider to use for deploying the device plugin.
18 | 
19 |     Returns:
20 |         None
21 |     """
22 | 
23 |     k8s.apps.v1.DaemonSet(
24 |         "nvidia-device-plugin-daemonset",
25 |         metadata=k8s.meta.v1.ObjectMetaArgs(
26 |             namespace="kube-system",
27 |         ),
28 |         spec=k8s.apps.v1.DaemonSetSpecArgs(
29 |             selector=k8s.meta.v1.LabelSelectorArgs(
30 |                 match_labels={
31 |                     "name": "nvidia-device-plugin-ds",
32 |                 },
33 |             ),
34 |             update_strategy=k8s.apps.v1.DaemonSetUpdateStrategyArgs(
35 |                 type="RollingUpdate",
36 |             ),
37 |             template=k8s.core.v1.PodTemplateSpecArgs(
38 |                 metadata=k8s.meta.v1.ObjectMetaArgs(
39 |                     labels={
40 |                         "name": "nvidia-device-plugin-ds",
41 |                     },
42 |                 ),
43 |                 spec=k8s.core.v1.PodSpecArgs(
44 |                     tolerations=[
45 |                         k8s.core.v1.TolerationArgs(
46 |                             key="nvidia.com/gpu",
47 |                             operator="Exists",
48 |                             effect="NoSchedule",
49 |                         ),
50 |                         k8s.core.v1.TolerationArgs(operator="Exists"),
51 |                     ],
52 |                     priority_class_name="system-node-critical",
53 |                     containers=[
54 |                         k8s.core.v1.ContainerArgs(
55 |                             image=f"nvcr.io/nvidia/k8s-device-plugin:{version}",
56 |                             name="nvidia-device-plugin-ctr",
57 |                             env=[
58 |                                 k8s.core.v1.EnvVarArgs(
59 |                                     name="FAIL_ON_INIT_ERROR",
60 |                                     value="false",
61 |                                 )
62 |                             ],
63 |                             security_context=k8s.core.v1.SecurityContextArgs(
64 |                                 allow_privilege_escalation=False,
65 |                                 capabilities=k8s.core.v1.CapabilitiesArgs(
66 |                                     drop=["ALL"],
67 |                                 ),
68 |                             ),
69 |                             volume_mounts=[
70 |                                 k8s.core.v1.VolumeMountArgs(
71 |                                     name="device-plugin",
72 |                                     mount_path="/var/lib/kubelet/device-plugins",
73 |                                 )
74 |                             ],
75 |                         )
76 |                     ],
77 |                     volumes=[
78 |                         k8s.core.v1.VolumeArgs(
79 |                             name="device-plugin",
80 |                             host_path=k8s.core.v1.HostPathVolumeSourceArgs(
81 |                                 path="/var/lib/kubelet/device-plugins",
82 |                             ),
83 |                         )
84 |                     ],
85 |                 ),
86 |             ),
87 |         ),
88 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider),
89 |     )
90 | 


--------------------------------------------------------------------------------
/paka/cli/model_group.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | from typing import Optional, Set
  5 | 
  6 | import boto3
  7 | import typer
  8 | from kubernetes import client
  9 | from tabulate import tabulate
 10 | 
 11 | from paka.cli.utils import (
 12 |     ensure_cluster_name,
 13 |     get_cluster_namespace,
 14 |     load_kubeconfig,
 15 |     read_pulumi_stack,
 16 | )
 17 | from paka.k8s.model_group.service import MODEL_PATH_PREFIX, filter_services
 18 | from paka.logger import logger
 19 | 
 20 | model_group_app = typer.Typer()
 21 | 
 22 | 
 23 | @model_group_app.command()
 24 | def list_downloaded_models(
 25 |     cluster_name: Optional[str] = typer.Option(
 26 |         os.getenv("PAKA_CURRENT_CLUSTER"),
 27 |         "--cluster",
 28 |         "-c",
 29 |         help="The name of the cluster.",
 30 |     ),
 31 | ) -> None:
 32 |     """
 33 |     List all models that have been downloaded to the object store.
 34 |     """
 35 |     load_kubeconfig(cluster_name)
 36 |     cluster_name = ensure_cluster_name(cluster_name)
 37 |     bucket = read_pulumi_stack(cluster_name, "bucket")
 38 | 
 39 |     s3 = boto3.client("s3")
 40 |     response = s3.list_objects_v2(Bucket=bucket, Prefix=MODEL_PATH_PREFIX)
 41 |     if "Contents" in response:
 42 |         unique_models: Set[str] = set()
 43 |         for obj in response["Contents"]:
 44 |             key = obj["Key"]
 45 |             if key.startswith(f"{MODEL_PATH_PREFIX}/"):
 46 |                 key = key[len(f"{MODEL_PATH_PREFIX}/") :]
 47 | 
 48 |             key = key.split("/")[0]
 49 |             unique_models.add(key)
 50 | 
 51 |         for key in unique_models:
 52 |             logger.info(key)
 53 |     else:
 54 |         logger.info("No models found.")
 55 | 
 56 | 
 57 | @model_group_app.command()
 58 | def list(
 59 |     cluster_name: Optional[str] = typer.Option(
 60 |         os.getenv("PAKA_CURRENT_CLUSTER"),
 61 |         "--cluster",
 62 |         "-c",
 63 |         help="The name of the cluster.",
 64 |     ),
 65 | ) -> None:
 66 |     """
 67 |     List all model groups.
 68 |     """
 69 |     load_kubeconfig(cluster_name)
 70 |     services = filter_services(get_cluster_namespace(cluster_name))
 71 | 
 72 |     # Get public model groups
 73 |     public_model_groups = [
 74 |         service.spec.selector.get("model", "")
 75 |         for service in services
 76 |         if service.spec
 77 |         and service.spec.selector
 78 |         and "model" in service.spec.selector
 79 |         and service.metadata
 80 |         and service.metadata.labels
 81 |         and (service.metadata.labels.get("is-public", "false") == "true")
 82 |     ]
 83 | 
 84 |     # Get private model groups
 85 |     private_model_groups = [
 86 |         service.spec.selector.get("model", "")
 87 |         for service in services
 88 |         if service.spec
 89 |         and service.spec.selector
 90 |         and "model" in service.spec.selector
 91 |         and service.metadata
 92 |         and service.metadata.labels
 93 |         and (service.metadata.labels.get("is-public", "false") == "false")
 94 |     ]
 95 | 
 96 |     model_groups = public_model_groups + private_model_groups
 97 | 
 98 |     v1 = client.CoreV1Api()
 99 |     cfg = v1.read_namespaced_config_map("config-domain", "knative-serving")
100 |     cfg_data = cfg.data or {}
101 |     filtered_keys = [key for key in cfg_data if key.endswith("sslip.io")]
102 |     if not filtered_keys:
103 |         if not model_groups:
104 |             logger.info("No model groups found.")
105 |         else:
106 |             logger.info("\n".join(model_groups))
107 |         return
108 |     domain = filtered_keys[0]
109 | 
110 |     table = [(group, f"http://{group}.{domain}") for group in public_model_groups]
111 |     table.extend([(group, f"private") for group in private_model_groups])
112 |     logger.info(tabulate(table, headers=["Model Group", "Endpoint"]))
113 | 


--------------------------------------------------------------------------------
/paka/cli/run.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shlex
  3 | from typing import Optional
  4 | 
  5 | import typer
  6 | from kubernetes import client
  7 | 
  8 | from paka.cli.utils import get_cluster_namespace, load_kubeconfig, resolve_image
  9 | from paka.k8s.utils import tail_logs
 10 | from paka.logger import logger
 11 | from paka.utils import kubify_name, random_str
 12 | 
 13 | CLEANUP_TIMEOUT = 600  # 10 minutes
 14 | 
 15 | run_app = typer.Typer()
 16 | 
 17 | 
 18 | @run_app.callback(invoke_without_command=True)
 19 | def one_off_script(
 20 |     cluster_name: Optional[str] = typer.Option(
 21 |         os.getenv("PAKA_CURRENT_CLUSTER"),
 22 |         "--cluster",
 23 |         "-c",
 24 |         help="The name of the cluster.",
 25 |     ),
 26 |     entrypoint: str = typer.Option(
 27 |         ...,
 28 |         "--entrypoint",
 29 |         help="The entrypoint of the application. This refers to the command "
 30 |         "defined in the Procfile that will be executed.",
 31 |     ),
 32 |     source_dir: Optional[str] = typer.Option(
 33 |         None,
 34 |         "--source",
 35 |         help="The directory containing the source code of the application. If "
 36 |         "specified, a new Docker image will be built using the source code from "
 37 |         "this directory. A Dockerfile is not required because the build process "
 38 |         "uses Cloud Native's Buildpacks, which automatically detect and install "
 39 |         "dependencies.",
 40 |     ),
 41 |     image: Optional[str] = typer.Option(
 42 |         None,
 43 |         "--image",
 44 |         help="The name of the Docker image to deploy. If both an image and a "
 45 |         "source directory are provided, this image will be used and the source "
 46 |         "directory will be ignored.",
 47 |     ),
 48 | ) -> None:
 49 |     """
 50 |     Runs a one-off script.
 51 | 
 52 |     This command creates a new Kubernetes job that runs the specified entrypoint command
 53 |     in a container with the specified Docker image. If a source directory is provided, a new
 54 |     Docker image is built using the source code from that directory.
 55 |     """
 56 |     load_kubeconfig(cluster_name)
 57 |     resolved_image = resolve_image(cluster_name, image, source_dir)
 58 | 
 59 |     # Generate a job name which is the hash of the command
 60 |     job_name = f"run-{kubify_name(random_str(10))}"
 61 | 
 62 |     job = client.V1Job(
 63 |         api_version="batch/v1",
 64 |         kind="Job",
 65 |         metadata=client.V1ObjectMeta(name=job_name, labels={"job-name": job_name}),
 66 |         spec=client.V1JobSpec(
 67 |             template=client.V1PodTemplateSpec(
 68 |                 spec=client.V1PodSpec(
 69 |                     containers=[
 70 |                         client.V1Container(
 71 |                             name="one-off-script",
 72 |                             image=resolved_image,
 73 |                             image_pull_policy="Always",
 74 |                             command=shlex.split(entrypoint),
 75 |                         )
 76 |                     ],
 77 |                     restart_policy="Never",
 78 |                 )
 79 |             ),
 80 |             backoff_limit=0,
 81 |             ttl_seconds_after_finished=CLEANUP_TIMEOUT,
 82 |         ),
 83 |     )
 84 | 
 85 |     namespace = get_cluster_namespace(cluster_name)
 86 | 
 87 |     logger.info(f"Submitting the task...")
 88 |     batch_api = client.BatchV1Api()
 89 |     batch_api.create_namespaced_job(namespace=namespace, body=job)
 90 |     logger.info(f"Successfully submitted the task.")
 91 | 
 92 |     logger.info(f"Waiting for the task to complete...")
 93 |     api = client.CoreV1Api()
 94 |     pods = api.list_namespaced_pod(
 95 |         namespace=namespace, label_selector=f"job-name={job_name}"
 96 |     )
 97 |     for pod in pods.items:
 98 |         if pod.metadata and pod.metadata.name:
 99 |             tail_logs(namespace, pod.metadata.name, "one-off-script")
100 | 


--------------------------------------------------------------------------------
/paka/cluster/manager/base.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from abc import ABC, abstractmethod
  4 | from functools import cached_property
  5 | from typing import Any
  6 | 
  7 | from pulumi import automation as auto
  8 | 
  9 | from paka.cluster.context import Context
 10 | from paka.cluster.pulumi import ensure_pulumi
 11 | from paka.config import CloudConfig, Config
 12 | from paka.constants import PULUMI_STACK_NAME
 13 | from paka.k8s.model_group.service import (
 14 |     cleanup_staled_model_group_services,
 15 |     create_model_group_service,
 16 | )
 17 | from paka.k8s.model_group.service_v1 import (
 18 |     create_model_group_service as create_model_group_service_v1,
 19 | )
 20 | from paka.logger import logger
 21 | 
 22 | STACK_NAME = "default"
 23 | 
 24 | 
 25 | class ClusterManager(ABC):
 26 |     """
 27 |     Abstract base class for a cluster manager.
 28 | 
 29 |     A ClusterManager is responsible for managing a cluster of compute resources.
 30 | 
 31 |     Subclasses must implement the abstract methods defined in this class.
 32 |     """
 33 | 
 34 |     config: Config
 35 |     cloud_config: CloudConfig
 36 | 
 37 |     def __init__(self, config: Config) -> None:
 38 |         self.config = config
 39 |         if not config.aws is None:
 40 |             self.cloud_config = config.aws
 41 |         self.ctx = Context()
 42 |         self.ctx.set_config(config)
 43 | 
 44 |     @abstractmethod
 45 |     def provision_k8s(self) -> None:
 46 |         pass
 47 | 
 48 |     def _stack_for_program(self, program: auto.PulumiFn) -> auto.Stack:
 49 |         return auto.create_or_select_stack(
 50 |             stack_name=PULUMI_STACK_NAME,
 51 |             project_name=self.cloud_config.cluster.name,
 52 |             program=program,
 53 |         )
 54 | 
 55 |     @cached_property
 56 |     def _stack(self) -> auto.Stack:
 57 |         ensure_pulumi()
 58 | 
 59 |         def program() -> None:
 60 |             self.provision_k8s()
 61 | 
 62 |         return self._stack_for_program(program)
 63 | 
 64 |     def create(self) -> None:
 65 |         if self.config.aws is None:
 66 |             raise ValueError("Only AWS is supported.")
 67 | 
 68 |         if self.config.aws:
 69 |             self._stack.set_config(
 70 |                 "aws:region", auto.ConfigValue(value=self.cloud_config.cluster.region)
 71 |             )
 72 | 
 73 |         logger.info("Creating resources...")
 74 |         self._stack.up(on_output=logger.info)
 75 | 
 76 |         if (
 77 |             self.cloud_config.modelGroups is None
 78 |             and self.cloud_config.mixedModelGroups is None
 79 |         ):
 80 |             return
 81 | 
 82 |         namespace = self.cloud_config.cluster.namespace
 83 | 
 84 |         # Clean up staled model group resources before creating new ones
 85 |         model_group_names = [mg.name for mg in self.config.aws.modelGroups or []]
 86 |         mixed_model_group_names = [
 87 |             mg.name for mg in self.config.aws.mixedModelGroups or []
 88 |         ]
 89 |         all_group_names = model_group_names + mixed_model_group_names
 90 | 
 91 |         cleanup_staled_model_group_services(namespace, all_group_names)
 92 |         # TODO: We should clean up deployment as well
 93 | 
 94 |         for model_group in self.cloud_config.modelGroups or []:
 95 |             create_model_group_service(self.ctx, namespace, model_group)
 96 | 
 97 |         for mixed_model_group in self.cloud_config.mixedModelGroups or []:
 98 |             create_model_group_service_v1(self.ctx, namespace, mixed_model_group)
 99 | 
100 |     def destroy(self) -> Any:
101 |         logger.info("Destroying resources...")
102 |         return self._stack.destroy(on_output=logger.info)
103 | 
104 |     def refresh(self) -> None:
105 |         logger.info("Refreshing the stack...")
106 |         self._stack.refresh(on_output=logger.info)
107 | 
108 |     def preview(self, *args: Any, **kwargs: Any) -> None:
109 |         if not "on_output" in kwargs:
110 |             kwargs["on_output"] = logger.info
111 |         self._stack.preview(*args, **kwargs)
112 | 


--------------------------------------------------------------------------------
/tests/k8s/test_utils.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import MagicMock, patch
  2 | 
  3 | from kubernetes.client.exceptions import ApiException
  4 | 
  5 | import paka.k8s.utils
  6 | from paka.k8s.utils import KubeconfigMerger, KubernetesResource, apply_resource
  7 | 
  8 | 
  9 | def test_apply_resource() -> None:
 10 |     resource = MagicMock(spec=KubernetesResource)
 11 |     resource.kind = "Deployment"
 12 |     resource.metadata = MagicMock()
 13 |     resource.metadata.name = "test"
 14 |     resource.metadata.namespace = "default"
 15 | 
 16 |     with patch("kubernetes.client.AppsV1Api") as mock_api_class:
 17 |         mock_api = mock_api_class.return_value
 18 |         mock_api.create_namespaced_deployment = MagicMock()
 19 |         mock_api.replace_namespaced_deployment = MagicMock()
 20 |         mock_api.read_namespaced_deployment = MagicMock(
 21 |             side_effect=ApiException(status=404)
 22 |         )
 23 | 
 24 |         apply_resource(resource)
 25 | 
 26 |         mock_api.create_namespaced_deployment.assert_called_once_with(
 27 |             resource.metadata.namespace, resource
 28 |         )
 29 | 
 30 | 
 31 | def test_apply_resource_existing() -> None:
 32 |     resource = MagicMock(spec=KubernetesResource)
 33 |     resource.kind = "Deployment"
 34 |     resource.metadata = MagicMock()
 35 |     resource.metadata.name = "test"
 36 |     resource.metadata.namespace = "default"
 37 | 
 38 |     with patch("kubernetes.client.AppsV1Api") as mock_api_class:
 39 |         mock_api = mock_api_class.return_value
 40 |         mock_api.create_namespaced_deployment = MagicMock()
 41 |         mock_api.replace_namespaced_deployment = MagicMock()
 42 |         mock_api.read_namespaced_deployment = MagicMock()
 43 | 
 44 |         apply_resource(resource)
 45 | 
 46 |         mock_api.replace_namespaced_deployment.assert_called_once_with(
 47 |             resource.metadata.name, resource.metadata.namespace, resource
 48 |         )
 49 | 
 50 | 
 51 | def test_apply_resource_scaled_object() -> None:
 52 |     resource = MagicMock()
 53 |     resource.kind = "ScaledObject"
 54 |     resource.metadata = MagicMock()
 55 |     resource.metadata.name = "test"
 56 |     resource.metadata.namespace = "default"
 57 | 
 58 |     with patch.object(
 59 |         paka.k8s.utils, "create_namespaced_custom_object"
 60 |     ) as mock_create, patch.object(
 61 |         paka.k8s.utils, "read_namespaced_custom_object"
 62 |     ) as mock_read:
 63 |         mock_read.side_effect = ApiException(status=404)
 64 | 
 65 |         apply_resource(resource)
 66 | 
 67 |         mock_create.assert_called_once_with(resource.metadata.namespace, resource)
 68 | 
 69 | 
 70 | def test_kubeconfig_merger() -> None:
 71 |     # Initialize a KubeconfigMerger object with some initial config
 72 |     merger = KubeconfigMerger(
 73 |         {
 74 |             "clusters": [{"name": "cluster1", "data": "data1"}],
 75 |             "users": [{"name": "user1", "data": "data1"}],
 76 |             "contexts": [{"name": "context1", "data": "data1"}],
 77 |             "current-context": "context1",
 78 |             "other-key": "other-value",
 79 |         }
 80 |     )
 81 | 
 82 |     # Define a new config to be merged
 83 |     new_config = {
 84 |         "clusters": [{"name": "cluster2", "data": "data2"}],
 85 |         "users": [{"name": "user2", "data": "data2"}],
 86 |         "contexts": [{"name": "context2", "data": "data2"}],
 87 |         "current-context": "context2",
 88 |         "other-key": "other-value2",
 89 |     }
 90 | 
 91 |     merger.merge(new_config)
 92 | 
 93 |     assert merger.config == {
 94 |         "clusters": [
 95 |             {"name": "cluster1", "data": "data1"},
 96 |             {"name": "cluster2", "data": "data2"},
 97 |         ],
 98 |         "users": [
 99 |             {"name": "user1", "data": "data1"},
100 |             {"name": "user2", "data": "data2"},
101 |         ],
102 |         "contexts": [
103 |             {"name": "context1", "data": "data1"},
104 |             {"name": "context2", "data": "data2"},
105 |         ],
106 |         "current-context": "context2",
107 |         "other-key": "other-value2",
108 |     }
109 | 


--------------------------------------------------------------------------------
/paka/cluster/pulumi.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import platform
  3 | import shutil
  4 | import tarfile
  5 | import zipfile
  6 | from pathlib import Path
  7 | 
  8 | import requests
  9 | 
 10 | from paka.cluster.kubectl import ensure_kubectl
 11 | from paka.logger import logger
 12 | from paka.utils import calculate_sha256, download_url, get_project_data_dir
 13 | 
 14 | # Pin the Pulumi version to avoid breaking changes
 15 | PULUMI_VERSION = "v3.114.0"
 16 | 
 17 | 
 18 | def change_permissions_recursive(path: Path, mode: int) -> None:
 19 |     for child in path.iterdir():
 20 |         if child.is_file():
 21 |             child.chmod(mode)
 22 |         elif child.is_dir():
 23 |             child.chmod(mode)
 24 |             change_permissions_recursive(child, mode)
 25 | 
 26 | 
 27 | def ensure_pulumi() -> None:
 28 |     # Plulumi kubernetes provider requires kubectl to be installed
 29 |     ensure_kubectl()
 30 |     paka_home = Path(get_project_data_dir())
 31 | 
 32 |     bin_dir = paka_home / "bin"
 33 |     bin_dir.mkdir(parents=True, exist_ok=True)
 34 | 
 35 |     system = platform.system().lower()
 36 |     arch = platform.machine().lower()
 37 | 
 38 |     current_path = os.environ.get("PATH", "")
 39 | 
 40 |     pulumi_files = list(bin_dir.glob("pulumi-*"))
 41 |     if pulumi_files:
 42 |         os.environ["PATH"] = f"{pulumi_files[0]}{os.pathsep}{current_path}"
 43 |         return
 44 | 
 45 |     pulumi_version = PULUMI_VERSION
 46 | 
 47 |     new_pulumi_path = bin_dir / f"pulumi-{pulumi_version}"
 48 | 
 49 |     if arch in ["amd64", "x86_64"]:
 50 |         arch = "x64"
 51 |     elif arch == "arm64":
 52 |         arch = "arm64"
 53 |     else:
 54 |         raise Exception(f"Unsupported architecture: {arch}")
 55 | 
 56 |     pulumi_file = f"pulumi-{pulumi_version}-{system}-{arch}"
 57 | 
 58 |     if system == "windows":
 59 |         pulumi_file = f"{pulumi_file}.zip"
 60 |     else:
 61 |         pulumi_file = f"{pulumi_file}.tar.gz"
 62 | 
 63 |     # First of all, download the checksum file
 64 |     checksum_url = f"https://github.com/pulumi/pulumi/releases/download/{pulumi_version}/pulumi-{pulumi_version[1:]}-checksums.txt"
 65 | 
 66 |     response = requests.get(checksum_url)
 67 |     response.raise_for_status()
 68 |     file_sha256_dict = {}
 69 |     # Iterate over the lines in the checksum file and split by sha256 and filename
 70 |     for line in response.text.strip().split("\n"):
 71 |         expected_sha256, filename = line.strip().split()
 72 |         file_sha256_dict[filename] = expected_sha256
 73 | 
 74 |     url = f"https://github.com/pulumi/pulumi/releases/download/{pulumi_version}/{pulumi_file}"
 75 | 
 76 |     logger.info(f"Downloading {pulumi_file}...")
 77 | 
 78 |     with download_url(url) as archive_file:
 79 |         archive_file_sha256 = calculate_sha256(archive_file)
 80 | 
 81 |         if pulumi_file not in file_sha256_dict:
 82 |             raise Exception(f"SHA256 not found for {pulumi_file}")
 83 | 
 84 |         expected_sha256 = file_sha256_dict[pulumi_file]
 85 | 
 86 |         if archive_file_sha256 != expected_sha256:
 87 |             raise Exception(
 88 |                 f"SHA256 mismatch: {archive_file_sha256} != {expected_sha256}"
 89 |             )
 90 | 
 91 |         if system == "windows":
 92 |             with zipfile.ZipFile(archive_file, "r") as zip_ref:
 93 |                 zip_ref.extractall(bin_dir)
 94 |         else:
 95 |             with tarfile.open(archive_file, "r:gz") as tar:
 96 |                 tar.extractall(bin_dir)
 97 | 
 98 |     pulumi_path = bin_dir / "pulumi"
 99 |     change_permissions_recursive(pulumi_path, 0o755)
100 |     pulumi_path = pulumi_path.rename(new_pulumi_path)
101 | 
102 |     # For windows, the Pulumi binary is under pulumi_path/bin
103 |     # For other platforms, the Pulumi binary is under pulumi_path
104 |     if system == "windows":
105 |         windows_bin_path = pulumi_path / "bin"
106 |         for file in windows_bin_path.iterdir():
107 |             if file.is_file():
108 |                 shutil.move(str(file), str(pulumi_path))
109 | 
110 |     logger.info("Pulumi installed successfully.")
111 | 
112 |     os.environ["PATH"] = f"{pulumi_path}{os.pathsep}{current_path}"
113 | 


--------------------------------------------------------------------------------
/paka/container/ecr.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import subprocess
  3 | 
  4 | import boto3
  5 | 
  6 | from paka.logger import logger
  7 | from paka.utils import random_str
  8 | 
  9 | 
 10 | def authenticate_docker_to_ecr(aws_region: str) -> str:
 11 |     try:
 12 |         ecr_client = boto3.client("ecr", region_name=aws_region)
 13 |         token = ecr_client.get_authorization_token()
 14 |         username, password = (
 15 |             base64.b64decode(token["authorizationData"][0]["authorizationToken"])
 16 |             .decode("utf-8")
 17 |             .split(":")
 18 |         )
 19 |         ecr_url = token["authorizationData"][0]["proxyEndpoint"]
 20 | 
 21 |         p = subprocess.Popen(
 22 |             ["docker", "login", "-u", username, "--password-stdin", ecr_url],
 23 |             stdin=subprocess.PIPE,
 24 |             stdout=subprocess.PIPE,
 25 |             stderr=subprocess.PIPE,
 26 |         )
 27 |         stdout, stderr = p.communicate(input=password.encode())
 28 |         if p.returncode != 0:
 29 |             raise Exception(f"Docker login failed: {stderr.decode()}")
 30 | 
 31 |         return ecr_url
 32 |     except Exception as e:
 33 |         print(f"An error occurred: {str(e)}")
 34 |         raise
 35 | 
 36 | 
 37 | def push_to_ecr(
 38 |     local_image_name: str, repository_uri: str, aws_region: str, app_name: str
 39 | ) -> str:
 40 |     """
 41 |     Pushes a Docker image to an Amazon ECR repository.
 42 | 
 43 |     This function tags the Docker image with a version tag and the "latest" tag,
 44 |     logs in to the ECR repository, and pushes the image to the repository.
 45 |     The version tag is generated randomly.
 46 | 
 47 |     All applications share the same container registry repository.
 48 |     To differentiate between them, we append the application name to the image tag.
 49 |     The '-latest' suffix is added to handle cases where applications themselves are tagged.
 50 |     This ensures that even tagged applications have a unique identifier in the shared repository.
 51 | 
 52 |     Args:
 53 |         local_image_name (str): The name of the Docker image to push.
 54 |         repository_uri (str): The URI of the ECR repository to push the image to.
 55 |         aws_region (str): The AWS region where the ECR repository is located.
 56 |         app_name (str): The name of the application. Used to generate the image tags.
 57 | 
 58 |     Raises:
 59 |         subprocess.CalledProcessError: If an error occurs while executing a subprocess command.
 60 | 
 61 |     Returns:
 62 |         str: The version tag of the image that was pushed.
 63 |     """
 64 |     try:
 65 |         # Generate a random version number
 66 |         version = random_str()
 67 | 
 68 |         # Tag the image with the repository URI and the version tag
 69 |         version_tag = f"{app_name}-v{version}"
 70 | 
 71 |         local_image_tagged = (
 72 |             f"{local_image_name}:latest"
 73 |             if ":" not in local_image_name
 74 |             else local_image_name
 75 |         )
 76 | 
 77 |         # Tag the image with the repository URI
 78 |         subprocess.run(
 79 |             [
 80 |                 "docker",
 81 |                 "tag",
 82 |                 local_image_tagged,
 83 |                 f"{repository_uri}:{version_tag}",
 84 |             ],
 85 |             check=True,
 86 |         )
 87 | 
 88 |         # Tag the image with the repository URI and the "latest" tag
 89 |         latest_tag = f"{app_name}-latest"
 90 |         subprocess.run(
 91 |             [
 92 |                 "docker",
 93 |                 "tag",
 94 |                 local_image_tagged,
 95 |                 f"{repository_uri}:{latest_tag}",
 96 |             ],
 97 |             check=True,
 98 |         )
 99 | 
100 |         # Authenticate Docker to the ECR
101 |         authenticate_docker_to_ecr(aws_region)
102 | 
103 |         # Push the image to the ECR repository
104 |         subprocess.run(
105 |             ["docker", "push", f"{repository_uri}:{version_tag}"], check=True
106 |         )
107 |         subprocess.run(["docker", "push", f"{repository_uri}:{latest_tag}"], check=True)
108 | 
109 |         logger.info(f"Successfully pushed {local_image_name} to {repository_uri}")
110 |         return version_tag
111 |     except subprocess.CalledProcessError as e:
112 |         logger.error(f"An error occurred: {e}")
113 |         raise
114 | 


--------------------------------------------------------------------------------
/paka/cluster/aws/cluster_autoscaler.py:
--------------------------------------------------------------------------------
  1 | import pulumi
  2 | import pulumi_aws as aws
  3 | import pulumi_eks as eks
  4 | import pulumi_kubernetes.helm.v3 as helm
  5 | from pulumi_kubernetes.core.v1 import ConfigMap
  6 | 
  7 | from paka.cluster.aws.utils import odic_role_for_sa
  8 | from paka.cluster.context import Context
  9 | from paka.utils import call_once, to_yaml
 10 | 
 11 | 
 12 | def create_priority_expander(ctx: Context) -> ConfigMap:
 13 |     # Create a priority expander to ensure that the cluster autoscaler provisions spot instances first.
 14 |     priority_data = {10: [".*spot.*"], 1: [".*"]}
 15 |     return ConfigMap(
 16 |         "cluster-autoscaler-priority-expander",
 17 |         metadata={
 18 |             "name": "cluster-autoscaler-priority-expander",
 19 |             "namespace": "kube-system",
 20 |         },
 21 |         data={
 22 |             "priorities": to_yaml(priority_data),
 23 |         },
 24 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider),
 25 |     )
 26 | 
 27 | 
 28 | @call_once
 29 | def create_cluster_autoscaler(
 30 |     ctx: Context,
 31 |     cluster: eks.Cluster,
 32 | ) -> None:
 33 |     """
 34 |     Sets up the cluster autoscaler for an EKS cluster.
 35 | 
 36 |     Args:
 37 |         cluster (eks.Cluster): The EKS cluster.
 38 |         k8s_provider (k8s.Provider): The Kubernetes provider.
 39 |         config (CloudConfig): The cluster config provided by user.
 40 | 
 41 |     Returns:
 42 |         None
 43 |     """
 44 |     cluster_name = ctx.cluster_name
 45 | 
 46 |     autoscaler_policy_doc = aws.iam.get_policy_document(
 47 |         statements=[
 48 |             aws.iam.GetPolicyDocumentStatementArgs(
 49 |                 actions=[
 50 |                     "autoscaling:DescribeAutoScalingGroups",
 51 |                     "autoscaling:DescribeAutoScalingInstances",
 52 |                     "autoscaling:DescribeLaunchConfigurations",
 53 |                     "autoscaling:DescribeTags",
 54 |                     "autoscaling:SetDesiredCapacity",
 55 |                     "autoscaling:TerminateInstanceInAutoScalingGroup",
 56 |                     "ec2:DescribeLaunchTemplateVersions",
 57 |                     "eks:DescribeNodegroup",
 58 |                     "ec2:GetInstanceTypesFromInstanceRequirements",
 59 |                     "ec2:DescribeImages",
 60 |                 ],
 61 |                 resources=["*"],
 62 |             )
 63 |         ]
 64 |     )
 65 | 
 66 |     autoscaler_policy = aws.iam.Policy(
 67 |         f"{cluster_name}-autoscaler-policy", policy=autoscaler_policy_doc.json
 68 |     )
 69 | 
 70 |     # The OIDC provider is required because the cluster autoscaler runs within the Kubernetes
 71 |     # cluster and needs to interact with the AWS API to manage the Auto Scaling Groups (ASGs).
 72 |     # OIDC provides a secure mechanism for the cluster autoscaler to authenticate with the AWS API.
 73 |     autoscaler_role = odic_role_for_sa(
 74 |         ctx, cluster, "autoscaler", "kube-system:cluster-autoscaler"
 75 |     )
 76 | 
 77 |     aws.iam.RolePolicyAttachment(
 78 |         f"{cluster_name}-autoscaler-role-policy-attachment",
 79 |         policy_arn=autoscaler_policy.arn,
 80 |         role=autoscaler_role.name,
 81 |     )
 82 | 
 83 |     expander = create_priority_expander(ctx)
 84 | 
 85 |     helm.Chart(
 86 |         "cluster-autoscaler",
 87 |         helm.ChartOpts(
 88 |             chart="cluster-autoscaler",
 89 |             version="9.34.0",
 90 |             namespace="kube-system",
 91 |             fetch_opts=helm.FetchOpts(repo="https://kubernetes.github.io/autoscaler"),
 92 |             values={
 93 |                 "autoDiscovery": {"clusterName": cluster.eks_cluster.name},
 94 |                 "awsRegion": ctx.region,
 95 |                 "rbac": {
 96 |                     "create": True,
 97 |                     "serviceAccount": {
 98 |                         "create": True,
 99 |                         "name": "cluster-autoscaler",
100 |                         "annotations": {
101 |                             "eks.amazonaws.com/role-arn": autoscaler_role.arn
102 |                         },
103 |                     },
104 |                 },
105 |                 "serviceMonitor": {"interval": "2s"},
106 |                 "image": {"tag": "v1.28.2"},
107 |                 "extraArgs": {
108 |                     "expander": "priority,random",  # Use priority expander if possible
109 |                 },
110 |             },
111 |         ),
112 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider, depends_on=[expander]),
113 |     )
114 | 


--------------------------------------------------------------------------------
/paka/cluster/qdrant.py:
--------------------------------------------------------------------------------
  1 | import pulumi
  2 | import pulumi_kubernetes as k8s
  3 | from pulumi_kubernetes.helm.v3 import Chart, ChartOpts, FetchOpts
  4 | 
  5 | from paka.cluster.context import Context
  6 | from paka.utils import call_once
  7 | 
  8 | 
  9 | @call_once
 10 | def create_qdrant(ctx: Context) -> None:
 11 |     """
 12 |     Installs the qdrant helm chart.
 13 |     """
 14 |     config = ctx.cloud_config
 15 | 
 16 |     if not config.vectorStore:
 17 |         return
 18 | 
 19 |     ns = k8s.core.v1.Namespace(
 20 |         "qdrant",
 21 |         metadata={"name": "qdrant", "labels": {"istio-injection": "enabled"}},
 22 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider),
 23 |     )
 24 | 
 25 |     resource_request = (
 26 |         {
 27 |             "resources": {
 28 |                 "requests": {
 29 |                     "cpu": config.vectorStore.resourceRequest.cpu,
 30 |                     "memory": config.vectorStore.resourceRequest.memory,
 31 |                 },
 32 |             }
 33 |         }
 34 |         if config.vectorStore.resourceRequest
 35 |         else {}
 36 |     )
 37 | 
 38 |     Chart(
 39 |         "qdrant",
 40 |         ChartOpts(
 41 |             chart="qdrant",
 42 |             version="0.7.5",
 43 |             namespace="qdrant",
 44 |             fetch_opts=FetchOpts(repo="https://qdrant.github.io/qdrant-helm"),
 45 |             values={
 46 |                 "metrics": {
 47 |                     "serviceMonitor": {
 48 |                         "enabled": (
 49 |                             True
 50 |                             if config.prometheus and config.prometheus.enabled
 51 |                             else False
 52 |                         ),
 53 |                     },
 54 |                 },
 55 |                 "replicaCount": config.vectorStore.replicas,
 56 |                 "persistence": {
 57 |                     "size": config.vectorStore.storageSize,
 58 |                 },
 59 |                 "livenessProbe": {
 60 |                     "enabled": True,
 61 |                 },
 62 |                 "tolerations": [
 63 |                     {
 64 |                         "key": "app",
 65 |                         "operator": "Equal",
 66 |                         "value": "qdrant",
 67 |                         "effect": "NoSchedule",
 68 |                     }
 69 |                 ],
 70 |                 "affinity": {
 71 |                     "nodeAffinity": {
 72 |                         "requiredDuringSchedulingIgnoredDuringExecution": {
 73 |                             "nodeSelectorTerms": [
 74 |                                 {
 75 |                                     "matchExpressions": [
 76 |                                         {
 77 |                                             "key": "app",
 78 |                                             "operator": "In",
 79 |                                             "values": ["qdrant"],
 80 |                                         }
 81 |                                     ]
 82 |                                 }
 83 |                             ]
 84 |                         }
 85 |                     },
 86 |                     "podAntiAffinity": {
 87 |                         "requiredDuringSchedulingIgnoredDuringExecution": [
 88 |                             {
 89 |                                 "labelSelector": {
 90 |                                     "matchExpressions": [
 91 |                                         {
 92 |                                             "key": "app",
 93 |                                             "operator": "In",
 94 |                                             "values": ["qdrant"],
 95 |                                         }
 96 |                                     ]
 97 |                                 },
 98 |                                 "topologyKey": "kubernetes.io/hostname",
 99 |                             }
100 |                         ]
101 |                     },
102 |                 },
103 |                 "topologySpreadConstraints": [
104 |                     {
105 |                         "maxSkew": 1,
106 |                         "topologyKey": "topology.kubernetes.io/zone",
107 |                         "whenUnsatisfiable": "ScheduleAnyway",
108 |                         "labelSelector": {"matchLabels": {"app": "qdrant"}},
109 |                     }
110 |                 ],
111 |                 **resource_request,
112 |             },
113 |         ),
114 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider, depends_on=[ns]),
115 |     )
116 | 


--------------------------------------------------------------------------------
/paka/cluster/aws/service_account.py:
--------------------------------------------------------------------------------
  1 | import pulumi
  2 | import pulumi_aws as aws
  3 | import pulumi_eks as eks
  4 | import pulumi_kubernetes as k8s
  5 | 
  6 | from paka.cluster.aws.utils import odic_role_for_sa
  7 | from paka.cluster.context import Context
  8 | from paka.constants import ACCESS_ALL_SA
  9 | from paka.utils import call_once
 10 | 
 11 | 
 12 | @call_once
 13 | def create_service_accounts(
 14 |     ctx: Context,
 15 |     cluster: eks.Cluster,
 16 | ) -> None:
 17 |     """
 18 |     Creates service accounts with necessary IAM roles and policies.
 19 | 
 20 |     This function creates two IAM policies: one for S3 access and one for ECR access.
 21 |     It then creates an IAM role for the service account and attaches the two policies to this role.
 22 |     Finally, it creates a Kubernetes service account and annotates it with the ARN of the IAM role.
 23 | 
 24 |     The S3 policy allows the service account to get objects and list the bucket.
 25 |     The ECR policy allows the service account to perform various actions related to ECR images.
 26 | 
 27 |     Args:
 28 |         config (CloudConfig): The cloud configuration containing the cluster name.
 29 |         cluster (eks.Cluster): The EKS cluster to create the service accounts in.
 30 |         k8s_provider (k8s.Provider): The Kubernetes provider to use when creating the service account.
 31 | 
 32 |     Returns:
 33 |         None
 34 |     """
 35 |     cluster_name = ctx.cluster_name
 36 |     bucket = ctx.bucket
 37 | 
 38 |     s3_policy = aws.iam.Policy(
 39 |         f"{cluster_name}-s3-access-policy",
 40 |         policy=aws.iam.get_policy_document(
 41 |             statements=[
 42 |                 aws.iam.GetPolicyDocumentStatementArgs(
 43 |                     effect="Allow",
 44 |                     actions=["s3:GetObject", "s3:ListBucket"],
 45 |                     resources=[
 46 |                         f"arn:aws:s3:::{bucket}/*",
 47 |                         f"arn:aws:s3:::{bucket}",
 48 |                     ],
 49 |                 )
 50 |             ]
 51 |         ).json,
 52 |     )
 53 | 
 54 |     ecr_policy = aws.iam.Policy(
 55 |         f"{cluster_name}-ecr-access-policy",
 56 |         policy=aws.iam.get_policy_document(
 57 |             statements=[
 58 |                 aws.iam.GetPolicyDocumentStatementArgs(
 59 |                     effect="Allow",
 60 |                     actions=[
 61 |                         "ecr:GetDownloadUrlForLayer",
 62 |                         "ecr:BatchGetImage",
 63 |                         "ecr:BatchCheckLayerAvailability",
 64 |                         "ecr:ListImages",
 65 |                         "ecr:DescribeImages",
 66 |                     ],
 67 |                     resources=["*"],
 68 |                 )
 69 |             ]
 70 |         ).json,
 71 |     )
 72 | 
 73 |     cloudwatch_policy = aws.iam.Policy(
 74 |         "cloudwatch-policy",
 75 |         policy=aws.iam.get_policy_document(
 76 |             statements=[
 77 |                 aws.iam.GetPolicyDocumentStatementArgs(
 78 |                     effect="Allow",
 79 |                     actions=[
 80 |                         "logs:CreateLogGroup",
 81 |                         "logs:CreateLogStream",
 82 |                         "logs:PutLogEvents",
 83 |                         "logs:DescribeLogStreams",
 84 |                     ],
 85 |                     resources=["arn:aws:logs:*:*:*"],
 86 |                 )
 87 |             ]
 88 |         ).json,
 89 |     )
 90 | 
 91 |     namespace = ctx.namespace
 92 |     sa_role = odic_role_for_sa(
 93 |         ctx,
 94 |         cluster,
 95 |         "sa",
 96 |         f"{namespace}:{ACCESS_ALL_SA}",
 97 |     )
 98 | 
 99 |     aws.iam.RolePolicyAttachment(
100 |         f"{cluster_name}-sa-s3-role-policy-attachment",
101 |         role=sa_role.name,
102 |         policy_arn=s3_policy.arn,
103 |     )
104 | 
105 |     aws.iam.RolePolicyAttachment(
106 |         f"{cluster_name}-sa-ecr-role-policy-attachment",
107 |         role=sa_role.name,
108 |         policy_arn=ecr_policy.arn,
109 |     )
110 | 
111 |     aws.iam.RolePolicyAttachment(
112 |         f"{cluster_name}-sa-cloudwatch-role-policy-attachment",
113 |         role=sa_role.name,
114 |         policy_arn=cloudwatch_policy.arn,
115 |     )
116 | 
117 |     k8s.core.v1.ServiceAccount(
118 |         f"{cluster_name}-service-account",
119 |         metadata={
120 |             "namespace": ctx.namespace,
121 |             "name": ACCESS_ALL_SA,
122 |             "annotations": {"eks.amazonaws.com/role-arn": sa_role.arn},
123 |         },
124 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider),
125 |     )
126 | 


--------------------------------------------------------------------------------
/paka/cluster/context.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Optional
  4 | 
  5 | import fasteners
  6 | import pulumi_kubernetes as k8s
  7 | 
  8 | from paka.config import CloudConfig, Config
  9 | 
 10 | 
 11 | class Context:
 12 |     _k8s_provider: Optional[k8s.Provider]
 13 |     _config: Optional[Config]
 14 |     # Materialized bucket with a unique name
 15 |     _bucket: Optional[str]
 16 |     # Materialized container registry url
 17 |     _registry: Optional[str]
 18 |     # The kubeconfig str
 19 |     _kubeconfig: Optional[str]
 20 | 
 21 |     # Need to lock the access to these fields
 22 |     _should_save_kubeconfig: bool = False
 23 | 
 24 |     def __init__(self) -> None:
 25 |         # Ugly, ideally, we can create these locks dynamically in __getattr__.
 26 |         # However, __getattr__ is not thread safe either. We need another lock to protect the creation of locks.
 27 |         # This lock is going to be a bottleneck. Therefore, we pre-create the locks.
 28 |         # Multiple locks pose a risk of deadlock. We need to be careful when acquiring multiple locks.
 29 |         self._k8s_provider_lock = fasteners.ReaderWriterLock()
 30 |         self._config_lock = fasteners.ReaderWriterLock()
 31 |         self._bucket_lock = fasteners.ReaderWriterLock()
 32 |         self._registry_lock = fasteners.ReaderWriterLock()
 33 |         self._kubeconfig_lock = fasteners.ReaderWriterLock()
 34 | 
 35 |     @fasteners.write_locked(lock="_k8s_provider_lock")
 36 |     def set_k8s_provider(self, k8s_provider: k8s.Provider) -> None:
 37 |         self._k8s_provider = k8s_provider
 38 | 
 39 |     @property
 40 |     @fasteners.read_locked(lock="_k8s_provider_lock")
 41 |     def k8s_provider(self) -> Optional[k8s.Provider]:
 42 |         return self._k8s_provider
 43 | 
 44 |     @fasteners.write_locked(lock="_config_lock")
 45 |     def set_config(self, config: Config) -> None:
 46 |         self._config = config
 47 | 
 48 |     @property
 49 |     @fasteners.read_locked(lock="_config_lock")
 50 |     def config(self) -> Optional[Config]:
 51 |         return self._config
 52 | 
 53 |     @property
 54 |     @fasteners.read_locked(lock="_config_lock")
 55 |     def cloud_config(self) -> Optional[CloudConfig]:
 56 |         if self._config is None:
 57 |             raise RuntimeError("Config is not set.")
 58 |         if self._config.aws is None:
 59 |             raise RuntimeError("Only AWS is supported.")
 60 | 
 61 |         return self._config.aws
 62 | 
 63 |     @property
 64 |     @fasteners.read_locked(lock="_config_lock")
 65 |     def region(self) -> Optional[str]:
 66 |         # fasteners's inter thread reader lock is reentrant. We can call other methods that acquire the same lock.
 67 |         # https://github.com/harlowja/fasteners/blob/06c3f06cab4e135b8d921932019a231c180eb9f4/docs/guide/inter_thread.md#lack-of-features
 68 |         return self.cloud_config.cluster.region
 69 | 
 70 |     @property
 71 |     @fasteners.read_locked(lock="_config_lock")
 72 |     def namespace(self) -> Optional[str]:
 73 |         # reentrant
 74 |         return self.cloud_config.cluster.namespace
 75 | 
 76 |     @property
 77 |     @fasteners.read_locked(lock="_config_lock")
 78 |     def provider(self) -> str:
 79 |         # reentrant
 80 |         _ = self.cloud_config
 81 |         return "aws"
 82 | 
 83 |     @property
 84 |     @fasteners.read_locked(lock="_config_lock")
 85 |     def cluster_name(self) -> str:
 86 |         # reentrant
 87 |         return self.cloud_config.cluster.name
 88 | 
 89 |     @fasteners.write_locked(lock="_bucket_lock")
 90 |     def set_bucket(self, bucket: str) -> None:
 91 |         self._bucket = bucket
 92 | 
 93 |     @property
 94 |     @fasteners.read_locked(lock="_bucket_lock")
 95 |     def bucket(self) -> Optional[str]:
 96 |         return self._bucket
 97 | 
 98 |     @fasteners.write_locked(lock="_registry_lock")
 99 |     def set_registry(self, registry: str) -> None:
100 |         self._registry = registry
101 | 
102 |     @property
103 |     @fasteners.read_locked(lock="_registry_lock")
104 |     def registry(self) -> Optional[str]:
105 |         return self._registry
106 | 
107 |     @fasteners.write_locked(lock="_kubeconfig_lock")
108 |     def set_kubeconfig(self, kubeconfig: str) -> None:
109 |         self._kubeconfig = kubeconfig
110 | 
111 |     @property
112 |     @fasteners.read_locked(lock="_kubeconfig_lock")
113 |     def kubeconfig(self) -> Optional[str]:
114 |         return self._kubeconfig
115 | 
116 |     def set_should_save_kubeconfig(self, should_save_kubeconfig: bool) -> None:
117 |         self._should_save_kubeconfig = should_save_kubeconfig
118 | 
119 |     @property
120 |     def should_save_kubeconfig(self) -> bool:
121 |         return self._should_save_kubeconfig
122 | 


--------------------------------------------------------------------------------
/paka/cluster/prometheus.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Any, Callable, Dict, Optional, Tuple
  4 | 
  5 | import pulumi
  6 | import pulumi_kubernetes as k8s
  7 | from pulumi_kubernetes.helm.v3 import Chart, ChartOpts, FetchOpts
  8 | 
  9 | from paka.cluster.context import Context
 10 | 
 11 | 
 12 | def memoize(func: Callable[..., Any]) -> Callable[..., Any]:
 13 |     cache: Dict[Callable[..., Any], Any] = dict()
 14 | 
 15 |     def memoized_func(*args: Tuple[Any, ...], **kwargs: Dict[str, Any]) -> Any:
 16 |         if func not in cache:
 17 |             cache[func] = func(*args, **kwargs)
 18 |         return cache[func]
 19 | 
 20 |     return memoized_func
 21 | 
 22 | 
 23 | @memoize
 24 | def create_prometheus(ctx: Context) -> Optional[Chart]:
 25 |     """
 26 |     Installs a Prometheus chart.
 27 |     """
 28 |     config = ctx.cloud_config
 29 |     if not config.prometheus or not config.prometheus.enabled:
 30 |         return None
 31 | 
 32 |     ns = k8s.core.v1.Namespace(
 33 |         "prometheus",
 34 |         metadata={"name": "prometheus"},
 35 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider),
 36 |     )
 37 | 
 38 |     return Chart(
 39 |         "kube-prometheus-stack",
 40 |         ChartOpts(
 41 |             chart="kube-prometheus-stack",
 42 |             version="58.6.0",
 43 |             namespace="prometheus",
 44 |             fetch_opts=FetchOpts(
 45 |                 repo="https://prometheus-community.github.io/helm-charts"
 46 |             ),
 47 |             values={
 48 |                 "nodeExporter": {
 49 |                     "enabled": config.prometheus.nodeExporter,
 50 |                 },
 51 |                 "alertmanager": {
 52 |                     "enabled": config.prometheus.alertmanager,
 53 |                 },
 54 |                 "grafana": {
 55 |                     "enabled": config.prometheus.grafana,
 56 |                 },
 57 |                 "kubeApiServer": {
 58 |                     "enabled": config.prometheus.kubeApiServer,
 59 |                 },
 60 |                 "kubelet": {
 61 |                     "enabled": config.prometheus.kubelet,
 62 |                 },
 63 |                 "kubeControllerManager": {
 64 |                     "enabled": config.prometheus.kubeControllerManager,
 65 |                 },
 66 |                 "coreDns": {
 67 |                     "enabled": config.prometheus.coreDns,
 68 |                 },
 69 |                 "kubeEtcd": {
 70 |                     "enabled": config.prometheus.kubeEtcd,
 71 |                 },
 72 |                 "kubeScheduler": {
 73 |                     "enabled": config.prometheus.kubeScheduler,
 74 |                 },
 75 |                 "kubeProxy": {
 76 |                     "enabled": config.prometheus.kubeProxy,
 77 |                 },
 78 |                 "kubeStateMetrics": {
 79 |                     "enabled": config.prometheus.kubeStateMetrics,
 80 |                 },
 81 |                 "thanosRuler": {
 82 |                     "enabled": config.prometheus.thanosRuler,
 83 |                 },
 84 |                 # Disable the Prometheus Operator's admission webhooks, since they don't work with Pulumi.
 85 |                 # This means ill-formatted Prometheus rules may make their way into Prometheus. :(
 86 |                 "prometheusOperator": {
 87 |                     "admissionWebhooks": {"enabled": False},
 88 |                     "tls": {"enabled": False},
 89 |                 },
 90 |                 "kube-state-metrics": {
 91 |                     "metricLabelsAllowlist": [
 92 |                         "pods=[*]",
 93 |                         "deployments=[app.kubernetes.io/name,app.kubernetes.io/component,app.kubernetes.io/instance]",
 94 |                     ]
 95 |                 },
 96 |                 "prometheus": {
 97 |                     "prometheusSpec": {
 98 |                         "serviceMonitorSelectorNilUsesHelmValues": False,
 99 |                         "podMonitorSelectorNilUsesHelmValues": False,
100 |                         "storageSpec": {
101 |                             "volumeClaimTemplate": {
102 |                                 "spec": {
103 |                                     "accessModes": ["ReadWriteOnce"],
104 |                                     "resources": {
105 |                                         "requests": {
106 |                                             "storage": config.prometheus.storageSize,
107 |                                         }
108 |                                     },
109 |                                 }
110 |                             }
111 |                         },
112 |                     }
113 |                 },
114 |             },
115 |         ),
116 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider, depends_on=[ns]),
117 |     )
118 | 


--------------------------------------------------------------------------------
/paka/cluster/fluentbit.py:
--------------------------------------------------------------------------------
  1 | import pulumi
  2 | import pulumi_kubernetes as k8s
  3 | 
  4 | from paka.cluster.context import Context
  5 | from paka.constants import ACCESS_ALL_SA
  6 | from paka.utils import call_once
  7 | 
  8 | 
  9 | @call_once
 10 | def create_fluentbit(ctx: Context, fluent_bit_config: str) -> None:
 11 |     """
 12 |     Creates a fluentbit daemonset with the given configuration.
 13 |     """
 14 | 
 15 |     parsers_config = """
 16 | [PARSER]
 17 |     Name        docker
 18 |     Format      json
 19 |     Time_Key    time
 20 |     Time_Format %Y-%m-%dT%H:%M:%S.%fZ
 21 | """
 22 |     parsers_config_map = k8s.core.v1.ConfigMap(
 23 |         "fluent-bit-parsers",
 24 |         data={"parsers.conf": parsers_config},
 25 |         metadata={
 26 |             "namespace": ctx.namespace,
 27 |             "name": "fluent-bit-parsers",
 28 |         },
 29 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider),
 30 |     )
 31 | 
 32 |     fluent_bit_config_map = k8s.core.v1.ConfigMap(
 33 |         "fluent-bit-config-map",
 34 |         data={"fluent-bit.conf": fluent_bit_config},
 35 |         metadata={
 36 |             "namespace": ctx.namespace,
 37 |             "name": "fluent-bit-config",
 38 |         },
 39 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider),
 40 |     )
 41 | 
 42 |     k8s.apps.v1.DaemonSet(
 43 |         "fluent-bit-daemonset",
 44 |         spec=k8s.apps.v1.DaemonSetSpecArgs(
 45 |             selector=k8s.meta.v1.LabelSelectorArgs(
 46 |                 match_labels={"k8s-app": "fluent-bit-logging"},
 47 |             ),
 48 |             template=k8s.core.v1.PodTemplateSpecArgs(
 49 |                 metadata=k8s.meta.v1.ObjectMetaArgs(
 50 |                     labels={"k8s-app": "fluent-bit-logging"},
 51 |                     annotations={"sidecar.istio.io/inject": "false"},
 52 |                 ),
 53 |                 spec=k8s.core.v1.PodSpecArgs(
 54 |                     service_account_name=ACCESS_ALL_SA,
 55 |                     tolerations=[k8s.core.v1.TolerationArgs(operator="Exists")],
 56 |                     containers=[
 57 |                         k8s.core.v1.ContainerArgs(
 58 |                             name="fluent-bit",
 59 |                             image="fluent/fluent-bit:latest",
 60 |                             volume_mounts=[
 61 |                                 k8s.core.v1.VolumeMountArgs(
 62 |                                     name="config",
 63 |                                     mount_path="/fluent-bit/etc/fluent-bit.conf",
 64 |                                     sub_path="fluent-bit.conf",
 65 |                                 ),
 66 |                                 k8s.core.v1.VolumeMountArgs(
 67 |                                     name="varlog",
 68 |                                     mount_path="/var/log",
 69 |                                 ),
 70 |                                 k8s.core.v1.VolumeMountArgs(
 71 |                                     name="varlibdockercontainers",
 72 |                                     mount_path="/var/lib/docker/containers",
 73 |                                     read_only=True,
 74 |                                 ),
 75 |                                 k8s.core.v1.VolumeMountArgs(
 76 |                                     name="parsers-config",
 77 |                                     mount_path="/fluent-bit/etc/parsers.conf",
 78 |                                     sub_path="parsers.conf",
 79 |                                 ),
 80 |                             ],
 81 |                         )
 82 |                     ],
 83 |                     volumes=[
 84 |                         k8s.core.v1.VolumeArgs(
 85 |                             name="config",
 86 |                             config_map=k8s.core.v1.ConfigMapVolumeSourceArgs(
 87 |                                 name=fluent_bit_config_map.metadata["name"],
 88 |                             ),
 89 |                         ),
 90 |                         k8s.core.v1.VolumeArgs(
 91 |                             name="varlog",
 92 |                             host_path=k8s.core.v1.HostPathVolumeSourceArgs(
 93 |                                 path="/var/log",
 94 |                             ),
 95 |                         ),
 96 |                         k8s.core.v1.VolumeArgs(
 97 |                             name="varlibdockercontainers",
 98 |                             host_path=k8s.core.v1.HostPathVolumeSourceArgs(
 99 |                                 path="/var/lib/docker/containers",
100 |                             ),
101 |                         ),
102 |                         k8s.core.v1.VolumeArgs(
103 |                             name="parsers-config",
104 |                             config_map=k8s.core.v1.ConfigMapVolumeSourceArgs(
105 |                                 name=parsers_config_map.metadata["name"],
106 |                             ),
107 |                         ),
108 |                     ],
109 |                 ),
110 |             ),
111 |         ),
112 |         metadata={"namespace": ctx.namespace},
113 |         opts=pulumi.ResourceOptions(provider=ctx.k8s_provider),
114 |     )
115 | 


--------------------------------------------------------------------------------
/paka/cli/cluster.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import threading
  4 | import time
  5 | from typing import List
  6 | 
  7 | import typer
  8 | 
  9 | from paka.cli.utils import load_cluster_manager, load_kubeconfig
 10 | from paka.k8s.utils import remove_crd_finalizers
 11 | 
 12 | cluster_app = typer.Typer()
 13 | 
 14 | 
 15 | @cluster_app.command()
 16 | def up(
 17 |     cluster_config: str = typer.Option(
 18 |         "",
 19 |         "--file",
 20 |         "-f",
 21 |         help="Path to the cluster config file. The cluster config file is a "
 22 |         "YAML file that contains the configuration of the cluster",
 23 |     ),
 24 |     no_kubeconfig: bool = typer.Option(
 25 |         False,
 26 |         "--no-kubeconfig",
 27 |         "-n",
 28 |         help="By default, the connection details of the newly created Kubernetes "
 29 |         "cluster are added to the default kubeconfig file (~/.kube/config). "
 30 |         "This allows kubectl to communicate with the new cluster. "
 31 |         "Use this option to prevent updating the kubeconfig file.",
 32 |     ),
 33 | ) -> None:
 34 |     """
 35 |     Creates or updates a Kubernetes cluster based on the provided configuration.
 36 |     """
 37 |     cluster_manager = load_cluster_manager(cluster_config)
 38 |     cluster_manager.ctx.set_should_save_kubeconfig(not no_kubeconfig)
 39 |     cluster_manager.create()
 40 | 
 41 | 
 42 | @cluster_app.command()
 43 | def down(
 44 |     cluster_config: str = typer.Option(
 45 |         "",
 46 |         "--file",
 47 |         "-f",
 48 |         help="Path to the cluster config file. The cluster config file is a "
 49 |         "YAML file that contains the configuration of the cluster",
 50 |     ),
 51 |     yes: bool = typer.Option(
 52 |         False,
 53 |         "--yes",
 54 |         "-y",
 55 |         help="Automatic yes to prompts. Use this option to bypass the confirmation "
 56 |         "prompt and directly proceed with the operation.",
 57 |     ),
 58 | ) -> None:
 59 |     """
 60 |     Tears down the Kubernetes cluster, removing all associated resources and data.
 61 |     """
 62 |     if yes or typer.confirm(
 63 |         f"Are you sure you want to proceed with the operation? Please note that "
 64 |         "all resources and data will be permanently deleted.",
 65 |         default=False,
 66 |     ):
 67 |         cluster_manager = load_cluster_manager(cluster_config)
 68 | 
 69 |         # Sometime finalizers might block CRD deletion, so we need to force delete those.
 70 |         # This is best effort and might not work in all cases.
 71 |         # TODO: better way to handle this
 72 |         # https://github.com/kubernetes/kubernetes/issues/60538
 73 |         stop_event = threading.Event()
 74 | 
 75 |         def remove_finalizers_forever() -> None:
 76 |             try:
 77 |                 crds = [
 78 |                     "scaledobjects.keda.sh",
 79 |                     "routes.serving.knative.dev",
 80 |                     "ingresses.networking.internal.knative.dev",
 81 |                 ]
 82 | 
 83 |                 load_kubeconfig(cluster_manager.cloud_config.cluster.name)
 84 | 
 85 |                 while not stop_event.is_set():
 86 |                     for crd in crds:
 87 |                         try:
 88 |                             remove_crd_finalizers(crd)
 89 |                         except Exception as e:
 90 |                             pass
 91 |                     time.sleep(1)  # Wait for a second before the next iteration
 92 |             except:
 93 |                 pass
 94 | 
 95 |         thread = threading.Thread(target=remove_finalizers_forever)
 96 |         thread.start()
 97 | 
 98 |         try:
 99 |             cluster_manager.destroy()
100 |         finally:
101 |             stop_event.set()
102 |             thread.join()
103 | 
104 | 
105 | @cluster_app.command()
106 | def preview(
107 |     cluster_config: str = typer.Option(
108 |         "",
109 |         "--file",
110 |         "-f",
111 |         help="Path to the cluster config file. The cluster config file is a "
112 |         "YAML file that contains the configuration of the cluster",
113 |     ),
114 |     policy_packs: List[str] = typer.Option(
115 |         [],
116 |         "--policy-pack",
117 |         "-p",
118 |         help="Path to the policy pack.",
119 |     ),
120 | ) -> None:
121 |     """
122 |     Previews the changes that will be applied to the cloud resources.
123 |     """
124 |     cluster_manager = load_cluster_manager(cluster_config)
125 |     if policy_packs:
126 |         cluster_manager.preview(policy_packs=policy_packs)
127 |     else:
128 |         cluster_manager.preview()
129 | 
130 | 
131 | @cluster_app.command()
132 | def refresh(
133 |     cluster_config: str = typer.Option(
134 |         "",
135 |         "--file",
136 |         "-f",
137 |         help="Path to the cluster config file. The cluster config file is a "
138 |         "YAML file that contains the configuration of the cluster",
139 |     ),
140 | ) -> None:
141 |     """
142 |     Synchronize the local cluster state with the state in the cloud.
143 |     """
144 |     cluster_manager = load_cluster_manager(cluster_config)
145 |     cluster_manager.refresh()
146 | 


--------------------------------------------------------------------------------
/paka/k8s/model_group/runtime/llama_cpp.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | import re
  5 | from typing import List, Optional
  6 | 
  7 | from huggingface_hub import HfFileSystem
  8 | from huggingface_hub.utils import validate_repo_id
  9 | 
 10 | from paka.cluster.context import Context
 11 | from paka.cluster.utils import get_model_store
 12 | from paka.config import CloudModelGroup
 13 | from paka.constants import MODEL_MOUNT_PATH
 14 | 
 15 | 
 16 | # Heuristic to determine if the image is a llama.cpp image
 17 | def is_llama_cpp_image(image: str) -> bool:
 18 |     return "llama.cpp" in image.lower()
 19 | 
 20 | 
 21 | def get_model_file_from_model_store(
 22 |     ctx: Context,
 23 |     model_group: CloudModelGroup,
 24 | ) -> Optional[str]:
 25 |     if model_group.model and model_group.model.useModelStore:
 26 |         store = get_model_store(ctx, with_progress_bar=False)
 27 |         # Find the file that ends with .gguf or .ggml
 28 |         model_files = [
 29 |             file
 30 |             for file in store.glob(f"{model_group.name}/*")
 31 |             if re.search(r"\.(gguf|ggml)$", file, re.IGNORECASE)
 32 |         ]
 33 | 
 34 |         if not model_files:
 35 |             model_files = [
 36 |                 file
 37 |                 for file in store.glob(f"{model_group.name}/*")
 38 |                 if any(
 39 |                     re.match(file_pattern, file)
 40 |                     for file_pattern in model_group.model.files
 41 |                 )
 42 |             ]
 43 | 
 44 |         if len(model_files) > 1:
 45 |             raise ValueError(
 46 |                 f"Multiple model files found in {model_group.name}/ directory."
 47 |             )
 48 | 
 49 |         if len(model_files) == 1:
 50 |             return os.path.basename(model_files[0])
 51 | 
 52 |     return None
 53 | 
 54 | 
 55 | def get_runtime_command_llama_cpp(
 56 |     ctx: Context, model_group: CloudModelGroup
 57 | ) -> List[str]:
 58 |     runtime = model_group.runtime
 59 |     if runtime.command:
 60 |         command_str = " ".join(runtime.command) if runtime.command else ""
 61 |         # If the command knows where or how to load the model file, we don't need to do anything.
 62 |         if (
 63 |             re.search(r"(--model|-m)[ \t]*\S+", command_str)
 64 |             or (
 65 |                 re.search(r"--hf-repo|-hfr", command_str)
 66 |                 and re.search(r"--hf-file|-hff", command_str)
 67 |             )
 68 |             or re.search(r"--model-url|-mu[ \t]*\S+", command_str)
 69 |         ):
 70 |             return runtime.command
 71 | 
 72 |     model_file = get_model_file_from_model_store(ctx, model_group)
 73 | 
 74 |     def attach_model_to_command(command: List[str]) -> List[str]:
 75 |         if model_file:
 76 |             return command + ["--model", f"{MODEL_MOUNT_PATH}/{model_file}"]
 77 |         elif model_group.model and model_group.model.hfRepoId:
 78 | 
 79 |             validate_repo_id(model_group.model.hfRepoId)
 80 |             hf_fs = HfFileSystem()
 81 |             files = [
 82 |                 file
 83 |                 for pattern in model_group.model.files
 84 |                 for file in hf_fs.glob(f"{model_group.model.hfRepoId}/{pattern}")
 85 |             ]
 86 | 
 87 |             if len(files) > 1:
 88 |                 raise ValueError("Multiple model files found in HuggingFace repo.")
 89 |             if len(files) == 0:
 90 |                 raise ValueError("No model file found in HuggingFace repo.")
 91 | 
 92 |             hf_file = os.path.basename(files[0])
 93 | 
 94 |             return command + [
 95 |                 "--hf-repo",
 96 |                 model_group.model.hfRepoId,
 97 |                 "--hf-file",
 98 |                 hf_file,
 99 |                 "--model",
100 |                 os.path.basename(
101 |                     hf_file
102 |                 ),  # This is the model file name that the huggingface model is saved as
103 |             ]
104 |         else:
105 |             raise ValueError("Did not find a model to load.")
106 | 
107 |     if runtime.command:
108 |         return attach_model_to_command(runtime.command)
109 | 
110 |     # https://github.com/ggerganov/llama.cpp/tree/master/examples/server
111 |     command = [
112 |         "/server",
113 |         "--host",
114 |         "0.0.0.0",
115 |         "--parallel",  # Number of parallel requests to handle
116 |         "1",
117 |         "--cont-batching",  # Enable continuous batching
118 |         "--ctx-size",
119 |         "4096",
120 |         "--batch-size",  # Maximum number of tokens to decode in a batch
121 |         "512",
122 |         "--ubatch-size",  # Physical batch size
123 |         "512",
124 |         "--n-predict",  # Maximum number of tokens to predict.
125 |         "-1",
126 |         "--embedding",
127 |         "--flash-attn",  # Enable flash attention
128 |         "--metrics",  # Enable metrics
129 |     ]
130 | 
131 |     if hasattr(model_group, "gpu") and model_group.gpu and model_group.gpu.enabled:
132 |         # The value 999 is typically sufficient for most models, as it attempts to offload as many layers as possible to the GPU.
133 |         # However, for particularly large models, this may result in exceeding the GPU's memory capacity and cause errors.
134 |         # A more effective approach would be to conduct a series of experiments with varying values for --n-gpu-layers to find the optimal setting.
135 |         command.extend(["--n-gpu-layers", "999"])
136 | 
137 |     return attach_model_to_command(command)
138 | 


--------------------------------------------------------------------------------
/tests/k8s/model_group/runtime/test_llama_cpp.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import MagicMock, patch
  2 | 
  3 | import pytest
  4 | 
  5 | import paka.cluster
  6 | import paka.cluster.utils
  7 | import paka.k8s.model_group.runtime.llama_cpp
  8 | from paka.cluster.context import Context
  9 | from paka.config import AwsModelGroup, Model, Runtime
 10 | from paka.constants import MODEL_MOUNT_PATH
 11 | from paka.k8s.model_group.runtime.llama_cpp import get_runtime_command_llama_cpp
 12 | 
 13 | 
 14 | @pytest.fixture
 15 | def model_group() -> AwsModelGroup:
 16 |     return AwsModelGroup(
 17 |         name="test-model-group",
 18 |         minInstances=1,
 19 |         maxInstances=2,
 20 |         nodeType="t2.micro",
 21 |         runtime=Runtime(
 22 |             image="johndoe/llama.cpp:server",
 23 |             command=["/server", "--model", f"{MODEL_MOUNT_PATH}/model.gguf"],
 24 |         ),
 25 |         resourceRequest={"cpu": "1000", "memory": "1Gi"},
 26 |     )
 27 | 
 28 | 
 29 | def test_get_runtime_command_llama_cpp(model_group: AwsModelGroup) -> None:
 30 |     mock_store = MagicMock()
 31 |     with patch.object(
 32 |         paka.k8s.model_group.runtime.llama_cpp,
 33 |         "get_model_store",
 34 |         return_value=mock_store,
 35 |     ) as mock_get_model_store, patch.object(
 36 |         paka.k8s.model_group.runtime.llama_cpp, "HfFileSystem"
 37 |     ) as mock_hf_fs, patch.object(
 38 |         paka.k8s.model_group.runtime.llama_cpp,
 39 |         "validate_repo_id",
 40 |         return_value=True,
 41 |     ) as mock_validate_repo_id:
 42 |         # Test case: runtime command is already provided
 43 |         assert get_runtime_command_llama_cpp(Context(), model_group) == [
 44 |             "/server",
 45 |             "--model",
 46 |             f"{MODEL_MOUNT_PATH}/model.gguf",
 47 |         ]
 48 | 
 49 |         # Test case: model file is found in model store
 50 |         model_group.runtime.command = None
 51 |         model_group.model = Model(useModelStore=True)
 52 |         # Mock os.listdir to return a specific list of files
 53 |         mock_store.glob.return_value = ["model.gguf"]
 54 |         command = get_runtime_command_llama_cpp(Context(), model_group)
 55 |         assert "--model" in command, "Expected '--model' to be in command list"
 56 |         model_index = command.index("--model")
 57 |         assert (
 58 |             command[model_index + 1] == f"{MODEL_MOUNT_PATH}/model.gguf"
 59 |         ), f"Expected '--model' to be followed by '{MODEL_MOUNT_PATH}/model.gguf'"
 60 | 
 61 |         # Test case: model file is not found in the model store but found in HuggingFace repo
 62 |         model_group.model = Model(
 63 |             useModelStore=True, hfRepoId="repoId", files=["model.gguf"]
 64 |         )
 65 |         # Mock os.listdir to return an empty list
 66 |         mock_store.glob.return_value = []
 67 |         # Mock HfFileSystem.glob to return a specific list of files
 68 |         mock_hf_fs.return_value.glob.return_value = ["repoId/model.gguf"]
 69 |         command = get_runtime_command_llama_cpp(Context(), model_group)
 70 |         assert "--hf-repo" in command, "Expected '--hf-repo' to be in command list"
 71 |         repo_index = command.index("--hf-repo")
 72 |         assert (
 73 |             command[repo_index + 1] == "repoId"
 74 |         ), "Expected '--hf-repo' to be followed by 'repoId'"
 75 | 
 76 |         assert "--hf-file" in command, "Expected '--hf-file' to be in command list"
 77 |         file_index = command.index("--hf-file")
 78 |         assert (
 79 |             command[file_index + 1] == "model.gguf"
 80 |         ), "Expected '--hf-file' to be followed by 'model.gguf'"
 81 |         assert "--model" in command, "Expected '--model' to be in command list"
 82 |         model_index = command.index("--model")
 83 |         assert (
 84 |             command[model_index + 1] == f"model.gguf"
 85 |         ), f"Expected '--model' to be followed by 'model.gguf'"
 86 | 
 87 |         # Test case: model file is not found in the model store and not found in HuggingFace repo
 88 |         model_group.model = Model(
 89 |             useModelStore=True, hfRepoId="repoId", files=["model.gguf"]
 90 |         )
 91 |         # Mock os.listdir to return an empty list
 92 |         mock_store.glob.return_value = []
 93 |         # Mock HfFileSystem.glob to return an empty list
 94 |         mock_hf_fs.return_value.glob.return_value = []
 95 |         with pytest.raises(
 96 |             ValueError, match="No model file found in HuggingFace repo."
 97 |         ):
 98 |             get_runtime_command_llama_cpp(Context(), model_group)
 99 | 
100 |         # Test case: Multiple model files found in the model store
101 |         model_group.model = Model(useModelStore=True)
102 |         # Mock os.listdir to return multiple model files
103 |         mock_store.glob.return_value = ["model1.ggml", "model2.ggml"]
104 |         with pytest.raises(
105 |             ValueError,
106 |             match=f"Multiple model files found in {model_group.name}/ directory.",
107 |         ):
108 |             get_runtime_command_llama_cpp(Context(), model_group)
109 | 
110 |         # Test case: No model file found in the model store
111 |         model_group.model = Model(useModelStore=True)
112 |         # Mock os.listdir to return an empty list
113 |         mock_store.glob.return_value = []
114 |         mock_hf_fs.return_value.glob.return_value = []
115 |         with pytest.raises(ValueError, match="Did not find a model to load."):
116 |             get_runtime_command_llama_cpp(Context(), model_group)
117 | 


--------------------------------------------------------------------------------