├── packyak ├── py.typed ├── synth │ ├── __init__.py │ ├── call.py │ ├── file_utils.py │ ├── loaded_module.py │ └── __main__.py ├── cli │ ├── cli.py │ ├── node_type.py │ ├── new.py │ ├── __init__.py │ ├── __main__.py │ ├── run.py │ ├── materialize.py │ ├── list.py │ ├── instances.py │ ├── logs.py │ ├── clusters.py │ └── synth.py ├── asset │ ├── manifest.py │ ├── partition_key.py │ ├── asset.py │ ├── source.py │ └── namespace.py ├── web │ └── streamlit-site.py ├── spark │ ├── __init__.py │ └── session.py ├── util │ ├── git.py │ ├── fqn.py │ ├── memoize.py │ └── typed_resource.py ├── runtime │ ├── pool.py │ ├── integration.py │ ├── binding.py │ ├── runnable.py │ ├── job.py │ ├── cluster.py │ └── function.py ├── scheduling │ ├── cron.py │ └── every.py ├── resource.py ├── duration.py ├── __init__.py ├── spec.py └── storage │ └── folder.py ├── .python-version ├── packyak-docs ├── static │ ├── .nojekyll │ └── img │ │ ├── favicon.ico │ │ ├── docusaurus.png │ │ └── docusaurus-social-card.jpg ├── babel.config.js ├── docs │ ├── tutorial-extras │ │ ├── img │ │ │ ├── localeDropdown.png │ │ │ └── docsVersionDropdown.png │ │ ├── _category_.json │ │ ├── manage-docs-versions.md │ │ └── translate-your-site.md │ ├── tutorial-basics │ │ ├── _category_.json │ │ ├── deploy-your-site.md │ │ ├── create-a-blog-post.md │ │ ├── congratulations.md │ │ ├── create-a-page.md │ │ ├── create-a-document.md │ │ └── markdown-features.mdx │ └── intro.md ├── src │ ├── pages │ │ ├── markdown-page.md │ │ ├── index.module.css │ │ └── index.tsx │ ├── components │ │ └── HomepageFeatures │ │ │ ├── styles.module.css │ │ │ └── index.tsx │ └── css │ │ └── custom.css ├── blog │ ├── authors.yml │ ├── 2024-01-21-DeltaLake_vs_Iceberg_vs_Hudi.md │ └── 2024-01-22-Data_Lineage.md ├── tsconfig.json ├── .gitignore ├── sidebars.ts ├── README.md ├── package.json └── docusaurus.config.ts ├── packyak-aws-cdk ├── .gitignore ├── src │ ├── sagemaker │ │ ├── cleanup │ │ │ └── package.json │ │ ├── user-profile.ts │ │ └── sage-maker-image.ts │ ├── .DS_Store │ ├── emr │ │ ├── instance-market.ts │ │ ├── scala-version.ts │ │ ├── spark-version.ts │ │ ├── step.ts │ │ ├── application.ts │ │ ├── bootstrap-action.ts │ │ ├── catalog.ts │ │ ├── python-version.ts │ │ ├── release-label.ts │ │ ├── spark-sql-extension.ts │ │ ├── spark-config.ts │ │ ├── block-device.ts │ │ ├── uniform-cluster.ts │ │ ├── jdbc.ts │ │ ├── configuration.ts │ │ ├── fleet-cluster.ts │ │ └── glue-catalog.ts │ ├── bind.ts │ ├── version.ts │ ├── workspace │ │ ├── group.ts │ │ ├── workspace.ts │ │ └── home.ts │ ├── dns-configuration.ts │ ├── python-poetry.ts │ ├── sst-compat.ts │ ├── export-requirements.ts │ ├── packyak-resource.ts │ ├── nessie │ │ ├── nessie-lambda-catalog.ts │ │ ├── nessie-version-store.ts │ │ ├── nessie-config.ts │ │ └── nessie-ecs-catalog.ts │ ├── index.ts │ ├── experimental.ts │ ├── streamlit-site.ts │ └── dagster.ts ├── .env.example ├── scripts │ ├── release.sh │ ├── install-pyenv.sh │ ├── install-github-cli.sh │ ├── install-docker-compose.sh │ ├── mount-yarn-cgroups.sh │ ├── setup-hadoop-users.sh │ ├── write-env-variables.sh │ ├── install-nvidia-container-toolkit.sh │ ├── install-ssm-agent.sh │ ├── install-nvidia-drivers.sh │ ├── debug-docker.sh │ └── mount-efs.sh ├── tsconfig.ref.json ├── .npmignore ├── bin │ └── packyak.mjs ├── tsconfig.json ├── package.json └── README.md ├── poetry.toml ├── packyak-nessie ├── src │ ├── main │ │ ├── resources │ │ │ └── application.properties │ │ ├── java │ │ │ └── org │ │ │ │ └── acme │ │ │ │ └── GreetingResource.java │ │ └── docker │ │ │ ├── Dockerfile.native │ │ │ └── Dockerfile.native-micro │ ├── native-test │ │ └── java │ │ │ └── org │ │ │ └── acme │ │ │ └── GreetingResourceIT.java │ └── test │ │ └── java │ │ └── org │ │ └── acme │ │ └── GreetingResourceTest.java ├── .dockerignore ├── gradle │ └── wrapper │ │ ├── gradle-wrapper.jar │ │ └── gradle-wrapper.properties ├── target │ ├── classes │ │ └── org │ │ │ └── acme │ │ │ └── GreetingResource.class │ └── test-classes │ │ └── org │ │ └── acme │ │ └── GreetingResourceTest.class ├── gradle.properties ├── settings.gradle.kts ├── .gitignore ├── build.gradle.kts ├── README.md └── gradlew.bat ├── cdk.json ├── create-packyak ├── src │ └── index.ts └── tsconfig.json ├── examples └── streamlit-aws-cdk │ ├── README.md │ ├── cdk.json │ ├── poetry.toml │ ├── .dockerignore │ ├── my_app │ ├── pages │ │ └── chat.py │ ├── __init__.py │ ├── videos.py │ ├── spark_job.py │ └── home.py │ ├── tsconfig.json │ ├── Dockerfile │ ├── pyproject.toml │ ├── package.json │ └── notebooks │ ├── spark.ipynb │ └── nessie.ipynb ├── pnpm-workspace.yaml ├── .quarkus └── cli │ └── plugins │ └── quarkus-cli-catalog.json ├── scripts ├── python ├── publish.sh └── bump.mjs ├── .gitignore ├── tsconfig.json ├── tsconfig.base.json ├── pyrightconfig.json ├── biome.json ├── .vscode ├── launch.json └── settings.json ├── packyak.config.ts ├── pyproject.toml ├── README.md └── package.json /packyak/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.12.1 2 | -------------------------------------------------------------------------------- /packyak-docs/static/.nojekyll: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /packyak-aws-cdk/.gitignore: -------------------------------------------------------------------------------- 1 | lib.jsii 2 | jsii -------------------------------------------------------------------------------- /poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | in-project = true 3 | -------------------------------------------------------------------------------- /packyak-nessie/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "./scripts/python examples/app.py" 3 | } 4 | -------------------------------------------------------------------------------- /create-packyak/src/index.ts: -------------------------------------------------------------------------------- 1 | export function createPackyak() {} 2 | -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/README.md: -------------------------------------------------------------------------------- 1 | # Streamlit Example 2 | 3 | 4 | -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "packyak" 3 | } 4 | -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | in-project = true 3 | -------------------------------------------------------------------------------- /create-packyak/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "../tsconfig.base.json" 3 | } 4 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/sagemaker/cleanup/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "module" 3 | } 4 | -------------------------------------------------------------------------------- /packyak/synth/__init__.py: -------------------------------------------------------------------------------- 1 | from packyak.synth.synth import synth 2 | 3 | __all__ = ["synth"] 4 | -------------------------------------------------------------------------------- /pnpm-workspace.yaml: -------------------------------------------------------------------------------- 1 | packages: 2 | - packyak-docs 3 | - packyak-aws-cdk 4 | - examples/* 5 | -------------------------------------------------------------------------------- /packyak-aws-cdk/.env.example: -------------------------------------------------------------------------------- 1 | export TWINE_USERNAME=samgoodwin89 2 | export TWINE_PASSWORD=(pypi token) -------------------------------------------------------------------------------- /packyak/cli/cli.py: -------------------------------------------------------------------------------- 1 | import asyncclick as click 2 | 3 | 4 | @click.group() 5 | def cli(): 6 | pass 7 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sam-goodwin/packyak/HEAD/packyak-aws-cdk/src/.DS_Store -------------------------------------------------------------------------------- /packyak-nessie/.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !build/*-runner 3 | !build/*-runner.jar 4 | !build/lib/* 5 | !build/quarkus-app/* -------------------------------------------------------------------------------- /packyak/asset/manifest.py: -------------------------------------------------------------------------------- 1 | from packyak.asset.asset import Asset 2 | 3 | 4 | class Manifest(Asset): 5 | pass 6 | -------------------------------------------------------------------------------- /packyak-docs/static/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sam-goodwin/packyak/HEAD/packyak-docs/static/img/favicon.ico -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/instance-market.ts: -------------------------------------------------------------------------------- 1 | export enum InstanceMarket { 2 | ON_DEMAND = "ON_DEMAND", 3 | SPOT = "SPOT", 4 | } 5 | -------------------------------------------------------------------------------- /packyak-docs/babel.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | presets: [require.resolve('@docusaurus/core/lib/babel/preset')], 3 | }; 4 | -------------------------------------------------------------------------------- /packyak-docs/static/img/docusaurus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sam-goodwin/packyak/HEAD/packyak-docs/static/img/docusaurus.png -------------------------------------------------------------------------------- /packyak/cli/node_type.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | NodeType = Literal["primary"] | Literal["core"] | Literal["task"] 4 | -------------------------------------------------------------------------------- /packyak/web/streamlit-site.py: -------------------------------------------------------------------------------- 1 | class StreamlitSite: 2 | def __init__(self, path: str) -> None: 3 | self.path = path 4 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/scala-version.ts: -------------------------------------------------------------------------------- 1 | import { Version } from "../version"; 2 | 3 | export class ScalaVersion extends Version {} 4 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/spark-version.ts: -------------------------------------------------------------------------------- 1 | import { Version } from "../version"; 2 | 3 | export class SparkVersion extends Version {} 4 | -------------------------------------------------------------------------------- /.quarkus/cli/plugins/quarkus-cli-catalog.json: -------------------------------------------------------------------------------- 1 | { 2 | "version" : "v1", 3 | "lastUpdate" : "08/02/2024 00:21:28", 4 | "plugins" : { } 5 | } -------------------------------------------------------------------------------- /packyak-aws-cdk/scripts/release.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | source .env 4 | cd lib.jsii/python 5 | python3 -m twine upload --verbose --skip-existing * -------------------------------------------------------------------------------- /packyak/spark/__init__.py: -------------------------------------------------------------------------------- 1 | from packyak.spark.session import init_session, session_builder 2 | 3 | __all__ = ["init_session", "session_builder"] 4 | -------------------------------------------------------------------------------- /packyak-nessie/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sam-goodwin/packyak/HEAD/packyak-nessie/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /packyak-aws-cdk/tsconfig.ref.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.json", 3 | "compilerOptions": { 4 | "composite": true 5 | } 6 | } 7 | 8 | -------------------------------------------------------------------------------- /packyak-docs/static/img/docusaurus-social-card.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sam-goodwin/packyak/HEAD/packyak-docs/static/img/docusaurus-social-card.jpg -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/step.ts: -------------------------------------------------------------------------------- 1 | import type { CfnCluster } from "aws-cdk-lib/aws-emr"; 2 | 3 | export interface Step extends CfnCluster.StepConfigProperty {} 4 | -------------------------------------------------------------------------------- /packyak-docs/docs/tutorial-extras/img/localeDropdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sam-goodwin/packyak/HEAD/packyak-docs/docs/tutorial-extras/img/localeDropdown.png -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/application.ts: -------------------------------------------------------------------------------- 1 | export enum Application { 2 | AMAZON_CLOUDWATCH_AGENT = "AmazonCloudWatchAgent", 3 | LIVY = "Livy", 4 | SPARK = "Spark", 5 | } 6 | -------------------------------------------------------------------------------- /packyak-docs/docs/tutorial-extras/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Tutorial - Extras", 3 | "position": 3, 4 | "link": { 5 | "type": "generated-index" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /packyak-docs/docs/tutorial-extras/img/docsVersionDropdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sam-goodwin/packyak/HEAD/packyak-docs/docs/tutorial-extras/img/docsVersionDropdown.png -------------------------------------------------------------------------------- /packyak-nessie/target/classes/org/acme/GreetingResource.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sam-goodwin/packyak/HEAD/packyak-nessie/target/classes/org/acme/GreetingResource.class -------------------------------------------------------------------------------- /scripts/python: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 4 | 5 | PYTHONPYCACHEPREFIX=/.pycache $DIR/../.venv/bin/python "$@" 6 | -------------------------------------------------------------------------------- /packyak-docs/src/pages/markdown-page.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Markdown page example 3 | --- 4 | 5 | # Markdown page example 6 | 7 | You don't need React to write simple standalone pages. 8 | -------------------------------------------------------------------------------- /packyak-docs/blog/authors.yml: -------------------------------------------------------------------------------- 1 | sam: 2 | name: Sam Goodwin 3 | title: Maintainer of PackYak 4 | url: https://github.com/sam-goodwin 5 | image_url: https://github.com/sam-goodwin.png 6 | -------------------------------------------------------------------------------- /packyak-aws-cdk/scripts/install-pyenv.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Install pyenv 4 | curl https://pyenv.run | bash 5 | 6 | git clone https://github.com/pyenv/pyenv.git --depth=1 ~/.pyenv 7 | 8 | -------------------------------------------------------------------------------- /packyak-nessie/target/test-classes/org/acme/GreetingResourceTest.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sam-goodwin/packyak/HEAD/packyak-nessie/target/test-classes/org/acme/GreetingResourceTest.class -------------------------------------------------------------------------------- /packyak-aws-cdk/src/bind.ts: -------------------------------------------------------------------------------- 1 | import { IGrantable } from "aws-cdk-lib/aws-iam"; 2 | 3 | export interface IBindable extends IGrantable { 4 | addEnvironment(key: string, value: string): void; 5 | } 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .env 3 | .pycache 4 | .ruff_cache 5 | .venv 6 | .packyak 7 | *.tsbuildinfo 8 | cdk.out 9 | dist 10 | lib 11 | node_modules 12 | tmp* 13 | custom 14 | .DS_Store 15 | 16 | -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/.dockerignore: -------------------------------------------------------------------------------- 1 | !.packyak 2 | .venv 3 | cdk.json 4 | node_modules 5 | package.json 6 | packyak.config.ts 7 | poetry.lock 8 | pyproject.toml 9 | README.md 10 | tsconfig.json -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/my_app/pages/chat.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from my_app.videos import get_video 3 | 4 | if __name__ == "__main__": 5 | st.write("Hello world!") 6 | video = get_video("foo") 7 | -------------------------------------------------------------------------------- /packyak-docs/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | // This file is not used in compilation. It is here just for a nice editor experience. 3 | "extends": "@docusaurus/tsconfig", 4 | "compilerOptions": { 5 | "baseUrl": "." 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/version.ts: -------------------------------------------------------------------------------- 1 | export class Version { 2 | constructor(readonly semverString: string) {} 3 | 4 | public get majorMinorVersion(): string { 5 | return this.semverString.split(".").slice(0, 2).join("."); 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/my_app/__init__.py: -------------------------------------------------------------------------------- 1 | from my_app.videos import videos, get_video, process_video 2 | from my_app.spark_job import download_hackernews 3 | 4 | __all__ = ["videos", "get_video", "process_video", "download_hackernews"] 5 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/bootstrap-action.ts: -------------------------------------------------------------------------------- 1 | import type { Asset } from "aws-cdk-lib/aws-s3-assets"; 2 | 3 | export interface BootstrapAction { 4 | readonly name: string; 5 | readonly script: Asset; 6 | readonly args?: string[]; 7 | } 8 | -------------------------------------------------------------------------------- /packyak/util/git.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | 4 | def get_git_branch(): 5 | return ( 6 | subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]) 7 | .decode("utf-8") 8 | .strip() 9 | ) 10 | -------------------------------------------------------------------------------- /packyak-aws-cdk/scripts/install-github-cli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | type -p yum-config-manager >/dev/null || sudo yum install yum-utils 4 | sudo yum-config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo -y 5 | sudo yum install gh -y -------------------------------------------------------------------------------- /packyak-docs/src/components/HomepageFeatures/styles.module.css: -------------------------------------------------------------------------------- 1 | .features { 2 | display: flex; 3 | align-items: center; 4 | padding: 2rem 0; 5 | width: 100%; 6 | } 7 | 8 | .featureSvg { 9 | height: 200px; 10 | width: 200px; 11 | } 12 | -------------------------------------------------------------------------------- /packyak-docs/docs/tutorial-basics/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Tutorial - Basics", 3 | "position": 2, 4 | "link": { 5 | "type": "generated-index", 6 | "description": "5 minutes to learn the most important Docusaurus concepts." 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /packyak/cli/new.py: -------------------------------------------------------------------------------- 1 | import asyncclick as click 2 | 3 | from packyak.cli.cli import cli 4 | 5 | 6 | @cli.command() 7 | @click.option("--name", prompt="Your name", help="The person to greet.") 8 | def new(name: str): 9 | click.echo(f"Hello {name}!") 10 | -------------------------------------------------------------------------------- /packyak/util/fqn.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable 2 | 3 | 4 | def get_fully_qualified_name(function: Callable[..., Any] | type): 5 | module_name = function.__module__ 6 | qual_name = function.__qualname__ 7 | return f"{module_name}.{qual_name}" 8 | -------------------------------------------------------------------------------- /packyak-nessie/gradle.properties: -------------------------------------------------------------------------------- 1 | #Gradle properties 2 | #Thu Feb 08 00:25:08 PST 2024 3 | quarkusPluginVersion=3.7.1 4 | quarkusPlatformArtifactId=quarkus-bom 5 | quarkusPluginId=io.quarkus 6 | quarkusPlatformGroupId=io.quarkus.platform 7 | quarkusPlatformVersion=3.7.1 8 | -------------------------------------------------------------------------------- /packyak-aws-cdk/scripts/install-docker-compose.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo curl -L https://github.com/docker/compose/releases/latest/download/docker-compose-$(uname -s)-$(uname -m) -o /usr/bin/docker-compose 4 | sudo chmod +x /usr/bin/docker-compose 5 | docker-compose --version -------------------------------------------------------------------------------- /packyak-nessie/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.5-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /packyak/runtime/pool.py: -------------------------------------------------------------------------------- 1 | from packyak.spec import ResourceType 2 | from packyak.resource import Resource 3 | 4 | 5 | class Pool(Resource): 6 | def __init__(self, pool_id: str): 7 | super().__init__(resource_id=pool_id, resource_type=ResourceType.Pool) 8 | self.name = pool_id 9 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": [], 3 | "references": [ 4 | { 5 | "path": "./packyak-aws-cdk/tsconfig.json" 6 | }, 7 | { 8 | "path": "./packyak-docs/tsconfig.json" 9 | }, 10 | { 11 | "path": "./examples/streamlit-aws-cdk" 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /packyak-nessie/src/native-test/java/org/acme/GreetingResourceIT.java: -------------------------------------------------------------------------------- 1 | package org.acme; 2 | 3 | import io.quarkus.test.junit.QuarkusIntegrationTest; 4 | 5 | @QuarkusIntegrationTest 6 | class GreetingResourceIT extends GreetingResourceTest { 7 | // Execute the same tests but in packaged mode. 8 | } 9 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/workspace/group.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * A statically defined POSIX Group. 3 | */ 4 | export interface PosixGroup { 5 | /** 6 | * Unique name of the POSIX group. 7 | */ 8 | readonly name: string; 9 | /** 10 | * Unique ID of the POSIX group. 11 | */ 12 | readonly gid: number; 13 | } 14 | -------------------------------------------------------------------------------- /packyak-aws-cdk/.npmignore: -------------------------------------------------------------------------------- 1 | 2 | # Exclude typescript source and config 3 | *.ts 4 | tsconfig.json 5 | *.tsbuildinfo 6 | 7 | # Include javascript files and typescript declarations 8 | !*.js 9 | !*.d.ts 10 | 11 | # Exclude jsii outdir 12 | lib.jsii 13 | 14 | # Include .jsii and .jsii.gz 15 | !.jsii 16 | !.jsii.gz 17 | -------------------------------------------------------------------------------- /packyak-docs/blog/2024-01-21-DeltaLake_vs_Iceberg_vs_Hudi.md: -------------------------------------------------------------------------------- 1 | --- 2 | slug: deltalake-vs-iceberg-vs-hudi 3 | title: Delta Lake vs Iceberg vs Hudi - which Lakehouse is best? 4 | author: sam 5 | tags: [lakehouse, data lake, data warehouse, delta lake, iceberg, hudi] 6 | --- 7 | 8 | # Delta Lake vs Iceberg vs Hudi - which Lakehouse is best? 9 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/dns-configuration.ts: -------------------------------------------------------------------------------- 1 | import type { ICertificate } from "aws-cdk-lib/aws-certificatemanager"; 2 | import type { IHostedZone } from "aws-cdk-lib/aws-route53"; 3 | 4 | export interface DNSConfiguration { 5 | readonly certificate: ICertificate; 6 | readonly domainName: string; 7 | readonly hostedZone: IHostedZone; 8 | } 9 | -------------------------------------------------------------------------------- /packyak-docs/.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | /node_modules 3 | 4 | # Production 5 | /build 6 | 7 | # Generated files 8 | .docusaurus 9 | .cache-loader 10 | 11 | # Misc 12 | .DS_Store 13 | .env.local 14 | .env.development.local 15 | .env.test.local 16 | .env.production.local 17 | 18 | npm-debug.log* 19 | yarn-debug.log* 20 | yarn-error.log* 21 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/python-poetry.ts: -------------------------------------------------------------------------------- 1 | export interface PythonPoetryArgs { 2 | readonly include: string[] | undefined; 3 | readonly exclude: string[] | undefined; 4 | readonly dev: boolean | undefined; 5 | readonly allExtras: boolean | undefined; 6 | readonly withoutHashes: boolean | undefined; 7 | readonly withoutUrls: boolean | undefined; 8 | } 9 | -------------------------------------------------------------------------------- /packyak/util/memoize.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar, Callable 2 | 3 | T = TypeVar("T") 4 | 5 | 6 | def memoize(fn: Callable[[], T]) -> Callable[[], T]: 7 | cache = None 8 | 9 | def wrapper(): 10 | nonlocal cache 11 | if cache is None: 12 | cache = fn() 13 | return cache 14 | 15 | return wrapper 16 | -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "../../tsconfig.base.json", 3 | "exclude": ["node_modules", ".venv"], 4 | "include": ["packyak.config.ts"], 5 | "compilerOptions": { 6 | "noEmit": true 7 | }, 8 | "references": [ 9 | { 10 | "path": "../../packyak-aws-cdk/tsconfig.ref.json" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/my_app/videos.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple 2 | from packyak import Bucket 3 | 4 | 5 | videos = Bucket("videos") 6 | 7 | 8 | @videos.on("create") 9 | def process_video(event: Bucket.ObjectCreatedEvent): 10 | print("Processing video", event.key) 11 | 12 | 13 | def get_video(key: str): 14 | return videos.get_sync(key) 15 | -------------------------------------------------------------------------------- /packyak/scheduling/cron.py: -------------------------------------------------------------------------------- 1 | from packyak.runtime.function import LambdaFunction 2 | from packyak.runtime.job import Job 3 | from packyak.duration import TimeUnit 4 | from typing import Any, TypeVar 5 | 6 | T = TypeVar("T", bound=Job[Any] | LambdaFunction[[], Any]) 7 | 8 | 9 | def cron(cron: str): 10 | def wrapper(func: T) -> T: 11 | return func 12 | 13 | return wrapper 14 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/sst-compat.ts: -------------------------------------------------------------------------------- 1 | // TODO: SST compat 2 | 3 | import { Node } from "constructs"; 4 | import type { App } from "sst/constructs"; 5 | 6 | declare module "constructs" { 7 | interface ConstructNode { 8 | root: App; 9 | } 10 | } 11 | Object.defineProperty(Node.prototype, "root", { 12 | get() { 13 | throw new Error("WOOF"); 14 | return this.scope.root; 15 | }, 16 | }); 17 | -------------------------------------------------------------------------------- /packyak-nessie/settings.gradle.kts: -------------------------------------------------------------------------------- 1 | pluginManagement { 2 | val quarkusPluginVersion: String by settings 3 | val quarkusPluginId: String by settings 4 | repositories { 5 | mavenCentral() 6 | gradlePluginPortal() 7 | mavenLocal() 8 | } 9 | plugins { 10 | id(quarkusPluginId) version quarkusPluginVersion 11 | } 12 | } 13 | rootProject.name="code-with-quarkus" 14 | -------------------------------------------------------------------------------- /packyak/scheduling/every.py: -------------------------------------------------------------------------------- 1 | from packyak.runtime.function import LambdaFunction 2 | from packyak.runtime.job import Job 3 | from packyak.duration import TimeUnit 4 | from typing import Any, TypeVar 5 | 6 | 7 | T = TypeVar("T", bound=Job[Any] | LambdaFunction[[], Any]) 8 | 9 | 10 | def every(amount: int, unit: TimeUnit): 11 | def wrapper(func: T) -> T: 12 | return func 13 | 14 | return wrapper 15 | -------------------------------------------------------------------------------- /packyak/synth/call.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from packyak.runtime.integration import Integration 3 | 4 | 5 | class Call: 6 | def __init__( 7 | self, 8 | func: Integration[Any, Any], 9 | *, 10 | obj: Any = None, 11 | metadata: dict[str, property] | None = None, 12 | ): 13 | self.func = func 14 | self.obj = obj 15 | self.metadata = metadata 16 | -------------------------------------------------------------------------------- /packyak/synth/file_utils.py: -------------------------------------------------------------------------------- 1 | def file_path_to_parent_package(file_path: str) -> str: 2 | components = file_path.split("/") 3 | package_name = ".".join(components[:-1]) 4 | return package_name 5 | 6 | 7 | def file_path_to_module_name(file_path: str) -> str: 8 | file_path = file_path.rsplit(".", 1)[0] 9 | components = file_path.split("/") 10 | package_name = ".".join(components) 11 | return package_name 12 | -------------------------------------------------------------------------------- /tsconfig.base.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@tsconfig/node20", 3 | "exclude": ["node_modules", ".venv"], 4 | "compilerOptions": { 5 | "moduleResolution": "Bundler", 6 | "module": "ESNext", 7 | "target": "ESNext", 8 | "declarationMap": true, 9 | "sourceMap": true, 10 | "declaration": true, 11 | "strict": true, 12 | "strictNullChecks": true, 13 | "typeRoots": ["node_modules/@types"] 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /packyak-nessie/src/main/java/org/acme/GreetingResource.java: -------------------------------------------------------------------------------- 1 | package org.acme; 2 | 3 | import jakarta.ws.rs.GET; 4 | import jakarta.ws.rs.Path; 5 | import jakarta.ws.rs.Produces; 6 | import jakarta.ws.rs.core.MediaType; 7 | 8 | @Path("/hello") 9 | public class GreetingResource { 10 | 11 | @GET 12 | @Produces(MediaType.TEXT_PLAIN) 13 | public String hello() { 14 | return "Hello from RESTEasy Reactive"; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /packyak/synth/loaded_module.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import types 3 | 4 | 5 | class LoadedModule: 6 | def __init__( 7 | self, 8 | module: types.ModuleType, 9 | ast: ast.Module, 10 | module_name: str, 11 | file_name: str, 12 | ): 13 | self.module = module 14 | self.ast = ast 15 | self.module_name = module_name 16 | self.file_name = file_name 17 | self.vars = vars(module) 18 | -------------------------------------------------------------------------------- /pyrightconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "exclude": [ 3 | "**/node_modules", 4 | "**/cdk.out", 5 | "**/.venv", 6 | "**/dist", 7 | "**/lib", 8 | "**/pycache" 9 | ], 10 | "reportUnusedExpression": false, 11 | "reportUnknownArgumentType": false, 12 | "reportUnknownVariableType": false, 13 | "reportUnknownMemberType": false, 14 | "reportUnusedImport": false, 15 | "reportOptionalMemberAccess": false, 16 | "reportMissingTypeStubs": false 17 | } 18 | -------------------------------------------------------------------------------- /packyak/resource.py: -------------------------------------------------------------------------------- 1 | class Resource: 2 | def __init__(self, resource_id: str): 3 | self.resource_id = resource_id 4 | self.bucket_id = self.resource_id 5 | if resource_id in RESOURCES: 6 | raise ValueError(f"Resource with id {resource_id} already exists") 7 | RESOURCES[self.resource_id] = self 8 | 9 | @property 10 | def resource_type(self) -> str: 11 | return self.__class__.__name__ 12 | 13 | 14 | RESOURCES: dict[str, Resource] = {} 15 | -------------------------------------------------------------------------------- /packyak-docs/src/pages/index.module.css: -------------------------------------------------------------------------------- 1 | /** 2 | * CSS files with the .module.css suffix will be treated as CSS modules 3 | * and scoped locally. 4 | */ 5 | 6 | .heroBanner { 7 | padding: 4rem 0; 8 | text-align: center; 9 | position: relative; 10 | overflow: hidden; 11 | } 12 | 13 | @media screen and (max-width: 996px) { 14 | .heroBanner { 15 | padding: 2rem; 16 | } 17 | } 18 | 19 | .buttons { 20 | display: flex; 21 | align-items: center; 22 | justify-content: center; 23 | } 24 | -------------------------------------------------------------------------------- /packyak-nessie/.gitignore: -------------------------------------------------------------------------------- 1 | # Gradle 2 | .gradle/ 3 | build/ 4 | 5 | # Eclipse 6 | .project 7 | .classpath 8 | .settings/ 9 | bin/ 10 | 11 | # IntelliJ 12 | .idea 13 | *.ipr 14 | *.iml 15 | *.iws 16 | 17 | # NetBeans 18 | nb-configuration.xml 19 | 20 | # Visual Studio Code 21 | .vscode 22 | .factorypath 23 | 24 | # OSX 25 | .DS_Store 26 | 27 | # Vim 28 | *.swp 29 | *.swo 30 | 31 | # patch 32 | *.orig 33 | *.rej 34 | 35 | # Local environment 36 | .env 37 | 38 | # Plugin directory 39 | /.quarkus/cli/plugins/ 40 | -------------------------------------------------------------------------------- /packyak/asset/partition_key.py: -------------------------------------------------------------------------------- 1 | from pydantic import Field 2 | from typing import Any, NamedTuple, TypeVar 3 | 4 | 5 | # Define a generic type variable 6 | PartitionKeyValue = str | int | float | bool | None 7 | 8 | T = TypeVar("T", bound=PartitionKeyValue) 9 | 10 | 11 | def PartitionKey() -> Any: 12 | return Field(..., partition_key=True) # type: ignore - i think pydantic actually allows this 13 | 14 | 15 | class PartitionKeyField(NamedTuple): 16 | name: str 17 | type: type[PartitionKeyValue] 18 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/catalog.ts: -------------------------------------------------------------------------------- 1 | import type { Cluster } from "./cluster"; 2 | 3 | /** 4 | * A Table Catalog implementation provides 5 | */ 6 | export interface ICatalog { 7 | /** 8 | * Bind this Catalog to a {@link Cluster} by granting any required IAM Policies 9 | * and adding any required configurations to the Cluster. 10 | * 11 | * @param cluster the cluster to bind this catalog to 12 | * @param catalogName the name to bind the catalog under 13 | */ 14 | bind(cluster: Cluster, catalogName: string): void; 15 | } 16 | -------------------------------------------------------------------------------- /packyak/asset/asset.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Generic, NamedTuple, TypeVar 2 | 3 | Asset = NamedTuple 4 | 5 | TInput = TypeVar("TInput", bound=Asset) 6 | TOutput = TypeVar("TOutput", bound=Asset) 7 | 8 | 9 | class AssetNode(Generic[TOutput]): 10 | def __init__(self, input: "AssetNode[Any]") -> None: 11 | self.input = input 12 | 13 | 14 | class TableAssetNode(AssetNode[TOutput]): 15 | pass 16 | 17 | 18 | def asset(): 19 | def wrapper(func: Callable[..., Asset]): 20 | pass 21 | 22 | return wrapper 23 | -------------------------------------------------------------------------------- /packyak/duration.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | TimeUnit = ( 4 | Literal["second"] 5 | | Literal["seconds"] 6 | | Literal["minute"] 7 | | Literal["minutes"] 8 | | Literal["hour"] 9 | | Literal["hours"] 10 | | Literal["day"] 11 | | Literal["days"] 12 | ) 13 | 14 | 15 | class Duration: 16 | def __init__(self, amount: int, unit: TimeUnit) -> None: 17 | self.amount = amount 18 | self.unit = unit 19 | 20 | 21 | def duration(amount: int, unit: TimeUnit) -> Duration: 22 | return Duration(amount, unit) 23 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/python-version.ts: -------------------------------------------------------------------------------- 1 | import { Version } from "../version"; 2 | 3 | export class PythonVersion extends Version { 4 | public static readonly V3_7 = new PythonVersion("3.7"); 5 | public static readonly V3_8 = new PythonVersion("3.8"); 6 | public static readonly V3_9 = new PythonVersion("3.9"); 7 | public static readonly V3_10 = new PythonVersion("3.10"); 8 | public static readonly V3_11 = new PythonVersion("3.11"); 9 | public static readonly V3_12 = new PythonVersion("3.12"); 10 | public static readonly LATEST = PythonVersion.V3_8; 11 | } 12 | -------------------------------------------------------------------------------- /packyak-nessie/src/test/java/org/acme/GreetingResourceTest.java: -------------------------------------------------------------------------------- 1 | package org.acme; 2 | 3 | import io.quarkus.test.junit.QuarkusTest; 4 | import org.junit.jupiter.api.Test; 5 | 6 | import static io.restassured.RestAssured.given; 7 | import static org.hamcrest.CoreMatchers.is; 8 | 9 | @QuarkusTest 10 | class GreetingResourceTest { 11 | @Test 12 | void testHelloEndpoint() { 13 | given() 14 | .when().get("/hello") 15 | .then() 16 | .statusCode(200) 17 | .body(is("Hello from RESTEasy Reactive")); 18 | } 19 | 20 | } -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-alpine3.19 2 | 3 | RUN apk update && apk upgrade 4 | 5 | RUN apk add --no-cache \ 6 | build-base \ 7 | gcc \ 8 | cmake \ 9 | python3-dev \ 10 | apache-arrow \ 11 | apache-arrow-dev 12 | 13 | RUN pip install pyarrow pandas 14 | 15 | ARG REQUIREMENTS_PATH 16 | 17 | COPY ${REQUIREMENTS_PATH} requirements.txt 18 | 19 | RUN pip install -r requirements.txt 20 | 21 | COPY app /app 22 | 23 | ENV PYTHONPATH=/ 24 | 25 | EXPOSE 8501 26 | 27 | CMD ["streamlit", "run", "app/home.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.fileWatcherType=none"] 28 | -------------------------------------------------------------------------------- /packyak/cli/__init__.py: -------------------------------------------------------------------------------- 1 | from packyak.cli.cli import cli 2 | from packyak.cli.list import list 3 | from packyak.cli.new import new 4 | from packyak.cli.ssh import ssh 5 | from packyak.cli.synth import synth 6 | from packyak.cli.run import run 7 | from packyak.cli.materialize import materialize 8 | from packyak.cli.logs import logs 9 | from packyak.cli.instances import instances 10 | from packyak.cli.clusters import clusters 11 | 12 | 13 | __all__ = [ 14 | "cli", 15 | "new", 16 | "ssh", 17 | "synth", 18 | "run", 19 | "materialize", 20 | "list", 21 | "logs", 22 | "instances", 23 | "clusters", 24 | ] 25 | -------------------------------------------------------------------------------- /packyak/cli/__main__.py: -------------------------------------------------------------------------------- 1 | from packyak.cli.cli import cli 2 | from packyak.cli.list import list 3 | from packyak.cli.materialize import materialize 4 | from packyak.cli.new import new 5 | from packyak.cli.run import run 6 | from packyak.cli.ssh import ssh 7 | from packyak.cli.synth import synth 8 | from packyak.cli.logs import logs 9 | from packyak.cli.instances import instances 10 | from packyak.cli.clusters import clusters 11 | 12 | 13 | __all__ = [ 14 | "cli", 15 | "new", 16 | "ssh", 17 | "synth", 18 | "run", 19 | "materialize", 20 | "list", 21 | "logs", 22 | "instances", 23 | "clusters", 24 | ] 25 | 26 | cli() 27 | -------------------------------------------------------------------------------- /scripts/publish.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | set -e 4 | 5 | extra_args=$* 6 | version=$(node -p "require('./package.json').version") 7 | 8 | script_dir=$(dirname -- "${BASH_SOURCE[0]}") 9 | echo "Script directory: $script_dir" 10 | 11 | if [[ " $extra_args " =~ " --no-bump " ]]; then 12 | echo "Skipping version bump due to --no-bump flag" 13 | else 14 | env node $script_dir/bump.mjs ${extra_args} 15 | fi 16 | 17 | pnpm run clean 18 | pnpm run build 19 | 20 | if [ "$BUMP_ROOT" == "true" ]; then 21 | poetry build && poetry publish 22 | fi 23 | 24 | if [ "$BUMP_CDK" == "true" ]; then 25 | cd packyak-aws-cdk 26 | pnpm run publish:pypi 27 | npm publish 28 | fi 29 | 30 | 31 | -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "my_app" 3 | 4 | version = "0.1.0" 5 | description = "" 6 | authors = ["sam "] 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | aioboto3 = "^12.1.0" 11 | packyak = { path = "../../" } 12 | pynessie = "^0.66.0" 13 | python = "3.10.13" 14 | streamlit = "^1.29.0" 15 | types-aiobotocore = { extras = ["essential"], version = "^2.9.0" } 16 | 17 | [tool.poetry.group.dev.dependencies] 18 | packyak-aws-cdk = { file = "../../packyak-aws-cdk/lib.jsii/python/packyak_aws_cdk-0.4.0-py3-none-any.whl" } 19 | poetry-plugin-export = "^1.6.0" 20 | 21 | [build-system] 22 | requires = ["poetry-core"] 23 | build-backend = "poetry.core.masonry.api" 24 | -------------------------------------------------------------------------------- /packyak-aws-cdk/scripts/mount-yarn-cgroups.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | while [[ "$#" -gt 0 ]]; do 5 | case $1 in 6 | --emr-version) EMR_VERSION="$2"; shift ;; 7 | *) echo "Unknown parameter passed: $1"; exit 1 ;; 8 | esac 9 | shift 10 | done 11 | 12 | if [ -z "$EMR_VERSION" ]; then 13 | echo "Error: EMR_VERSION is not set." 14 | exit 1 15 | fi 16 | 17 | 18 | if [ "$EMR_VERSION" -eq "7" ]; then 19 | sudo mkdir -p /yarn-cgroup/devices 20 | sudo mount -t cgroup -o devices cgroupv1-devices /yarn-cgroup/devices 21 | sudo chmod a+rwx -R /yarn-cgroup 22 | else 23 | sudo chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct 24 | sudo chmod a+rwx -R /sys/fs/cgroup/devices 25 | fi 26 | 27 | 28 | -------------------------------------------------------------------------------- /packyak-aws-cdk/scripts/setup-hadoop-users.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # Check if the user yarn exists, if not, create it 5 | if id "yarn" &>/dev/null; then 6 | echo "User yarn exists, proceeding." 7 | else 8 | echo "User yarn does not exist, creating user." 9 | sudo adduser yarn 10 | fi 11 | 12 | 13 | # Define the groups to be checked and potentially created 14 | groups=("docker" "hadoop" "hdfsadmingroup" "hdfs" "spark") 15 | 16 | # Loop through each group and create it if it doesn't already exist 17 | for group in "${groups[@]}"; do 18 | if getent group "$group" > /dev/null; then 19 | echo "Group $group already exists." 20 | else 21 | sudo groupadd "$group" 22 | echo "Group $group created." 23 | fi 24 | done 25 | -------------------------------------------------------------------------------- /biome.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://biomejs.dev/schemas/1.5.3/schema.json", 3 | "organizeImports": { 4 | "enabled": true 5 | }, 6 | "linter": { 7 | "enabled": false, 8 | "rules": { 9 | "recommended": true, 10 | "suspicious": { 11 | "noExplicitAny": "off" 12 | } 13 | } 14 | }, 15 | "formatter": { 16 | "enabled": true, 17 | "formatWithErrors": false, 18 | "indentStyle": "space", 19 | "indentWidth": 2, 20 | "lineWidth": 80, 21 | "lineEnding": "lf", 22 | "ignore": [] 23 | }, 24 | "javascript": { 25 | "formatter": { 26 | "indentWidth": 2, 27 | "indentStyle": "space" 28 | } 29 | }, 30 | "json": { 31 | "formatter": { 32 | "indentWidth": 2, 33 | "indentStyle": "space" 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/my_app/spark_job.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | from packyak import Bucket, Cluster, every, duration 3 | import json 4 | from datetime import datetime 5 | 6 | spark = Cluster("spark") 7 | 8 | from pyspark import SparkContext 9 | # from pyspark.sql import SparkSession 10 | 11 | 12 | data = Bucket("data") 13 | 14 | orion = data / "orion" 15 | 16 | scans = data / "scans" 17 | 18 | 19 | @every(1, "day") 20 | @spark.job() 21 | def download_hackernews(sc: SparkContext): 22 | ( 23 | sc.binaryFiles(f"{scans}/") 24 | .map(lambda x: x[1].decode("utf-8")) 25 | .flatMap(lambda x: x.split("\n")) 26 | .map(lambda x: json.loads(x)) 27 | .toDF() 28 | .write.parquet(f"{scans}/{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}/") 29 | ) 30 | -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "examples-streamlit-aws-cdk", 3 | "version": "0.1.0", 4 | "description": "A Streamlit app deployed to AWS with Packyak for AWS CDK", 5 | "scripts": { 6 | "start": "poetry run streamlit run ./app/home.py", 7 | "start:docker": "docker build -t packyak . && docker run -e AWS_DEFAULT_REGION=us-east-2 -p 8501:8501 -t packyak", 8 | "synth": "cdk synth --quiet", 9 | "dev": "cdk deploy --require-approval never", 10 | "deploy": "cdk deploy --require-approval never" 11 | }, 12 | "dependencies": { 13 | "@packyak/aws-cdk": "workspace:^" 14 | }, 15 | "devDependencies": { 16 | "aws-cdk": "2.134.0", 17 | "aws-cdk-lib": "2.134.0", 18 | "ts-node": "^10.9.2", 19 | "tsx": "^4.7.0", 20 | "typescript": "^5" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /packyak/cli/run.py: -------------------------------------------------------------------------------- 1 | import asyncclick as click 2 | from packyak.cli.cli import cli 3 | from packyak.util.git import get_git_branch 4 | 5 | 6 | @cli.command() 7 | @click.argument("root_dir", type=str) 8 | @click.option( 9 | "--branch", 10 | type=str, 11 | help="Name of the data branch to materialize. Default to the branch of the current git repository.", 12 | ) 13 | @click.option( 14 | "--profile", 15 | type=str, 16 | help="AWS CLI profile to use when authenticating to SSM", 17 | ) 18 | @click.option( 19 | "--asset", 20 | multiple=True, 21 | type=str, 22 | help="List of assets to materialize. Can be specified multiple times. Defaults to all.", 23 | ) 24 | def run( 25 | root_dir: str, 26 | branch: str = get_git_branch(), 27 | profile: str | None = None, 28 | assets: list[str] = [], 29 | ): 30 | pass 31 | -------------------------------------------------------------------------------- /packyak-nessie/src/main/docker/Dockerfile.native: -------------------------------------------------------------------------------- 1 | #### 2 | # This Dockerfile is used in order to build a container that runs the Quarkus application in native (no JVM) mode. 3 | # 4 | # Before building the container image run: 5 | # 6 | # ./gradlew build -Dquarkus.package.type=native 7 | # 8 | # Then, build the image with: 9 | # 10 | # docker build -f src/main/docker/Dockerfile.native -t quarkus/code-with-quarkus . 11 | # 12 | # Then run the container using: 13 | # 14 | # docker run -i --rm -p 8080:8080 quarkus/code-with-quarkus 15 | # 16 | ### 17 | FROM registry.access.redhat.com/ubi8/ubi-minimal:8.9 18 | WORKDIR /work/ 19 | RUN chown 1001 /work \ 20 | && chmod "g+rwX" /work \ 21 | && chown 1001:root /work 22 | COPY --chown=1001:root build/*-runner /work/application 23 | 24 | EXPOSE 8080 25 | USER 1001 26 | 27 | ENTRYPOINT ["./application", "-Dquarkus.http.host=0.0.0.0"] 28 | -------------------------------------------------------------------------------- /packyak-docs/docs/tutorial-basics/deploy-your-site.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 5 3 | --- 4 | 5 | # Deploy your site 6 | 7 | Docusaurus is a **static-site-generator** (also called **[Jamstack](https://jamstack.org/)**). 8 | 9 | It builds your site as simple **static HTML, JavaScript and CSS files**. 10 | 11 | ## Build your site 12 | 13 | Build your site **for production**: 14 | 15 | ```bash 16 | npm run build 17 | ``` 18 | 19 | The static files are generated in the `build` folder. 20 | 21 | ## Deploy your site 22 | 23 | Test your production build locally: 24 | 25 | ```bash 26 | npm run serve 27 | ``` 28 | 29 | The `build` folder is now served at [http://localhost:3000/](http://localhost:3000/). 30 | 31 | You can now deploy the `build` folder **almost anywhere** easily, **for free** or very small cost (read the **[Deployment Guide](https://docusaurus.io/docs/deployment)**). 32 | -------------------------------------------------------------------------------- /packyak-aws-cdk/scripts/write-env-variables.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | ENV_FILE=/etc/profile.d/packyak.sh 6 | 7 | sudo touch $ENV_FILE 8 | 9 | # Create the ENV_FILE file with explicit permissions 10 | sudo touch $ENV_FILE 11 | sudo chmod 644 $ENV_FILE 12 | sudo chown root:root $ENV_FILE 13 | sudo chmod +x $ENV_FILE 14 | 15 | while [[ "$#" -gt 0 ]]; do 16 | case $1 in 17 | --*=*) 18 | key=$(echo $1 | sed 's/--//;s/=.*//') 19 | value=$(echo $1 | sed 's/[^=]*=//') 20 | echo export $key="$value" | sudo tee -a $ENV_FILE > /dev/null 21 | ;; 22 | --*) 23 | key=$(echo $1 | sed 's/--//') 24 | value=$2 25 | echo export $key="$value" | sudo tee -a $ENV_FILE > /dev/null 26 | shift ;; 27 | *) echo "Unknown parameter passed: $1"; exit 1 ;; 28 | esac 29 | shift 30 | done 31 | -------------------------------------------------------------------------------- /packyak/cli/materialize.py: -------------------------------------------------------------------------------- 1 | import os 2 | import asyncclick as click 3 | from packyak.cli.cli import cli 4 | from packyak.util.git import get_git_branch 5 | 6 | 7 | @cli.command() 8 | @click.argument( 9 | "assets", 10 | nargs=-1, 11 | type=list[str], 12 | ) 13 | @click.option( 14 | "--root", 15 | type=str, 16 | help="The working directory of the PackYak project. Defaults to CWD.", 17 | ) 18 | @click.option( 19 | "--branch", 20 | type=str, 21 | help="Name of the data branch to materialize. Default to the branch of the current git repository.", 22 | ) 23 | @click.option( 24 | "--profile", 25 | type=str, 26 | help="AWS CLI profile to use when authenticating to SSM", 27 | ) 28 | def materialize( 29 | assets: list[str] = [], 30 | root: str = os.getcwd(), 31 | branch: str = get_git_branch(), 32 | profile: str | None = None, 33 | ): 34 | pass 35 | -------------------------------------------------------------------------------- /packyak-aws-cdk/scripts/install-nvidia-container-toolkit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-yum-or-dnf 4 | 5 | curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \ 6 | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo 7 | 8 | sudo yum install -y nvidia-container-toolkit 9 | 10 | # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker 11 | 12 | sudo nvidia-ctk runtime configure --runtime=docker 13 | 14 | # (CRITICAL) this set the value "no-cgroups = false" in /etc/nvidia-container-runtime/config.toml 15 | # without this YARN docker containers will fail with "Failed to initialize NVML: Unknown Error" 16 | sudo nvidia-ctk config --set nvidia-container-cli.no-cgroups=false -i 17 | 18 | sudo systemctl restart docker -------------------------------------------------------------------------------- /packyak-docs/sidebars.ts: -------------------------------------------------------------------------------- 1 | import type {SidebarsConfig} from '@docusaurus/plugin-content-docs'; 2 | 3 | /** 4 | * Creating a sidebar enables you to: 5 | - create an ordered group of docs 6 | - render a sidebar for each doc of that group 7 | - provide next/previous navigation 8 | 9 | The sidebars can be generated from the filesystem, or explicitly defined here. 10 | 11 | Create as many sidebars as you want. 12 | */ 13 | const sidebars: SidebarsConfig = { 14 | // By default, Docusaurus generates a sidebar from the docs folder structure 15 | tutorialSidebar: [{type: 'autogenerated', dirName: '.'}], 16 | 17 | // But you can create a sidebar manually 18 | /* 19 | tutorialSidebar: [ 20 | 'intro', 21 | 'hello', 22 | { 23 | type: 'category', 24 | label: 'Tutorial', 25 | items: ['tutorial-basics/create-a-document'], 26 | }, 27 | ], 28 | */ 29 | }; 30 | 31 | export default sidebars; 32 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/sagemaker/user-profile.ts: -------------------------------------------------------------------------------- 1 | import { Resource } from "aws-cdk-lib"; 2 | import { IRole } from "aws-cdk-lib/aws-iam"; 3 | import { CfnUserProfile } from "aws-cdk-lib/aws-sagemaker"; 4 | import { Construct } from "constructs"; 5 | import { Domain } from "./domain"; 6 | 7 | export interface UserProfileProps { 8 | readonly domain: Domain; 9 | readonly userProfileName: string; 10 | readonly executionRole?: IRole; 11 | } 12 | 13 | export class UserProfile extends Resource { 14 | protected readonly resource: CfnUserProfile; 15 | constructor(scope: Construct, id: string, props: UserProfileProps) { 16 | super(scope, id); 17 | 18 | this.resource = new CfnUserProfile(this, "Resource", { 19 | domainId: props.domain.domainId, 20 | userProfileName: props.userProfileName, 21 | userSettings: { 22 | executionRole: props.executionRole?.roleArn, 23 | }, 24 | }); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /packyak/runtime/integration.py: -------------------------------------------------------------------------------- 1 | from typing import ( 2 | Any, 3 | Callable, 4 | Generic, 5 | Protocol, 6 | runtime_checkable, 7 | TypeVar, 8 | List, 9 | ParamSpec, 10 | ) 11 | 12 | P = ParamSpec("P") 13 | R = TypeVar("R", covariant=True) 14 | 15 | 16 | @runtime_checkable 17 | class Integration(Protocol, Generic[P, R]): 18 | scopes: List[str] 19 | metadata: dict[str, Any] | None 20 | 21 | def __call__(self, *args: P.args, **kwargs: P.kwargs) -> R: 22 | ... 23 | 24 | 25 | def is_integration(func: Any) -> bool: 26 | return hasattr(func, "scopes") and hasattr(func, "metadata") 27 | 28 | 29 | def integration(*scopes: str, **kwargs: property): 30 | def decorator(func: Callable[P, R]) -> Callable[P, R]: 31 | setattr(func, "scopes", scopes) 32 | setattr(func, "metadata", kwargs if kwargs and len(kwargs) > 0 else None) 33 | return func 34 | 35 | return decorator 36 | -------------------------------------------------------------------------------- /packyak-aws-cdk/bin/packyak.mjs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | import * as path from "path"; 4 | import { promises as fs } from "fs"; 5 | import { exec } from "child_process"; 6 | import { promisify } from "util"; 7 | 8 | const execAsync = promisify(exec); 9 | 10 | const packyakConfig = (await fs.readdir(".")).find((file) => 11 | file.match(/packyak\.config\.(js|ts)/), 12 | ); 13 | if (!packyakConfig) { 14 | console.error("No packyak.(js|ts) file found in current directory"); 15 | process.exit(1); 16 | } 17 | 18 | const ext = path.extname(packyakConfig); 19 | 20 | const command = `npx tsx -e 'import("./packyak.config${ext}")'`; 21 | 22 | try { 23 | const { stdout, stderr } = await execAsync(command); 24 | if (stdout.trim()) { 25 | process.stdout.write(stdout); 26 | } 27 | if (stderr.trim()) { 28 | process.stderr.write(stderr); 29 | } 30 | } catch (error) { 31 | console.error("Execution error:", error); 32 | process.exit(1); 33 | } 34 | -------------------------------------------------------------------------------- /packyak-docs/README.md: -------------------------------------------------------------------------------- 1 | # Website 2 | 3 | This website is built using [Docusaurus](https://docusaurus.io/), a modern static website generator. 4 | 5 | ### Installation 6 | 7 | ``` 8 | $ yarn 9 | ``` 10 | 11 | ### Local Development 12 | 13 | ``` 14 | $ yarn start 15 | ``` 16 | 17 | This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server. 18 | 19 | ### Build 20 | 21 | ``` 22 | $ yarn build 23 | ``` 24 | 25 | This command generates static content into the `build` directory and can be served using any static contents hosting service. 26 | 27 | ### Deployment 28 | 29 | Using SSH: 30 | 31 | ``` 32 | $ USE_SSH=true yarn deploy 33 | ``` 34 | 35 | Not using SSH: 36 | 37 | ``` 38 | $ GIT_USER= yarn deploy 39 | ``` 40 | 41 | If you are using GitHub pages for hosting, this command is a convenient way to build the website and push to the `gh-pages` branch. 42 | -------------------------------------------------------------------------------- /packyak/spark/session.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | if TYPE_CHECKING: 6 | from pyspark.sql import SparkSession 7 | 8 | 9 | def init_session(cwd: str = os.getcwd(), venv: str | None = None) -> SparkSession: 10 | return session_builder(cwd, venv).getOrCreate() 11 | 12 | 13 | def session_builder( 14 | cwd: str = os.getcwd(), venv: str | None = None 15 | ) -> SparkSession.Builder: 16 | import findspark 17 | 18 | findspark.init() 19 | 20 | venv = venv if venv is not None else os.path.join(cwd, ".venv", "bin", "python") 21 | 22 | builder: SparkSession.Builder = ( 23 | SparkSession.builder.master("yarn") # type: ignore - not sure why builder is of type classproperty instead of SparkSession.builder 24 | .config("spark.pyspark.python", venv) 25 | .config("spark.pyspark.driver.python", venv) 26 | .config("spark.pyspark.virtualenv.enabled", "false") 27 | ) 28 | return builder 29 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/release-label.ts: -------------------------------------------------------------------------------- 1 | import { PythonVersion } from "./python-version"; 2 | import { ScalaVersion } from "./scala-version"; 3 | import { SparkVersion } from "./spark-version"; 4 | 5 | export class ReleaseLabel { 6 | public static readonly EMR_7_0_0 = new ReleaseLabel( 7 | "emr-7.0.0", 8 | new SparkVersion("3.5.0"), 9 | new PythonVersion("3.9"), 10 | new ScalaVersion("2.12.17"), 11 | ); 12 | public static readonly EMR_6_15_0 = new ReleaseLabel( 13 | "emr-6.15.0", 14 | new SparkVersion("3.4.1"), 15 | new PythonVersion("3.7"), 16 | new ScalaVersion("2.12.17"), 17 | ); 18 | public static readonly EMR_6 = this.EMR_6_15_0; 19 | public static readonly LATEST = this.EMR_7_0_0; 20 | 21 | constructor( 22 | readonly label: string, 23 | readonly sparkVersion: SparkVersion, 24 | readonly pythonVersion: PythonVersion, 25 | readonly scalaVersion: ScalaVersion, 26 | ) {} 27 | 28 | public get majorVersion(): number { 29 | return Number(this.label.split("-")[1].split(".")[0]); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /packyak/util/typed_resource.py: -------------------------------------------------------------------------------- 1 | from typing import Any, TypeVar, AsyncContextManager 2 | 3 | # Define a type variable for the casted type 4 | T = TypeVar("T") 5 | 6 | 7 | class TypedResource(AsyncContextManager[T]): 8 | def __init__(self, untyped_resource: AsyncContextManager[Any]): 9 | self.untyped_resource = untyped_resource 10 | self.typed_resource: T | None = None # To be set in __aenter__ 11 | 12 | async def __aenter__(self) -> T: 13 | # Proxy the __aenter__ call to the untyped resource, then cast it 14 | raw_resource = await self.untyped_resource.__aenter__() 15 | self.typed_resource = self.cast_to_type(raw_resource) 16 | return self.typed_resource 17 | 18 | async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any): 19 | # Proxy the __aexit__ call to the untyped resource 20 | await self.untyped_resource.__aexit__(exc_type, exc_val, exc_tb) 21 | 22 | def cast_to_type(self, resource: Any) -> T: 23 | # default implementation is type-only 24 | return resource 25 | -------------------------------------------------------------------------------- /packyak-docs/docs/tutorial-basics/create-a-blog-post.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 3 3 | --- 4 | 5 | # Create a Blog Post 6 | 7 | Docusaurus creates a **page for each blog post**, but also a **blog index page**, a **tag system**, an **RSS** feed... 8 | 9 | ## Create your first Post 10 | 11 | Create a file at `blog/2021-02-28-greetings.md`: 12 | 13 | ```md title="blog/2021-02-28-greetings.md" 14 | --- 15 | slug: greetings 16 | title: Greetings! 17 | authors: 18 | - name: Joel Marcey 19 | title: Co-creator of Docusaurus 1 20 | url: https://github.com/JoelMarcey 21 | image_url: https://github.com/JoelMarcey.png 22 | - name: Sébastien Lorber 23 | title: Docusaurus maintainer 24 | url: https://sebastienlorber.com 25 | image_url: https://github.com/slorber.png 26 | tags: [greetings] 27 | --- 28 | 29 | Congratulations, you have made your first post! 30 | 31 | Feel free to play around and edit this post as much you like. 32 | ``` 33 | 34 | A new blog post is now available at [http://localhost:3000/blog/greetings](http://localhost:3000/blog/greetings). 35 | -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/my_app/home.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from packyak import Bucket, function 3 | from my_app.videos import get_video 4 | 5 | bucket = Bucket("bucket") 6 | 7 | 8 | @function() 9 | def some_func(): 10 | return bucket.get_sync("my-key") 11 | 12 | 13 | if __name__ == "__main__": 14 | import streamlit as st 15 | 16 | df = pd.DataFrame( 17 | { 18 | "first column": pd.Series([1, 2, 3, 4], dtype=int), 19 | "second column": pd.Series([10, 20, 30, 40], dtype=int), 20 | } 21 | ) 22 | 23 | def get_stuff(key: str): 24 | return bucket.get_sync(key) 25 | 26 | col1 = df["first column"] 27 | 28 | df 29 | 30 | option = st.selectbox("Which number do you like best?", col1) 31 | 32 | "You selected: ", option 33 | 34 | key = st.text_input("Enter a key to get from the bucket", "my-key") 35 | 36 | obj = get_stuff(key) 37 | video = get_video(key) 38 | 39 | if obj: 40 | st.write(obj.body.read().decode("utf-8")) 41 | else: 42 | st.write("No object found") 43 | -------------------------------------------------------------------------------- /packyak/runtime/binding.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from packyak.runtime.function import LambdaFunction 4 | from packyak.runtime.job import Job 5 | from packyak.resource import Resource 6 | from packyak.runtime.runnable import Runnable 7 | from packyak.spec import BindingSpec 8 | from packyak.synth.loaded_module import LoadedModule 9 | 10 | 11 | BindingTarget = Runnable[Any, Any] | LoadedModule 12 | 13 | 14 | class Binding: 15 | def __init__( 16 | self, 17 | function: BindingTarget, 18 | resource: Resource, 19 | scopes: list[str], 20 | metadata: dict[str, Any] | None = None, 21 | ) -> None: 22 | self.resource = resource 23 | self.scopes = scopes 24 | self.function = function 25 | self.metadata = metadata 26 | 27 | def to_binding_spec(self) -> BindingSpec: 28 | return BindingSpec( 29 | resource_type=self.resource.resource_type, 30 | resource_id=self.resource.resource_id, 31 | scopes=self.scopes, 32 | props=self.metadata, 33 | ) 34 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/spark-sql-extension.ts: -------------------------------------------------------------------------------- 1 | import { ScalaVersion } from "./scala-version"; 2 | import { SparkVersion } from "./spark-version"; 3 | 4 | export class SparkSqlExtension { 5 | public static readonly Nessie = new SparkSqlExtension( 6 | "org.projectnessie.nessie-integrations", 7 | "nessie-spark-extensions", 8 | "0.76.6", 9 | "org.projectnessie.spark.extensions.NessieSparkSessionExtensions", 10 | ); 11 | public static readonly Iceberg = new SparkSqlExtension( 12 | "org.apache.iceberg", 13 | "iceberg-spark-runtime", 14 | "1.4.3", 15 | "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", 16 | ); 17 | 18 | constructor( 19 | readonly groupId: string, 20 | readonly artifactId: string, 21 | readonly pkgVersion: string, 22 | readonly className: string, 23 | ) {} 24 | 25 | public maven(sparkVersion: SparkVersion, scalaVersion: ScalaVersion): string { 26 | return `${this.groupId}:${this.artifactId}-${sparkVersion.majorMinorVersion}_${scalaVersion.majorMinorVersion}:${this.pkgVersion}`; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /packyak-nessie/src/main/docker/Dockerfile.native-micro: -------------------------------------------------------------------------------- 1 | #### 2 | # This Dockerfile is used in order to build a container that runs the Quarkus application in native (no JVM) mode. 3 | # It uses a micro base image, tuned for Quarkus native executables. 4 | # It reduces the size of the resulting container image. 5 | # Check https://quarkus.io/guides/quarkus-runtime-base-image for further information about this image. 6 | # 7 | # Before building the container image run: 8 | # 9 | # ./gradlew build -Dquarkus.package.type=native 10 | # 11 | # Then, build the image with: 12 | # 13 | # docker build -f src/main/docker/Dockerfile.native-micro -t quarkus/code-with-quarkus . 14 | # 15 | # Then run the container using: 16 | # 17 | # docker run -i --rm -p 8080:8080 quarkus/code-with-quarkus 18 | # 19 | ### 20 | FROM quay.io/quarkus/quarkus-micro-image:2.0 21 | WORKDIR /work/ 22 | RUN chown 1001 /work \ 23 | && chmod "g+rwX" /work \ 24 | && chown 1001:root /work 25 | COPY --chown=1001:root build/*-runner /work/application 26 | 27 | EXPOSE 8080 28 | USER 1001 29 | 30 | ENTRYPOINT ["./application", "-Dquarkus.http.host=0.0.0.0"] 31 | -------------------------------------------------------------------------------- /packyak/__init__.py: -------------------------------------------------------------------------------- 1 | from packyak.runtime.function import function 2 | from packyak.runtime.cluster import Cluster, Engine 3 | from packyak.runtime.job import Job 4 | from packyak.duration import Duration, duration, TimeUnit 5 | from packyak.scheduling.every import every 6 | from packyak.scheduling.cron import cron 7 | from packyak.storage.bucket import Bucket 8 | from packyak.streaming.queue import Queue, Message, ReceivedMessagesEvent 9 | from packyak.synth.synth import synth 10 | from packyak.asset.asset import Asset, asset 11 | from packyak.asset.manifest import Manifest 12 | from packyak.asset.namespace import DB, Namespace 13 | from packyak.asset.partition_key import PartitionKey 14 | from packyak.asset.source import source 15 | 16 | __all__ = [ 17 | "asset", 18 | "Asset", 19 | "Bucket", 20 | "Cluster", 21 | "cron", 22 | "DB", 23 | "duration", 24 | "Duration", 25 | "Engine", 26 | "every", 27 | "function", 28 | "Job", 29 | "Manifest", 30 | "Message", 31 | "Namespace", 32 | "PartitionKey", 33 | "Queue", 34 | "ReceivedMessagesEvent", 35 | "source", 36 | "synth", 37 | "TimeUnit", 38 | ] 39 | -------------------------------------------------------------------------------- /packyak/asset/source.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Generic, TypeVar 2 | 3 | from packyak.asset.asset import AssetNode, TInput, TOutput 4 | from packyak.asset.manifest import Manifest 5 | 6 | 7 | TManifest = TypeVar("TManifest", bound=Manifest) 8 | 9 | 10 | class MaterializedAssetNode(Generic[TInput, TOutput], AssetNode[TOutput]): 11 | def __init__(self, upstream: AssetNode[TInput], f: Callable[[TInput], TOutput]): 12 | self.upstream = upstream 13 | self.f = f 14 | 15 | def materialize(self, input: TInput) -> TOutput: 16 | return self.f(input) 17 | 18 | 19 | def source(id: str, data: type[TManifest]) -> "SourceAssetNode[TManifest]": 20 | return SourceAssetNode[TManifest](id, data) 21 | 22 | 23 | class SourceAssetNode(Generic[TManifest], AssetNode[TManifest]): 24 | def __init__(self, id: str, type: type[TManifest]): 25 | self.id = id 26 | self.type = type 27 | 28 | def table(self): 29 | def wrapper( 30 | f: Callable[[TManifest], TOutput], 31 | ) -> MaterializedAssetNode[TManifest, TOutput]: 32 | return MaterializedAssetNode(self, f) 33 | 34 | return wrapper 35 | -------------------------------------------------------------------------------- /packyak-docs/src/css/custom.css: -------------------------------------------------------------------------------- 1 | /** 2 | * Any CSS included here will be global. The classic template 3 | * bundles Infima by default. Infima is a CSS framework designed to 4 | * work well for content-centric websites. 5 | */ 6 | 7 | /* You can override the default Infima variables here. */ 8 | :root { 9 | --ifm-color-primary: #2e8555; 10 | --ifm-color-primary-dark: #29784c; 11 | --ifm-color-primary-darker: #277148; 12 | --ifm-color-primary-darkest: #205d3b; 13 | --ifm-color-primary-light: #33925d; 14 | --ifm-color-primary-lighter: #359962; 15 | --ifm-color-primary-lightest: #3cad6e; 16 | --ifm-code-font-size: 95%; 17 | --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1); 18 | } 19 | 20 | /* For readability concerns, you should choose a lighter palette in dark mode. */ 21 | [data-theme='dark'] { 22 | --ifm-color-primary: #25c2a0; 23 | --ifm-color-primary-dark: #21af90; 24 | --ifm-color-primary-darker: #1fa588; 25 | --ifm-color-primary-darkest: #1a8870; 26 | --ifm-color-primary-light: #29d5b0; 27 | --ifm-color-primary-lighter: #32d8b4; 28 | --ifm-color-primary-lightest: #4fddbf; 29 | --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3); 30 | } 31 | -------------------------------------------------------------------------------- /packyak-nessie/build.gradle.kts: -------------------------------------------------------------------------------- 1 | plugins { 2 | java 3 | id("io.quarkus") 4 | } 5 | 6 | repositories { 7 | mavenCentral() 8 | mavenLocal() 9 | } 10 | 11 | val quarkusPlatformGroupId: String by project 12 | val quarkusPlatformArtifactId: String by project 13 | val quarkusPlatformVersion: String by project 14 | 15 | dependencies { 16 | implementation("io.quarkus:quarkus-amazon-lambda-http") 17 | implementation(enforcedPlatform("${quarkusPlatformGroupId}:${quarkusPlatformArtifactId}:${quarkusPlatformVersion}")) 18 | implementation("io.quarkus:quarkus-arc") 19 | implementation("io.quarkus:quarkus-resteasy-reactive") 20 | testImplementation("io.quarkus:quarkus-junit5") 21 | testImplementation("io.rest-assured:rest-assured") 22 | } 23 | 24 | group = "org.acme" 25 | version = "1.0.0-SNAPSHOT" 26 | 27 | java { 28 | sourceCompatibility = JavaVersion.VERSION_17 29 | targetCompatibility = JavaVersion.VERSION_17 30 | } 31 | 32 | tasks.withType { 33 | systemProperty("java.util.logging.manager", "org.jboss.logmanager.LogManager") 34 | } 35 | tasks.withType { 36 | options.encoding = "UTF-8" 37 | options.compilerArgs.add("-parameters") 38 | } 39 | -------------------------------------------------------------------------------- /packyak-docs/docs/tutorial-basics/congratulations.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 6 3 | --- 4 | 5 | # Congratulations! 6 | 7 | You have just learned the **basics of Docusaurus** and made some changes to the **initial template**. 8 | 9 | Docusaurus has **much more to offer**! 10 | 11 | Have **5 more minutes**? Take a look at **[versioning](../tutorial-extras/manage-docs-versions.md)** and **[i18n](../tutorial-extras/translate-your-site.md)**. 12 | 13 | Anything **unclear** or **buggy** in this tutorial? [Please report it!](https://github.com/facebook/docusaurus/discussions/4610) 14 | 15 | ## What's next? 16 | 17 | - Read the [official documentation](https://docusaurus.io/) 18 | - Modify your site configuration with [`docusaurus.config.js`](https://docusaurus.io/docs/api/docusaurus-config) 19 | - Add navbar and footer items with [`themeConfig`](https://docusaurus.io/docs/api/themes/configuration) 20 | - Add a custom [Design and Layout](https://docusaurus.io/docs/styling-layout) 21 | - Add a [search bar](https://docusaurus.io/docs/search) 22 | - Find inspirations in the [Docusaurus showcase](https://docusaurus.io/showcase) 23 | - Get involved in the [Docusaurus Community](https://docusaurus.io/community/support) 24 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Python: Streamlit AWS CDK", 6 | "type": "python", 7 | "request": "launch", 8 | "console": "integratedTerminal", 9 | "justMyCode": true, 10 | "python": "${workspaceFolder}/.venv/bin/python", 11 | "module": "app", 12 | "cwd": "${workspaceFolder}/examples/streamlit-aws-cdk" 13 | }, 14 | { 15 | "name": "Python: packyak.synth", 16 | "type": "python", 17 | "request": "launch", 18 | "console": "integratedTerminal", 19 | "justMyCode": true, 20 | "python": "${workspaceFolder}/.venv/bin/python", 21 | "module": "packyak.synth", 22 | "cwd": "${workspaceFolder}/examples/streamlit-aws-cdk", 23 | "args": ["--root=app"], 24 | "env": { 25 | "AWS_DEFAULT_REGION": "us-east-1" 26 | } 27 | }, 28 | { 29 | "name": "Python: CodeGen", 30 | "type": "python", 31 | "request": "launch", 32 | "program": "${workspaceFolder}/scripts/codegen.py", 33 | "console": "integratedTerminal", 34 | "justMyCode": true, 35 | "python": "${workspaceFolder}/.venv/bin/python" 36 | } 37 | ] 38 | } 39 | -------------------------------------------------------------------------------- /packyak/runtime/runnable.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, ParamSpec, TypeVar, Generic 2 | from packyak.resource import Resource 3 | from packyak.util.fqn import get_fully_qualified_name 4 | from packyak.spec import DependencyGroup 5 | 6 | Params = ParamSpec("Params") 7 | Return = TypeVar("Return") 8 | 9 | 10 | class Runnable(Resource, Generic[Params, Return]): 11 | def __init__( 12 | self, 13 | handler: Callable[Params, Return], 14 | resource_id: str | None, 15 | file_name: str | None, 16 | with_: DependencyGroup | None, 17 | without: DependencyGroup | None, 18 | dev: bool | None, 19 | all_extras: bool | None, 20 | without_hashes: bool | None, 21 | without_urls: bool | None, 22 | ): 23 | super().__init__( 24 | resource_id=resource_id or get_fully_qualified_name(handler), 25 | ) 26 | self.file_name = file_name or handler.__code__.co_filename 27 | self.handler = handler 28 | self.with_ = with_ 29 | self.without = without 30 | self.dev = dev 31 | self.all_extras = all_extras 32 | self.without_hashes = without_hashes 33 | self.without_urls = without_urls 34 | -------------------------------------------------------------------------------- /packyak-docs/docs/tutorial-basics/create-a-page.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 1 3 | --- 4 | 5 | # Create a Page 6 | 7 | Add **Markdown or React** files to `src/pages` to create a **standalone page**: 8 | 9 | - `src/pages/index.js` → `localhost:3000/` 10 | - `src/pages/foo.md` → `localhost:3000/foo` 11 | - `src/pages/foo/bar.js` → `localhost:3000/foo/bar` 12 | 13 | ## Create your first React Page 14 | 15 | Create a file at `src/pages/my-react-page.js`: 16 | 17 | ```jsx title="src/pages/my-react-page.js" 18 | import React from 'react'; 19 | import Layout from '@theme/Layout'; 20 | 21 | export default function MyReactPage() { 22 | return ( 23 | 24 |

My React page

25 |

This is a React page

26 |
27 | ); 28 | } 29 | ``` 30 | 31 | A new page is now available at [http://localhost:3000/my-react-page](http://localhost:3000/my-react-page). 32 | 33 | ## Create your first Markdown Page 34 | 35 | Create a file at `src/pages/my-markdown-page.md`: 36 | 37 | ```mdx title="src/pages/my-markdown-page.md" 38 | # My Markdown page 39 | 40 | This is a Markdown page 41 | ``` 42 | 43 | A new page is now available at [http://localhost:3000/my-markdown-page](http://localhost:3000/my-markdown-page). 44 | -------------------------------------------------------------------------------- /packyak/runtime/job.py: -------------------------------------------------------------------------------- 1 | from typing import ( 2 | Callable, 3 | TypeVar, 4 | TYPE_CHECKING, 5 | ) 6 | from packyak.runtime.runnable import Runnable 7 | 8 | from packyak.spec import DependencyGroup 9 | 10 | 11 | # if TYPE_CHECKING: 12 | from pyspark import SparkContext 13 | 14 | Return = TypeVar("Return") 15 | 16 | 17 | class Job(Runnable[[SparkContext], Return]): 18 | def __init__( 19 | self, 20 | handler: Callable[[SparkContext], Return], 21 | job_id: str | None = None, 22 | file_name: str | None = None, 23 | with_: DependencyGroup | None = None, 24 | without: DependencyGroup | None = None, 25 | dev: bool | None = None, 26 | all_extras: bool | None = None, 27 | without_hashes: bool | None = None, 28 | without_urls: bool | None = None, 29 | ) -> None: 30 | super().__init__( 31 | resource_id=job_id, 32 | handler=handler, 33 | file_name=file_name, 34 | with_=with_, 35 | without=without, 36 | dev=dev, 37 | all_extras=all_extras, 38 | without_hashes=without_hashes, 39 | without_urls=without_urls, 40 | ) 41 | self.job_id = job_id 42 | -------------------------------------------------------------------------------- /packyak-aws-cdk/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "outDir": "lib", 4 | "rootDir": "src", 5 | "declarationMap": false, 6 | "inlineSourceMap": true, 7 | "inlineSources": true, 8 | "alwaysStrict": true, 9 | "declaration": true, 10 | "experimentalDecorators": true, 11 | "incremental": true, 12 | "lib": [ 13 | "es2020" 14 | ], 15 | "module": "CommonJS", 16 | "noEmitOnError": true, 17 | "noFallthroughCasesInSwitch": true, 18 | "noImplicitAny": true, 19 | "noImplicitReturns": true, 20 | "noImplicitThis": true, 21 | "noUnusedLocals": true, 22 | "noUnusedParameters": true, 23 | "resolveJsonModule": true, 24 | "skipLibCheck": true, 25 | "strict": true, 26 | "strictNullChecks": true, 27 | "strictPropertyInitialization": true, 28 | "stripInternal": false, 29 | "target": "ES2020", 30 | "composite": false, 31 | "tsBuildInfoFile": "lib/tsconfig.tsbuildinfo" 32 | }, 33 | "include": [ 34 | "src/**/*.ts" 35 | ], 36 | "exclude": [ 37 | "node_modules", 38 | "/Users/sam/workspaces/packyak/packyak-aws-cdk/lib/.types-compat" 39 | ], 40 | "_generated_by_jsii_": "Generated by jsii - safe to delete, and ideally should be in .gitignore" 41 | } -------------------------------------------------------------------------------- /packyak.config.ts: -------------------------------------------------------------------------------- 1 | import "./packyak-aws-cdk/lib/index"; 2 | 3 | import { 4 | Certificate, 5 | CertificateValidation, 6 | } from "aws-cdk-lib/aws-certificatemanager"; 7 | import { HostedZone } from "aws-cdk-lib/aws-route53"; 8 | import { App, Stack } from "aws-cdk-lib/core"; 9 | 10 | async function Dns(this: Stack) { 11 | const hostedZone = new HostedZone(this, "hostedZone", { 12 | zoneName: "packyak.ai", 13 | }); 14 | const certificate = new Certificate(this, "certificate", { 15 | domainName: "packyak.ai", 16 | validation: CertificateValidation.fromDns(hostedZone), 17 | }); 18 | 19 | return { 20 | hostedZone, 21 | certificate, 22 | }; 23 | } 24 | 25 | // https://spark.apache.org/docs/latest/sql-data-sources-binaryFile.html 26 | 27 | // CDK App start here 28 | 29 | const app = new App(); 30 | 31 | // const dns = await app.create(Dns); 32 | 33 | // app.create(function DocsSite(this: Construct) { 34 | // const site = new StaticSite(this, "docs.packyak.ai", { 35 | // path: "docs", 36 | // customDomain: { 37 | // domainName: "docs.packyak.ai", 38 | // hostedZone: dns.hostedZone.hostedZoneArn, 39 | // }, 40 | // }); 41 | 42 | // return { 43 | // SiteUrl: site.url!, 44 | // }; 45 | // }); 46 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/spark-config.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Format a set of CLI options into a string where {key}={value}. 3 | */ 4 | export function toCLIArgs(options: Record): string { 5 | return Object.entries(options) 6 | .map(([key, value]) => `${key}=${value}`) 7 | .join(" "); 8 | } 9 | 10 | /** 11 | * Parse a CLI options string into a key-value pair record. 12 | */ 13 | export function parseCLIArgs(optionsString: string): Record { 14 | const optionsArray = optionsString.split(" "); 15 | const optionsRecord: Record = {}; 16 | 17 | optionsArray.forEach((option) => { 18 | const [key, value] = option.split("="); 19 | if (key && value) { 20 | optionsRecord[key] = value; 21 | } 22 | }); 23 | 24 | return optionsRecord; 25 | } 26 | 27 | export function mergeSparkExtraJars( 28 | ...args: (string | Record | undefined)[] 29 | ): string { 30 | return toCLIArgs( 31 | args.reduce((acc: Record, current) => { 32 | if (typeof current === "string") { 33 | current = parseCLIArgs(current); 34 | } else if (current === undefined) { 35 | current = {}; 36 | } 37 | return { ...acc, ...current }; 38 | }, {}), 39 | ); 40 | } 41 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/export-requirements.ts: -------------------------------------------------------------------------------- 1 | import * as fs from "fs"; 2 | import { execSync } from "child_process"; 3 | import * as path from "path"; 4 | import type { PythonPoetryArgs } from "./python-poetry"; 5 | 6 | export function exportRequirementsSync( 7 | dir: string, 8 | options?: PythonPoetryArgs, 9 | ): string { 10 | const requirements = path.join(dir, "requirements.txt"); 11 | const command = [ 12 | "poetry export -f requirements.txt", 13 | arg("with", options?.include), 14 | arg("without", options?.exclude), 15 | arg("without-urls", options?.withoutUrls), 16 | arg("without-hashes", options?.withoutHashes ?? true), 17 | arg("dev", options?.dev), 18 | arg("all-extras", options?.allExtras), 19 | `> ${requirements}`, 20 | ]; 21 | 22 | fs.mkdirSync(dir, { recursive: true }); 23 | execSync(command.join(" ")); 24 | return requirements; 25 | 26 | function arg( 27 | flag: string, 28 | value: T | undefined, 29 | ) { 30 | if (value === undefined) { 31 | return ""; 32 | } else if (typeof value === "boolean") { 33 | return value ? ` --${flag}` : ""; 34 | } else { 35 | return ` --${flag}=${Array.isArray(value) ? value.join(",") : value}`; 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /packyak-docs/docs/tutorial-basics/create-a-document.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 2 3 | --- 4 | 5 | # Create a Document 6 | 7 | Documents are **groups of pages** connected through: 8 | 9 | - a **sidebar** 10 | - **previous/next navigation** 11 | - **versioning** 12 | 13 | ## Create your first Doc 14 | 15 | Create a Markdown file at `docs/hello.md`: 16 | 17 | ```md title="docs/hello.md" 18 | # Hello 19 | 20 | This is my **first Docusaurus document**! 21 | ``` 22 | 23 | A new document is now available at [http://localhost:3000/docs/hello](http://localhost:3000/docs/hello). 24 | 25 | ## Configure the Sidebar 26 | 27 | Docusaurus automatically **creates a sidebar** from the `docs` folder. 28 | 29 | Add metadata to customize the sidebar label and position: 30 | 31 | ```md title="docs/hello.md" {1-4} 32 | --- 33 | sidebar_label: 'Hi!' 34 | sidebar_position: 3 35 | --- 36 | 37 | # Hello 38 | 39 | This is my **first Docusaurus document**! 40 | ``` 41 | 42 | It is also possible to create your sidebar explicitly in `sidebars.js`: 43 | 44 | ```js title="sidebars.js" 45 | export default { 46 | tutorialSidebar: [ 47 | 'intro', 48 | // highlight-next-line 49 | 'hello', 50 | { 51 | type: 'category', 52 | label: 'Tutorial', 53 | items: ['tutorial-basics/create-a-document'], 54 | }, 55 | ], 56 | }; 57 | ``` 58 | -------------------------------------------------------------------------------- /packyak-aws-cdk/scripts/install-ssm-agent.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## Name: SSM Agent Installer Script 5 | ## Description: Installs SSM Agent on EMR cluster EC2 instances and update hosts file 6 | ## Reference: https://aws.amazon.com/blogs/big-data/securing-access-to-emr-clusters-using-aws-systems-manager/ 7 | ## 8 | sudo yum install -y https://s3.amazonaws.com/ec2-downloads-windows/SSMAgent/latest/linux_amd64/amazon-ssm-agent.rpm 9 | sudo status amazon-ssm-agent >>/tmp/ssm-status.log 10 | ## Update hosts file 11 | echo "\n ########### localhost mapping check ########### \n" > /tmp/localhost.log 12 | lhost=`sudo cat /etc/hosts | grep localhost | grep '127.0.0.1' | grep -v '^#'` 13 | v_ipaddr=`hostname --ip-address` 14 | lhostmapping=`sudo cat /etc/hosts | grep $v_ipaddr | grep -v '^#'` 15 | if [ -z "${lhostmapping}" ]; 16 | then 17 | echo "\n ########### IP address to localhost mapping NOT defined in hosts files. add now ########### \n " >> /tmp/localhost.log 18 | sudo echo "${v_ipaddr} localhost" >>/etc/hosts 19 | else 20 | echo "\n IP address to localhost mapping already defined in hosts file \n" >> /tmp/localhost.log 21 | fi 22 | echo "\n ########### IP Address to localhost mapping check complete and below is the content ########### " >> /tmp/localhost.log 23 | sudo cat /etc/hosts >> /tmp/localhost.log 24 | 25 | echo "\n ########### Exit script ########### " >> /tmp/localhost.log -------------------------------------------------------------------------------- /packyak/cli/list.py: -------------------------------------------------------------------------------- 1 | import asyncclick as click 2 | import os 3 | from tabulate import tabulate 4 | 5 | from packyak.cli.cli import cli 6 | from packyak.util.emr import EMR 7 | 8 | 9 | @cli.command() 10 | @click.option( 11 | "--profile", type=str, help="AWS CLI profile to use when authenticating to SSM" 12 | ) 13 | @click.option("-v", "--verbose", type=str, is_flag=True) 14 | async def list(profile: str | None, verbose: bool = False): 15 | if profile is not None: 16 | os.environ["AWS_PROFILE"] = profile 17 | 18 | emr = EMR() 19 | 20 | try: 21 | clusters = await emr.list_clusters() 22 | if len(clusters) > 0: 23 | print( 24 | tabulate( 25 | [ 26 | [ 27 | cluster.cluster_id, 28 | cluster.cluster_name, 29 | cluster.cluster_status.value, 30 | ] 31 | for cluster in clusters 32 | ], 33 | # headers=["Cluster ID", "Cluster Name", "Cluster Status"], 34 | headers=[], 35 | tablefmt="plain", 36 | ) 37 | ) 38 | else: 39 | pass 40 | except Exception as e: 41 | raise click.ClickException(f"Error listing EMR clusters: {e}") 42 | -------------------------------------------------------------------------------- /packyak-docs/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "packyak-docs", 3 | "version": "0.0.0", 4 | "private": true, 5 | "scripts": { 6 | "docusaurus": "docusaurus", 7 | "start": "docusaurus start", 8 | "build": "docusaurus build", 9 | "swizzle": "docusaurus swizzle", 10 | "deploy": "docusaurus deploy", 11 | "clear": "docusaurus clear", 12 | "serve": "docusaurus serve", 13 | "write-translations": "docusaurus write-translations", 14 | "write-heading-ids": "docusaurus write-heading-ids", 15 | "typecheck": "tsc" 16 | }, 17 | "dependencies": { 18 | "@docusaurus/core": "3.1.0", 19 | "@docusaurus/preset-classic": "3.1.0", 20 | "@mdx-js/react": "^3.0.0", 21 | "clsx": "^2.1.0", 22 | "prism-react-renderer": "^2.3.1", 23 | "react": "^18.2.0", 24 | "react-dom": "^18.2.0" 25 | }, 26 | "devDependencies": { 27 | "@docusaurus/module-type-aliases": "3.1.0", 28 | "@docusaurus/theme-classic": "^3.1.0", 29 | "@docusaurus/tsconfig": "3.1.0", 30 | "@docusaurus/types": "3.1.0", 31 | "typescript": "^5.3.3" 32 | }, 33 | "browserslist": { 34 | "production": [ 35 | ">0.5%", 36 | "not dead", 37 | "not op_mini all" 38 | ], 39 | "development": [ 40 | "last 3 chrome version", 41 | "last 3 firefox version", 42 | "last 5 safari version" 43 | ] 44 | }, 45 | "engines": { 46 | "node": ">=18.0" 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/packyak-resource.ts: -------------------------------------------------------------------------------- 1 | import { CustomResource, Duration } from "aws-cdk-lib"; 2 | import { Architecture, Function, Runtime, Code } from "aws-cdk-lib/aws-lambda"; 3 | import { Provider } from "aws-cdk-lib/custom-resources"; 4 | import type { Construct } from "constructs"; 5 | 6 | export interface PackYakResourceProps { 7 | resourceType: string; 8 | properties: Record; 9 | code: Code; 10 | architecture?: Architecture; 11 | runtime?: Runtime; 12 | handler?: string; 13 | timeout?: Duration; 14 | environment?: Record; 15 | } 16 | 17 | export class PackYakResource extends Function { 18 | constructor(scope: Construct, id: string, props: PackYakResourceProps) { 19 | super(scope, id, { 20 | code: props.code, 21 | architecture: props.architecture ?? Architecture.ARM_64, 22 | runtime: props.runtime ?? Runtime.NODEJS_20_X, 23 | handler: props.handler ?? "index.handler", 24 | timeout: props.timeout ?? Duration.minutes(1), 25 | environment: { 26 | NODE_OPTIONS: "--enable-source-maps", 27 | ...props.environment, 28 | }, 29 | }); 30 | 31 | const provider = new Provider(this, "CreateUsers", { 32 | onEventHandler: this, 33 | }); 34 | 35 | new CustomResource(this, "Users", { 36 | serviceToken: provider.serviceToken, 37 | resourceType: "Custom::CreateUsers", 38 | properties: props.properties, 39 | }); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "packyak" 3 | version = "0.4.22" 4 | description = "Infrastructure for AI applications and machine learning pipelines" 5 | authors = ["sam "] 6 | readme = "README.md" 7 | include = ["./packyak/py.typed"] 8 | 9 | [tool.poetry.scripts] 10 | yak = "packyak.cli:cli" 11 | packyak = "packyak.cli:cli" 12 | 13 | [tool.poetry.dependencies] 14 | types-aiobotocore = { extras = ["essential", "emr"], version = "^2.9.0" } 15 | types-boto3 = "^1.0.2" 16 | aioboto3 = "^12.1.0" 17 | aiobotocore = { version = "2.8.0", extras = ["boto3"] } 18 | aws-lambda-typing = "^2.18.0" 19 | boto3-stubs = { extras = [ 20 | "essential", 21 | "sts", 22 | "ssm", 23 | "emr", 24 | ], version = "^1.34.15" } 25 | pandas = ">2" 26 | pandera = "*" 27 | pydantic = "^2.5.3" 28 | python = "^3.10" 29 | pandas-stubs = ">2" 30 | aiofiles = "^23.2.1" 31 | pyspark = { extras = ["pandas-on-spark"], version = "^3.5.0" } 32 | click = "^8.1.7" 33 | tabulate = "^0.9.0" 34 | findspark = "^2.0.1" 35 | asyncclick = "^8.1.7.1" 36 | 37 | [tool.poetry.group.dev.dependencies] 38 | matplotlib = "^3.8.2" 39 | pylint = "^3.0.3" 40 | pyright = "^1.1.351" 41 | requests = "^2.31.0" 42 | ruff = "^0.1.11" 43 | streamlit = "^1.29.0" 44 | pynessie = "^0.66.0" 45 | ipykernel = "^6.29.2" 46 | twine = "^5.0.0" 47 | 48 | [build-system] 49 | requires = ["poetry-core"] 50 | build-backend = "poetry.core.masonry.api" 51 | 52 | [tool.ruff] 53 | ignore = ["E402", "F401"] 54 | -------------------------------------------------------------------------------- /packyak-docs/src/pages/index.tsx: -------------------------------------------------------------------------------- 1 | import clsx from 'clsx'; 2 | import Link from '@docusaurus/Link'; 3 | import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; 4 | import Layout from '@theme/Layout'; 5 | import HomepageFeatures from '@site/src/components/HomepageFeatures'; 6 | import Heading from '@theme/Heading'; 7 | 8 | import styles from './index.module.css'; 9 | 10 | function HomepageHeader() { 11 | const {siteConfig} = useDocusaurusContext(); 12 | return ( 13 |
14 |
15 | 16 | {siteConfig.title} 17 | 18 |

{siteConfig.tagline}

19 |
20 | 23 | Docusaurus Tutorial - 5min ⏱️ 24 | 25 |
26 |
27 |
28 | ); 29 | } 30 | 31 | export default function Home(): JSX.Element { 32 | const {siteConfig} = useDocusaurusContext(); 33 | return ( 34 | 37 | 38 |
39 | 40 |
41 |
42 | ); 43 | } 44 | -------------------------------------------------------------------------------- /packyak/cli/instances.py: -------------------------------------------------------------------------------- 1 | import asyncclick as click 2 | from packyak.cli.cli import cli 3 | from packyak.util.emr import EMR, NodeType 4 | import os 5 | 6 | 7 | @cli.command() 8 | @click.argument( 9 | "cluster_id", 10 | required=True, 11 | type=str, 12 | ) 13 | @click.option( 14 | "--node-type", 15 | type=click.Choice(["primary", "core", "task"], case_sensitive=False), 16 | default=None, 17 | help="The node type to filter by.", 18 | ) 19 | @click.option( 20 | "--profile", 21 | required=False, 22 | type=str, 23 | default=None, 24 | help="The AWS profile to use for the session.", 25 | ) 26 | async def instances(cluster_id: str, node_type: NodeType | None, profile: str | None): 27 | """ 28 | Lists instances in an EMR cluster with optional filtering by node type. 29 | """ 30 | if profile is not None: 31 | os.environ["AWS_PROFILE"] = profile 32 | emr = EMR() 33 | 34 | if node_type is None: 35 | types_to_fetch: list[NodeType] = ["primary", "core", "task"] 36 | else: 37 | types_to_fetch = [node_type] 38 | 39 | output_messages = [] 40 | for node_type in types_to_fetch: 41 | instances = await emr.list_instance_ids(cluster_id, node_type) 42 | if instances: 43 | output_messages.append(f"{node_type.capitalize()}:") 44 | for instance in instances: 45 | output_messages.append(instance) 46 | 47 | for message in output_messages: 48 | click.echo(message) 49 | -------------------------------------------------------------------------------- /packyak-docs/docs/tutorial-extras/manage-docs-versions.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 1 3 | --- 4 | 5 | # Manage Docs Versions 6 | 7 | Docusaurus can manage multiple versions of your docs. 8 | 9 | ## Create a docs version 10 | 11 | Release a version 1.0 of your project: 12 | 13 | ```bash 14 | npm run docusaurus docs:version 1.0 15 | ``` 16 | 17 | The `docs` folder is copied into `versioned_docs/version-1.0` and `versions.json` is created. 18 | 19 | Your docs now have 2 versions: 20 | 21 | - `1.0` at `http://localhost:3000/docs/` for the version 1.0 docs 22 | - `current` at `http://localhost:3000/docs/next/` for the **upcoming, unreleased docs** 23 | 24 | ## Add a Version Dropdown 25 | 26 | To navigate seamlessly across versions, add a version dropdown. 27 | 28 | Modify the `docusaurus.config.js` file: 29 | 30 | ```js title="docusaurus.config.js" 31 | export default { 32 | themeConfig: { 33 | navbar: { 34 | items: [ 35 | // highlight-start 36 | { 37 | type: 'docsVersionDropdown', 38 | }, 39 | // highlight-end 40 | ], 41 | }, 42 | }, 43 | }; 44 | ``` 45 | 46 | The docs version dropdown appears in your navbar: 47 | 48 | ![Docs Version Dropdown](./img/docsVersionDropdown.png) 49 | 50 | ## Update an existing version 51 | 52 | It is possible to edit versioned docs in their respective folder: 53 | 54 | - `versioned_docs/version-1.0/hello.md` updates `http://localhost:3000/docs/hello` 55 | - `docs/hello.md` updates `http://localhost:3000/docs/next/hello` 56 | -------------------------------------------------------------------------------- /packyak/runtime/cluster.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Callable, TypeVar, TYPE_CHECKING 3 | 4 | from packyak.runtime.job import Job 5 | from packyak.spec import DependencyGroup 6 | from packyak.resource import Resource 7 | from pyspark import SparkContext 8 | 9 | Return = TypeVar("Return") 10 | 11 | 12 | class Engine(Enum): 13 | Spark = "SPARK" 14 | 15 | 16 | class Cluster(Resource): 17 | def __init__(self, cluster_id: str): 18 | super().__init__(cluster_id) 19 | self.cluster_id = cluster_id 20 | self.engine = Engine.Spark 21 | 22 | def job( 23 | self, 24 | job_id: str | None = None, 25 | *, 26 | file_name: str | None = None, 27 | with_: DependencyGroup = None, 28 | without: DependencyGroup = None, 29 | # deprecated, use with_ and without 30 | dev: bool | None = None, 31 | all_extras: bool | None = None, 32 | without_hashes: bool | None = None, 33 | without_urls: bool | None = None, 34 | ): 35 | def decorator(handler: Callable[[SparkContext], Return]) -> Job[Return]: 36 | return Job( 37 | handler=handler, 38 | job_id=job_id, 39 | file_name=file_name, 40 | with_=with_, 41 | without=without, 42 | dev=dev, 43 | all_extras=all_extras, 44 | without_hashes=without_hashes, 45 | without_urls=without_urls, 46 | ) 47 | 48 | return decorator 49 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/nessie/nessie-lambda-catalog.ts: -------------------------------------------------------------------------------- 1 | import { 2 | Code, 3 | Function, 4 | FunctionUrl, 5 | FunctionUrlAuthType, 6 | InvokeMode, 7 | Runtime, 8 | SnapStartConf, 9 | } from "aws-cdk-lib/aws-lambda"; 10 | import { Construct } from "constructs"; 11 | import * as path from "path"; 12 | import { BaseNessieCatalog, BaseNessieRepoProps } from "./base-nessie-catalog"; 13 | 14 | export interface NessieLambdaCatalogProps extends BaseNessieRepoProps {} 15 | 16 | export class NessieLambdaCatalog extends BaseNessieCatalog { 17 | public readonly function: Function; 18 | public readonly functionUrl: FunctionUrl; 19 | 20 | public override readonly endpoint: string; 21 | 22 | constructor(scope: Construct, id: string, props: NessieLambdaCatalogProps) { 23 | super(scope, id, props); 24 | 25 | // TODO: none of this is right 26 | // see: https://project-nessie.zulipchat.com/#narrow/stream/371187-general/topic/AWS.20Lambda.20with.20SnapStart 27 | this.function = new Function(this, "Function", { 28 | runtime: Runtime.JAVA_17, 29 | snapStart: SnapStartConf.ON_PUBLISHED_VERSIONS, 30 | code: Code.fromAsset(path.join(__dirname, "lambda")), 31 | handler: 32 | "io.quarkus.amazon.lambda.runtime.QuarkusStreamHandler::handleRequest", 33 | }); 34 | this.functionUrl = this.function.addFunctionUrl({ 35 | authType: FunctionUrlAuthType.AWS_IAM, 36 | // TODO: what's right here? Maybe streaming? 37 | invokeMode: InvokeMode.BUFFERED, 38 | }); 39 | this.endpoint = this.functionUrl.url; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /packyak-docs/docs/intro.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 1 3 | --- 4 | 5 | # Tutorial Intro 6 | 7 | Let's discover **Docusaurus in less than 5 minutes**. 8 | 9 | ## Getting Started 10 | 11 | Get started by **creating a new site**. 12 | 13 | Or **try Docusaurus immediately** with **[docusaurus.new](https://docusaurus.new)**. 14 | 15 | ### What you'll need 16 | 17 | - [Node.js](https://nodejs.org/en/download/) version 18.0 or above: 18 | - When installing Node.js, you are recommended to check all checkboxes related to dependencies. 19 | 20 | ## Generate a new site 21 | 22 | Generate a new Docusaurus site using the **classic template**. 23 | 24 | The classic template will automatically be added to your project after you run the command: 25 | 26 | ```bash 27 | npm init docusaurus@latest my-website classic 28 | ``` 29 | 30 | You can type this command into Command Prompt, Powershell, Terminal, or any other integrated terminal of your code editor. 31 | 32 | The command also installs all necessary dependencies you need to run Docusaurus. 33 | 34 | ## Start your site 35 | 36 | Run the development server: 37 | 38 | ```bash 39 | cd my-website 40 | npm run start 41 | ``` 42 | 43 | The `cd` command changes the directory you're working with. In order to work with your newly created Docusaurus site, you'll need to navigate the terminal there. 44 | 45 | The `npm run start` command builds your website locally and serves it through a development server, ready for you to view at http://localhost:3000/. 46 | 47 | Open `docs/intro.md` (this page) and edit some lines: the site **reloads automatically** and displays your changes. 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PackYak ![image](https://github.com/sam-goodwin/packyak/assets/38672686/249af136-45fb-4d13-82bb-5818e803eeb0) 2 | 3 | [![PyPI version](https://badge.fury.io/py/packyak.svg)](https://badge.fury.io/py/packyak) 4 | 5 | > [!NOTE] 6 | > Still in active development. 7 | 8 | # PackYak 9 | 10 | PackYak is an open source platform for building modern Data Engineering projects with Python in your own AWS account. It provides everything you need to iterate quickly on massive scale data science, build reliable and flexible production pipelines, version control your data sets and expose interactive data applications and reports. 11 | 12 | PackYak leverages the AWS CDK to automatically deploy the following software in to your own AWS Account: 13 | * [Kubernetes](https://kubernetes.io/) - fully automated AWS EKS set up of a Kubernetes cluster with pre-installed services 14 | * [Coder](https://coder.com/) - run developer environments in the cloud with Terraform-automated environments 15 | * [Ray](https://www.ray.io/) - run heterogeneous clusters consisting of CPU, GPU and Memory optimized EC2 instances. 16 | * [Dask](https://www.dask.org/), [Daft](https://www.getdaft.io/), [Spark](https://spark.apache.org/), etc. - run any compute framework supported by Ray 17 | * [Dagster](https://dagster.io/) - orchestrate the production of inter-connected "Software-defined Assets" 18 | * [Nessie](https://projectnessie.org/) & [Iceberg](https://iceberg.apache.org/) - version your data catalog with Git-like branching, tags and commits. 19 | * [Streamlit](https://streamlit.io/) - build interactive reports over your data with simple Python scripts 20 | 21 | # Get Started 22 | 23 | > [!Note] 24 | > Coming Soon. -------------------------------------------------------------------------------- /packyak/synth/__main__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any 3 | 4 | from packyak.synth.synth import synth 5 | 6 | 7 | async def main(): 8 | import argparse 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--root", type=str, help="root directory", required=True) 12 | args = parser.parse_args() 13 | 14 | packyak_spec = await synth(root_dir=args.root) 15 | packyak_dir = ".packyak" 16 | 17 | if not os.path.exists(packyak_dir): 18 | os.makedirs(packyak_dir) 19 | with open(f"{packyak_dir}/spec.json", "w") as f: 20 | spec = packyak_spec.model_dump( 21 | exclude_unset=True, 22 | exclude_none=True, 23 | ) 24 | 25 | def transform(obj: Any) -> Any: 26 | if isinstance(obj, dict): 27 | d = dict( 28 | ( 29 | k, 30 | transform(v), 31 | # TODO: decide whether the file_name should be relative or absolute 32 | # os.path.relpath(v, packyak_dir) 33 | # if k == "file_name" 34 | # else transform(v), 35 | ) 36 | for k, v in obj.items() 37 | if not k.startswith("_") 38 | ) 39 | return d 40 | 41 | elif isinstance(obj, list): 42 | return [transform(v) for v in obj] 43 | return obj 44 | 45 | import json 46 | 47 | f.write(json.dumps(transform(spec), indent=2)) 48 | 49 | 50 | if __name__ == "__main__": 51 | import asyncio 52 | 53 | asyncio.run(main()) 54 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@packyak/packyak", 3 | "private": true, 4 | "version": "0.4.22", 5 | "type": "module", 6 | "exports": { 7 | ".": { 8 | "import": "./lib/index.js", 9 | "require": "./lib/index.js" 10 | } 11 | }, 12 | "scripts": { 13 | "clean": "tsc -b --clean && rm -rf packyak-aws-cdk/*.tsbuildinfo packyak-aws-cdk/lib packyak-aws-cdk/lib.jsii", 14 | "watch": "tsc -b -w", 15 | "codegen": "poetry run python ./scripts/codegen.py", 16 | "typecheck": "poetry run pyright", 17 | "build": "tsc -b && pnpm run --filter @packyak/aws-cdk build", 18 | "build:cdk": "pnpm --filter @packyak/aws-cdk run build && pnpm --filter @packyak/aws-cdk run package", 19 | "synth": "poetry run python ./examples/app.py", 20 | "synth:website": "cdk synth --app 'tsx ./packyak.config.ts'", 21 | "publish:all": "BUMP_ROOT=true BUMP_CDK=true ./scripts/publish.sh", 22 | "publish:packyak": "BUMP_ROOT=true ./scripts/publish.sh", 23 | "publish:cdk": "BUMP_CDK=true ./scripts/publish.sh" 24 | }, 25 | "keywords": [], 26 | "author": "", 27 | "license": "ISC", 28 | "devDependencies": { 29 | "@aws-sdk/client-efs": "^3.515.0", 30 | "@aws-sdk/client-sso": "^3.515.0", 31 | "@aws-sdk/credential-provider-ini": "^3.515.0", 32 | "@aws-sdk/credential-provider-sso": "^3.515.0", 33 | "@biomejs/biome": "^1.6.1", 34 | "@tsconfig/node20": "^20.1.2", 35 | "@types/node": "^20.10.8", 36 | "aws-cdk-lib": "^2.127.0", 37 | "bun": "^1.0.22", 38 | "constructs": "10.3.0", 39 | "jsii": "^5.3.20", 40 | "jsii-config": "^1.94.0", 41 | "semver": "^7.6.0", 42 | "sst": "^2.39.5", 43 | "tsx": "^4.7.0", 44 | "typescript": "^5.3.3" 45 | } 46 | } -------------------------------------------------------------------------------- /packyak/cli/logs.py: -------------------------------------------------------------------------------- 1 | import asyncclick as click 2 | from packyak.cli.cli import cli 3 | from packyak.util.emr import EMR 4 | import asyncio 5 | import os 6 | 7 | 8 | @cli.command() 9 | @click.argument( 10 | "cluster_id", 11 | required=True, 12 | type=str, 13 | ) 14 | @click.option( 15 | "--instance-id", 16 | required=True, 17 | type=str, 18 | help="The unique identifier of the EMR instance.", 19 | ) 20 | @click.option( 21 | "--profile", 22 | required=False, 23 | type=str, 24 | default=None, 25 | help="The AWS profile to use for the session.", 26 | ) 27 | @click.argument("log-files", nargs=-1, type=str) 28 | @click.option( 29 | "--get", 30 | is_flag=True, 31 | help="Retrieve and display the content of the log files.", 32 | ) 33 | async def logs( 34 | cluster_id: str, 35 | instance_id: str, 36 | profile: str | None, 37 | log_files: list[str] = [], 38 | get: bool = False, 39 | ): 40 | """ 41 | Lists all log files for a given EMR cluster and instance ID. 42 | """ 43 | if profile is not None: 44 | os.environ["AWS_PROFILE"] = profile 45 | emr = EMR() 46 | 47 | logs = await emr.list_logs(cluster_id, instance_id) 48 | import fnmatch 49 | 50 | filtered_logs = logs 51 | if log_files: 52 | filtered_logs = [] 53 | for log_file_pattern in log_files: 54 | filtered_logs.extend(fnmatch.filter(logs, log_file_pattern)) 55 | 56 | if get: 57 | log_text = await asyncio.gather( 58 | *[emr.get_log_text(cluster_id, log) for log in filtered_logs] 59 | ) 60 | for text in log_text: 61 | print(text) 62 | else: 63 | for log in filtered_logs: 64 | print(log) 65 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.exclude": { 3 | "**/__pycache__": true, 4 | ".pycache": true, 5 | ".ruff_cache": true 6 | }, 7 | "editor.formatOnSave": true, 8 | "prettier.resolveGlobalModules": false, 9 | "editor.indentSize": 2, 10 | "[json]": { 11 | "editor.defaultFormatter": "biomejs.biome" 12 | }, 13 | "[typescript]": { 14 | "editor.defaultFormatter": "biomejs.biome", 15 | "editor.indentSize": 2, 16 | }, 17 | "[javascript]": { 18 | "editor.defaultFormatter": "biomejs.biome", 19 | "editor.indentSize": 2, 20 | }, 21 | "[python]": { 22 | "editor.defaultFormatter": "charliermarsh.ruff", 23 | "editor.indentSize": 4 24 | }, 25 | "python.languageServer": "Default", 26 | "python.analysis.typeCheckingMode": "strict", 27 | "python.analysis.autoImportCompletions": true, 28 | "python.defaultInterpreterPath": ".venv/bin/python", 29 | "python.pyright.enabled": true, 30 | "python.pyright.autoImportCompletions": true, 31 | "python.pyright.typeCheckingMode": "strict", 32 | "cSpell.words": [ 33 | "aioboto", 34 | "aiobotocore", 35 | "aiofiles", 36 | "boto", 37 | "botocore", 38 | "certificatemanager", 39 | "cgroups", 40 | "dagster", 41 | "dedupe", 42 | "Fargate", 43 | "Gids", 44 | "HDFS", 45 | "hudi", 46 | "JDBC", 47 | "JSII", 48 | "Lakehouse", 49 | "Lakehouses", 50 | "Milvus", 51 | "MWAA", 52 | "Nessie", 53 | "OLAP", 54 | "OLTP", 55 | "packyak", 56 | "projectnessie", 57 | "pydantic", 58 | "pyspark", 59 | "Qdrant", 60 | "quarkus", 61 | "selectbox", 62 | "streamlit", 63 | "thriftserver", 64 | "transactionally", 65 | "Unversioned", 66 | "VCPU", 67 | "virtualenv", 68 | "Weaviate", 69 | "xlarge" 70 | ], 71 | } 72 | -------------------------------------------------------------------------------- /packyak/asset/namespace.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Callable 2 | from enum import Enum 3 | from packyak.asset.asset import AssetNode, TInput, TOutput, TableAssetNode 4 | 5 | if TYPE_CHECKING: 6 | import pyspark.sql as sql 7 | import pyspark.pandas as ps 8 | 9 | 10 | class TableFormat(Enum): 11 | AVRO = "avro" 12 | CSV = "csv" 13 | DELTA = "delta" 14 | JSON = "json" 15 | ORC = "orc" 16 | PARQUET = "parquet" 17 | 18 | 19 | class Namespace: 20 | def __init__(self, name: str, parent: "Namespace | None" = None): 21 | self.name = name 22 | self.parent = parent 23 | self.children: dict[str, Namespace] = {} 24 | 25 | def __getattr__(self, name: str) -> "Namespace": 26 | if name not in self.children: 27 | self.children[name] = Namespace(name, self) 28 | return self.children[name] 29 | 30 | def table( 31 | self, 32 | *, 33 | input: AssetNode[TInput], 34 | output: type[TOutput] | None = None, 35 | format: TableFormat = TableFormat.PARQUET, 36 | ): 37 | def create_table_node( 38 | f: Callable[ 39 | [ps.DataFrame[TInput]], 40 | ps.DataFrame[TOutput] | sql.DataFrame, 41 | ] 42 | | Callable[ 43 | [sql.DataFrame], 44 | ps.DataFrame[TOutput] | sql.DataFrame, 45 | ], 46 | ) -> TableAssetNode[TOutput]: 47 | return TableAssetNode[TOutput](input=input) 48 | 49 | return create_table_node 50 | 51 | 52 | class DB(Namespace): 53 | def __init__(self, name: str): 54 | super().__init__(name) 55 | if name in dbs: 56 | raise ValueError(f"DB {name} already exists") 57 | dbs[name] = self 58 | 59 | 60 | dbs: dict[str, DB] = {} 61 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./bind"; 2 | export * from "./dagster"; 3 | export * from "./dns-configuration"; 4 | export * from "./emr/block-device"; 5 | export * from "./emr/bootstrap-action"; 6 | export * from "./emr/catalog"; 7 | export * from "./emr/cluster"; 8 | export * from "./emr/spark-config"; 9 | export * from "./emr/configuration"; 10 | export * from "./emr/fleet-cluster"; 11 | export * from "./emr/glue-catalog"; 12 | export * from "./emr/instance-fleet"; 13 | export * from "./emr/instance-group"; 14 | export * from "./emr/instance-market"; 15 | export * from "./emr/jdbc"; 16 | export * from "./emr/managed-scaling"; 17 | export * from "./emr/python-version"; 18 | export * from "./emr/release-label"; 19 | export * from "./emr/scala-version"; 20 | export * from "./emr/uniform-cluster"; 21 | export * from "./emr/spark-version"; 22 | export * from "./emr/step"; 23 | export * from "./nessie/base-nessie-catalog"; 24 | export * from "./nessie/nessie-ecs-catalog"; 25 | export * from "./nessie/nessie-lambda-catalog"; 26 | export * from "./nessie/nessie-version-store"; 27 | export * from "./python-poetry"; 28 | export * from "./sagemaker/domain"; 29 | export * from "./sagemaker/sage-maker-image"; 30 | export * from "./sagemaker/user-profile"; 31 | export * from "./streamlit-site"; 32 | export * from "./version"; 33 | export * from "./workspace/group"; 34 | export * from "./workspace/home"; 35 | export * from "./workspace/workspace"; 36 | 37 | import { CfnOutput, Stack } from "aws-cdk-lib/core"; 38 | 39 | declare module "aws-cdk-lib/core" { 40 | interface Stack { 41 | addOutputs(outputs: Record): void; 42 | } 43 | } 44 | 45 | Stack.prototype.addOutputs = function (outputs: Record) { 46 | for (const [key, value] of Object.entries(outputs)) { 47 | new CfnOutput(this, key, { value }); 48 | } 49 | }; 50 | -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/notebooks/spark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Full url of the Nessie API endpoint to nessie\n", 10 | "url = \"http://localhost:19120/api/v1\"\n", 11 | "# Where to store nessie tables\n", 12 | "full_path_to_warehouse = ...\n", 13 | "# The ref or context that nessie will operate on (if different from default branch).\n", 14 | "# Can be the name of a Nessie branch or tag name.\n", 15 | "ref = \"main\"\n", 16 | "# Nessie authentication type (NONE, BEARER, OAUTH2 or AWS)\n", 17 | "auth_type = \"NONE\"\n", 18 | "\n", 19 | "spark = SparkSession.builder \\\n", 20 | " .config(\"spark.jars.packages\",\"org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.0,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.3_2.12:0.76.6\") \\\n", 21 | " .config(\"spark.sql.extensions\", \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions\") \\\n", 22 | " .config(\"spark.sql.catalog.nessie.uri\", url) \\\n", 23 | " .config(\"spark.sql.catalog.nessie.ref\", ref) \\\n", 24 | " .config(\"spark.sql.catalog.nessie.authentication.type\", auth_type) \\\n", 25 | " .config(\"spark.sql.catalog.nessie.catalog-impl\", \"org.apache.iceberg.nessie.NessieCatalog\") \\\n", 26 | " .config(\"spark.sql.catalog.nessie.warehouse\", full_path_to_warehouse) \\\n", 27 | " .config(\"spark.sql.catalog.nessie\", \"org.apache.iceberg.spark.SparkCatalog\") \\\n", 28 | " .getOrCreate()" 29 | ] 30 | } 31 | ], 32 | "metadata": { 33 | "language_info": { 34 | "name": "python" 35 | } 36 | }, 37 | "nbformat": 4, 38 | "nbformat_minor": 2 39 | } 40 | -------------------------------------------------------------------------------- /examples/streamlit-aws-cdk/notebooks/nessie.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pynessie import init" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "client = init()" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 9, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "main = client.get_reference(\"main\")" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 10, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "test_branch = client.create_branch(\"test\", \"main\", main.hash_)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 11, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "Branch(name='test', hash_='2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d', metadata=None)" 48 | ] 49 | }, 50 | "execution_count": 11, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "test_branch" 57 | ] 58 | } 59 | ], 60 | "metadata": { 61 | "kernelspec": { 62 | "display_name": ".venv", 63 | "language": "python", 64 | "name": "python3" 65 | }, 66 | "language_info": { 67 | "codemirror_mode": { 68 | "name": "ipython", 69 | "version": 3 70 | }, 71 | "file_extension": ".py", 72 | "mimetype": "text/x-python", 73 | "name": "python", 74 | "nbconvert_exporter": "python", 75 | "pygments_lexer": "ipython3", 76 | "version": "3.10.13" 77 | } 78 | }, 79 | "nbformat": 4, 80 | "nbformat_minor": 2 81 | } 82 | -------------------------------------------------------------------------------- /packyak/cli/clusters.py: -------------------------------------------------------------------------------- 1 | import asyncclick as click 2 | import os 3 | from packyak.util.emr import EMR, ClusterStatus 4 | from tabulate import tabulate 5 | from packyak.cli.cli import cli 6 | 7 | from fnmatch import fnmatch 8 | 9 | 10 | @cli.command() 11 | @click.option( 12 | "--status", 13 | required=False, 14 | type=str, 15 | default="STARTING,BOOTSTRAPPING,RUNNING,WAITING", 16 | help="Comma-separated list of cluster statuses to include in the list. Allowed values are STARTING, BOOTSTRAPPING, RUNNING, WAITING, TERMINATING, TERMINATED, and TERMINATED_WITH_ERRORS.", 17 | ) 18 | @click.option( 19 | "--profile", 20 | required=False, 21 | type=str, 22 | default=None, 23 | help="The AWS profile to use for the session.", 24 | ) 25 | @click.option( 26 | "--filter", 27 | required=False, 28 | type=str, 29 | default="*", 30 | help="Glob pattern to filter clusters by name.", 31 | ) 32 | async def clusters(status: str, profile: str | None, filter: str): 33 | """ 34 | Lists all clusters in the account. By default, it only lists clusters that are waiting, bootstrapping, running, etc. 35 | Filters clusters by name using a glob pattern if provided. 36 | """ 37 | if profile is not None: 38 | os.environ["AWS_PROFILE"] = profile 39 | emr = EMR() 40 | 41 | status_filters = [ClusterStatus[s] for s in status.split(",")] 42 | all_clusters = await emr.list_clusters(states=status_filters) 43 | filtered_clusters = [ 44 | cluster for cluster in all_clusters if fnmatch(cluster.cluster_name, filter) 45 | ] 46 | click.echo( 47 | tabulate( 48 | [ 49 | [ 50 | cluster.cluster_name, 51 | cluster.cluster_id, 52 | cluster.cluster_status.value, 53 | ] 54 | for cluster in filtered_clusters 55 | ], 56 | headers=[], 57 | tablefmt="plain", 58 | ) 59 | ) 60 | -------------------------------------------------------------------------------- /packyak-aws-cdk/scripts/install-nvidia-drivers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | # see: https://github.com/amazonlinux/amazon-linux-2023/issues/538 5 | 6 | TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" -s) 7 | INSTANCE_TYPE=$(curl -s http://169.254.169.254/latest/meta-data/instance-type -H "X-aws-ec2-metadata-token: $TOKEN") 8 | echo "EC2 Instance Type: $INSTANCE_TYPE" 9 | 10 | if [[ $INSTANCE_TYPE == g* ]] || [[ $INSTANCE_TYPE == p* ]]; then 11 | sudo yum install -y cmake gcc docker kernel-devel-$(uname -r) 12 | BASE_URL=https://us.download.nvidia.com/tesla 13 | DRIVER_VERSION=515.105.01 14 | sudo curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run 15 | sudo chmod +x NVIDIA-Linux-x86_64-$DRIVER_VERSION.run 16 | sudo dnf install -y kernel-modules-extra 17 | sudo ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run -s -a --accept-license 18 | sudo curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | tee /etc/yum.repos.d/nvidia-container-toolkit.repo 19 | sudo yum install -y nvidia-container-toolkit 20 | sudo nvidia-ctk runtime configure --runtime=docker 21 | sudo systemctl restart docker 22 | sudo rm -rf ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run 23 | else 24 | echo "This is not a GPU instance, skipping NVIDIA driver installation." 25 | fi 26 | 27 | #WARNING: nvidia-installer was forced to guess the X library path '/usr/lib64' and X module path '/usr/lib64/xorg/modules'; these paths were not queryable from the system. If X fails to find the NVIDIA X driver 28 | # module, please install the `pkg-config` utility and the X.Org SDK/development package for your distribution and reinstall the driver. 29 | 30 | 31 | #WARNING: Unable to determine the path to install the libglvnd EGL vendor library config files. Check that you have pkg-config and the libglvnd development libraries installed, or specify a path with 32 | # --glvnd-egl-config-path. -------------------------------------------------------------------------------- /packyak-docs/docs/tutorial-extras/translate-your-site.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 2 3 | --- 4 | 5 | # Translate your site 6 | 7 | Let's translate `docs/intro.md` to French. 8 | 9 | ## Configure i18n 10 | 11 | Modify `docusaurus.config.js` to add support for the `fr` locale: 12 | 13 | ```js title="docusaurus.config.js" 14 | export default { 15 | i18n: { 16 | defaultLocale: 'en', 17 | locales: ['en', 'fr'], 18 | }, 19 | }; 20 | ``` 21 | 22 | ## Translate a doc 23 | 24 | Copy the `docs/intro.md` file to the `i18n/fr` folder: 25 | 26 | ```bash 27 | mkdir -p i18n/fr/docusaurus-plugin-content-docs/current/ 28 | 29 | cp docs/intro.md i18n/fr/docusaurus-plugin-content-docs/current/intro.md 30 | ``` 31 | 32 | Translate `i18n/fr/docusaurus-plugin-content-docs/current/intro.md` in French. 33 | 34 | ## Start your localized site 35 | 36 | Start your site on the French locale: 37 | 38 | ```bash 39 | npm run start -- --locale fr 40 | ``` 41 | 42 | Your localized site is accessible at [http://localhost:3000/fr/](http://localhost:3000/fr/) and the `Getting Started` page is translated. 43 | 44 | :::caution 45 | 46 | In development, you can only use one locale at a time. 47 | 48 | ::: 49 | 50 | ## Add a Locale Dropdown 51 | 52 | To navigate seamlessly across languages, add a locale dropdown. 53 | 54 | Modify the `docusaurus.config.js` file: 55 | 56 | ```js title="docusaurus.config.js" 57 | export default { 58 | themeConfig: { 59 | navbar: { 60 | items: [ 61 | // highlight-start 62 | { 63 | type: 'localeDropdown', 64 | }, 65 | // highlight-end 66 | ], 67 | }, 68 | }, 69 | }; 70 | ``` 71 | 72 | The locale dropdown now appears in your navbar: 73 | 74 | ![Locale Dropdown](./img/localeDropdown.png) 75 | 76 | ## Build your localized site 77 | 78 | Build your site for a specific locale: 79 | 80 | ```bash 81 | npm run build -- --locale fr 82 | ``` 83 | 84 | Or build your site to include all the locales at once: 85 | 86 | ```bash 87 | npm run build 88 | ``` 89 | -------------------------------------------------------------------------------- /packyak/cli/synth.py: -------------------------------------------------------------------------------- 1 | import asyncclick as click 2 | from packyak.cli.cli import cli 3 | from packyak.util.git import get_git_branch 4 | from packyak.synth import synth as _synth 5 | import os 6 | import sys 7 | import aiofiles as aio 8 | import asyncio 9 | 10 | 11 | @click.option( 12 | "-c", 13 | "--config", 14 | type=str, 15 | help="Path to the packyak.config.py file.", 16 | default="./packyak.config.py", 17 | ) 18 | @click.option( 19 | "--branch", 20 | type=str, 21 | help="Name of the data branch to materialize. Default to the branch of the current git repository.", 22 | default=get_git_branch(), 23 | ) 24 | @click.option( 25 | "--profile", 26 | type=str, 27 | help="AWS CLI profile to use when authenticating to SSM", 28 | ) 29 | @cli.command() 30 | async def synth( 31 | config: str, 32 | branch: str, 33 | profile: str | None, 34 | ): 35 | import importlib 36 | import importlib.util 37 | 38 | # Add the root directory to the path so we can import relative to the user's code 39 | sys.path.append(os.path.dirname(os.path.abspath(config))) 40 | 41 | config_module_spec = importlib.util.spec_from_file_location( 42 | "packyak.config", config 43 | ) 44 | if config_module_spec is None: 45 | raise ImportError( 46 | f"Could not find module {config}. Please ensure the file exists and is a valid Python module." 47 | ) 48 | if config_module_spec.loader is not None: 49 | config_module = importlib.util.module_from_spec(config_module_spec) 50 | config_module_spec.loader.exec_module(config_module) 51 | else: 52 | raise ImportError( 53 | f"Could not load module {config}. Please ensure the loader is available." 54 | ) 55 | packyak_spec = await _synth() # Assuming _synth can be an async function 56 | os.makedirs(".packyak", exist_ok=True) 57 | async with aio.open(os.path.join(".packyak", "manifest.json"), "w") as f: 58 | await f.write(packyak_spec.model_dump_json(indent=2, exclude_none=True)) 59 | -------------------------------------------------------------------------------- /packyak-nessie/README.md: -------------------------------------------------------------------------------- 1 | # packyak-nessie 2 | 3 | This project uses Quarkus, the Supersonic Subatomic Java Framework. 4 | 5 | If you want to learn more about Quarkus, please visit its website: https://quarkus.io/ . 6 | 7 | ## Running the application in dev mode 8 | 9 | You can run your application in dev mode that enables live coding using: 10 | ```shell script 11 | ./gradlew quarkusDev 12 | ``` 13 | 14 | > **_NOTE:_** Quarkus now ships with a Dev UI, which is available in dev mode only at http://localhost:8080/q/dev/. 15 | 16 | ## Packaging and running the application 17 | 18 | The application can be packaged using: 19 | ```shell script 20 | ./gradlew build 21 | ``` 22 | It produces the `quarkus-run.jar` file in the `build/quarkus-app/` directory. 23 | Be aware that it’s not an _über-jar_ as the dependencies are copied into the `build/quarkus-app/lib/` directory. 24 | 25 | The application is now runnable using `java -jar build/quarkus-app/quarkus-run.jar`. 26 | 27 | If you want to build an _über-jar_, execute the following command: 28 | ```shell script 29 | ./gradlew build -Dquarkus.package.type=uber-jar 30 | ``` 31 | 32 | The application, packaged as an _über-jar_, is now runnable using `java -jar build/*-runner.jar`. 33 | 34 | ## Creating a native executable 35 | 36 | You can create a native executable using: 37 | ```shell script 38 | ./gradlew build -Dquarkus.package.type=native 39 | ``` 40 | 41 | Or, if you don't have GraalVM installed, you can run the native executable build in a container using: 42 | ```shell script 43 | ./gradlew build -Dquarkus.package.type=native -Dquarkus.native.container-build=true 44 | ``` 45 | 46 | You can then execute your native executable with: `./build/code-with-quarkus-1.0.0-SNAPSHOT-runner` 47 | 48 | If you want to learn more about building native executables, please consult https://quarkus.io/guides/gradle-tooling. 49 | 50 | ## Provided Code 51 | 52 | ### RESTEasy Reactive 53 | 54 | Easily start your Reactive RESTful Web Services 55 | 56 | [Related guide section...](https://quarkus.io/guides/getting-started-reactive#reactive-jax-rs-resources) 57 | -------------------------------------------------------------------------------- /scripts/bump.mjs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | import fs from "fs/promises"; 4 | import path from "path"; 5 | import semver from "semver"; 6 | import { argv } from "process"; 7 | 8 | let bumpType = "patch"; // default to minor if not specified 9 | 10 | function inc(version) { 11 | return semver.inc(version, bumpType); 12 | } 13 | 14 | const args = argv.slice(2); // remove the first two default args 15 | 16 | for (const arg of args) { 17 | const validBumpTypes = ["major", "minor", "patch"]; 18 | const argBumpType = arg.replace("--", ""); 19 | if (validBumpTypes.includes(argBumpType)) { 20 | if (bumpType !== "minor" && bumpType !== "major" && bumpType !== "patch") { 21 | throw new Error("Cannot specify multiple bump types."); 22 | } 23 | bumpType = argBumpType; 24 | } 25 | } 26 | 27 | const __dirname = path.dirname(new URL(import.meta.url).pathname); 28 | const root = path.join(__dirname, ".."); 29 | 30 | if (process.env.BUMP_ROOT) { 31 | const rootPackage = JSON.parse( 32 | await fs.readFile(path.join(root, "package.json"), "utf-8"), 33 | ); 34 | rootPackage.version = inc(rootPackage.version); 35 | 36 | await fs.writeFile( 37 | path.join(root, "package.json"), 38 | JSON.stringify(rootPackage, null, 2), 39 | ); 40 | 41 | const packyakTomlPath = path.join(root, "pyproject.toml"); 42 | const packyakToml = await fs.readFile(packyakTomlPath, "utf-8"); 43 | 44 | const versionRegex = 45 | /\[tool\.poetry\]\nname\s*=\s*"packyak"\nversion\s*=\s*"\d+\.\d+\.\d+"/; 46 | const updatedPackyakToml = packyakToml.replace( 47 | versionRegex, 48 | `[tool.poetry]\nname = "packyak"\nversion = "${rootPackage.version}"`, 49 | ); 50 | await fs.writeFile(packyakTomlPath, updatedPackyakToml); 51 | } 52 | 53 | if (process.env.BUMP_CDK) { 54 | const awsCDK = path.join(root, "packyak-aws-cdk"); 55 | 56 | const awsCDKPackage = JSON.parse( 57 | await fs.readFile(path.join(awsCDK, "package.json"), "utf-8"), 58 | ); 59 | awsCDKPackage.version = inc(awsCDKPackage.version); 60 | await fs.writeFile( 61 | path.join(awsCDK, "package.json"), 62 | JSON.stringify(awsCDKPackage, null, 2), 63 | ); 64 | } 65 | -------------------------------------------------------------------------------- /packyak-docs/src/components/HomepageFeatures/index.tsx: -------------------------------------------------------------------------------- 1 | import clsx from 'clsx'; 2 | import Heading from '@theme/Heading'; 3 | import styles from './styles.module.css'; 4 | 5 | type FeatureItem = { 6 | title: string; 7 | Svg: React.ComponentType>; 8 | description: JSX.Element; 9 | }; 10 | 11 | const FeatureList: FeatureItem[] = [ 12 | { 13 | title: 'Easy to Use', 14 | Svg: require('@site/static/img/undraw_docusaurus_mountain.svg').default, 15 | description: ( 16 | <> 17 | Docusaurus was designed from the ground up to be easily installed and 18 | used to get your website up and running quickly. 19 | 20 | ), 21 | }, 22 | { 23 | title: 'Focus on What Matters', 24 | Svg: require('@site/static/img/undraw_docusaurus_tree.svg').default, 25 | description: ( 26 | <> 27 | Docusaurus lets you focus on your docs, and we'll do the chores. Go 28 | ahead and move your docs into the docs directory. 29 | 30 | ), 31 | }, 32 | { 33 | title: 'Powered by React', 34 | Svg: require('@site/static/img/undraw_docusaurus_react.svg').default, 35 | description: ( 36 | <> 37 | Extend or customize your website layout by reusing React. Docusaurus can 38 | be extended while reusing the same header and footer. 39 | 40 | ), 41 | }, 42 | ]; 43 | 44 | function Feature({title, Svg, description}: FeatureItem) { 45 | return ( 46 |
47 |
48 | 49 |
50 |
51 | {title} 52 |

{description}

53 |
54 |
55 | ); 56 | } 57 | 58 | export default function HomepageFeatures(): JSX.Element { 59 | return ( 60 |
61 |
62 |
63 | {FeatureList.map((props, idx) => ( 64 | 65 | ))} 66 |
67 |
68 |
69 | ); 70 | } 71 | -------------------------------------------------------------------------------- /packyak/spec.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import List, Literal, TypeVar 4 | from enum import Enum 5 | from pydantic import BaseModel 6 | 7 | BucketSubscriptionScope = Literal["create"] | Literal["delete"] 8 | 9 | 10 | BucketIntegrationScope = ( 11 | Literal["get"] | Literal["list"] | Literal["put"] | Literal["delete"] 12 | ) 13 | 14 | 15 | Item = TypeVar("Item") 16 | 17 | NonEmptyList = list[Item] 18 | 19 | DependencyGroup = NonEmptyList[str] | str | None 20 | 21 | 22 | class BindingSpec(BaseModel): 23 | resource_type: str 24 | resource_id: str 25 | scopes: list[str] 26 | props: dict[str, str] | None 27 | 28 | 29 | class BucketBindingSpec(BindingSpec): 30 | selector: str | None 31 | 32 | 33 | class PythonPoetryArgs(BaseModel): 34 | with_: DependencyGroup | None = None 35 | without: DependencyGroup | None = None 36 | dev: bool | None = None 37 | all_extras: bool | None = None 38 | without_hashes: bool | None = None 39 | without_urls: bool | None = None 40 | 41 | 42 | # defines the bindings within a Python module (file) 43 | class ModuleSpec(PythonPoetryArgs): 44 | file_name: str 45 | bindings: list[BindingSpec] | None 46 | 47 | 48 | class FunctionSpec(PythonPoetryArgs): 49 | function_id: str 50 | file_name: str 51 | bindings: list[BindingSpec] | None 52 | 53 | 54 | class JobSpec(PythonPoetryArgs): 55 | job_id: str 56 | file_name: str 57 | bindings: list[BindingSpec] | None 58 | 59 | 60 | class BucketSubscriptionSpec(BaseModel): 61 | scopes: List[BucketSubscriptionScope] 62 | function_id: str 63 | 64 | 65 | class BucketSpec(BaseModel): 66 | bucket_id: str 67 | subscriptions: list[BucketSubscriptionSpec] 68 | 69 | 70 | class QueueSubscriptionSpec(BaseModel): 71 | function_id: str 72 | 73 | 74 | class QueueSpec(BaseModel): 75 | queue_id: str 76 | fifo: bool 77 | subscriptions: list[QueueSubscriptionSpec] 78 | 79 | 80 | class ClusterSpec(BaseModel): 81 | cluster_id: str 82 | # bindings: list[BindingSpec] | None 83 | 84 | 85 | class PackyakSpec(BaseModel): 86 | modules: list[ModuleSpec] 87 | buckets: list[BucketSpec] 88 | queues: list[QueueSpec] 89 | clusters: list[ClusterSpec] 90 | functions: list[FunctionSpec] 91 | jobs: list[JobSpec] 92 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/block-device.ts: -------------------------------------------------------------------------------- 1 | import type { EbsDeviceVolumeType } from "aws-cdk-lib/aws-ec2"; 2 | 3 | /** 4 | * @see https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-storage.html 5 | */ 6 | export interface EbsBlockDevice { 7 | /** 8 | * The number of I/O operations per second (IOPS) that the volume supports. 9 | * 10 | * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-emr-cluster-volumespecification.html#cfn-emr-cluster-volumespecification-iops 11 | */ 12 | readonly iops?: number; 13 | /** 14 | * The volume size, in gibibytes (GiB). 15 | * 16 | * This can be a number from 1 - 1024. If the volume type is EBS-optimized, the minimum value is 10. 17 | * 18 | * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-emr-cluster-volumespecification.html#cfn-emr-cluster-volumespecification-sizeingb 19 | */ 20 | readonly sizeInGb: number; 21 | /** 22 | * The throughput, in mebibyte per second (MiB/s). 23 | * 24 | * This optional parameter can be a number from `125` - `1000` and is valid 25 | * only for {@link EbsDeviceVolumeType.GENERAL_PURPOSE_SSD_GP3} volumes. 26 | * 27 | * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-emr-cluster-volumespecification.html#cfn-emr-cluster-volumespecification-throughput 28 | */ 29 | readonly throughput?: number; 30 | /** 31 | * The volume type. 32 | * 33 | * Volume types supported are: 34 | * - gp3 ({@link EbsDeviceVolumeType.GENERAL_PURPOSE_SSD_GP3}) 35 | * - gp2 ({@link EbsDeviceVolumeType.GENERAL_PURPOSE_SSD}) 36 | * - io1 ({@link EbsDeviceVolumeType.PROVISIONED_IOPS_SSD}) 37 | * - st1 ({@link EbsDeviceVolumeType.THROUGHPUT_OPTIMIZED_HDD}) 38 | * - sc1 ({@link EbsDeviceVolumeType.COLD_HDD}) 39 | * - standard ({@link EbsDeviceVolumeType.STANDARD}) 40 | * 41 | * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-emr-cluster-volumespecification.html#cfn-emr-cluster-volumespecification-volumetype 42 | * @default standard 43 | */ 44 | readonly volumeType: EbsDeviceVolumeType; 45 | /** 46 | * The number of EBS volumes with a specific volume configuration to attach to each instance. 47 | * 48 | * @default 1 49 | */ 50 | readonly volumesPerInstance?: number; 51 | } 52 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/experimental.ts: -------------------------------------------------------------------------------- 1 | import { type App, Stack } from "aws-cdk-lib/core"; 2 | import type { Construct } from "constructs"; 3 | import { isPromise } from "util/types"; 4 | 5 | type ConstructInstance< 6 | F extends (this: InstanceType, props?: any) => any, 7 | Base extends new ( 8 | scope: Construct, 9 | id: string, 10 | props?: any, 11 | ) => any = typeof Construct, 12 | > = { 13 | construct: InstanceType & Awaited>; 14 | } & (undefined extends Parameters[0] 15 | ? { 16 | new (scope: Construct, id: string): InstanceType & ReturnType; 17 | } 18 | : Parameters[0] extends undefined 19 | ? { 20 | new ( 21 | scope: Construct, 22 | id: string, 23 | props?: Parameters[0], 24 | ): InstanceType & ReturnType; 25 | } 26 | : { 27 | new ( 28 | scope: Construct, 29 | id: string, 30 | props: Parameters[0], 31 | ): InstanceType & ReturnType; 32 | }); 33 | 34 | export function stack any>( 35 | func: F, 36 | ): ConstructInstance { 37 | return construct(func, Stack); 38 | } 39 | 40 | export function construct< 41 | F extends (this: InstanceType, props?: any) => any, 42 | Base extends new ( 43 | scope: Construct, 44 | id: string, 45 | props?: any, 46 | ) => any = typeof Construct, 47 | >( 48 | func: F, 49 | // @ts-ignore 50 | base: Base = Construct, 51 | ): ConstructInstance { 52 | // @ts-ignore 53 | return class extends base { 54 | static construct: any; 55 | constructor(scope: App, id: string, props?: any) { 56 | super(scope, id); 57 | // @ts-expect-error - we are being naughty and we know it 58 | const result = func.bind(this)(props); 59 | 60 | if (isPromise(result)) { 61 | // biome-ignore lint/correctness/noConstructorReturn: 62 | return result.then((outputs: any) => { 63 | Object.assign(this, outputs); 64 | return this; 65 | }); 66 | } else { 67 | Object.assign(this, result); 68 | } 69 | } 70 | }; 71 | } 72 | 73 | declare module "constructs" { 74 | interface Construct { 75 | create any>(id: string, func: F): ReturnType; 76 | create any>( 77 | id: string, 78 | func: F, 79 | ): ReturnType; 80 | create any>(func: F): ReturnType; 81 | create any>(func: F): ReturnType; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /packyak/runtime/function.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, TypeVar, ParamSpec 2 | from packyak.runtime.runnable import Runnable 3 | from packyak.util.fqn import get_fully_qualified_name 4 | 5 | 6 | from packyak.spec import DependencyGroup 7 | from packyak.duration import Duration 8 | 9 | Params = ParamSpec("Params") 10 | Return = TypeVar("Return") 11 | 12 | 13 | class LambdaFunction(Runnable[Params, Return]): 14 | def __init__( 15 | self, 16 | file_name: str, 17 | handler: Callable[Params, Return], 18 | function_id: str | None = None, 19 | memory: int | None = None, 20 | timeout: Duration | None = None, 21 | with_: DependencyGroup | None = None, 22 | without: DependencyGroup | None = None, 23 | dev: bool | None = None, 24 | all_extras: bool | None = None, 25 | without_hashes: bool | None = None, 26 | without_urls: bool | None = None, 27 | ) -> None: 28 | super().__init__( 29 | resource_id=function_id or get_fully_qualified_name(handler), 30 | handler=handler, 31 | file_name=file_name, 32 | with_=with_, 33 | without=without, 34 | dev=dev, 35 | all_extras=all_extras, 36 | without_hashes=without_hashes, 37 | without_urls=without_urls, 38 | ) 39 | 40 | self.memory = memory 41 | self.timeout = timeout 42 | 43 | 44 | def function( 45 | *, 46 | function_id: str | None = None, 47 | memory: int | None = None, 48 | timeout: Duration | None = None, 49 | file_name: str | None = None, 50 | with_: DependencyGroup = None, 51 | without: DependencyGroup = None, 52 | # deprecated, use with_ and without 53 | dev: bool | None = None, 54 | all_extras: bool | None = None, 55 | without_hashes: bool | None = None, 56 | without_urls: bool | None = None, 57 | ): 58 | def decorator(handler: Callable[Params, Return]) -> LambdaFunction[Params, Return]: 59 | _function_id = ( 60 | function_id 61 | if function_id is not None 62 | else get_fully_qualified_name(handler) 63 | ) 64 | 65 | func = LambdaFunction( 66 | function_id=_function_id, 67 | memory=memory, 68 | timeout=timeout, 69 | handler=handler, 70 | file_name=file_name or handler.__code__.co_filename, 71 | with_=with_, 72 | without=without, 73 | dev=dev, 74 | all_extras=all_extras, 75 | without_hashes=without_hashes, 76 | without_urls=without_urls, 77 | ) 78 | 79 | return func 80 | 81 | return decorator 82 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/nessie/nessie-version-store.ts: -------------------------------------------------------------------------------- 1 | import { Table, AttributeType, BillingMode } from "aws-cdk-lib/aws-dynamodb"; 2 | import { IGrantable } from "aws-cdk-lib/aws-iam"; 3 | import { RemovalPolicy } from "aws-cdk-lib/core"; 4 | import { Construct } from "constructs"; 5 | 6 | export interface NessieVersionStoreProps { 7 | /** 8 | * Nessie has two tables, `objs` and `refs`. 9 | * 10 | * Nessie supports configuring a "prefix" that will be used to determine the names of these tables. 11 | * 12 | * @default - "nessie" 13 | * @see https://project-nessie.zulipchat.com/#narrow/stream/371187-general/topic/AWS.20Lambda.20with.20SnapStart/near/420329834 14 | */ 15 | readonly versionStoreName?: string; 16 | /** 17 | * @default - RemovalPolicy.DESTROY 18 | */ 19 | readonly removalPolicy?: RemovalPolicy; 20 | } 21 | 22 | /** 23 | * @see https://projectnessie.org/try/configuration/#dynamodb-version-store-settings 24 | */ 25 | export class DynamoDBNessieVersionStore extends Construct { 26 | public readonly refs: Table; 27 | public readonly objs: Table; 28 | public readonly tablePrefix: string; 29 | constructor(scope: Construct, id: string, props?: NessieVersionStoreProps) { 30 | super(scope, id); 31 | this.tablePrefix = props?.versionStoreName ?? "nessie"; 32 | 33 | this.objs = new NessieVersionStoreTable(this, "objs", { 34 | tableName: `${this.tablePrefix}_objs`, 35 | removalPolicy: props?.removalPolicy, 36 | }); 37 | this.refs = new NessieVersionStoreTable(this, "refs", { 38 | tableName: `${this.tablePrefix}_refs`, 39 | removalPolicy: props?.removalPolicy, 40 | }); 41 | } 42 | 43 | public grantReadData(grantee: IGrantable) { 44 | this.objs.grantReadData(grantee); 45 | this.refs.grantReadData(grantee); 46 | } 47 | 48 | public grantWriteData(grantee: IGrantable) { 49 | this.objs.grantWriteData(grantee); 50 | this.refs.grantWriteData(grantee); 51 | } 52 | 53 | public grantReadWriteData(grantee: IGrantable) { 54 | this.objs.grantReadWriteData(grantee); 55 | this.refs.grantReadWriteData(grantee); 56 | } 57 | } 58 | 59 | interface NessieVersionStoreTableProps { 60 | tableName: string; 61 | /** 62 | * @default - RemovalPolicy.DESTROY 63 | */ 64 | removalPolicy?: RemovalPolicy; 65 | } 66 | 67 | class NessieVersionStoreTable extends Table { 68 | constructor( 69 | scope: Construct, 70 | id: string, 71 | props: NessieVersionStoreTableProps, 72 | ) { 73 | super(scope, id, { 74 | tableName: props.tableName, 75 | partitionKey: { 76 | name: "k", 77 | type: AttributeType.STRING, 78 | }, 79 | billingMode: BillingMode.PAY_PER_REQUEST, 80 | }); 81 | this.applyRemovalPolicy(props.removalPolicy ?? RemovalPolicy.DESTROY); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /packyak-aws-cdk/scripts/debug-docker.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | cat < /lib/udev/rules.d/71-nvidia-dev-char.rules 4 | # This will create /dev/char symlinks to all device nodes 5 | ACTION=="add", DEVPATH=="/bus/pci/drivers/nvidia", RUN+="/usr/bin/nvidia-ctk system create-dev-char-symlinks --create-all" 6 | EOF 7 | 8 | sudo nvidia-ctk system create-dev-char-symlinks --create-all 9 | 10 | cat < /etc/docker/daemon.json 11 | { 12 | "exec-opts": [ 13 | "native.cgroupdriver=cgroupfs" 14 | ], 15 | "runtimes": { 16 | "nvidia": { 17 | "args": [], 18 | "path": "nvidia-container-runtime" 19 | } 20 | } 21 | } 22 | EOF 23 | 24 | 25 | sudo systemctl restart docker 26 | 27 | docker run --rm --runtime=nvidia --gpus all \ 28 | --device=/dev/nvidia-uvm \ 29 | --device=/dev/nvidia-uvm-tools \ 30 | --device=/dev/nvidia-modeset \ 31 | --device=/dev/nvidiactl \ 32 | --device=/dev/nvidia0 \ 33 | ubuntu nvidia-smi 34 | 35 | 36 | docker run --rm --runtime=nvidia --gpus all \ 37 | --device=/dev/nvidia-uvm \ 38 | --device=/dev/nvidia-uvm-tools \ 39 | --device=/dev/nvidia0 \ 40 | ubuntu nvidia-smi 41 | 42 | sudo docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi 43 | sudo docker run --rm --gpus all ubuntu nvidia-smi 44 | 45 | 46 | docker \ 47 | run --rm \ 48 | -e NVIDIA_VISIBLE_DEVICES="0" \ 49 | --runtime=nvidia \ 50 | --device=/dev/nvidia-uvm \ 51 | --device=/dev/nvidia-uvm-tools \ 52 | --device=/dev/nvidia-modeset \ 53 | --device=/dev/nvidiactl \ 54 | --device=/dev/nvidia0 \ 55 | ubuntu nvidia-smi 56 | 210070991806.dkr.ecr.us-east-1.amazonaws.com/orion-analysis:artemis-dask nvidia-smi 57 | 58 | docker run --rm \ 59 | --runtime=nvidia \ 60 | --device=/dev/nvidia-uvm \ 61 | --device=/dev/nvidia-uvm-tools \ 62 | --device=/dev/nvidia-modeset \ 63 | --device=/dev/nvidiactl \ 64 | --device=/dev/nvidia0 \ 65 | ubuntu nvidia-smi 66 | 210070991806.dkr.ecr.us-east-1.amazonaws.com/orion-analysis:artemis-dask nvidia-smi 67 | 68 | 69 | 70 | 71 | docker run -e NVIDIA_VISIBLE_DEVICES="/dev/nvidia-uvm,/dev/nvidia-uvm-tools,/dev/nvidia-modeset,/dev/nvidiactl,/dev/nvidia0" -it 210070991806.dkr.ecr.us-east-1.amazonaws.com/orion-analysis:artemis-dask nvidia-smi 72 | docker run -e NVIDIA_VISIBLE_DEVICES="0" -it 210070991806.dkr.ecr.us-east-1.amazonaws.com/orion-analysis:artemis-dask nvidia-smi 73 | 74 | 75 | yarn jar /usr/lib/hadoop-yarn/hadoop-yarn-applications-distributedshell.jar \ 76 | -jar /usr/lib/hadoop-yarn/hadoop-yarn-applications-distributedshell.jar \ 77 | -shell_env YARN_CONTAINER_RUNTIME_TYPE=docker \ 78 | -shell_env YARN_CONTAINER_RUNTIME_DOCKER_IMAGE=ubuntu \ 79 | -shell_command nvidia-smi \ 80 | -container_resources memory-mb=3072,vcores=1,yarn.io/gpu=1 \ 81 | -num_containers 1 -------------------------------------------------------------------------------- /packyak/storage/folder.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, Tuple 4 | 5 | from packyak.runtime.integration import integration 6 | 7 | if TYPE_CHECKING: 8 | from packyak.storage.bucket import Bucket 9 | from packyak.spec import BucketSubscriptionScope 10 | from pyspark import SparkContext, RDD 11 | 12 | 13 | class Folder: 14 | sc: SparkContext | None = None 15 | 16 | def __init__(self, parent: "Bucket | Folder", name: str): 17 | self.parent = parent 18 | self.name = name 19 | 20 | def __str__(self) -> str: 21 | return f"{self.bucket}{self.path}/" 22 | 23 | @property 24 | def resource_id(self) -> str: 25 | return self.bucket.resource_id 26 | 27 | @property 28 | def resource_type(self) -> str: 29 | return self.bucket.resource_type 30 | 31 | @property 32 | def path(self) -> str: 33 | from packyak.storage.bucket import Bucket 34 | 35 | if isinstance(self.parent, Bucket): 36 | return self.name 37 | else: 38 | return f"{self.parent.path}/{self.name}" 39 | 40 | @property 41 | def prefix(self) -> str: 42 | return f"{self.path}/*" 43 | 44 | @property 45 | def bucket(self) -> "Bucket": 46 | from packyak.storage.bucket import Bucket 47 | 48 | if isinstance(self.parent, Bucket): 49 | return self.parent 50 | else: 51 | return self.parent.bucket 52 | 53 | @integration("get", prefix=prefix) 54 | def get(self, key: str): 55 | return self.bucket.get(f"{self.name}/{key}") 56 | 57 | @integration("delete", prefix=prefix) 58 | def delete(self, key: str): 59 | return self.bucket.delete(f"{self.name}/{key}") 60 | 61 | @integration("put", prefix=prefix) 62 | def put(self, key: str, body: str): 63 | return self.bucket.put(f"{self.name}/{key}", body) 64 | 65 | @integration("list", prefix=prefix) 66 | async def list(self, prefix: str, *, limit: int | None, next_token: str | None): 67 | return self.bucket.list( 68 | f"{self.path}/{prefix}", limit=limit, next_token=next_token 69 | ) 70 | 71 | def folder(self, path: str) -> "Folder": 72 | return self / path 73 | 74 | # Overload the '/' operator 75 | def __truediv__(self, other: str) -> "Folder": 76 | return Folder(self, other) 77 | 78 | def on(self, scope: BucketSubscriptionScope, prefix: str | None = None): 79 | return self.bucket.on( 80 | scope, prefix=self.path + prefix if prefix else self.path + "/*" 81 | ) 82 | 83 | @integration("get", "list") 84 | def binaryFiles( 85 | self, prefix: str | None = None, *, minPartitions: int | None = None 86 | ) -> RDD[Tuple[str, bytes]]: 87 | return self.parent.binaryFiles(prefix, minPartitions=minPartitions) 88 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/uniform-cluster.ts: -------------------------------------------------------------------------------- 1 | import { Construct } from "constructs"; 2 | import { type BaseClusterProps, Cluster } from "./cluster"; 3 | import type { InstanceGroup, PrimaryInstanceGroup } from "./instance-group"; 4 | import { ComputeUnit } from "./managed-scaling"; 5 | 6 | export interface UniformClusterProps extends BaseClusterProps { 7 | /** 8 | * Describes the EC2 instances and instance configurations for the primary {@link InstanceGroup}. 9 | * 10 | * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-emr-cluster-jobflowinstancesconfig.html#cfn-emr-cluster-jobflowinstancesconfig-masterinstancegroup 11 | */ 12 | readonly primaryInstanceGroup: PrimaryInstanceGroup; 13 | /** 14 | * Describes the EC2 instances and instance configurations for core {@link InstanceGroup}s. 15 | * 16 | * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-emr-cluster-jobflowinstancesconfig.html#cfn-emr-cluster-jobflowinstancesconfig-coreinstancegroup 17 | */ 18 | readonly coreInstanceGroup: InstanceGroup; 19 | /** 20 | * Describes the EC2 instances and instance configurations for task {@link InstanceGroup}s. 21 | * 22 | * These task {@link InstanceGroup}s are added to the cluster as part of the cluster launch. 23 | * Each task {@link InstanceGroup} must have a unique name specified so that CloudFormation 24 | * can differentiate between the task {@link InstanceGroup}s. 25 | * 26 | * > After creating the cluster, you can only modify the mutable properties of `InstanceGroupConfig` , which are `AutoScalingPolicy` and `InstanceCount` . Modifying any other property results in cluster replacement. 27 | * 28 | * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-emr-cluster-jobflowinstancesconfig.html#cfn-emr-cluster-jobflowinstancesconfig-taskinstancegroups 29 | */ 30 | readonly taskInstanceGroups?: InstanceGroup[]; 31 | } 32 | 33 | /** 34 | * Creates an EMR Cluster that is comprised of {@link InstanceGroup}s. 35 | * 36 | * @see https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-emr-cluster-jobflowinstancesconfig.html 37 | * @see https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-uniform-instance-group.html 38 | * @see https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-instances-guidelines.html 39 | */ 40 | export class UniformCluster extends Cluster { 41 | constructor(scope: Construct, id: string, props: UniformClusterProps) { 42 | if (props.managedScalingPolicy) { 43 | if ( 44 | props.managedScalingPolicy.computeLimits.unitType === 45 | ComputeUnit.INSTANCE_FLEET_UNITS 46 | ) { 47 | throw new Error( 48 | `Uniform Clusters must use either Instances or VCPU as ComputeLimitsUnitType`, 49 | ); 50 | } 51 | } 52 | super(scope, id, props); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/nessie/nessie-config.ts: -------------------------------------------------------------------------------- 1 | export function nessieConfigToEnvironment(config: NessieConfig): { 2 | [key: string]: string; 3 | } { 4 | return Object.fromEntries( 5 | Object.entries(config).map(([k, v]) => [ 6 | k.toUpperCase().replace(/[\._-]/g, "_"), 7 | v.toString(), 8 | ]), 9 | ); 10 | } 11 | 12 | /** 13 | * TODO: support others if necessary. For now DynamoDB is ideal for AWS. 14 | * 15 | * @see https://projectnessie.org/try/configuration/#support-for-the-database-specific-implementations 16 | */ 17 | export enum NessieVersionStoreType { 18 | DYNAMODB = "DYNAMODB", 19 | } 20 | 21 | /** 22 | * Nessie configuration settings. 23 | * 24 | * @see https://projectnessie.org/try/configuration/#configuration 25 | */ 26 | export interface NessieConfig { 27 | /** 28 | * @default main 29 | */ 30 | readonly "nessie.server.default-branch"?: string; 31 | /** 32 | * @see https://projectnessie.org/try/configuration/#support-for-the-database-specific-implementations 33 | */ 34 | readonly "nessie.version.store.type": NessieVersionStoreType; 35 | /** 36 | * @default - region CDK stack is deployed to 37 | * @see https://docs.quarkiverse.io/quarkus-amazon-services/dev/amazon-dynamodb.html#quarkus-amazon-dynamodb_quarkus.dynamodb.aws.region 38 | */ 39 | readonly "quarkus.dynamodb.aws.region"?: string; 40 | /** 41 | * @default aws-crt 42 | * @see https://docs.quarkiverse.io/quarkus-amazon-services/dev/amazon-dynamodb.html#quarkus-amazon-dynamodb_quarkus.dynamodb.sync-client.type 43 | */ 44 | readonly "quarkus.dynamodb.sync-client.type"?: "aws-crt" | "apache" | "url"; 45 | /** 46 | * @default aws-crt 47 | * @see https://docs.quarkiverse.io/quarkus-amazon-services/dev/amazon-dynamodb.html#quarkus-amazon-dynamodb_quarkus.dynamodb.async-client.type 48 | */ 49 | readonly "quarkus.dynamodb.async-client.type"?: "aws-crt" | "netty"; 50 | /** 51 | * @see https://docs.quarkiverse.io/quarkus-amazon-services/dev/amazon-dynamodb.html#quarkus-amazon-dynamodb_quarkus.dynamodb.devservices.enabled 52 | */ 53 | readonly "quarkus.dynamodb.devservices.enabled"?: boolean; 54 | /** 55 | * Determines the name of the `objs` and `refs` tables: 56 | * Objects table: `{prefix}_objs` 57 | * References table: `{prefix}_refs` 58 | * 59 | * @see https://projectnessie.org/try/configuration/#dynamodb-version-store-settings 60 | */ 61 | readonly "nessie.version.store.persist.dynamodb.table-prefix"?: string; 62 | 63 | // auth: https://github.com/projectnessie/nessie/blob/ae208dd02d18e003da6c4223e42da1b0099ebb19/servers/quarkus-server/src/main/resources/application.properties#L171-L177 64 | 65 | readonly "quarkus.oidc.tenant-enabled"?: boolean; 66 | 67 | readonly "quarkus.smallrye-health.root-path"?: string; 68 | 69 | /** 70 | * @see https://quarkus.io/guides/aws-lambda-snapstart 71 | */ 72 | readonly "quarkus.snapstart.enable"?: boolean; 73 | } 74 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/jdbc.ts: -------------------------------------------------------------------------------- 1 | import { IConnectable, Port } from "aws-cdk-lib/aws-ec2"; 2 | import type { Cluster } from "./cluster"; 3 | import { mergeSparkExtraJars } from "./spark-config"; 4 | 5 | /** 6 | * https://mr3docs.datamonad.com/docs/k8s/advanced/transport/ 7 | */ 8 | export enum TransportMode { 9 | BINARY = "binary", 10 | HTTP = "http", 11 | ALL = "all", 12 | } 13 | 14 | export interface JdbcProps { 15 | /** 16 | * @see https://spark.apache.org/docs/latest/sql-distributed-sql-engine.html 17 | */ 18 | readonly port: number; 19 | /** 20 | * @default 21 | */ 22 | readonly hiveConf?: Record; 23 | readonly sparkConf?: Record; 24 | readonly extraJavaOptions?: Record; 25 | } 26 | 27 | /** 28 | * Configures an EMR Cluster to start a Thrift Server daemon. 29 | */ 30 | export class Jdbc { 31 | constructor( 32 | private readonly cluster: Cluster, 33 | private readonly options: JdbcProps, 34 | ) { 35 | const hiveConf = options.hiveConf ?? {}; 36 | 37 | hiveConf["hive.server2.thrift.port"] = options.port.toString(10); 38 | 39 | const sparkConf = options.sparkConf ?? {}; 40 | const extraJavaOptions = mergeSparkExtraJars( 41 | cluster.extraJavaOptions, 42 | sparkConf["spark.driver.extraJavaOptions"], 43 | options.extraJavaOptions, 44 | ); 45 | if (extraJavaOptions) { 46 | sparkConf["spark.driver.extraJavaOptions"] = `'${extraJavaOptions}'`; 47 | } 48 | this.cluster.addStep({ 49 | name: "StartThriftServer", 50 | hadoopJarStep: { 51 | jar: "command-runner.jar", 52 | args: [ 53 | "sudo", 54 | "-u", 55 | "spark", 56 | "bash", 57 | "-c", 58 | // sudo -u spark bash -c "/lib/spark/sbin/start-thriftserver.sh --hiveconf hive.server2.thrift.port=10001 --hiveconf hive.execution.engine=spark --conf spark.sql.hive.thriftServer.singleSession=true --conf spark.driver.extraJavaOptions='-Djdk.httpclient.allowRestrictedHeaders=host'" 59 | `/lib/spark/sbin/start-thriftserver.sh --hiveconf hive.server2.thrift.port=10001 --hiveconf hive.execution.engine=spark --conf spark.driver.extraJavaOptions='-Djdk.httpclient.allowRestrictedHeaders=host' ${[ 60 | ...Object.entries(sparkConf).flatMap(([k, v]) => [ 61 | "--hiveconf", 62 | `${k}=${v}`, 63 | ]), 64 | ...Object.entries(sparkConf).flatMap(([k, v]) => [ 65 | "--conf", 66 | `${k}=${v}`, 67 | ]), 68 | ].join(" ")}`, 69 | ], 70 | }, 71 | actionOnFailure: "CANCEL_AND_WAIT", 72 | }); 73 | } 74 | 75 | public allowFrom(...connectables: IConnectable[]) { 76 | for (const connectable of connectables) { 77 | this.cluster.connections.allowFrom( 78 | connectable, 79 | Port.tcp(this.options.port), 80 | ); 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/configuration.ts: -------------------------------------------------------------------------------- 1 | import { mergeSparkExtraJars } from "./spark-config"; 2 | 3 | export interface Configuration { 4 | readonly classification: string; 5 | readonly configurationProperties: Record; 6 | readonly configurations?: Configuration[]; 7 | } 8 | // TODO: if keys like `"spark.jars.packages"` collide, join by , and dedupe 9 | export function combineConfigurations( 10 | ...configs: ((Configuration | undefined)[] | Configuration | undefined)[] 11 | ): Configuration[] | undefined { 12 | const mergedConfigurations = 13 | configs 14 | ?.flat() 15 | .reduce( 16 | ( 17 | finalConfig: { [classification: string]: Configuration }, 18 | next: Configuration | undefined, 19 | ) => { 20 | if (next === undefined) { 21 | return finalConfig; 22 | } 23 | const { classification, configurationProperties, configurations } = 24 | next; 25 | if (!finalConfig[classification]) { 26 | finalConfig[classification] = { 27 | classification, 28 | configurationProperties: {}, 29 | configurations: undefined, 30 | }; 31 | } 32 | 33 | // @ts-expect-error - slight hack to overwrite the readonly array. JSII requires readonly properties. 34 | finalConfig[classification].configurations = combineConfigurations( 35 | finalConfig[classification].configurations, 36 | configurations, 37 | ); 38 | const csvProperties = new Set([ 39 | "spark.jars.packages", 40 | "spark.sql.extensions", 41 | ]); 42 | for (const [key, value] of Object.entries(configurationProperties)) { 43 | if (csvProperties.has(key)) { 44 | const existing = finalConfig[classification] 45 | .configurationProperties[key] 46 | ? finalConfig[classification].configurationProperties[ 47 | key 48 | ].split(",") 49 | : []; 50 | const newValues = value.split(","); 51 | const merged = [...new Set([...existing, ...newValues])].join( 52 | ",", 53 | ); 54 | finalConfig[classification].configurationProperties[key] = merged; 55 | } else if (key == "spark.driver.extraJavaOptions") { 56 | finalConfig[classification].configurationProperties[key] = 57 | mergeSparkExtraJars( 58 | finalConfig[classification].configurationProperties[key], 59 | value, 60 | ); 61 | } else { 62 | finalConfig[classification].configurationProperties[key] = value; 63 | } 64 | } 65 | 66 | return finalConfig; 67 | }, 68 | {}, 69 | ) ?? []; 70 | const configurations = Object.values(mergedConfigurations); 71 | if (configurations.length == 0) { 72 | return undefined; 73 | } 74 | return configurations; 75 | } 76 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/fleet-cluster.ts: -------------------------------------------------------------------------------- 1 | import { Construct } from "constructs"; 2 | import { type BaseClusterProps, Cluster } from "./cluster"; 3 | import type { InstanceFleet } from "./instance-fleet"; 4 | import { ComputeUnit } from "./managed-scaling"; 5 | 6 | export interface FleetClusterProps extends BaseClusterProps { 7 | /** 8 | * Describes the EC2 instances and instance configurations for the primary 9 | * {@link InstanceFleet} when using {@link FleetCluster}s. 10 | * 11 | * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-emr-cluster-jobflowinstancesconfig.html#cfn-emr-cluster-jobflowinstancesconfig-masterinstancefleet 12 | */ 13 | readonly primaryInstanceFleet: InstanceFleet; 14 | /** 15 | * Describes the EC2 instances and instance configurations for the core {@link InstanceFleet}. 16 | * 17 | * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-emr-cluster-jobflowinstancesconfig.html#cfn-emr-cluster-jobflowinstancesconfig-coreinstancefleet 18 | */ 19 | readonly coreInstanceFleet: InstanceFleet; 20 | /** 21 | * Describes the EC2 instances and instance configurations for the task {@link InstanceFleet}s. 22 | * 23 | * These task {@link InstanceFleet}s are added to the cluster as part of the cluster launch. 24 | * Each task {@link InstanceFleet} must have a unique name specified so that CloudFormation 25 | * can differentiate between the task {@link InstanceFleet}s. 26 | * 27 | * > You can currently specify only one task instance fleet for a cluster. After creating the cluster, you can only modify the mutable properties of `InstanceFleetConfig` , which are `TargetOnDemandCapacity` and `TargetSpotCapacity` . Modifying any other property results in cluster replacement. > To allow a maximum of 30 Amazon EC2 instance types per fleet, include `TaskInstanceFleets` when you create your cluster. If you create your cluster without `TaskInstanceFleets` , Amazon EMR uses its default allocation strategy, which allows for a maximum of five Amazon EC2 instance types. 28 | * 29 | * @see http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-emr-cluster-jobflowinstancesconfig.html#cfn-emr-cluster-jobflowinstancesconfig-taskinstancefleets 30 | */ 31 | readonly taskInstanceFleets?: InstanceFleet[]; 32 | } 33 | 34 | /** 35 | * An EMR Cluster that is comprised of {@link InstanceFleet}s. 36 | * 37 | * @see https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-instance-fleet.html 38 | * @see https://docs.aws.amazon.com/emr/latest/ManagementGuide/on-demand-capacity-reservations.html 39 | */ 40 | export class FleetCluster extends Cluster { 41 | constructor(scope: Construct, id: string, props: FleetClusterProps) { 42 | if (props.managedScalingPolicy) { 43 | if ( 44 | props.managedScalingPolicy.computeLimits.unitType !== 45 | ComputeUnit.INSTANCE_FLEET_UNITS 46 | ) { 47 | throw new Error( 48 | `If you are using a FleetCluster, you must use INSTANCE_FLEET_UNITS as the ComputeLimitsUnitType`, 49 | ); 50 | } 51 | } 52 | super(scope, id, props); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /packyak-aws-cdk/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@packyak/aws-cdk", 3 | "description": "AWS CDK Constructs for the PackYak Lakehouse Platform", 4 | "version": "0.4.37", 5 | "author": "Sam Goodwin", 6 | "license": "Apache-2.0", 7 | "repository": { 8 | "type": "git", 9 | "url": "https://github.com/sam-goodwin/packyak" 10 | }, 11 | "keywords": [ 12 | "packyak", 13 | "aws-cdk", 14 | "lakehouse" 15 | ], 16 | "main": "lib/index.js", 17 | "types": "lib/index.d.ts", 18 | "typesVersions": { 19 | "<=3.9": { 20 | "lib/*": [ 21 | "lib/.types-compat/ts3.9/*", 22 | "lib/.types-compat/ts3.9/*/index.d.ts" 23 | ] 24 | } 25 | }, 26 | "bin": { 27 | "packyak": "./bin/packyak.mjs" 28 | }, 29 | "exports": { 30 | ".": { 31 | "import": "./lib/index.js", 32 | "require": "./lib/index.js", 33 | "types": "./lib/index.d.ts" 34 | } 35 | }, 36 | "files": [ 37 | "bin", 38 | "lib", 39 | "scripts", 40 | ".jsii" 41 | ], 42 | "scripts": { 43 | "build": "jsii && jsii-pacmak", 44 | "publish:pypi": "./scripts/release.sh", 45 | "build:watch": "jsii --watch", 46 | "clean-src": "find ./src -name '*.d.ts' -exec rm {} \\; && find ./src -name '*.js' -exec rm {} \\;" 47 | }, 48 | "devDependencies": { 49 | "@aws-cdk/aws-glue-alpha": "2.134.0-alpha.0", 50 | "@aws-cdk/aws-lambda-python-alpha": "2.134.0-alpha.0", 51 | "@aws-cdk/aws-sagemaker-alpha": "2.134.0-alpha.0", 52 | "@aws-sdk/client-efs": "^3.515.0", 53 | "@aws-sdk/client-sagemaker": "^3.515.0", 54 | "@biomejs/biome": "^1.5.3", 55 | "@tsconfig/node20": "^20.1.2", 56 | "@types/aws-lambda": "^8.10.134", 57 | "@types/node": "^20.10.8", 58 | "aws-cdk": "2.134.0", 59 | "aws-cdk-lib": "2.134.0", 60 | "bun": "^1.0.22", 61 | "constructs": "10.3.0", 62 | "esbuild": "^0.20.1", 63 | "jsii": "^5.3.20", 64 | "jsii-config": "^1.94.0", 65 | "jsii-pacmak": "^1.94.0", 66 | "sst": "^2.39.5", 67 | "tsx": "^4.7.0", 68 | "typescript": "^5.3.3" 69 | }, 70 | "peerDependencies": { 71 | "@aws-cdk/aws-glue-alpha": "2.134.0-alpha.0", 72 | "@aws-cdk/aws-lambda-python-alpha": "2.134.0-alpha.0", 73 | "@aws-cdk/aws-sagemaker-alpha": "2.134.0-alpha.0", 74 | "aws-cdk-lib": "2.134.0", 75 | "constructs": "^10.3.0" 76 | }, 77 | "peerDependenciesMeta": { 78 | "constructs": { 79 | "optional": true 80 | }, 81 | "aws-cdk-lib": { 82 | "optional": true 83 | }, 84 | "@aws-cdk/aws-lambda-python-alpha": { 85 | "optional": true 86 | }, 87 | "@aws-cdk/aws-sagemaker-alpha": { 88 | "optional": true 89 | }, 90 | "@aws-cdk/aws-glue-alpha": { 91 | "optional": true 92 | } 93 | }, 94 | "publishConfig": { 95 | "access": "public" 96 | }, 97 | "stability": "experimental", 98 | "jsii": { 99 | "targets": { 100 | "python": { 101 | "module": "packyak_aws_cdk", 102 | "distName": "packyak-aws-cdk", 103 | "classifiers": [ 104 | "Framework :: AWS CDK", 105 | "Framework :: AWS CDK :: 1" 106 | ] 107 | } 108 | }, 109 | "outdir": "lib.jsii", 110 | "versionFormat": "full", 111 | "tsc": { 112 | "outDir": "lib", 113 | "rootDir": "src" 114 | } 115 | } 116 | } -------------------------------------------------------------------------------- /packyak-nessie/gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto init 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto init 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :init 68 | @rem Get command-line arguments, handling Windows variants 69 | 70 | if not "%OS%" == "Windows_NT" goto win9xME_args 71 | 72 | :win9xME_args 73 | @rem Slurp the command line arguments. 74 | set CMD_LINE_ARGS= 75 | set _SKIP=2 76 | 77 | :win9xME_args_slurp 78 | if "x%~1" == "x" goto execute 79 | 80 | set CMD_LINE_ARGS=%* 81 | 82 | :execute 83 | @rem Setup the command line 84 | 85 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 86 | 87 | 88 | @rem Execute Gradle 89 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 90 | 91 | :end 92 | @rem End local scope for the variables with windows NT shell 93 | if "%ERRORLEVEL%"=="0" goto mainEnd 94 | 95 | :fail 96 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 97 | rem the _cmd.exe /c_ return code! 98 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 99 | exit /b 1 100 | 101 | :mainEnd 102 | if "%OS%"=="Windows_NT" endlocal 103 | 104 | :omega 105 | -------------------------------------------------------------------------------- /packyak-docs/docs/tutorial-basics/markdown-features.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 4 3 | --- 4 | 5 | # Markdown Features 6 | 7 | Docusaurus supports **[Markdown](https://daringfireball.net/projects/markdown/syntax)** and a few **additional features**. 8 | 9 | ## Front Matter 10 | 11 | Markdown documents have metadata at the top called [Front Matter](https://jekyllrb.com/docs/front-matter/): 12 | 13 | ```text title="my-doc.md" 14 | // highlight-start 15 | --- 16 | id: my-doc-id 17 | title: My document title 18 | description: My document description 19 | slug: /my-custom-url 20 | --- 21 | // highlight-end 22 | 23 | ## Markdown heading 24 | 25 | Markdown text with [links](./hello.md) 26 | ``` 27 | 28 | ## Links 29 | 30 | Regular Markdown links are supported, using url paths or relative file paths. 31 | 32 | ```md 33 | Let's see how to [Create a page](/create-a-page). 34 | ``` 35 | 36 | ```md 37 | Let's see how to [Create a page](./create-a-page.md). 38 | ``` 39 | 40 | **Result:** Let's see how to [Create a page](./create-a-page.md). 41 | 42 | ## Images 43 | 44 | Regular Markdown images are supported. 45 | 46 | You can use absolute paths to reference images in the static directory (`static/img/docusaurus.png`): 47 | 48 | ```md 49 | ![Docusaurus logo](/img/docusaurus.png) 50 | ``` 51 | 52 | ![Docusaurus logo](/img/docusaurus.png) 53 | 54 | You can reference images relative to the current file as well. This is particularly useful to colocate images close to the Markdown files using them: 55 | 56 | ```md 57 | ![Docusaurus logo](./img/docusaurus.png) 58 | ``` 59 | 60 | ## Code Blocks 61 | 62 | Markdown code blocks are supported with Syntax highlighting. 63 | 64 | ````md 65 | ```jsx title="src/components/HelloDocusaurus.js" 66 | function HelloDocusaurus() { 67 | return

Hello, Docusaurus!

; 68 | } 69 | ``` 70 | ```` 71 | 72 | ```jsx title="src/components/HelloDocusaurus.js" 73 | function HelloDocusaurus() { 74 | return

Hello, Docusaurus!

; 75 | } 76 | ``` 77 | 78 | ## Admonitions 79 | 80 | Docusaurus has a special syntax to create admonitions and callouts: 81 | 82 | ```md 83 | :::tip My tip 84 | 85 | Use this awesome feature option 86 | 87 | ::: 88 | 89 | :::danger Take care 90 | 91 | This action is dangerous 92 | 93 | ::: 94 | ``` 95 | 96 | :::tip My tip 97 | 98 | Use this awesome feature option 99 | 100 | ::: 101 | 102 | :::danger Take care 103 | 104 | This action is dangerous 105 | 106 | ::: 107 | 108 | ## MDX and React Components 109 | 110 | [MDX](https://mdxjs.com/) can make your documentation more **interactive** and allows using any **React components inside Markdown**: 111 | 112 | ```jsx 113 | export const Highlight = ({children, color}) => ( 114 | { 123 | alert(`You clicked the color ${color} with label ${children}`) 124 | }}> 125 | {children} 126 | 127 | ); 128 | 129 | This is Docusaurus green ! 130 | 131 | This is Facebook blue ! 132 | ``` 133 | 134 | export const Highlight = ({children, color}) => ( 135 | { 144 | alert(`You clicked the color ${color} with label ${children}`); 145 | }}> 146 | {children} 147 | 148 | ); 149 | 150 | This is Docusaurus green ! 151 | 152 | This is Facebook blue ! 153 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/nessie/nessie-ecs-catalog.ts: -------------------------------------------------------------------------------- 1 | import { Platform } from "aws-cdk-lib/aws-ecr-assets"; 2 | import { 3 | AwsLogDriver, 4 | ContainerImage, 5 | CpuArchitecture, 6 | OperatingSystemFamily, 7 | } from "aws-cdk-lib/aws-ecs"; 8 | import { 9 | ApplicationLoadBalancedFargateService, 10 | ApplicationLoadBalancedFargateServiceProps, 11 | } from "aws-cdk-lib/aws-ecs-patterns"; 12 | import { 13 | IGrantable, 14 | IPrincipal, 15 | Role, 16 | ServicePrincipal, 17 | } from "aws-cdk-lib/aws-iam"; 18 | import { Construct } from "constructs"; 19 | import { BaseNessieCatalog, BaseNessieRepoProps } from "./base-nessie-catalog"; 20 | import type { DNSConfiguration } from "../dns-configuration"; 21 | import { ILogGroup, LogGroup } from "aws-cdk-lib/aws-logs"; 22 | 23 | export interface NessieECSCatalogProps 24 | extends BaseNessieRepoProps, 25 | ApplicationLoadBalancedFargateServiceProps { 26 | readonly platform?: Platform; 27 | readonly dns?: DNSConfiguration; 28 | } 29 | 30 | export class NessieECSCatalog extends BaseNessieCatalog implements IGrantable { 31 | public readonly service: ApplicationLoadBalancedFargateService; 32 | 33 | public override readonly endpoint: string; 34 | 35 | public readonly grantPrincipal: IPrincipal; 36 | public readonly logGroup: ILogGroup; 37 | 38 | constructor(scope: Construct, id: string, props: NessieECSCatalogProps) { 39 | super(scope, id, props); 40 | 41 | const platform = props?.platform ?? Platform.LINUX_AMD64; 42 | 43 | const taskRole = new Role(this, "TaskRole", { 44 | assumedBy: new ServicePrincipal("ecs-tasks.amazonaws.com"), 45 | }); 46 | 47 | // TODO: logs 48 | this.grantPrincipal = taskRole; 49 | 50 | this.logGroup = 51 | props.logGroup ?? 52 | new LogGroup(this, "LogGroup", { 53 | logGroupName: `/nessie/${this.catalogName}`, 54 | }); 55 | 56 | this.service = new ApplicationLoadBalancedFargateService(this, "Service", { 57 | cluster: props?.cluster, 58 | vpc: props.vpc, 59 | serviceName: props.serviceName, 60 | runtimePlatform: { 61 | cpuArchitecture: 62 | platform === Platform.LINUX_AMD64 63 | ? CpuArchitecture.X86_64 64 | : CpuArchitecture.ARM64, 65 | operatingSystemFamily: OperatingSystemFamily.LINUX, 66 | }, 67 | cpu: props?.cpu ?? 256, 68 | memoryLimitMiB: props?.memoryLimitMiB ?? 512, 69 | publicLoadBalancer: props?.publicLoadBalancer, 70 | certificate: props?.dns?.certificate, 71 | domainName: props?.dns?.domainName, 72 | domainZone: props?.dns?.hostedZone, 73 | taskImageOptions: { 74 | ...(props?.taskImageOptions ?? {}), 75 | environment: { 76 | ...this.configAsEnvVars(), 77 | ...props?.taskImageOptions?.environment, 78 | }, 79 | logDriver: AwsLogDriver.awsLogs({ 80 | streamPrefix: "nessie", 81 | logGroup: this.logGroup, 82 | }), 83 | containerPort: props?.taskImageOptions?.containerPort ?? 19120, 84 | taskRole, 85 | image: 86 | props?.taskImageOptions?.image ?? 87 | ContainerImage.fromRegistry("ghcr.io/projectnessie/nessie"), 88 | }, 89 | }); 90 | 91 | // this.service.loadBalancer.addListener("HTTPS", { 92 | // port: 443, 93 | // protocol: ApplicationProtocol.HTTPS, 94 | // }); 95 | this.versionStore.grantReadWriteData(taskRole); 96 | 97 | this.service.targetGroup.configureHealthCheck({ 98 | // uses smallrye-health: 99 | // see: https://redhat-developer-demos.github.io/quarkus-tutorial/quarkus-tutorial/health.html 100 | path: "/q/health", 101 | }); 102 | 103 | if (props?.dns) { 104 | this.endpoint = `https://${props.dns.domainName}`; 105 | } else { 106 | this.endpoint = `http://${this.service.loadBalancer.loadBalancerDnsName}`; 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /packyak-aws-cdk/scripts/mount-efs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | while [[ "$#" -gt 0 ]]; do 6 | case $1 in 7 | --file-system-id) FILE_SYSTEM_ID="$2"; shift ;; 8 | --mount-point) MOUNT_POINT="$2"; shift ;; 9 | --access-point-id) ACCESS_POINT_ID="$2"; shift ;; 10 | --user) USERNAME="$2"; shift ;; 11 | --uid) USER_ID="$2"; shift ;; 12 | --gid) GROUP_ID="$2"; shift ;; 13 | *) echo "Unknown parameter passed: $1"; exit 1 ;; 14 | esac 15 | shift 16 | done 17 | 18 | check_variable() { 19 | if [ -z "${!1}" ]; then 20 | echo "Error: $1 is undefined." 21 | exit 1 22 | fi 23 | } 24 | 25 | check_variable FILE_SYSTEM_ID 26 | check_variable MOUNT_POINT 27 | check_variable USERNAME 28 | check_variable GROUP_ID 29 | check_variable USER_ID 30 | 31 | 32 | # EMR 6 is having problems with parallel yum operations (for unknown reasons), we'll loop until the lock is released 33 | max_wait_seconds=60 34 | start_time=$(date +%s) 35 | 36 | while sudo fuser /var/run/yum.pid >/dev/null 2>&1; do 37 | current_time=$(date +%s) 38 | elapsed_seconds=$((current_time - start_time)) 39 | if [[ $elapsed_seconds -ge $max_wait_seconds ]]; then 40 | echo "Yum lock could not be acquired within $max_wait_seconds seconds. Exiting." 41 | exit 1 42 | fi 43 | echo "Yum lock is held by another process. Retrying in 1 seconds" 44 | sleep 1 45 | done 46 | 47 | sudo yum update -y 48 | sudo yum check-update -y 49 | sudo yum upgrade -y 50 | sudo yum install -y amazon-efs-utils nfs-utils 51 | sudo yum install -y openssl-devel bzip2-devel libffi-devel zlib-devel xz-devel sqlite-devel readline-devel 52 | 53 | if ! getent group $GROUP_ID &>/dev/null; then 54 | echo "Group with GID $GROUP_ID does not exist, creating group." 55 | sudo groupadd --gid $GROUP_ID ${USERNAME} 56 | else 57 | echo "Group with GID $GROUP_ID already exists, proceeding." 58 | fi 59 | 60 | # create the user if it doe not exist 61 | if id "$USERNAME" &>/dev/null; then 62 | echo "User ${USERNAME} exists, proceeding to mount EFS." 63 | else 64 | echo "User ${USERNAME} does not exist, creating user." 65 | sudo adduser --uid ${USER_ID} --gid ${GROUP_ID} ${USERNAME} 66 | 67 | # user needs to be able to run docker 68 | sudo usermod -aG docker ${USERNAME} 69 | sudo usermod -aG hadoop ${USERNAME} 70 | sudo usermod -aG hdfsadmingroup ${USERNAME} 71 | sudo usermod -aG hdfs ${USERNAME} 72 | sudo usermod -aG spark ${USERNAME} 73 | 74 | # allow yarn to access the user's files that allow read access at the group level 75 | sudo usermod -aG ${USERNAME} yarn 76 | fi 77 | 78 | # create a local directory on the EBS device owned by the user 79 | # this will not be persisted across sessions but gives a directory 80 | # for users to store latency-sensitive files 81 | LOCAL_DIR=/mnt/${USERNAME} 82 | sudo mkdir -p $LOCAL_DIR 83 | sudo chown ${USERNAME}:${GROUP_ID} $LOCAL_DIR 84 | 85 | # TODO: add ssh pub keys, set them up in the bootstrap 86 | # TODO: remove ssm-user from the sudoers file 87 | # result: now i can't log in as tyler by adding my ssh key .. 88 | 89 | sudo mkdir -p ${MOUNT_POINT} 90 | sudo chown ${USERNAME}:${GROUP_ID} ${MOUNT_POINT} 91 | sudo chmod 750 ${MOUNT_POINT} 92 | 93 | sudo mkdir -p ${MOUNT_POINT}/.ssh 94 | sudo chown ${USERNAME}:${GROUP_ID} ${MOUNT_POINT}/.ssh 95 | sudo chmod 750 ${MOUNT_POINT}/.ssh 96 | 97 | if [ ! -z "${ACCESS_POINT_ID}" ]; then 98 | PARAMS="$PARAMS,accesspoint=${ACCESS_POINT_ID}" 99 | fi 100 | if [ ! -z "$PARAMS" ]; then 101 | PARAMS=",$PARAMS" 102 | fi 103 | 104 | 105 | # see: https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-mount-helper-ec2-linux.html 106 | 107 | # file-system-id:/ efs-mount-point efs _netdev,noresvport,tls,iam,accesspoint=access-point-id 0 0 108 | 109 | # Modify the section for adding the mount point to /etc/fstab 110 | echo "${FILE_SYSTEM_ID}:/ ${MOUNT_POINT} efs _netdev,noresvport,tls,iam${PARAMS} 0 0" | sudo tee -a /etc/fstab 111 | 112 | # mount the newly added file system 113 | sudo mount ${MOUNT_POINT} 114 | 115 | echo Mounted ${MOUNT_POINT} successfully. -------------------------------------------------------------------------------- /packyak-aws-cdk/src/workspace/workspace.ts: -------------------------------------------------------------------------------- 1 | import type { Connections, IConnectable } from "aws-cdk-lib/aws-ec2"; 2 | import { 3 | FileSystem, 4 | ThroughputMode, 5 | type FileSystemProps, 6 | } from "aws-cdk-lib/aws-efs"; 7 | import { RemovalPolicy } from "aws-cdk-lib/core"; 8 | import { Construct } from "constructs"; 9 | import { PosixGroup } from "./group"; 10 | import { Home } from "./home"; 11 | 12 | export interface MountFileSystemOptions { 13 | readonly mountPoint: string; 14 | readonly username: string; 15 | readonly uid: number; 16 | readonly gid: number; 17 | } 18 | 19 | export interface AddHomeRequest { 20 | /** 21 | * The username for the user. This should be unique across all users. 22 | */ 23 | readonly username: string; 24 | /** 25 | * The POSIX user ID for the user. This should be a unique identifier. 26 | */ 27 | readonly uid: string; 28 | /** 29 | * The POSIX group ID for the user. This is used for file system permissions. 30 | * 31 | * @default - same as the uid 32 | */ 33 | readonly gid?: string; 34 | /** 35 | * Secondary groups to assign to files written to this home directory. 36 | */ 37 | readonly secondaryGroups?: PosixGroup[]; 38 | } 39 | 40 | export interface WorkspaceProps extends FileSystemProps {} 41 | 42 | /** 43 | * A Workspace is a shared environment for a team of developers to work on a project together. 44 | * 45 | * A Workspace contains a shared EFS {@link FileSystem} with {@link AccessPoint}s 46 | * for each {@link User} granted access to the system. 47 | * 48 | * A Workspace can be mounted to EC2 machines, SageMaker Domains and AWS EMR Clusters. 49 | */ 50 | export class Workspace extends Construct implements IConnectable { 51 | /** 52 | * EFS File System shared by all users of the Workspace. 53 | */ 54 | public readonly fileSystem: FileSystem; 55 | /** 56 | * Home directory of the `ssm-user` POSIX user. 57 | * 58 | * This is the default user assigned when logging into a machine via SSM. 59 | */ 60 | public readonly ssm: Home; 61 | /** 62 | * Connections for the EFS file system 63 | */ 64 | public readonly connections: Connections; 65 | 66 | // isolated scoping for Home directories 67 | private readonly homes = new Construct(this, "Homes"); 68 | 69 | constructor(scope: Construct, id: string, props: WorkspaceProps) { 70 | super(scope, id); 71 | 72 | this.fileSystem = new FileSystem(this, "FileSystem", { 73 | ...props, 74 | vpc: props.vpc, 75 | // switch default to Elastic as it seems more hands off 76 | throughputMode: props.throughputMode ?? ThroughputMode.ELASTIC, 77 | // switch the default to encrypted, this is designed to store sensitive user data in a home directory 78 | // e.g. ssh keys, .env files, API keys, credentials, proprietary code 79 | encrypted: props.encrypted ?? true, 80 | removalPolicy: props.removalPolicy ?? RemovalPolicy.RETAIN, 81 | }); 82 | this.connections = this.fileSystem.connections; 83 | 84 | // TODO: disable root permissions from ssm-user 85 | // https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-getting-started-ssm-user-permissions.html 86 | 87 | this.ssm = this.addHome({ 88 | username: "ssm-user", 89 | // TODO: what's the default UID and GID for the ssm-user when created by AWS? 90 | uid: "2000", 91 | gid: "2000", 92 | }); 93 | } 94 | 95 | /** 96 | * Allow access to the EFS file system from a connectable, e.g. SecurityGroup. 97 | * 98 | * @param connectable the connectable to allow access to the shared EFS file system 99 | */ 100 | public allowFrom(connectable: IConnectable) { 101 | this.fileSystem.connections.allowDefaultPortFrom(connectable); 102 | } 103 | 104 | /** 105 | * Add a home directory to the workspace 106 | */ 107 | public addHome(props: AddHomeRequest) { 108 | return new Home(this.homes, props.username, { 109 | fileSystem: this.fileSystem, 110 | username: props.username, 111 | uid: props.uid, 112 | gid: props.gid, 113 | }); 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/emr/glue-catalog.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from "aws-cdk-lib"; 2 | import { PolicyStatement } from "aws-cdk-lib/aws-iam"; 3 | import type { ICatalog } from "./catalog"; 4 | import type { Cluster } from "./cluster"; 5 | import { Bucket, IBucket } from "aws-cdk-lib/aws-s3"; 6 | import { Construct } from "constructs"; 7 | import { SparkSqlExtension } from "./spark-sql-extension"; 8 | 9 | export interface IcebergGlueCatalogProps { 10 | /** 11 | * The S3 bucket where the Iceberg table data is stored. 12 | * 13 | * @default - one is created for you 14 | */ 15 | readonly warehouseBucket?: IBucket; 16 | /** 17 | * The prefix for the Iceberg table data in the S3 bucket. 18 | * 19 | * @default - no prefix (e.g. use the root: `s3://bucket/`) 20 | */ 21 | readonly warehousePrefix?: string; 22 | } 23 | 24 | export interface FromBucketProps { 25 | readonly warehouseBucketName: string; 26 | readonly warehousePrefix?: string; 27 | } 28 | 29 | export class IcebergGlueCatalog extends Construct implements ICatalog { 30 | private readonly warehouseBucket: IBucket; 31 | private readonly warehousePrefix: string | undefined; 32 | 33 | public static fromBucketName( 34 | scope: Construct, 35 | id: string, 36 | props: FromBucketProps, 37 | ) { 38 | return new IcebergGlueCatalog(scope, id, { 39 | warehouseBucket: Bucket.fromBucketName( 40 | scope, 41 | `${id}WarehouseBucket`, 42 | props.warehouseBucketName, 43 | ), 44 | warehousePrefix: props.warehousePrefix, 45 | }); 46 | } 47 | 48 | constructor(scope: Construct, id: string, props: IcebergGlueCatalogProps) { 49 | super(scope, id); 50 | this.warehouseBucket = 51 | props.warehouseBucket ?? new Bucket(this, "WarehouseBucket"); 52 | this.warehousePrefix = props.warehousePrefix; 53 | } 54 | 55 | public bind(cluster: Cluster, catalogName: string): void { 56 | // TODO: should we limit this to the warehouse prefix 57 | this.warehouseBucket.grantReadWrite(cluster, "*"); 58 | const { partition, region, account } = Stack.of(cluster); 59 | cluster.grantPrincipal.addToPrincipalPolicy( 60 | new PolicyStatement({ 61 | actions: ["glue:GetDatabase"], 62 | resources: [`arn:${partition}:glue:${region}:${account}:catalog`], 63 | }), 64 | ); 65 | const sparkVersion = cluster.release.sparkVersion; 66 | const scalaVersion = cluster.release.scalaVersion; 67 | const icebergExt = SparkSqlExtension.Iceberg.maven( 68 | sparkVersion, 69 | scalaVersion, 70 | ); 71 | const catalogNamespace = `spark.sql.catalog.${catalogName}`; 72 | cluster.addConfig( 73 | { 74 | classification: "spark-hive-site", 75 | configurationProperties: { 76 | "hive.metastore.client.factory.class": 77 | "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory", 78 | }, 79 | }, 80 | { 81 | classification: "spark-defaults", 82 | configurationProperties: { 83 | "spark.jars.packages": icebergExt, 84 | "spark.sql.extensions": SparkSqlExtension.Iceberg.className, 85 | // "spark.sql.catalogImplementation": "hive", 86 | [catalogNamespace]: "org.apache.iceberg.spark.SparkCatalog", 87 | [`${catalogNamespace}.warehouse`]: `s3://${ 88 | this.warehouseBucket.bucketName 89 | }${ 90 | this.warehousePrefix 91 | ? `/${this.warehousePrefix.replace(/^[\/]*/g, "")}` 92 | : "" 93 | }`, 94 | [`${catalogNamespace}.catalog-impl`]: 95 | "org.apache.iceberg.aws.glue.GlueCatalog", 96 | [`${catalogNamespace}.io-impl`]: "org.apache.iceberg.aws.s3.S3FileIO", 97 | }, 98 | }, 99 | ); 100 | } 101 | 102 | /* 103 | spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions 104 | --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog 105 | --conf spark.sql.catalog.glue_catalog.warehouse=s3:/// 106 | --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog 107 | --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO 108 | */ 109 | } 110 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/sagemaker/sage-maker-image.ts: -------------------------------------------------------------------------------- 1 | import { type Stack, CfnMapping, Arn } from "aws-cdk-lib/core"; 2 | 3 | export const SageMakerImageSingletonID = "sagemaker:image:mapping"; 4 | export const SageMakerDistributionImageSingletonID = 5 | "sagemaker:image:mapping:distribution"; 6 | 7 | export enum SageMakerImageType { 8 | DISTRIBUTION = "Distribution", 9 | IMAGE = "Image", 10 | } 11 | 12 | // AWS stores each image in a different account based on two factors: 13 | // 1. The region 14 | // 2. Whether it is a "Distribution Image" or a "Standard Image" 15 | // https://docs.aws.amazon.com/sagemaker/latest/dg/notebooks-available-images.html 16 | const imageArnAccounts = { 17 | "us-east-1": "081325390199", 18 | "us-east-2": "429704687514", 19 | "us-west-1": "742091327244", 20 | "us-west-2": "236514542706", 21 | "af-south-1": "559312083959", 22 | "ap-east-1": "493642496378", 23 | "ap-south-1": "394103062818", 24 | "ap-northeast-2": "806072073708", 25 | "ap-southeast-1": "492261229750", 26 | "ap-southeast-2": "452832661640", 27 | "ap-northeast-1": "102112518831", 28 | "ca-central-1": "310906938811", 29 | "eu-central-1": "936697816551", 30 | "eu-west-1": "470317259841", 31 | "eu-west-2": "712779665605", 32 | "eu-west-3": "615547856133", 33 | "eu-north-1": "243637512696", 34 | "eu-south-1": "592751261982", 35 | "sa-east-1": "782484402741", 36 | "ap-northeast-3": "792733760839", 37 | "ap-southeast-3": "276181064229", 38 | "me-south-1": "117516905037", 39 | "me-central-1": "103105715889", 40 | }; 41 | const distributionImageArnAccounts = { 42 | "us-east-1": "885854791233", 43 | "us-east-2": "137914896644", 44 | "us-west-1": "053634841547", 45 | "us-west-2": "542918446943", 46 | "af-south-1": "238384257742", 47 | "ap-east-1": "523751269255", 48 | "ap-south-1": "245090515133", 49 | "ap-northeast-2": "064688005998", 50 | "ap-southeast-1": "022667117163", 51 | "ap-southeast-2": "648430277019", 52 | "ap-northeast-1": "010972774902", 53 | "ca-central-1": "481561238223", 54 | "eu-central-1": "545423591354", 55 | "eu-west-1": "819792524951", 56 | "eu-west-2": "021081402939", 57 | "eu-west-3": "856416204555", 58 | "eu-north-1": "175620155138", 59 | "eu-south-1": "810671768855", 60 | "sa-east-1": "567556641782", 61 | "ap-northeast-3": "564864627153", 62 | "ap-southeast-3": "370607712162", 63 | "me-south-1": "523774347010", 64 | "me-central-1": "358593528301", 65 | }; 66 | 67 | export class SageMakerImage { 68 | public static readonly CPU_V1 = new SageMakerImage( 69 | "sagemaker-distribution-cpu-v1", 70 | SageMakerImageType.DISTRIBUTION, 71 | ); 72 | public static readonly CPU_V0 = new SageMakerImage( 73 | "sagemaker-distribution-cpu-v0", 74 | SageMakerImageType.DISTRIBUTION, 75 | ); 76 | public static readonly GPU_V0 = new SageMakerImage( 77 | "sagemaker-distribution-gpu-v0", 78 | SageMakerImageType.DISTRIBUTION, 79 | ); 80 | public static readonly GPU_V1 = new SageMakerImage( 81 | "sagemaker-distribution-gpu-v1", 82 | SageMakerImageType.DISTRIBUTION, 83 | ); 84 | 85 | constructor( 86 | private readonly resourceId: string, 87 | private readonly imageType: SageMakerImageType, 88 | ) {} 89 | 90 | public getArnForStack(stack: Stack) { 91 | const [singletonId, mappings] = 92 | this.imageType === SageMakerImageType.IMAGE 93 | ? [SageMakerImageSingletonID, imageArnAccounts] 94 | : [SageMakerDistributionImageSingletonID, distributionImageArnAccounts]; 95 | 96 | // this maps the region to the AWS-owned account that owns the image 97 | const regionToAccount = 98 | (stack.node.tryFindChild(singletonId) as CfnMapping | undefined) ?? 99 | new CfnMapping(stack, singletonId, { 100 | mapping: Object.fromEntries( 101 | Object.entries(mappings).map(([region, account]) => [ 102 | region, 103 | { account }, 104 | ]), 105 | ), 106 | }); 107 | 108 | const region = stack.region; 109 | return Arn.format({ 110 | partition: "aws", 111 | service: "sagemaker", 112 | region, 113 | account: regionToAccount.findInMap(region, "account"), 114 | resource: "image", 115 | resourceName: this.resourceId, 116 | }); 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /packyak-docs/docusaurus.config.ts: -------------------------------------------------------------------------------- 1 | import {themes as prismThemes} from 'prism-react-renderer'; 2 | import type {Config} from '@docusaurus/types'; 3 | import type * as Preset from '@docusaurus/preset-classic'; 4 | 5 | const config: Config = { 6 | title: 'My Site', 7 | tagline: 'Dinosaurs are cool', 8 | favicon: 'img/favicon.ico', 9 | 10 | // Set the production url of your site here 11 | url: 'https://your-docusaurus-site.example.com', 12 | // Set the // pathname under which your site is served 13 | // For GitHub pages deployment, it is often '//' 14 | baseUrl: '/', 15 | 16 | // GitHub pages deployment config. 17 | // If you aren't using GitHub pages, you don't need these. 18 | organizationName: 'facebook', // Usually your GitHub org/user name. 19 | projectName: 'docusaurus', // Usually your repo name. 20 | 21 | onBrokenLinks: 'throw', 22 | onBrokenMarkdownLinks: 'warn', 23 | 24 | // Even if you don't use internationalization, you can use this field to set 25 | // useful metadata like html lang. For example, if your site is Chinese, you 26 | // may want to replace "en" with "zh-Hans". 27 | i18n: { 28 | defaultLocale: 'en', 29 | locales: ['en'], 30 | }, 31 | 32 | presets: [ 33 | [ 34 | 'classic', 35 | { 36 | docs: { 37 | sidebarPath: './sidebars.ts', 38 | // Please change this to your repo. 39 | // Remove this to remove the "edit this page" links. 40 | editUrl: 41 | 'https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/', 42 | }, 43 | blog: { 44 | showReadingTime: true, 45 | // Please change this to your repo. 46 | // Remove this to remove the "edit this page" links. 47 | editUrl: 48 | 'https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/', 49 | }, 50 | theme: { 51 | customCss: './src/css/custom.css', 52 | }, 53 | } satisfies Preset.Options, 54 | ], 55 | ], 56 | 57 | themeConfig: { 58 | // Replace with your project's social card 59 | image: 'img/docusaurus-social-card.jpg', 60 | navbar: { 61 | title: 'My Site', 62 | logo: { 63 | alt: 'My Site Logo', 64 | src: 'img/logo.svg', 65 | }, 66 | items: [ 67 | { 68 | type: 'docSidebar', 69 | sidebarId: 'tutorialSidebar', 70 | position: 'left', 71 | label: 'Tutorial', 72 | }, 73 | {to: '/blog', label: 'Blog', position: 'left'}, 74 | { 75 | href: 'https://github.com/facebook/docusaurus', 76 | label: 'GitHub', 77 | position: 'right', 78 | }, 79 | ], 80 | }, 81 | footer: { 82 | style: 'dark', 83 | links: [ 84 | { 85 | title: 'Docs', 86 | items: [ 87 | { 88 | label: 'Tutorial', 89 | to: '/docs/intro', 90 | }, 91 | ], 92 | }, 93 | { 94 | title: 'Community', 95 | items: [ 96 | { 97 | label: 'Stack Overflow', 98 | href: 'https://stackoverflow.com/questions/tagged/docusaurus', 99 | }, 100 | { 101 | label: 'Discord', 102 | href: 'https://discordapp.com/invite/docusaurus', 103 | }, 104 | { 105 | label: 'Twitter', 106 | href: 'https://twitter.com/docusaurus', 107 | }, 108 | ], 109 | }, 110 | { 111 | title: 'More', 112 | items: [ 113 | { 114 | label: 'Blog', 115 | to: '/blog', 116 | }, 117 | { 118 | label: 'GitHub', 119 | href: 'https://github.com/facebook/docusaurus', 120 | }, 121 | ], 122 | }, 123 | ], 124 | copyright: `Copyright © ${new Date().getFullYear()} My Project, Inc. Built with Docusaurus.`, 125 | }, 126 | prism: { 127 | theme: prismThemes.github, 128 | darkTheme: prismThemes.dracula, 129 | }, 130 | } satisfies Preset.ThemeConfig, 131 | }; 132 | 133 | export default config; 134 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/streamlit-site.ts: -------------------------------------------------------------------------------- 1 | import { IgnoreMode } from "aws-cdk-lib"; 2 | import { Platform } from "aws-cdk-lib/aws-ecr-assets"; 3 | import { 4 | ContainerImage, 5 | CpuArchitecture, 6 | OperatingSystemFamily, 7 | } from "aws-cdk-lib/aws-ecs"; 8 | import { 9 | ApplicationLoadBalancedFargateService, 10 | ApplicationLoadBalancedFargateServiceProps, 11 | } from "aws-cdk-lib/aws-ecs-patterns"; 12 | import { Construct } from "constructs"; 13 | import { exportRequirementsSync } from "./export-requirements"; 14 | import * as path from "path"; 15 | import { Role, ServicePrincipal } from "aws-cdk-lib/aws-iam"; 16 | import type { PythonPoetryArgs } from "./python-poetry"; 17 | 18 | export interface StreamlitSiteProps 19 | extends ApplicationLoadBalancedFargateServiceProps { 20 | /** 21 | * The {@link LakeHouse} that this Streamlit application will source and contribute data to. 22 | */ 23 | // readonly lakeHouse: LakeHouse; 24 | /** 25 | * Entrypoint to the streamlit application. 26 | * 27 | * @example "my/app.py" 28 | */ 29 | readonly home: string; 30 | /** 31 | * The name of the Dockerfile to use to build this Streamlit site. 32 | * 33 | * @default "Dockerfile" 34 | */ 35 | readonly dockerfile?: string; 36 | /** 37 | * The platform to use to build this Streamlit site. 38 | * 39 | * @default {@link Platform.LINUX_AMD64} 40 | */ 41 | readonly platform?: Platform; 42 | /** 43 | * Override how the `requirements.txt` file is generated with Python Poetry 44 | * 45 | * @default - see {@link exportRequirementsSync} 46 | */ 47 | readonly pythonPoetryArgs?: PythonPoetryArgs; 48 | } 49 | 50 | export class StreamlitSite extends Construct { 51 | readonly service; 52 | readonly url; 53 | 54 | constructor(scope: Construct, id: string, props: StreamlitSiteProps) { 55 | super(scope, id); 56 | 57 | const requirementsPath = exportRequirementsSync( 58 | path.join(".packyak", this.node.addr), 59 | props.pythonPoetryArgs, 60 | ); 61 | 62 | // enumerate over the module specs to discover what the home and pages/*.py depend on 63 | // const homeFilePath = path.resolve(props.home); 64 | // const pagesDirPath = path.join(path.dirname(homeFilePath), "pages"); 65 | 66 | // const homeAndPagesModules = props.lakeHouse.spec.modules.flatMap((module) => 67 | // module.file_name === homeFilePath || 68 | // module.file_name.startsWith(path.join(pagesDirPath, "")) 69 | // ? [module] 70 | // : [], 71 | // ); 72 | 73 | const platform = props.platform ?? Platform.LINUX_AMD64; 74 | 75 | const taskRole = 76 | props.taskImageOptions?.taskRole ?? 77 | new Role(this, "TaskRole", { 78 | assumedBy: new ServicePrincipal("ecs-tasks.amazonaws.com"), 79 | }); 80 | const environment: Record = { 81 | ...props.taskImageOptions?.environment, 82 | }; 83 | // props.lakeHouse.bind( 84 | // { 85 | // grantPrincipal: taskRole, 86 | // addEnvironment: (key, value) => { 87 | // environment[key] = value; 88 | // }, 89 | // }, 90 | // homeAndPagesModules, 91 | // ); 92 | 93 | this.service = new ApplicationLoadBalancedFargateService(this, "Service", { 94 | ...props, 95 | cluster: props.cluster, 96 | vpc: props.vpc, 97 | // cluster: props.lakeHouse.cluster, 98 | runtimePlatform: { 99 | cpuArchitecture: 100 | platform === Platform.LINUX_AMD64 101 | ? CpuArchitecture.X86_64 102 | : CpuArchitecture.ARM64, 103 | operatingSystemFamily: OperatingSystemFamily.LINUX, 104 | }, 105 | cpu: props.cpu ?? 256, 106 | memoryLimitMiB: props.memoryLimitMiB ?? 512, 107 | taskImageOptions: { 108 | ...(props.taskImageOptions ?? {}), 109 | environment, 110 | containerPort: props.taskImageOptions?.containerPort ?? 8501, 111 | taskRole, 112 | image: 113 | props.taskImageOptions?.image ?? 114 | ContainerImage.fromAsset(".", { 115 | ignoreMode: IgnoreMode.DOCKER, 116 | platform, 117 | buildArgs: { 118 | REQUIREMENTS_PATH: requirementsPath, 119 | }, 120 | }), 121 | }, 122 | }); 123 | 124 | this.service.targetGroup.configureHealthCheck( 125 | props.healthCheck ?? { 126 | path: "/_stcore/health", 127 | }, 128 | ); 129 | 130 | this.url = `https://${this.service.loadBalancer.loadBalancerDnsName}`; 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /packyak-docs/blog/2024-01-22-Data_Lineage.md: -------------------------------------------------------------------------------- 1 | --- 2 | slug: data-lineage-governance-and-compliance 3 | title: "Data Lineage: governance and compliance" 4 | author: sam 5 | tags: [lakehouse, data lake, data warehouse, delta lake, iceberg, hudi] 6 | --- 7 | 8 | # Data Lineage: governance and compliance 9 | 10 | ## Tools 11 | 12 | - [Open Lineage](https://openlineage.io/) 13 | - [Apache Atlas](https://atlas.apache.org/#/) 14 | - [Amundsen](https://www.amundsen.io/) 15 | - [Egeria](https://egeria-project.org/) 16 | - [Atlan](https://atlan.com/) 17 | 18 | ## Blogs 19 | 20 | - [Data Lineage: State-of-the-art and Implementation Challenges](https://medium.com/bliblidotcom-techblog/data-lineage-state-of-the-art-and-implementation-challenges-1ea8dccde9de) 21 | 22 | ## Research 23 | 24 | Data lineage refers to the life-cycle of data, including its origins, movements, transformations, and interactions within a system. It provides a clear and comprehensive visual representation of data flow, which is crucial for various aspects of data management and governance[6][12]. 25 | 26 | Standards like OpenLineage and tools like Apache Atlas are designed to facilitate data lineage tracking and management. OpenLineage is an open standard for lineage metadata collection and analysis. It provides a standard API for capturing lineage events, enabling consistent collection of lineage metadata across different data pipeline components. This helps create a deeper understanding of how data is produced and used[1][4][7]. Apache Atlas, on the other hand, is an open-source metadata and data governance framework. It helps in mapping and organizing metadata representations, thereby enabling control over data across the data ecosystem[2][5][8]. 27 | 28 | Data lineage solves several core problems: 29 | 30 | 1. **Improving Data Quality**: Data lineage provides a transparent view of how data is collected, transformed, and integrated across systems. This helps ensure that the data used for analysis and decision-making is accurate and trustworthy[6][9]. 31 | 32 | 2. **Facilitating Root Cause Analysis**: When performance issues arise, data lineage can help pinpoint the root causes, allowing organizations to resolve them effectively[6][9]. 33 | 34 | 3. **Enhancing Data Governance and Compliance**: Data lineage aids in compliance with data privacy and security regulations. It provides a clear record of data transformations and movements, which is crucial for audit trails and regulatory compliance[6][9]. 35 | 36 | 4. **Optimizing Data Transformation Processes**: Understanding the transformations data undergoes is essential for optimizing data transformation processes. This can lead to streamlined data processing, reduced processing times, and enhanced resource utilization[12]. 37 | 38 | 5. **Facilitating Impact Analysis**: Data lineage allows organizations to predict the effects of changes to data sources, transformations, or destinations. This foresight is invaluable in risk assessment and change management[12]. 39 | 40 | OpenLineage and Apache Atlas, by providing a standardized and efficient way to track and manage data lineage, help organizations address these challenges, thereby enhancing data trust, quality, governance, and scalability[1][2][4][5][7][8]. 41 | 42 | Citations: 43 | [1] https://openlineage.io 44 | [2] https://atlas.apache.org 45 | [3] https://alexsolutions.com/about-us/blog/lineage-solves-business-problems/ 46 | [4] https://openlineage.io/docs/ 47 | [5] https://atlan.com/what-is-apache-atlas/ 48 | [6] https://securityaffairs.com/151541/security/top-5-problems-solved-by-data-lineage.html 49 | [7] https://openlineage.io/blog/why-open-standard/ 50 | [8] https://blog.dotmodus.com/what-is-apache-atlas-and-why-is-it-important/ 51 | [9] https://atlan.com/data-lineage-tools/ 52 | [10] https://atlan.com/openmetadata-vs-openlineage/ 53 | [11] https://community.cloudera.com/t5/Community-Articles/Using-Apache-Atlas-to-view-Data-Lineage/ta-p/246305 54 | [12] https://www.red-gate.com/simple-talk/development/other-development/understanding-the-importance-of-data-lineage-in-modern-data-management/ 55 | [13] https://openlineage.io/getting-started/ 56 | [14] https://atlas.apache.org/1.2.0/index.html 57 | [15] https://www.imperva.com/learn/data-security/data-lineage/ 58 | [16] https://docs.astronomer.io/astro/data-lineage-concepts 59 | [17] https://www.cloudera.com/products/open-source/apache-hadoop/apache-atlas.html 60 | [18] https://www.youtube.com/watch?v=5YvMnQ6O0RI 61 | [19] https://hightouch.com/blog/exploring-data-lineage-with-open-lineage 62 | [20] https://www.clearpeaks.com/data-governance-with-apache-atlas-introduction-to-atlas/ 63 | [21] https://www.linkedin.com/pulse/data-lineage-more-important-than-you-think-shaun-ryan-xjzle 64 | [22] https://www.youtube.com/watch?v=M3IBHe8bnu0 65 | [23] https://stackoverflow.com/questions/tagged/apache-atlas?tab=Unanswered 66 | [24] https://docs.getdbt.com/terms/data-lineage 67 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/workspace/home.ts: -------------------------------------------------------------------------------- 1 | import { type FileSystem, AccessPoint } from "aws-cdk-lib/aws-efs"; 2 | import { Construct } from "constructs"; 3 | import type { Workspace } from "./workspace"; 4 | import { type IGrantable, PolicyStatement } from "aws-cdk-lib/aws-iam"; 5 | import type { Connections, IConnectable } from "aws-cdk-lib/aws-ec2"; 6 | import type { PosixGroup } from "./group"; 7 | 8 | export interface HomeProps { 9 | /** 10 | * The file system associated with the user. 11 | */ 12 | readonly fileSystem: FileSystem; 13 | /** 14 | * The username for the user. This should be unique across all users. 15 | */ 16 | readonly username: string; 17 | /** 18 | * The POSIX user ID for the user. This should be a unique identifier. 19 | */ 20 | readonly uid: string; 21 | /** 22 | * The POSIX group ID for the user. This is used for file system permissions. 23 | * 24 | * @default - same as the uid 25 | */ 26 | readonly gid?: string; 27 | /** 28 | * Secondary groups to assign to files written to this home directory. 29 | */ 30 | readonly secondaryGroups?: PosixGroup[]; 31 | } 32 | 33 | /** 34 | * A Home directory is a secure directory in a {@link Workspace} only 35 | * accessible by the User who owns it. 36 | */ 37 | export class Home extends Construct implements IConnectable { 38 | /** 39 | * The connections for the EFS file system. 40 | */ 41 | public readonly connections: Connections; 42 | /** 43 | * An {@link AccessPoint} to the user's home directory 44 | */ 45 | public readonly accessPoint: AccessPoint; 46 | /** 47 | * The username of the user. 48 | * 49 | * Should match the AWS SSO username. 50 | */ 51 | public readonly username: string; 52 | /** 53 | * The POSIX user ID 54 | */ 55 | public readonly uid: string; 56 | /** 57 | * The POSIX group ID 58 | */ 59 | public readonly gid: string; 60 | /** 61 | * Absolute path to the home directory 62 | */ 63 | public readonly path: string; 64 | 65 | constructor(scope: Construct, id: string, props: HomeProps) { 66 | super(scope, id); 67 | 68 | this.username = props.username; 69 | this.uid = props.uid; 70 | this.gid = props.gid ?? props.uid; 71 | this.path = `/home/${props.username}`; 72 | 73 | this.connections = props.fileSystem.connections; 74 | 75 | this.accessPoint = new AccessPoint(this, "AccessPoint", { 76 | fileSystem: props.fileSystem, 77 | createAcl: { 78 | ownerGid: this.gid, 79 | ownerUid: this.uid, 80 | // locked down for the user 81 | // user: rwx 82 | // group: r-x 83 | // other: --- 84 | permissions: "750", 85 | }, 86 | // TODO: this forces all files written through this file system to have this ownership. 87 | // TODO: is this right? Or should we force consistent username, gid and uid across all EC2 instances? 88 | posixUser: { 89 | uid: this.uid, 90 | gid: this.gid, 91 | secondaryGids: props.secondaryGroups?.map((g) => `${g.gid}`), 92 | }, 93 | path: this.path, 94 | }); 95 | } 96 | 97 | public allowFrom(connectable: IConnectable) { 98 | this.accessPoint.fileSystem.connections.allowDefaultPortFrom(connectable); 99 | } 100 | 101 | public grantRead({ grantPrincipal }: IGrantable) { 102 | grantPrincipal.addToPrincipalPolicy( 103 | new PolicyStatement({ 104 | actions: ["elasticfilesystem:DescribeMountTargets"], 105 | resources: [ 106 | this.accessPoint.fileSystem.fileSystemArn, 107 | this.accessPoint.accessPointArn, 108 | ], 109 | }), 110 | ); 111 | this.grant(grantPrincipal, ["elasticfilesystem:ClientMount"]); 112 | } 113 | 114 | public grantReadWrite({ grantPrincipal }: IGrantable) { 115 | this.grantRead({ grantPrincipal }); 116 | this.grant(grantPrincipal, ["elasticfilesystem:ClientWrite"]); 117 | } 118 | 119 | public grantRootAccess({ grantPrincipal }: IGrantable) { 120 | this.grantReadWrite({ grantPrincipal }); 121 | this.grant(grantPrincipal, ["elasticfilesystem:ClientRootAccess"]); 122 | } 123 | 124 | public grant({ grantPrincipal }: IGrantable, actions: string[]) { 125 | grantPrincipal.addToPrincipalPolicy( 126 | new PolicyStatement({ 127 | actions: actions, 128 | resources: [this.accessPoint.fileSystem.fileSystemArn], 129 | conditions: { 130 | // see: https://docs.aws.amazon.com/service-authorization/latest/reference/list_amazonelasticfilesystem.html#amazonelasticfilesystem-resources-for-iam-policies 131 | StringEquals: { 132 | "elasticfilesystem:AccessPointArn": this.accessPoint.accessPointArn, 133 | }, 134 | Bool: { 135 | "elasticfilesystem:AccessedViaMountTarget": true, 136 | }, 137 | }, 138 | }), 139 | ); 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /packyak-aws-cdk/src/dagster.ts: -------------------------------------------------------------------------------- 1 | import { Construct } from "constructs"; 2 | import { IConnectable, IVpc, Port } from "aws-cdk-lib/aws-ec2"; 3 | import { 4 | DatabaseClusterEngine, 5 | Credentials, 6 | DatabaseCluster, 7 | AuroraPostgresEngineVersion, 8 | ClusterInstance, 9 | IClusterInstance, 10 | } from "aws-cdk-lib/aws-rds"; 11 | import { Cluster } from "aws-cdk-lib/aws-ecs"; 12 | import { RemovalPolicy } from "aws-cdk-lib/core"; 13 | import type { ISecret } from "aws-cdk-lib/aws-secretsmanager"; 14 | 15 | export interface DagsterServiceProps { 16 | /** 17 | * The VPC to deploy the service to. 18 | * 19 | * You must specify either {@link vpc} or {@link cluster}. 20 | */ 21 | readonly vpc?: IVpc; 22 | /** 23 | * The ECS cluster to deploy the service to. 24 | * 25 | * You must specify either {@link vpc} or {@link cluster}. 26 | */ 27 | readonly cluster?: Cluster; 28 | /** 29 | * The database to deploy to. 30 | */ 31 | readonly database?: DagsterDatabaseProps; 32 | /** 33 | * The removal policy to use for the database and service. 34 | * 35 | * @default - The database is not removed automatically. 36 | */ 37 | readonly removalPolicy?: RemovalPolicy; 38 | } 39 | 40 | export interface DagsterDatabaseProps { 41 | /** 42 | * Credentials for the administrative user 43 | * 44 | * @default - A username of 'admin' and SecretsManager-generated password 45 | */ 46 | readonly credentials?: Credentials; 47 | /** 48 | * An optional identifier for the cluster 49 | * 50 | * @default - A name is automatically generated. 51 | */ 52 | readonly clusterIdentifier?: string; 53 | /** 54 | * The writer instance to use for the database. 55 | * 56 | * @default - A serverless instance is created. 57 | */ 58 | readonly writer?: IClusterInstance; 59 | /** 60 | * The readers instances to use for the database. 61 | * 62 | * @default - No readers are created. 63 | */ 64 | readonly readers?: IClusterInstance[]; 65 | /** 66 | * The port to connect to the database on. 67 | * 68 | * @default - 5432 69 | */ 70 | readonly port?: number; 71 | } 72 | 73 | /** 74 | * Represents a Dagster service deployment in AWS, encapsulating the necessary AWS resources. 75 | * 76 | * This class allows for the easy setup of a Dagster service with a connected Aurora Postgres database 77 | * within an ECS cluster. It abstracts away the complexity of directly dealing with AWS CDK constructs 78 | * for creating and configuring the ECS service, database, and necessary permissions. 79 | */ 80 | export class DagsterService extends Construct { 81 | public readonly database: DatabaseCluster; 82 | public readonly databaseSecret: ISecret; 83 | 84 | constructor(scope: Construct, id: string, props: DagsterServiceProps) { 85 | super(scope, id); 86 | 87 | if (props.cluster === undefined && props.vpc === undefined) { 88 | throw new Error("One of cluster or vpc must be provided."); 89 | } 90 | if (props.cluster && props.vpc) { 91 | throw new Error("Only one of cluster or vpc can be provided, not both."); 92 | } 93 | const cluster = 94 | props.cluster ?? 95 | new Cluster(this, "DagsterCluster", { 96 | vpc: props.vpc, 97 | }); 98 | const vpc = props.vpc ?? cluster.vpc; 99 | 100 | // TODO: deploy the service once we are ready to move away from hand running things on EMR 101 | // this.service = new ApplicationLoadBalancedFargateService(this, "Service", { 102 | // cluster, 103 | // taskImageOptions: { 104 | // image: ContainerImage.fromRegistry(props.dagsterImage), 105 | // }, 106 | // publicLoadBalancer: true, 107 | // }); 108 | 109 | this.database = new DatabaseCluster(this, "Database", { 110 | vpc, 111 | engine: DatabaseClusterEngine.auroraPostgres({ 112 | // version is 14.6 according to HELM charts 113 | // @see https://github.com/dagster-io/dagster/blob/4bb81fdb84a7775d3fd03190a2edf1a173def4b6/helm/dagster/values.yaml#L765 114 | version: AuroraPostgresEngineVersion.VER_14_6, 115 | }), 116 | writer: props.database?.writer ?? ClusterInstance.serverlessV2("writer"), 117 | readers: props.database?.readers, 118 | credentials: props.database?.credentials, 119 | removalPolicy: props.removalPolicy, 120 | clusterIdentifier: props.database?.clusterIdentifier, 121 | port: props.database?.port, 122 | }); 123 | this.databaseSecret = this.database.secret!; 124 | } 125 | 126 | /** 127 | * Allow a connectable to access the database. 128 | * 129 | * @param connectable The connectable to allow access from. 130 | */ 131 | public allowDBAccessFrom(connectable: IConnectable) { 132 | this.database.connections.allowFrom( 133 | connectable, 134 | Port.tcp(this.database.clusterEndpoint.port), 135 | ); 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /packyak-aws-cdk/README.md: -------------------------------------------------------------------------------- 1 | # PackYak ![image](https://github.com/sam-goodwin/packyak/assets/38672686/249af136-45fb-4d13-82bb-5818e803eeb0) 2 | 3 | [![PyPI version](https://badge.fury.io/py/packyak.svg)](https://badge.fury.io/py/packyak) 4 | 5 | # Packyak AWS CDK 6 | 7 | PackYak is a next-generation framework for building and deploying Data Lakehouses in AWS with a Git-like versioned developer workflow that simplifies how Data Scientists and Data Engineers collaborate. 8 | 9 | It enables you to deploy your entire Data Lakehouse, ETL and Machine Learning platforms on AWS with no external dependencies, maintain your Data Tables with Git-like versioning semantics and scale data production with Dagster-like Software-defined Asset Graphs. 10 | 11 | It combines 5 key technologies into one framework that makes scaling Data Lakehouses and Data Science teams dead simple: 12 | 1. Git-like versioning of Data Tables with [Project Nessie](https://projectnessie.org/) - no more worrying about the version of data, simply use branches, tags and commits to freeze data or roll back mistakes. 13 | 2. Software-defined Assets (as seen in Dagster) - think of your data pipelines in terms of the data it produces. Greatly simplify how data is produced, modified over time and backfilled in the event of errors. 14 | 3. Infrastructure-as-Code (AWS CDK and Pulumi) - deploy in minutes and manage it all yourself with minimal effort. 15 | 4. Apache Spark - write your ETL as simple python processes that are then scaled automatically over a managed AWS EMR Spark Cluster. 16 | 5. Streamlit - build Streamlit applications that integrate the Data Lakehouse and Apache Spark to provide interactive reports and exploratory tools over the versioned data lake. 17 | 18 | # Get Started 19 | 20 | ## Install Docker 21 | 22 | If you haven't already, install [Docker](https://docs.docker.com/get-docker/). 23 | 24 | ## Install Python Poetry & Plugins 25 | 26 | ```sh 27 | # Install the Python Poetry CLI 28 | curl -sSL https://install.python-poetry.org | python3 - 29 | 30 | # Add the export plugin to generate narrow requirements.txt 31 | poetry self add poetry-plugin-export 32 | ``` 33 | 34 | ## Install the `packyak` CLI: 35 | ```sh 36 | pip install packyak 37 | ``` 38 | 39 | ## Create a new Project 40 | 41 | ```sh 42 | packyak new my-project 43 | cd ./my-project 44 | ``` 45 | 46 | ## Deploy to AWS 47 | ```sh 48 | poetry run cdk deploy 49 | ``` 50 | 51 | ## Git-like Data Catalog (Project Nessie) 52 | 53 | PackYak comes with a Construct for hosting a [Project Nessie](https://projectnessie.org/) catalog that supports Git-like versioning of the tables in a Data Lakehouse. 54 | 55 | It deploys with an AWS DynamoDB Versioned store and an API hosted in AWS Lambda or AWS ECS. The Nessie Server is stateless and can be scaled easily with minimal-to-zero operational overhead. 56 | 57 | ### Create a `NessieDynamoDBVersionStore` 58 | 59 | ```py 60 | from packyak.aws_cdk import DynamoDBNessieVersionStore 61 | 62 | versionStore = DynamoDBNessieVersionStore( 63 | scope=stack, 64 | id="VersionStore", 65 | versionStoreName="my-version-store", 66 | ) 67 | ``` 68 | 69 | ### Create a Bucket to store Data Tables (e.g. Parquet files). This will store the "Repository"'s data. 70 | 71 | ```py 72 | myRepoBucket = Bucket( 73 | scope=stack, 74 | id="MyCatalogBucket", 75 | ) 76 | ``` 77 | 78 | ### Create the Nessie Catalog Service 79 | 80 | ```py 81 | # hosted on AWS ECS 82 | myCatalog = NessieECSCatalog( 83 | scope=stack, 84 | id="MyCatalog", 85 | vpc=vpc, 86 | warehouseBucket=myRepoBucket, 87 | catalogName=lakeHouseName, 88 | versionStore=versionStore, 89 | ) 90 | ``` 91 | 92 | ### Create a Branch 93 | 94 | Branch off the `main` branch of data into a `dev` branch to "freeze" the data as of a particular commit 95 | 96 | ```sql 97 | CREATE BRANCH dev FROM main 98 | ``` 99 | 100 | ## Deploy a Spark Cluster 101 | 102 | Create an EMR Cluster for processing data 103 | 104 | ```py 105 | spark = Cluster( 106 | scope=stack, 107 | id="Spark", 108 | clusterName="my-cluster", 109 | vpc=vpc, 110 | catalogs={ 111 | # use the Nessie Catalog as the default data catalog for Spark SQL queries 112 | "spark_catalog": myCatalog, 113 | }, 114 | installSSMAgent=true, 115 | ) 116 | ``` 117 | 118 | ## Configure SparkSQL to be served over JDBC 119 | 120 | ```py 121 | sparkSQL = spark.jdbc(port=10001) 122 | ``` 123 | 124 | ## Deploy Streamlit Site 125 | 126 | Stand up a Streamlit Site to serve interactive reports and applications over your data. 127 | 128 | ```py 129 | site = StreamlitSite( 130 | scope=stack, 131 | # Point it at the Streamlit site entrypoint 132 | home="app/home.py", 133 | # Where the Streamlit pages/tabs are, defaults to `dirname(home)/pages/*.py` 134 | # pages="app/pages" 135 | ) 136 | ``` 137 | 138 | ## Deploy to AWS 139 | 140 | ```sh 141 | packyak deploy 142 | ``` 143 | 144 | Or via the AWS CDK CLI: 145 | ```sh 146 | poetry run cdk deploy 147 | ``` 148 | --------------------------------------------------------------------------------