├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── custom.md │ └── feature-request.md └── workflows │ ├── build_docs.yaml │ ├── cluster_tests.yaml │ ├── copy_docs.yaml │ ├── generate_docs_for_tag.yaml │ ├── local_den_unit_tests.yaml │ ├── local_tests.yaml │ ├── local_tests_den_dev.yaml │ ├── nightly_release_testing.yaml │ ├── precommit.yaml │ ├── push_to_ecr_rh_all.yaml │ ├── release_precheck.yaml │ ├── setup_release_testing │ └── action.yaml │ ├── setup_rh_config │ └── action.yaml │ ├── setup_runhouse │ └── action.yaml │ └── unit_tests.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── collect_env.py ├── docker ├── cuda │ └── Dockerfile ├── slim └── testing │ ├── password-file-auth │ └── Dockerfile │ ├── public-key-auth-conda │ └── Dockerfile │ └── public-key-auth │ ├── Dockerfile │ └── instructions.md ├── docs ├── Makefile ├── _ext │ └── json_globaltoc.py ├── _static │ ├── favicon.ico │ └── rh_1.png ├── _templates │ └── layout.html ├── api │ ├── cli.rst │ ├── python.rst │ └── python │ │ ├── cluster.rst │ │ ├── folder.rst │ │ ├── function.rst │ │ ├── image.rst │ │ ├── login.rst │ │ ├── module.rst │ │ ├── package.rst │ │ ├── resource.rst │ │ └── secrets.rst ├── assets │ ├── img.png │ └── img_1.png ├── conf.py ├── debugging-logging.rst ├── development-guide.rst ├── docker-setup.rst ├── docker-workflows.rst ├── index.rst ├── installation-setup.rst ├── installation.rst ├── make.bat ├── requirements.txt ├── runhouse-in-your-stack.rst ├── security-and-authentication.rst └── tutorials │ ├── api-clusters.rst │ ├── api-folders.rst │ ├── api-images.rst │ ├── api-modules.rst │ ├── api-process.rst │ ├── api-resources.rst │ ├── api-secrets.rst │ ├── async.rst │ ├── quick-start-cloud.rst │ └── quick-start-local.rst ├── examples ├── README.md ├── dask-basic │ ├── __init__.py │ └── lgbm_train.py ├── dask-preprocessing-and-training │ ├── dask_on_ray.py │ ├── lightgbm_dask.py │ ├── notebook - interact with remote objects.ipynb │ └── requirements.txt ├── deepseek_inference │ ├── deepseek_llama_70b_vllm.py │ └── deepseek_qwen_32b_vllm.py ├── dlrm-movielens │ ├── Dockerfile │ ├── __init__.py │ ├── dlrm_data_prepoc.py │ ├── dlrm_inference.py │ ├── dlrm_training.py │ ├── my_pipeline.yaml │ └── requirements.txt ├── embedding-batch-inference │ └── embedding_batch_inference.py ├── fastapi-embeddings-rag │ ├── Dockerfile │ ├── README.md │ ├── app │ │ ├── __init__.py │ │ ├── main.py │ │ └── modules │ │ │ ├── __init__.py │ │ │ ├── embedding.py │ │ │ └── llm.py │ └── requirements.txt ├── flux │ ├── flux.py │ ├── readme.md │ └── requirements.txt ├── hello-world │ ├── hello_world.py │ └── requirements.txt ├── hpo │ ├── __init__.py │ ├── hpo.py │ ├── hpo_bayes_opt.py │ ├── hpo_bayes_opt_low_level.py │ ├── hpo_pytorch_distributed.py │ └── hpo_ray_tune_remote.py ├── inference_llama70b │ ├── llama70b_hf_accelerate.py │ └── llama70b_vllm.py ├── langchain-rag-ec2 │ ├── README.md │ └── langchain_rag.py ├── lightning-resnet │ ├── requirements.txt │ └── resnet_training.py ├── llama2-13b-ec2 │ ├── README.md │ ├── llama2_ec2.py │ └── requirements.txt ├── llama2-fine-tuning-with-lora │ ├── README.md │ ├── llama2_fine_tuning.py │ └── requirements.txt ├── llama2-with-tgi-aws-inferentia2 │ ├── README.md │ ├── requirements.txt │ └── tgi_llama2_inferentia.py ├── llama2-with-tgi-ec2 │ ├── README.md │ ├── requirements.txt │ └── tgi_llama_ec2.py ├── llama3-8b-ec2 │ ├── README.md │ ├── llama3_ec2.py │ └── requirements.txt ├── llama3-8b-tgi-ec2 │ ├── README.md │ ├── llama3_tgi_ec2.py │ └── requirements.txt ├── llama3-fine-tuning-lora │ ├── README.md │ ├── llama3_fine_tuning.py │ ├── llama3_fine_tuning_distributed.py │ ├── requirements.txt │ └── runhouse_marimo.py ├── llama3-vllm-gcp │ ├── README.md │ ├── llama3_vllm_aws.py │ ├── llama3_vllm_gcp.py │ └── requirements.txt ├── lora-example-with-notebook │ ├── Lora Fine Tuning Notebook.ipynb │ ├── LoraFineTuner.py │ ├── LoraFineTuner_check_status.py │ └── readme.md ├── mistral-with-tgi-ec2 │ ├── README.md │ ├── requirements.txt │ └── tgi_mistral_ec2.py ├── parallel-hf-embedding │ ├── README.md │ ├── parallel_hf_embedding.py │ └── requirements.txt ├── pytorch-distributed-basic │ ├── README.md │ ├── __init__.py │ ├── pytorch_distributed.py │ └── requirements.txt ├── pytorch-resnet │ ├── __init__.py │ ├── imagenet_preproc.py │ ├── requirements.txt │ ├── resnet_training.py │ └── resnet_training_full.py ├── pytorch-torchvision-mnist-training │ ├── README.md │ ├── airflow-multicloud │ │ ├── DataProcessing.py │ │ ├── airflow_multicloud_torch_train.py │ │ └── local_run_of_callables.py │ ├── airflow │ │ ├── airflow_example_torch_train.py │ │ ├── local_run_of_callables.py │ │ ├── readme.md │ │ └── requirements.txt │ ├── kfp_training.py │ ├── my_simple_model.py │ ├── my_transforms.py │ ├── requirements.txt │ ├── torch_basic_example.py │ └── work_with_remote_TorchTrainer.py ├── ray-data-lightgbm │ ├── lightgbm_ray_fns.py │ └── ray_6_nodes_lightgbm.ipynb ├── spark-basic │ └── spark_taxi_preprocess.py ├── stable-diffusion-xl-ec2 │ ├── README.md │ ├── requirements.txt │ └── sdxl.py ├── tensorflow-distributed │ ├── README.md │ ├── requirements.txt │ └── tensorflow_distributed.py ├── xgboost-gpu │ ├── requirements.txt │ ├── xgboost_fashionmnist_training.py │ └── xgboost_training_hpo.py └── yolo-fastapi │ ├── requirements.txt │ └── yolo_fastapi.py ├── pyproject.toml ├── pytest.ini ├── requirements.txt ├── runhouse ├── __init__.py ├── builtins │ ├── __init__.py │ ├── config.json │ ├── generate_builtins.py │ ├── rh-32-cpu │ │ └── config.json │ ├── rh-4-gpu │ │ └── config.json │ ├── rh-4-v100 │ │ └── config.json │ ├── rh-8-cpu │ │ └── config.json │ ├── rh-8-gpu │ │ └── config.json │ ├── rh-8-v100 │ │ └── config.json │ ├── rh-cpu │ │ └── config.json │ ├── rh-gpu │ │ └── config.json │ └── rh-v100 │ │ └── config.json ├── cli_utils.py ├── constants.py ├── exceptions.py ├── globals.py ├── logger.py ├── main.py ├── resources │ ├── __init__.py │ ├── asgi.py │ ├── distributed │ │ ├── __init__.py │ │ ├── dask_distributed.py │ │ ├── distributed_pool.py │ │ ├── pytorch_distributed.py │ │ ├── ray_distributed.py │ │ ├── spark_distributed.py │ │ ├── supervisor.py │ │ └── utils.py │ ├── folders │ │ ├── __init__.py │ │ ├── folder.py │ │ ├── folder_factory.py │ │ ├── gcs_folder.py │ │ └── s3_folder.py │ ├── functions │ │ ├── __init__.py │ │ ├── function.py │ │ └── function_factory.py │ ├── future_module.py │ ├── hardware │ │ ├── __init__.py │ │ ├── cluster.py │ │ ├── cluster_factory.py │ │ ├── constants.py │ │ ├── docker_cluster.py │ │ ├── kubernetes │ │ │ └── rsync_helper.sh │ │ ├── launcher_utils.py │ │ ├── on_demand_cluster.py │ │ ├── ray_utils.py │ │ ├── sky │ │ │ ├── __init__.py │ │ │ ├── command_runner.py │ │ │ ├── common_utils.py │ │ │ ├── constants.py │ │ │ ├── log_lib.py │ │ │ ├── subprocess_daemon.py │ │ │ └── subprocess_utils.py │ │ ├── sky_command_runner.py │ │ ├── ssh_tunnel.py │ │ └── utils.py │ ├── images │ │ ├── __init__.py │ │ ├── builtin_images.py │ │ └── image.py │ ├── module.py │ ├── packages │ │ ├── __init__.py │ │ └── package.py │ ├── resource.py │ └── secrets │ │ ├── __init__.py │ │ ├── provider_secrets │ │ ├── __init__.py │ │ ├── anthropic_secret.py │ │ ├── api_key_secret.py │ │ ├── aws_secret.py │ │ ├── azure_secret.py │ │ ├── cohere_secret.py │ │ ├── docker_secret.py │ │ ├── gcp_secret.py │ │ ├── github_secret.py │ │ ├── huggingface_secret.py │ │ ├── kubeconfig_secret.py │ │ ├── lambda_secret.py │ │ ├── langchain_secret.py │ │ ├── openai_secret.py │ │ ├── pinecone_secret.py │ │ ├── provider_secret.py │ │ ├── providers.py │ │ ├── sky_secret.py │ │ ├── ssh_secret.py │ │ └── wandb_secret.py │ │ ├── secret.py │ │ ├── secret_factory.py │ │ └── utils.py ├── rns │ ├── __init__.py │ ├── defaults.py │ ├── login.py │ ├── rns_client.py │ ├── top_level_rns_fns.py │ └── utils │ │ ├── __init__.py │ │ ├── api.py │ │ └── names.py ├── servers │ ├── __init__.py │ ├── autostop_helper.py │ ├── caddy │ │ ├── __init__.py │ │ └── config.py │ ├── cluster_servlet.py │ ├── http │ │ ├── __init__.py │ │ ├── auth.py │ │ ├── certs.py │ │ ├── http_client.py │ │ ├── http_server.py │ │ └── http_utils.py │ ├── node_servlet.py │ ├── obj_store.py │ └── servlet.py └── utils.py ├── scripts ├── __init__.py ├── build_package.sh ├── generating_docs.py └── kubernetes_cluster │ ├── README.md │ ├── tf-aks │ ├── 0-locals.tf │ ├── 1-provider.tf │ ├── 2-resource-group.tf │ ├── 3-vpc.tf │ ├── 4-subnets.tf │ ├── 5-aks.tf │ └── README.md │ ├── tf-eks │ ├── README.md │ └── main.tf │ └── tf-gke │ ├── 1-provider.tf │ ├── 2-vpc.tf │ ├── 3-subnets.tf │ ├── 4-router.tf │ ├── 5-nat.tf │ ├── 6-firewalls.tf │ ├── 7-kubernetes.tf │ ├── 8-node-pools.tf │ └── README.md ├── setup.py └── tests ├── README.md ├── __init__.py ├── conftest.py ├── constants.py ├── fixtures ├── docker_cluster_fixtures.py ├── folder_fixtures.py ├── on_demand_cluster_fixtures.py ├── package_fixtures.py ├── resource_fixtures.py ├── secret_fixtures.py ├── static_cluster_fixtures.py ├── test_fake_package │ ├── setup.py │ └── test_fake_package │ │ ├── __init__.py │ │ ├── function_to_import.py │ │ └── module_to_import.py ├── test_fake_package_copy │ ├── setup.py │ └── test_fake_package_copy │ │ ├── __init__.py │ │ ├── function_to_import.py │ │ └── module_to_import.py └── utils.py ├── requirements.txt ├── test_den ├── __init__.py ├── test_defaults.py └── test_rns.py ├── test_login.py ├── test_obj_store.py ├── test_performance.py ├── test_requirements ├── aws_test_requirements.txt ├── google_tests_requirements.txt └── tutorial_requirements.txt ├── test_resources ├── __init__.py ├── test_clusters │ ├── __init__.py │ ├── cluster_tests.py │ ├── test_cluster.py │ ├── test_docker_cluster.py │ ├── test_multinode_cluster.py │ └── test_on_demand_cluster.py ├── test_data │ ├── test_folder.py │ └── test_package.py ├── test_modules │ ├── __init__.py │ ├── exception_module.py │ ├── test_folder.py │ ├── test_folders │ │ ├── __init__.py │ │ └── test_packages │ │ │ ├── __init__.py │ │ │ ├── conftest.py │ │ │ └── test_package.py │ ├── test_functions │ │ ├── __init__.py │ │ ├── conftest.py │ │ └── test_function.py │ ├── test_module.py │ └── test_server_modules │ │ ├── __init__.py │ │ ├── assets │ │ ├── __init__.py │ │ └── sample_fastapi_app.py │ │ └── dont_test_server_module.py ├── test_resource.py ├── test_resource_sharing.py └── test_secrets │ ├── __init__.py │ └── test_secret.py ├── test_servers ├── __init__.py ├── conftest.py ├── test_caddy.py ├── test_certs.py ├── test_http_client.py ├── test_http_server.py ├── test_nginx.py ├── test_server_obj_store.py └── test_servlet.py ├── test_tutorials.py └── utils.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us reproduce and fix the bug. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | Please provide a clear and concise description of what the bug and the expected behavior is. 12 | 13 | If relevant, include the steps or code snippet to reproduce the error. 14 | 15 | **Versions** 16 | Please run the following and paste the output below. 17 | ``` 18 | wget https://raw.githubusercontent.com/run-house/runhouse/main/collect_env.py 19 | # For security purposes, please check the contents of collect_env.py before running it. 20 | python collect_env.py 21 | ``` 22 | 23 | **Additional context** 24 | Add any other context about the problem here. 25 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Custom issue template 3 | about: Describe this issue template's purpose here. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Submit a proposal or request for a new Runhouse feature. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **The feature** 11 | A clear and concise description of the feature proposal. 12 | 13 | **Motivation** 14 | What is the motivation for the feature request? Is it related to a problem you're running into? 15 | 16 | **What the ideal solution looks like** 17 | A clear and concise description of what you want to happen. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/build_docs.yaml: -------------------------------------------------------------------------------- 1 | name: Build docs 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | 8 | jobs: 9 | build_docs: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Check out repository code 13 | uses: actions/checkout@v3 14 | - name: Install 15 | run: python -m pip install --upgrade pip && pip install -e . 16 | - name: Install docs requirements 17 | run: pip install -r docs/requirements.txt 18 | - name: Build docs 19 | run: cd docs && make html && cd .. 20 | - name: Upload artifacts 21 | uses: actions/upload-artifact@v4 22 | with: 23 | name: docs 24 | path: docs/_build/html 25 | -------------------------------------------------------------------------------- /.github/workflows/cluster_tests.yaml: -------------------------------------------------------------------------------- 1 | name: cluster-tests 2 | 3 | on: workflow_dispatch 4 | 5 | jobs: 6 | cluster-tests: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Check out repository code 10 | uses: actions/checkout@v3 11 | 12 | - name: setup python 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: '3.10' 16 | 17 | - name: Configure aws 18 | run: | 19 | aws configure set aws_access_key_id ${{ secrets.DEV_AWS_ACCESS_KEY }} 20 | aws configure set aws_secret_access_key ${{ secrets.DEV_AWS_SECRET_KEY }} 21 | aws configure set default.region us-east-1 22 | 23 | - name: Install & check skypilot configuration 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install skypilot 27 | sky check 28 | sky status 29 | 30 | - name: Install python packages & dependencies 31 | run: | 32 | pip install runhouse[aws] 33 | pip install -r tests/requirements.txt 34 | 35 | - name: Run all cluster unit tests tests 36 | env: 37 | KITCHEN_TESTER_TOKEN: ${{ secrets.KITCHEN_TESTER_PROD_TOKEN }} 38 | run: pytest -v tests/test_resources/test_cluster.py --level unit 39 | 40 | - name: Teardown all test clusters 41 | run: | 42 | sky status 43 | sky down --all -y 44 | sky status 45 | -------------------------------------------------------------------------------- /.github/workflows/copy_docs.yaml: -------------------------------------------------------------------------------- 1 | name: Generate docs for runhouse-docs 2 | # https://cpina.github.io/push-to-another-repository-docs/index.html 3 | 4 | on: 5 | push: 6 | branches: 7 | - '*' 8 | 9 | jobs: 10 | generate-docs: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Check out repository code 14 | uses: actions/checkout@v3 15 | 16 | - name: Install 17 | run: python -m pip install --upgrade pip && pip install -e . 18 | 19 | - name: Install docs requirements 20 | run: pip install -r docs/requirements.txt 21 | 22 | - name: Build docs 23 | run: cd docs && make json 24 | 25 | - name: Get current branch name 26 | run: echo "CURRENT_BRANCH=${GITHUB_REF#refs/heads/}" >> $GITHUB_ENV 27 | 28 | - name: Set target branch name 29 | run: echo "BRANCH_NAME=${CURRENT_BRANCH}" >> $GITHUB_ENV 30 | 31 | - name: Push directory to another repository 32 | uses: cpina/github-action-push-to-another-repository@v1.7.2 33 | env: 34 | SSH_DEPLOY_KEY: ${{ secrets.SSH_DEPLOY_KEY }} 35 | with: 36 | source-directory: 'docs/_build/json/' 37 | destination-github-username: 'run-house' 38 | destination-repository-name: 'runhouse-docs' 39 | target-branch: ${{ env.BRANCH_NAME }} 40 | create-target-branch-if-needed: true 41 | -------------------------------------------------------------------------------- /.github/workflows/generate_docs_for_tag.yaml: -------------------------------------------------------------------------------- 1 | name: Generate Docs for Tag 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' # Triggers on any tag push 7 | workflow_dispatch: 8 | inputs: 9 | tag-name: 10 | description: 'Tag Name (ex: v0.0.32)' 11 | required: false # Allow empty for cases where the release event provides the tag 12 | default: '' 13 | 14 | jobs: 15 | build-docs-for-tag: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v2 20 | 21 | - name: Set up Python 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: '3.8' 25 | 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install -r docs/requirements.txt 30 | pip install runhouse sshtunnel python-dotenv gitpython 31 | 32 | - name: Determine Tag Name 33 | id: determine-tag 34 | run: | 35 | if [[ "${{ github.event_name }}" == "release" ]]; then 36 | echo "Tag name from release: ${{ github.event.release.tag_name }}" 37 | echo "tag_name=${{ github.event.release.tag_name }}" >> $GITHUB_ENV 38 | elif [[ -n "${{ github.event.inputs.tag-name }}" ]]; then 39 | echo "Tag name from manual input: ${{ github.event.inputs.tag-name }}" 40 | echo "tag_name=${{ github.event.inputs.tag-name }}" >> $GITHUB_ENV 41 | else 42 | echo "Error: No tag name provided" 43 | exit 1 44 | fi 45 | shell: bash 46 | 47 | - name: Run docs build script for specific tag 48 | env: 49 | GH_TOKEN: ${{ secrets.GH_TOKEN }} 50 | TAG_NAME: ${{ env.tag_name }} 51 | run: | 52 | python scripts/generating_docs.py --tag-name "${{ env.TAG_NAME }}" 53 | -------------------------------------------------------------------------------- /.github/workflows/precommit.yaml: -------------------------------------------------------------------------------- 1 | name: Run pre-commit 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | 8 | jobs: 9 | linting: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Check out repository code 13 | uses: actions/checkout@v3 14 | - name: Install pre-commit 15 | run: python -m pip install --upgrade pip && pip install pre-commit 16 | - name: Lint code 17 | run: pre-commit run --all-files 18 | - name: Show diff 19 | run: git --no-pager diff --color=always 20 | -------------------------------------------------------------------------------- /.github/workflows/push_to_ecr_rh_all.yaml: -------------------------------------------------------------------------------- 1 | name: Push to ECR 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | 9 | jobs: 10 | push_to_ecr: 11 | name: Build and Push Image 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: Check out code 16 | uses: actions/checkout@v2 17 | 18 | - name: Configure AWS credentials 19 | uses: aws-actions/configure-aws-credentials@v1 20 | with: 21 | aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY }} 22 | aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_KEY }} 23 | aws-region: us-east-1 24 | 25 | - name: Login to Amazon ECR Public 26 | id: login-ecr-public 27 | uses: aws-actions/amazon-ecr-login@v2 28 | with: 29 | registry-type: public 30 | 31 | - name: Set the environment 32 | id: set-image-tag 33 | run: | 34 | 35 | BRANCH_NAME="${GITHUB_REF#refs/heads/}" # Extract branch name from refs/heads/ 36 | PR_BRANCH_NAME="${GITHUB_HEAD_REF:-}" # For pull requests, GITHUB_HEAD_REF contains the branch name 37 | 38 | if [[ -n "$PR_BRANCH_NAME" ]]; then 39 | # If it's a pull request, use the PR branch name instead 40 | BRANCH_NAME="$PR_BRANCH_NAME" 41 | fi 42 | 43 | if [[ "$BRANCH_NAME" != "main" ]]; then 44 | # Build a dev image for pull requests or feature branches 45 | IMAGE_TAG="rh-all-${BRANCH_NAME}-${GITHUB_SHA::8}" 46 | else 47 | # Build an image for main branch pushes 48 | IMAGE_TAG="rh-all-main-${GITHUB_SHA::8}" 49 | fi 50 | 51 | # Replace "/" with "-" in the image tag 52 | IMAGE_TAG="${IMAGE_TAG//\//-}" 53 | 54 | # Save environment variables 55 | echo "BRANCH_NAME=$BRANCH_NAME" >> $GITHUB_ENV 56 | echo "IMAGE_TAG=$IMAGE_TAG" >> $GITHUB_ENV 57 | echo "GITHUB_REF=$GITHUB_REF" >> $GITHUB_ENV 58 | 59 | - name: Build, tag, and push image to Amazon ECR 60 | env: 61 | ECR_REGISTRY: public.ecr.aws/a9j3d7s3 62 | ECR_REPOSITORY: run-house/runhouse 63 | run: | 64 | docker build --platform linux/amd64 -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f ./docker/slim . --build-arg RUNHOUSE_EXTRAS=all --build-arg BRANCH_NAME=$BRANCH_NAME 65 | docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG 66 | -------------------------------------------------------------------------------- /.github/workflows/release_precheck.yaml: -------------------------------------------------------------------------------- 1 | name: Conda Environment Setup and Test 2 | 3 | on: 4 | release: 5 | types: [ created ] 6 | workflow_dispatch: 7 | 8 | 9 | jobs: 10 | build-and-test: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] 15 | 16 | steps: 17 | - name: Checkout repository 18 | uses: actions/checkout@v2 19 | 20 | - name: Setup Miniconda 21 | uses: conda-incubator/setup-miniconda@v2 22 | with: 23 | auto-update-conda: true 24 | python-version: ${{ matrix.python-version }} 25 | 26 | - name: Create Conda environment 27 | run: conda create --yes --name test-env python=${{ matrix.python-version }} 28 | 29 | - name: Install current package in editable mode 30 | run: | 31 | source $CONDA/etc/profile.d/conda.sh 32 | conda activate test-env 33 | pip install -e . 34 | 35 | - name: Test package import 36 | run: | 37 | source $CONDA/etc/profile.d/conda.sh 38 | conda activate test-env 39 | python -c "import runhouse" 40 | -------------------------------------------------------------------------------- /.github/workflows/setup_rh_config/action.yaml: -------------------------------------------------------------------------------- 1 | name: Setup an RH config 2 | 3 | description: Reusable short flow for setting up a fake ~/.rh/config.yaml 4 | 5 | inputs: 6 | username: 7 | description: 'The username to log in with' 8 | required: true 9 | 10 | token: 11 | description: 'The token of the logged in username' 12 | required: true 13 | 14 | api_server_url: 15 | description: 'The den api server to send the requests to' 16 | required: true 17 | 18 | runs: 19 | using: composite 20 | steps: 21 | - name: Setup ~/.rh/config.yaml 22 | shell: bash 23 | run: | 24 | mkdir ~/.rh && touch ~/.rh/config.yaml 25 | echo "default_folder: /${{ inputs.username }}" > ~/.rh/config.yaml 26 | echo "token: ${{ inputs.token }}" >> ~/.rh/config.yaml 27 | echo "username: ${{ inputs.username }}" >> ~/.rh/config.yaml 28 | echo "api_server_url: ${{ inputs.api_server_url }}" >> ~/.rh/config.yaml 29 | echo "default_ssh_key: ssh-sky-key" >> ~/.rh/config.yaml 30 | echo "autosave: false" >> ~/.rh/config.yaml 31 | echo "disable_observability: false" >> ~/.rh/config.yaml 32 | -------------------------------------------------------------------------------- /.github/workflows/setup_runhouse/action.yaml: -------------------------------------------------------------------------------- 1 | name: Setup Runhouse 2 | 3 | description: Reusable steps for setting up Runhouse 4 | 5 | inputs: 6 | den_tester_ssh_private_key: 7 | description: 'SSH private key value' 8 | required: true 9 | 10 | den_tester_ssh_public_key: 11 | description: 'SSH public key value' 12 | required: true 13 | 14 | runs: 15 | using: composite 16 | steps: 17 | - name: Setup python 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: '3.10' 21 | 22 | # Note: using the default SSH keys stored for Den tester 23 | - name: Set up local default SSH keys 24 | shell: bash 25 | run: | 26 | mkdir -p ~/.ssh 27 | echo "${{ inputs.den_tester_ssh_private_key }}" > ~/.ssh/sky-key 28 | echo "${{ inputs.den_tester_ssh_public_key }}" > ~/.ssh/sky-key.pub 29 | chmod 600 ~/.ssh/sky-key 30 | chmod 644 ~/.ssh/sky-key.pub 31 | echo "password" > $GITHUB_WORKSPACE/../docker_user_passwd 32 | 33 | - name: Install runhouse from source code 34 | shell: bash 35 | run: | 36 | pip install -e $GITHUB_WORKSPACE 37 | 38 | - name: Install python packages & dependencies for unit and local tests 39 | shell: bash 40 | run: 41 | pip install -r tests/requirements.txt scipy boto3 google-cloud-storage 42 | -------------------------------------------------------------------------------- /.github/workflows/unit_tests.yaml: -------------------------------------------------------------------------------- 1 | name: Tests with level "unit" 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | 8 | jobs: 9 | all-tests-logged-out-level-unit: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Check out repository code 13 | uses: actions/checkout@v3 14 | 15 | - name: Setup Runhouse 16 | uses: ./.github/workflows/setup_runhouse 17 | with: 18 | den_tester_ssh_private_key: ${{ secrets.DEN_TESTER_SSH_PRIVATE_KEY }} 19 | den_tester_ssh_public_key: ${{ secrets.DEN_TESTER_SSH_PUBLIC_KEY }} 20 | 21 | - name: pytest -v --level unit -k "not secrettest" 22 | run: pytest -v --level unit -k "not secrettest" 23 | timeout-minutes: 20 24 | 25 | # all-tests-logged-in-level-unit: 26 | # runs-on: ubuntu-latest 27 | # steps: 28 | # - name: Check out repository code 29 | # uses: actions/checkout@v3 30 | # 31 | # - name: Setup Runhouse 32 | # uses: ./.github/workflows/setup_runhouse 33 | # 34 | # - name: Setup ~/.rh/config.yaml 35 | # uses: ./.github/workflows/setup_rh_config 36 | # with: 37 | # username: ${{ secrets.CI_ACCOUNT_USERNAME }} 38 | # token: ${{ secrets.CI_ACCOUNT_TOKEN }} 39 | # 40 | # - name: pytest -v --level unit -k "not den_auth" 41 | # env: 42 | # TEST_TOKEN: ${{ secrets.TEST_TOKEN }} 43 | # TEST_USERNAME: ${{ secrets.TEST_USERNAME }} 44 | # run: pytest -v --level unit 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .DS_Store 132 | 133 | # IDE project files 134 | .idea/ 135 | .vscode/ 136 | 137 | # Config files 138 | rh/ 139 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.0.1 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | 8 | - repo: https://github.com/omnilib/ufmt 9 | rev: v1.3.2 10 | hooks: 11 | - id: ufmt 12 | exclude: (runhouse/servers/grpc/unary_pb2.py|runhouse/servers/grpc/unary_pb2_grpc.py|runhouse/resources/hardware/sky/) 13 | additional_dependencies: 14 | - black == 22.6.0 15 | - usort == 1.0.4 16 | 17 | - repo: https://github.com/astral-sh/ruff-pre-commit 18 | rev: v0.3.0 19 | hooks: 20 | - id: ruff 21 | args: [--line-length=120, '--ignore=E402,E721,E722,E731,F821'] 22 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-20.04 11 | tools: 12 | python: "3.9" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | python: 19 | install: 20 | - method: pip 21 | path: . 22 | - requirements: docs/requirements.txt 23 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Runhouse 2 | Please file an [issue](https://github.com/run-house/runhouse/issues) if you encounter a bug. 3 | 4 | If you would like to submit a bug-fix or improve an existing feature, please submit a pull request following the 5 | process outlined below. 6 | 7 | If you would like to contribute, but don't know what to add, you can look for open issues labeled 8 | `good first issue`, or take a look at the [funhouse repo](https://github.com/run-house/funhouse) to 9 | create and add your own ML application using Runhouse! 10 | 11 | ## Development Process 12 | If you want to modify code, please follow the instructions for creating a Pull Request. 13 | 14 | 1. Fork the Github repository, and then clone the forked repo to local. 15 | ``` 16 | git clone git@github.com:/runhouse.git 17 | cd runhouse 18 | git remote add upstream https://github.com/run-house/runhouse.git 19 | ``` 20 | 21 | 2. Create a new branch for your development changes: 22 | ``` 23 | git checkout -b branch-name 24 | ``` 25 | 26 | 3. Install Runhouse 27 | ``` 28 | pip install -e . 29 | ``` 30 | 31 | 4. Develop your features 32 | 33 | 5. Download and run pre-commit to automatically format your code using black and ruff. 34 | 35 | ``` 36 | pip install pre-commit 37 | pre-commit run --files [FILES [FILES ...]] 38 | ``` 39 | 40 | 6. Add, commit, and push your changes. Create a "Pull Request" on GitHub to submit the changes for review. 41 | 42 | ``` 43 | git push -u origin branch-name 44 | ``` 45 | 46 | ## Testing 47 | 48 | To run tests, please install test/requirements.txt. 49 | ``` 50 | pip install -r tests/requirements.txt 51 | ``` 52 | 53 | Additional optional packages to install to run related tests: 54 | 55 | aws related tests 56 | ``` 57 | pip install -r tests/test_requirements/aws_test_requirements.txt 58 | ``` 59 | 60 | google related tests 61 | ``` 62 | pip install -r tests/test_requirements/google_tests_requirements.txt 63 | ``` 64 | 65 | 66 | 67 | ## Documentation 68 | Docs source code is located in `docs/`. To build and review docs locally: 69 | 70 | ``` 71 | pip install -r docs/requirements.txt 72 | cd docs/ 73 | make clean html 74 | ``` 75 | 76 | ### Tutorials and Examples 77 | Notebook (`.ipynb`) code lives in [run-house/notebooks](https://github.com/run-house/notebooks). If modifying 78 | a tutorial or example involving a `.ipynb` file, please refer to these 79 | [instructions](https://github.com/run-house/notebooks?tab=readme-ov-file#syncing-docs-to-run-houserunhouse) for 80 | how to upload your notebook to the notebooks repo and sync the rendered `.rst` file over to the runhouse repo. 81 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include runhouse/builtins/* 2 | include runhouse/resources/hardware/kubernetes/* 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🏃‍♀️Runhouse🏠 is now 📦Kubetorch🔥 2 | 3 | This repository is no longer supported as we privately beta 📦Kubetorch🔥, the next generation of this project. 4 | Kubetorch is for production people that like Kubernetes and ML people that don't. 5 | It's a Pythonic, debuggable successor to Kubeflow just as PyTorch succeeded Tensorflow. 6 | To join the waitlist, please contact us at support@run.house . 7 | -------------------------------------------------------------------------------- /collect_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import sys 4 | 5 | try: 6 | from pip._internal.operations import freeze 7 | except ImportError: # pip < 10.0 8 | from pip.operations import freeze 9 | 10 | py_version = sys.version.replace("\n", " ") 11 | py_platform = platform.platform() 12 | 13 | pkgs = freeze.freeze() 14 | pip_pkgs = "\n".join( 15 | pkg 16 | for pkg in pkgs 17 | if any( 18 | name in pkg 19 | for name in { 20 | # runhouse 21 | "runhouse", 22 | # required installs 23 | "wheel", 24 | "rich", 25 | "typer", 26 | "skypilot", 27 | "fastapi", 28 | "uvicorn", 29 | "pyOpenSSL" 30 | # aws 31 | "awscli", 32 | "boto3", 33 | "pycryptodome", 34 | "s3fs", 35 | "sshtunnel", 36 | # azure 37 | "azure-cli", 38 | "azure-core", 39 | # gcp 40 | "google-api-python-client", 41 | "google-cloud-storage", 42 | "gcsfs", 43 | # docker 44 | "docker", 45 | } 46 | ) 47 | ) 48 | 49 | print(f"Python Platform: {py_platform}") 50 | print(f"Python Version: {py_version}") 51 | print() 52 | print(f"Relevant packages: \n{pip_pkgs}") 53 | print() 54 | os.system("sky check") 55 | os.system("sky status --refresh") 56 | -------------------------------------------------------------------------------- /docker/cuda/Dockerfile: -------------------------------------------------------------------------------- 1 | # Nvidia/CUDA on Ubuntu based image 2 | FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 3 | 4 | ARG DOCKER_USER_PASSWORD_FILE 5 | 6 | WORKDIR /app 7 | 8 | # Create the password file 9 | RUN mkdir -p /app/ssh 10 | 11 | # Install the required packages 12 | RUN apt-get update && \ 13 | apt-get install -y gcc python3-dev openssh-server supervisor && \ 14 | rm -rf /var/lib/apt/lists/* 15 | 16 | # Copy the password file into the image 17 | COPY $DOCKER_USER_PASSWORD_FILE /app/ssh/docker_user_password_file 18 | 19 | RUN pip install runhouse 20 | RUN pip install -e . 21 | 22 | # Create the privilege separation directory required by sshd 23 | RUN mkdir -p /run/sshd 24 | 25 | # Create a user for SSH access (using password from $DOCKER_USER_PASSWORD_FILE) 26 | RUN useradd -m rh-docker-user && \ 27 | echo "rh-docker-user:$(cat /app/ssh/docker_user_password_file)" | chpasswd && \ 28 | echo "PermitRootLogin no" >> /etc/ssh/sshd_config && \ 29 | echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config 30 | 31 | # Create supervisord configuration file 32 | RUN echo "[supervisord]" > /etc/supervisor/conf.d/supervisord.conf && \ 33 | echo "nodaemon=true" >> /etc/supervisor/conf.d/supervisord.conf && \ 34 | echo "user=root" >> /etc/supervisor/conf.d/supervisord.conf && \ 35 | echo "[program:sshd]" >> /etc/supervisor/conf.d/supervisord.conf && \ 36 | echo "command=/usr/sbin/sshd -D" >> /etc/supervisor/conf.d/supervisord.conf && \ 37 | echo "stdout_logfile=/var/log/sshd.log" >> /etc/supervisor/conf.d/supervisord.conf && \ 38 | echo "stderr_logfile=/var/log/sshd.err" >> /etc/supervisor/conf.d/supervisord.conf && \ 39 | echo "[program:runhouse]" >> /etc/supervisor/conf.d/supervisord.conf && \ 40 | echo "command=runhouse server start --host "0.0.0.0"" >> /etc/supervisor/conf.d/supervisord.conf && \ 41 | echo "stdout_logfile=/var/log/runhouse.log" >> /etc/supervisor/conf.d/supervisord.conf && \ 42 | echo "stderr_logfile=/var/log/runhouse.err" >> /etc/supervisor/conf.d/supervisord.conf 43 | 44 | # Runhouse server port 45 | EXPOSE 32300 46 | # HTTPS port 47 | EXPOSE 443 48 | # HTTP port 49 | EXPOSE 80 50 | # SSH port 51 | EXPOSE 22 52 | 53 | # Run supervisord as the main process to manage the others 54 | CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] 55 | -------------------------------------------------------------------------------- /docker/slim: -------------------------------------------------------------------------------- 1 | # A simple container to use as a local runhouse cluster 2 | # docker build -t runhouse -f ./runhouse/docker/slim ./runhouse 3 | 4 | # Use the official Python image as a parent image 5 | FROM python:3.11-slim 6 | 7 | ARG RUNHOUSE_EXTRAS=server 8 | ARG BRANCH_NAME=None 9 | RUN echo "RUNHOUSE_EXTRAS is: $RUNHOUSE_EXTRAS" 10 | RUN echo "BRANCH_NAME is: BRANCH_NAME" 11 | 12 | # get the correct runhouse version based on BRANCH_NAME, install screen and Runhouse 13 | RUN if [ "$BRANCH_NAME" = "None" ]; then \ 14 | export rh_version="runhouse[${RUNHOUSE_EXTRAS}]"; \ 15 | else \ 16 | export rh_version="runhouse[${RUNHOUSE_EXTRAS}] @ git+https://github.com/run-house/runhouse.git@$BRANCH_NAME"; \ 17 | fi && \ 18 | echo "install_pkg is: $rh_version" && \ 19 | apt-get update && apt-get install -y screen procps rsync ssh netcat-traditional git && \ 20 | python -m pip install --upgrade pip && \ 21 | python -m pip install "$rh_version" 22 | 23 | # Alias python3 as python 24 | RUN ln -s /usr/bin/python3 /usr/bin/python 25 | 26 | # Make port 32300 available to the world outside this container 27 | EXPOSE 32300 28 | 29 | # Start ray and the the runhouse server 30 | CMD ["runhouse", "server", "start", "--no-restart-ray", "--host", "0.0.0.0", "--port", "32300", "--no-screen", "--no-nohup"] 31 | -------------------------------------------------------------------------------- /docker/testing/password-file-auth/Dockerfile: -------------------------------------------------------------------------------- 1 | # Debian based image (should work for Ubuntu as well) 2 | FROM python:3.9.15-slim 3 | 4 | ARG DOCKER_USER_PASSWORD_FILE 5 | ARG RUNHOUSE_PATH 6 | ARG RUNHOUSE_VERSION 7 | 8 | WORKDIR /app 9 | 10 | # Create the password file directory 11 | RUN mkdir -p /app/ssh 12 | 13 | # Install the required packages 14 | RUN apt-get update --allow-insecure-repositories && \ 15 | apt-get install -y --no-install-recommends gcc python3-dev openssh-server rsync supervisor screen wget curl sudo ufw git awscli && \ 16 | apt-get clean && \ 17 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 18 | 19 | # Copy the password file into the image 20 | COPY $DOCKER_USER_PASSWORD_FILE /app/ssh/docker_user_password_file 21 | 22 | # COPY local Runhouse package into the image if provided 23 | COPY $RUNHOUSE_PATH /app/runhouse 24 | 25 | # If using a local version of runhouse, install it from the local directory 26 | RUN if [ -d "/app/runhouse" ]; then pip install -U -e /app/runhouse; else pip install -U runhouse==$RUNHOUSE_VERSION; fi 27 | 28 | # Create the privilege separation directory required by sshd 29 | RUN mkdir -p /run/sshd 30 | 31 | # Create a user for SSH access (using password from $DOCKER_USER_PASSWORD_FILE) 32 | RUN useradd -m rh-docker-user && \ 33 | echo "rh-docker-user:$(cat /app/ssh/docker_user_password_file)" | chpasswd && \ 34 | echo "PermitRootLogin no" >> /etc/ssh/sshd_config && \ 35 | echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config 36 | 37 | # Create supervisord configuration file 38 | RUN echo "[supervisord]" > /etc/supervisor/conf.d/supervisord.conf && \ 39 | echo "nodaemon=true" >> /etc/supervisor/conf.d/supervisord.conf && \ 40 | echo "user=root" >> /etc/supervisor/conf.d/supervisord.conf && \ 41 | echo "[program:sshd]" >> /etc/supervisor/conf.d/supervisord.conf && \ 42 | echo "command=/usr/sbin/sshd -D" >> /etc/supervisor/conf.d/supervisord.conf && \ 43 | echo "stdout_logfile=/var/log/sshd.log" >> /etc/supervisor/conf.d/supervisord.conf && \ 44 | echo "stderr_logfile=/var/log/sshd.err" >> /etc/supervisor/conf.d/supervisord.conf 45 | 46 | # Runhouse server port 47 | EXPOSE 32300 48 | # HTTPS port 49 | EXPOSE 443 50 | # HTTP port 51 | EXPOSE 80 52 | # SSH port 53 | EXPOSE 22 54 | 55 | # Run supervisord as the main process to manage the others 56 | CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] 57 | -------------------------------------------------------------------------------- /docker/testing/public-key-auth/instructions.md: -------------------------------------------------------------------------------- 1 | How to start a local Docker container with public key based authentication 2 | 3 | 1. Configure Docker to use secrets in the build process 4 | ``` 5 | echo "DOCKER_BUILDKIT=1" >> ~/.docker/config.json 6 | ``` 7 | 8 | or edit the file manually to make sure it includes 9 | ``` 10 | { 11 | "features": { 12 | "buildkit": true 13 | } 14 | } 15 | ``` 16 | 17 | 2. Generate a public private key pair 18 | ``` 19 | mkdir -p ~/.ssh/runhouse/docker 20 | ssh-keygen -t rsa -b 4096 -C "your_email@example.com" -f ~/.ssh/runhouse/docker/id_rsa 21 | ``` 22 | 23 | 3. The Dockerfile in the current directory should support public key based authentication using Docker Secrets for its build process 24 | 25 | 4. Build the Docker container 26 | ``` 27 | docker build --no-cache --pull --rm -f "docker/testing/public-key-auth/Dockerfile" --secret id=ssh_key,src=$HOME/.ssh/runhouse/docker/id_rsa.pub -t runhouse:start . 28 | ``` 29 | 30 | 5. Run the Docker container 31 | ``` 32 | docker run --rm --shm-size=4gb -it -p 32300:32300 -p 6379:6379 -p 52365:52365 -p 22:22 -p 443:443 -p 80:80 runhouse:start 33 | ``` 34 | 35 | 6. Verify via SSH 36 | ``` 37 | ssh -i ~/.ssh/runhouse/docker/id_rsa rh-docker-user@localhost 38 | ``` 39 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | json: 16 | @$(SPHINXBUILD) -M json "$(SOURCEDIR)" "$(BUILDDIR)" -b json -t json 17 | 18 | .PHONY: help Makefile 19 | 20 | # Catch-all target: route all unknown targets to Sphinx using the new 21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 22 | %: Makefile 23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | -------------------------------------------------------------------------------- /docs/_ext/json_globaltoc.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | from sphinx.application import Sphinx 4 | from sphinx.environment.adapters.toctree import TocTree 5 | from sphinxcontrib.serializinghtml import JSONHTMLBuilder 6 | 7 | __version__ = "0.0.1" 8 | 9 | 10 | def setup(app: Sphinx) -> Dict[str, Any]: 11 | app.add_builder(SphinxGlobalTOCJSONHTMLBuilder, override=True) 12 | 13 | return {"version": __version__, "parallel_read_safe": True} 14 | 15 | 16 | class SphinxGlobalTOCJSONHTMLBuilder(JSONHTMLBuilder): 17 | 18 | name: str = "json" 19 | 20 | def get_doc_context(self, docname: str, body: str, metatags: str) -> Dict[str, Any]: 21 | """ 22 | Extends :py:class:`sphinxcontrib.serializinghtml.JSONHTMLBuilder`. 23 | 24 | Add a ``globaltoc`` key to our document that contains the HTML for the 25 | global table of contents. 26 | 27 | Note: 28 | 29 | We're rendering the **full global toc** for the entire documentation 30 | set into every page. We do this to easily render the toc on each 31 | page and allow for a unique toc for each branch and repo version. 32 | """ 33 | doc = super().get_doc_context(docname, body, metatags) 34 | # Get the entire doctree. It is the 3rd argument (``collapse``) that 35 | # does this. If you set that to ``True`` you will only get the submenu 36 | # HTML included if you are on a page that is within that submenu. 37 | self_toctree = TocTree(self.env).get_toctree_for( 38 | "index", self, False, titles_only=True, includehidden=False, maxdepth=2 39 | ) 40 | toctree = self.render_partial(self_toctree)["fragment"] 41 | doc["globaltoc"] = toctree 42 | return doc 43 | -------------------------------------------------------------------------------- /docs/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/docs/_static/favicon.ico -------------------------------------------------------------------------------- /docs/_static/rh_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/docs/_static/rh_1.png -------------------------------------------------------------------------------- /docs/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | {% block extrahead %} 3 | 4 | {% endblock %} 5 | -------------------------------------------------------------------------------- /docs/api/cli.rst: -------------------------------------------------------------------------------- 1 | Command Line Interface 2 | ------------------------------------ 3 | Runhouse provides CLI commands for the following use cases: 4 | 5 | * logging in and out (``runhouse login/logout``) 6 | * interacting with or retrieving information about clusters (``runhouse cluster ``) 7 | * interacting with the Runhouse server (``runhouse server ``) 8 | 9 | The commands can be run using either ``runhouse`` or the ``rh``` alias 10 | 11 | .. automodule:: runhouse.main 12 | :members: login, logout, cluster_ssh, server_start, server_restart, server_stop, server_status, cluster_status, cluster_list, cluster_keep_warm, cluster_up, cluster_down, cluster_logs 13 | :undoc-members: 14 | :show-inheritance: 15 | -------------------------------------------------------------------------------- /docs/api/python.rst: -------------------------------------------------------------------------------- 1 | Python API 2 | ==================================== 3 | Runhouse offers a programmatic API in Python to manage your account and resources. 4 | 5 | 6 | Resources 7 | ------------------------------------ 8 | Resources are the Runhouse abstraction for objects that can be saved, shared, and reused. 9 | This includes both compute abstractions (clusters, functions, packages, environments) and 10 | data abstractions (folders). 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | 15 | python/resource 16 | 17 | 18 | Compute Abstractions 19 | ------------------------------------ 20 | The Function, Cluster, Env, Package, and Module APIs allow a seamless flow of code and execution across local and remote compute. 21 | They blur the line between program execution and deployment, providing both a path of least resistence for running 22 | a sub-routine on specific hardware, while unceremoniously turning that sub-routine into a reusable service. 23 | They also provide convenient dependency isolation and management, provider-agnostic provisioning and termination, 24 | and rich debugging and accessibility interfaces built-in. 25 | 26 | .. toctree:: 27 | :maxdepth: 1 28 | 29 | python/function 30 | 31 | .. toctree:: 32 | :maxdepth: 1 33 | 34 | python/cluster 35 | 36 | .. toctree:: 37 | :maxdepth: 1 38 | 39 | python/image 40 | 41 | .. toctree:: 42 | :maxdepth: 1 43 | 44 | python/package 45 | 46 | .. toctree:: 47 | :maxdepth: 1 48 | 49 | python/module 50 | 51 | 52 | Data Abstractions 53 | ------------------------------------ 54 | The Folder APIs provide a simple interface for storing, recalling, and moving data between 55 | the user's laptop, remote compute, and cloud storage (currently we support `S3` and `GCS`). They provide 56 | least-common-denominator APIs across providers, allowing users to easily specify the actions 57 | they want to take on the data without needed to dig into provider-specific APIs. 58 | 59 | .. toctree:: 60 | :maxdepth: 1 61 | 62 | python/folder 63 | 64 | 65 | 66 | Secrets 67 | ------------------------------------ 68 | Runhouse provides a convenient interface for managing your secrets in a secure manner. 69 | Secrets are stored in `Vault `__, an industry standard for 70 | secrets management, and never touches Runhouse servers. Please see 71 | :ref:`Security and Authentication` for more information on security. 72 | 73 | .. toctree:: 74 | :maxdepth: 1 75 | 76 | python/secrets 77 | 78 | .. toctree:: 79 | :maxdepth: 1 80 | 81 | python/login 82 | -------------------------------------------------------------------------------- /docs/api/python/folder.rst: -------------------------------------------------------------------------------- 1 | Folder 2 | ==================================== 3 | A Folder represents a specified location for organizing and storing other Runhouse primitives 4 | across various systems. 5 | 6 | 7 | Folder Factory Method 8 | ~~~~~~~~~~~~~~~~~~~~~ 9 | 10 | .. autofunction:: runhouse.folder 11 | 12 | 13 | Folder Class 14 | ~~~~~~~~~~~~ 15 | 16 | .. autoclass:: runhouse.Folder 17 | :members: 18 | :exclude-members: 19 | 20 | .. automethod:: __init__ 21 | -------------------------------------------------------------------------------- /docs/api/python/function.rst: -------------------------------------------------------------------------------- 1 | Function 2 | ==================================== 3 | 4 | A Function is a portable code block that can be sent to remote hardware to run as a subroutine or service. 5 | It is comprised of the entrypoint, system (:ref:`Cluster`), and requirements necessary to run it. 6 | 7 | 8 | Function Factory Methods 9 | ~~~~~~~~~~~~~~~~~~~~~~~~ 10 | 11 | .. autofunction:: runhouse.function 12 | 13 | Function Class 14 | ~~~~~~~~~~~~~~ 15 | 16 | .. autoclass:: runhouse.Function 17 | :members: 18 | :exclude-members: map, starmap, get_or_call 19 | 20 | .. automethod:: __init__ 21 | -------------------------------------------------------------------------------- /docs/api/python/image.rst: -------------------------------------------------------------------------------- 1 | Image 2 | ===== 3 | A Runhouse image allows you to easily encapsulate various setup steps to take across each node on the cluster before 4 | it is launched. See the :ref:`Images` section for a more in-depth explanation. 5 | 6 | Image Class 7 | ~~~~~~~~~~~ 8 | 9 | .. autoclass:: runhouse.Image 10 | :members: 11 | :exclude-members: 12 | 13 | .. automethod:: __init__ 14 | 15 | ImageSteupStepType 16 | ~~~~~~~~~~~~~~~~~~ 17 | 18 | .. autoclass:: runhouse.resources.images.ImageSetupStepType 19 | 20 | .. autoattribute:: PACKAGES 21 | .. autoattribute:: CMD_RUN 22 | .. autoattribute:: SETUP_CONDA_ENV 23 | .. autoattribute:: RSYNC 24 | .. autoattribute:: SYNC_SECRETS 25 | .. autoattribute:: SET_ENV_VARS 26 | .. autoattribute:: PIP_INSTALL 27 | .. autoattribute:: CONDA_INSTALL 28 | .. autoattribute:: UV_INSTALL 29 | .. autoattribute:: SYNC_PACKAGE 30 | .. autoattribute:: SET_VENV 31 | 32 | ImageSetupStep 33 | ~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: runhouse.resources.images.ImageSetupStep 36 | :members: 37 | :exclude-members: 38 | 39 | .. automethod:: __init__ 40 | -------------------------------------------------------------------------------- /docs/api/python/login.rst: -------------------------------------------------------------------------------- 1 | Login/Logout 2 | ==================================== 3 | Functions for logging in and out of your Runhouse account. 4 | 5 | .. autofunction:: runhouse.login 6 | 7 | .. autofunction:: runhouse.logout 8 | -------------------------------------------------------------------------------- /docs/api/python/module.rst: -------------------------------------------------------------------------------- 1 | Module 2 | ==================================== 3 | 4 | A Module represents a class that can be sent to and used on remote clusters and environments. Modules can live on remote hardware and its class methods called remotely. 5 | 6 | 7 | Module Factory Method 8 | ~~~~~~~~~~~~~~~~~~~~~ 9 | 10 | .. autofunction:: runhouse.module 11 | 12 | Module Class 13 | ~~~~~~~~~~~~ 14 | 15 | .. autoclass:: runhouse.Module 16 | :members: 17 | :exclude-members: 18 | 19 | .. automethod:: __init__ 20 | -------------------------------------------------------------------------------- /docs/api/python/package.rst: -------------------------------------------------------------------------------- 1 | Package 2 | ==================================== 3 | A Package is a Runhouse primitive for sharing code between various systems (ex: s3, cluster, local). 4 | 5 | 6 | Package Factory Method 7 | ~~~~~~~~~~~~~~~~~~~~~~ 8 | 9 | .. autofunction:: runhouse.package 10 | 11 | 12 | Package Class 13 | ~~~~~~~~~~~~~ 14 | 15 | .. autoclass:: runhouse.Package 16 | :members: 17 | :exclude-members: 18 | 19 | .. automethod:: __init__ 20 | -------------------------------------------------------------------------------- /docs/api/python/resource.rst: -------------------------------------------------------------------------------- 1 | Resource 2 | ======== 3 | Resources are the Runhouse abstraction for objects that can be saved, shared, and reused. 4 | 5 | 6 | Resource Class 7 | ~~~~~~~~~~~~~~ 8 | .. autoclass:: runhouse.resources.resource.Resource 9 | :members: 10 | :exclude-members: 11 | 12 | .. automethod:: __init__ 13 | -------------------------------------------------------------------------------- /docs/assets/img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/docs/assets/img.png -------------------------------------------------------------------------------- /docs/assets/img_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/docs/assets/img_1.png -------------------------------------------------------------------------------- /docs/debugging-logging.rst: -------------------------------------------------------------------------------- 1 | Debugging and Logging 2 | ===================== 3 | 4 | Below, we describe how to access log outputs and show a sample debugging flow. 5 | 6 | 7 | Logging 8 | ~~~~~~~ 9 | 10 | There are three main ways to access logs: 11 | 12 | (1) **On the cluster** 13 | 14 | Logs are automatically output onto the cluster, in the file ``~/.rh/server.log``. You can ssh 15 | into the cluster with ``runhouse cluster ssh cluster-name`` to view these logs. 16 | 17 | (2) **Streaming** 18 | 19 | To see logs on your local machine while running a remote function, you can add the ``stream_logs=True`` 20 | argument to your function call. 21 | 22 | .. code:: ipython3 23 | 24 | remote_fn = rh.function(fn) 25 | fn(fn_args, stream_logs=True) 26 | 27 | (3) **Runhouse CLI** 28 | 29 | You can view the latest logs by running the command: ``runhouse cluster logs cluster-name``. 30 | 31 | Log Levels 32 | ---------- 33 | You can set the log level to control the verbosity of the Runhouse logs. You can adjust the log level by setting the 34 | environment variable ``RH_LOG_LEVEL`` to your desired level. 35 | 36 | Debugging 37 | ~~~~~~~~~ 38 | 39 | For general debugging that doesn't occur within remote function calls, you can add ``breakpoint()`` wherever you want 40 | to set your debugging session. If the code is being run locally at the point of the debugger, you'll be able to access 41 | the session from your local machine. If the code is being run remotely on a cluster, you will need to ssh into the 42 | cluster with ``runhouse cluster ssh cluster-name``, and then run ``screen -r`` inside the cluster. 43 | From there, you will see the RPC logs being printed out, and can debug normally inside the ``screen``. 44 | 45 | .. note:: 46 | 47 | When debugging inside ``screen``, please use ``Ctrl A+D`` to exit out of the screen. Do NOT use ``Ctrl C``, 48 | which will terminate the RPC server. 49 | 50 | If you accidentally terminate the RPC server, you can run ``cluster.restart_server()`` to restart the 51 | server. 52 | 53 | For debugging remote functions, which are launched using ``ray``, we can utilize Ray's debugger. Add a ``breakpoint()`` 54 | call inside the function where you want to start the debugging session, then ssh into the cluster with 55 | ``runhouse cluster ssh cluster-name``, and call ``ray debug`` to view select the breakpoint to enter. 56 | You can run normal ``pdb`` commands within the debugging session, and can refer to `Ray Debugger 57 | `__ for more information. 58 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | chardet==4.0.0 2 | myst-parser==2.0.0 3 | pint==0.20.1 4 | pydata-sphinx-theme==0.13.3 5 | ray>=2.2.0 6 | sphinx-book-theme==1.0.1 7 | sphinx-click==4.3.0 8 | sphinx-copybutton==0.5.1 9 | sphinx-thebe==0.2.1 10 | sphinx==6.2.1 11 | sphinx_autodoc_typehints==1.17.0 12 | sphinxcontrib-serializinghtml==1.1.5 13 | -------------------------------------------------------------------------------- /docs/security-and-authentication.rst: -------------------------------------------------------------------------------- 1 | Security and Authentication 2 | =========================== 3 | By default, Runhouse collects metadata from provisioned clusters and data relating to performance and error monitoring. 4 | This data will only be used by Runhouse to improve the product. 5 | 6 | Cluster Metadata Collection 7 | --------------------------- 8 | We collect non-sensitive data on the cluster that helps us understand how Runhouse is being used. This data includes: 9 | 10 | - Python version 11 | - Resources (cpus, gpus, memory) 12 | - Cloud provider 13 | - Region 14 | - Instance type 15 | 16 | 17 | Cluster Observability 18 | --------------------------------------- 19 | Runhouse collects various telemetry data by default on clusters. This data will be used to provide better observability 20 | into logs, traces, and metrics associated with clusters. We will not sell data or buy any observability data collected. 21 | 22 | To disable observability globally for all clusters, set the environment variable :code:`disable_observability` 23 | to :code:`True`. Alternatively, set :code:`disable_observability` to :code:`true` in your 24 | local Runhouse config (:code:`~/.rh/config.yaml`), or in Python: 25 | 26 | .. code-block:: python 27 | 28 | import runhouse as rh 29 | rh.configs.disable_observability() 30 | -------------------------------------------------------------------------------- /docs/tutorials/api-images.rst: -------------------------------------------------------------------------------- 1 | Images 2 | ====== 3 | 4 | .. raw:: html 5 | 6 |

7 | Open In Colab

8 | 9 | Runhouse clusters expose various functions that allow you to set up 10 | state, dependencies, and whatnot on all nodes of your cluster. These 11 | include: 12 | 13 | - ``cluster.pip_install(...)`` 14 | - ``cluster.rsync(...)`` 15 | - ``cluster.set_env_vars(...)`` 16 | - ``cluster.run_bash(...)`` 17 | 18 | A Runhouse “Image” is simply an abstraction that allows you to run 19 | several setup steps *before* we install ``runhouse`` and bring up the 20 | Runhouse daemon and initial set up on your cluster’s nodes. You can also 21 | specify a Docker ``image_id`` as the “base image” of your Runhouse 22 | image. 23 | 24 | Here’s a simple example of using the Runhouse Image abstraction in your 25 | cluster setup: 26 | 27 | .. code:: ipython3 28 | 29 | import runhouse as rh 30 | 31 | image = ( 32 | rh.Image(name="sample_image") 33 | .from_docker("python:3.12.8-bookworm") 34 | .pip_install(["numpy", "pandas"]) 35 | .sync_secrets(["huggingface"]) 36 | .set_env_vars({"RH_LOG_LEVEL": "debug"}) 37 | ) 38 | 39 | cluster = rh.cluster(name="ml_ready_cluster", image=image, instance_type="CPU:2+", provider="aws").up_if_not() 40 | 41 | 42 | .. parsed-literal:: 43 | :class: code-output 44 | 45 | I 12-17 12:04:55 provisioner.py:560] Successfully provisioned cluster: ml_ready_cluster 46 | I 12-17 12:04:57 cloud_vm_ray_backend.py:3402] Run commands not specified or empty. 47 | Clusters 48 | AWS: Fetching availability zones mapping...NAME LAUNCHED RESOURCES STATUS AUTOSTOP COMMAND 49 | ml_ready_cluster a few secs ago 1x AWS(m6i.large, image_id={'us-east-1': 'docker:python:3.12.8-bookwor... UP (down) /Users/rohinbhasin/minico... 50 | 51 | [?25h 52 | 53 | The growing listing of setup steps available for Runhouse images is 54 | available in the :ref:`API reference docs `. 55 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Runhouse Examples 2 | 3 | These directories contain self-contained examples that use Runhouse for various use cases. Each example has 4 | several comments that contain Markdown. These are rendered as examples on 5 | [our site](https://www.run.house/examples). To add to these, make a new directory and example file, and follow 6 | the Markdown-in-comments format that the rest of the examples follow. 7 | -------------------------------------------------------------------------------- /examples/dask-basic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/dask-basic/__init__.py -------------------------------------------------------------------------------- /examples/dask-preprocessing-and-training/dask_on_ray.py: -------------------------------------------------------------------------------- 1 | import kubetorch as kt 2 | 3 | # ## Using Dask on Ray for data processing 4 | # Dask on Ray works out of the box when KT sets up the Ray cluster; simply use enable_dask_on_ray() 5 | def read_taxi_df_dask(dataset_path, X_vars, y_vars): 6 | import dask.dataframe as dd 7 | from ray.util.dask import disable_dask_on_ray, enable_dask_on_ray 8 | 9 | enable_dask_on_ray() 10 | 11 | # Read the dataset 12 | df = dd.read_parquet(dataset_path) 13 | print(df.head()) 14 | 15 | X = df[X_vars].to_dask_array(lengths=True) 16 | y = df[y_vars].to_dask_array(lengths=True) 17 | 18 | from dask_ml.model_selection import train_test_split 19 | 20 | X_train, X_test, y_train, y_test = train_test_split( 21 | X, y, test_size=0.2, random_state=42 22 | ) 23 | 24 | print("First few rows of X_train:") 25 | print( 26 | X_train[:5].compute() 27 | ) # Limit to first 5 rows and compute to bring it to memory 28 | 29 | disable_dask_on_ray() 30 | 31 | 32 | if __name__ == "__main__": 33 | img = ( 34 | kt.images.ray() 35 | .pip_install( 36 | [ 37 | "dask-ml", 38 | "dask[distributed]", 39 | "dask[dataframe]", 40 | "boto3", 41 | "s3fs", 42 | "xgboost", 43 | ] 44 | ) 45 | .sync_secrets(["aws"]) 46 | ) 47 | compute = kt.Compute(cpus="4+", image=img) 48 | 49 | remote_read_taxi_df_dask = ( 50 | kt.fn(read_taxi_df_dask).to(compute).distribute("ray", num_nodes=4) 51 | ) 52 | 53 | # ## Example of using Dask on Ray to read data and minimally preprocess the data 54 | # Use one slice of the NYC taxi data as an example 55 | remote_read_taxi_df_dask( 56 | dataset_path="s3://rh-demo-external/taxi/yellow_tripdata_2024-01.parquet", 57 | X_vars=["passenger_count", "trip_distance", "fare_amount"], 58 | y_var=["tip_amount"], 59 | ) 60 | -------------------------------------------------------------------------------- /examples/dask-preprocessing-and-training/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse 2 | dask[distributed] 3 | dask-ml 4 | -------------------------------------------------------------------------------- /examples/dlrm-movielens/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | ENV HOME /root 4 | 5 | RUN apt-get update && apt-get install -y git 6 | 7 | # Install required Python packages 8 | RUN pip install --no-cache-dir skypilot[aws] awscli runhouse torch "ray[data, train]" 9 | RUN apt-get update && apt-get install -y rsync openssh-client 10 | 11 | # Copy your custom Python module 12 | COPY /dlrm_data_prepoc.py /root/code/dlrm_data_preproc.py 13 | COPY /dlrm_training.py /root/code/dlrm_training.py 14 | COPY /dlrm_inference.py /root/code/dlrm_inference.py 15 | COPY /requirements.txt /root/code/requirements.txt 16 | COPY /__init__.py /root/code/__init__.py 17 | 18 | RUN mkdir -p ~/.ssh 19 | -------------------------------------------------------------------------------- /examples/dlrm-movielens/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/dlrm-movielens/__init__.py -------------------------------------------------------------------------------- /examples/dlrm-movielens/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse 2 | ray[data,train] 3 | torch 4 | boto3 5 | -------------------------------------------------------------------------------- /examples/fastapi-embeddings-rag/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | WORKDIR /code 4 | 5 | # Copy only the file with the requirements first, not the rest of the code. 6 | # As this file doesn't change often, Docker will detect it and use the cache for this step, 7 | # enabling the cache for the next step too. 8 | COPY ./requirements.txt /code/requirements.txt 9 | 10 | # The --no-cache-dir option tells pip to not save the downloaded packages locally, 11 | # as that is only if pip was going to be run again to install the same packages, 12 | # but that's not the case when working with containers. 13 | RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt 14 | 15 | # Rsync is required to run sky commands on the container 16 | RUN apt-get update && apt-get -y install rsync 17 | 18 | # As this has all the code which is what changes most frequently the Docker 19 | # cache won't be used for this or any following steps easily. 20 | # So, it's important to put this near the end of the Dockerfile, 21 | # to optimize the container image build times. 22 | COPY ./app /code/app 23 | 24 | # Set the command to use fastapi run, which uses Uvicorn underneath. 25 | # CMD takes a list of strings, each of these strings is 26 | # what you would type in the command line separated by spaces. 27 | # This command will be run from the current working directory, 28 | # the same /code directory you set above with WORKDIR /code. 29 | CMD ["fastapi", "run", "app/main.py", "--port", "80"] 30 | -------------------------------------------------------------------------------- /examples/fastapi-embeddings-rag/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/fastapi-embeddings-rag/app/__init__.py -------------------------------------------------------------------------------- /examples/fastapi-embeddings-rag/app/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/fastapi-embeddings-rag/app/modules/__init__.py -------------------------------------------------------------------------------- /examples/fastapi-embeddings-rag/app/modules/embedding.py: -------------------------------------------------------------------------------- 1 | from lancedb.pydantic import LanceModel, Vector 2 | 3 | 4 | class Item(LanceModel): 5 | url: str 6 | page_content: str 7 | vector: Vector(1024) 8 | 9 | 10 | class URLEmbedder: 11 | def __init__(self, **model_kwargs): 12 | import torch 13 | from sentence_transformers import SentenceTransformer 14 | 15 | self.model = torch.compile(SentenceTransformer(**model_kwargs)) 16 | 17 | def encode_text(self, text: str, **embed_kwargs): 18 | embeddings = self.model.encode([text], **embed_kwargs) 19 | 20 | return embeddings[0] 21 | 22 | def embed_docs(self, paths: str, **embed_kwargs): 23 | from langchain_community.document_loaders import WebBaseLoader 24 | from langchain_text_splitters import RecursiveCharacterTextSplitter 25 | 26 | docs = WebBaseLoader( 27 | web_paths=paths, 28 | ).load() 29 | split_docs = RecursiveCharacterTextSplitter( 30 | chunk_size=250, chunk_overlap=50 31 | ).split_documents(docs) 32 | splits_as_str = [doc.page_content for doc in split_docs] 33 | embeddings = self.model.encode(splits_as_str, **embed_kwargs) 34 | items = [ 35 | { 36 | "url": doc.metadata["source"], 37 | "page_content": doc.page_content, 38 | "vector": embeddings[index], 39 | } 40 | for index, doc in enumerate(split_docs) 41 | ] 42 | 43 | return items 44 | -------------------------------------------------------------------------------- /examples/fastapi-embeddings-rag/app/modules/llm.py: -------------------------------------------------------------------------------- 1 | class LlamaModel: 2 | def __init__(self, model_id="meta-llama/Meta-Llama-3-8B-Instruct", **model_kwargs): 3 | super().__init__() 4 | self.model_id, self.model_kwargs = model_id, model_kwargs 5 | self.engine = None 6 | 7 | def load_engine(self): 8 | import gc 9 | 10 | import torch 11 | from vllm.distributed.parallel_state import ( 12 | destroy_distributed_environment, 13 | destroy_model_parallel, 14 | ) 15 | from vllm.engine.arg_utils import AsyncEngineArgs 16 | from vllm.engine.async_llm_engine import AsyncLLMEngine 17 | 18 | # This vLLM function resets the global variables, which enables initializing models 19 | destroy_model_parallel() 20 | # Cleanup methods in case vLLM is reloaded in a new LlamaModel instance 21 | destroy_distributed_environment() 22 | gc.collect() 23 | torch.cuda.empty_cache() 24 | 25 | args = AsyncEngineArgs( 26 | model=self.model_id, # Hugging Face Model ID 27 | tensor_parallel_size=1, # Increase if using additional GPUs 28 | trust_remote_code=True, # Trust remote code from Hugging Face 29 | enforce_eager=True, # Set to False in production to improve performance 30 | max_model_len=7056, # 31 | ) 32 | self.engine = AsyncLLMEngine.from_engine_args(args) 33 | 34 | async def generate(self, prompt: str, **sampling_params): 35 | from vllm.sampling_params import SamplingParams 36 | from vllm.utils import random_uuid 37 | 38 | if not self.engine: 39 | self.load_engine() 40 | 41 | sampling_params = SamplingParams(**sampling_params) 42 | request_id = random_uuid() 43 | results_generator = self.engine.generate(prompt, sampling_params, request_id) 44 | 45 | async for output in results_generator: 46 | final_output = output 47 | responses = [] 48 | for output in final_output.outputs: 49 | responses.append(output.text) 50 | return responses 51 | -------------------------------------------------------------------------------- /examples/fastapi-embeddings-rag/requirements.txt: -------------------------------------------------------------------------------- 1 | asyncio 2 | fastapi[standard] 3 | lancedb==0.11.0 4 | runhouse[aws]==0.0.32 5 | -------------------------------------------------------------------------------- /examples/flux/flux.py: -------------------------------------------------------------------------------- 1 | import kubetorch as kt 2 | 3 | # ## Create Flux Pipeline with Kubetorch 4 | # First, we define a class that will hold the model and allow us to send prompts to it. 5 | # To deploy it as a service, we simply decorate the class to send it to our cluster 6 | # when we call `kubetorch deploy` in the CLI. 7 | img = ( 8 | kt.images.pytorch() 9 | .pip_install( 10 | [ 11 | "diffusers", 12 | "transformers[sentencepiece]", 13 | "accelerate", 14 | ] 15 | ) 16 | .sync_secrets(["huggingface"]) 17 | ) 18 | 19 | 20 | @kt.compute( 21 | gpus="A10G:1", memory="64", image=img 22 | ) # Send to compute with an A10 GPU and 64GB of memory 23 | @kt.distribute("auto", num_replicas=(1, 4)) # Autoscale between 1 and 4 replicas 24 | class FluxPipeline: 25 | def __init__( 26 | self, 27 | model_id: str = "black-forest-labs/FLUX.1-schnell", # Schenll is smaller and faster while dev is more powerful but slower 28 | ): 29 | super().__init__() 30 | self.model_id = model_id 31 | self.pipeline = None 32 | 33 | def _load_pipeline(self): 34 | import torch 35 | from diffusers import FluxPipeline 36 | 37 | if not self.pipeline: 38 | self.pipeline = FluxPipeline.from_pretrained( 39 | self.model_id, torch_dtype=torch.bfloat16, use_safetensors=True 40 | ) 41 | self.pipeline.enable_sequential_cpu_offload() # Optimizes memory usage to allow the model to fit and inference on an A10 which has 24GB of memory 42 | 43 | def generate(self, input_prompt: str, **parameters): 44 | import torch 45 | 46 | torch.cuda.empty_cache() 47 | 48 | if not self.pipeline: 49 | self._load_pipeline() 50 | 51 | image = self.pipeline( 52 | input_prompt, 53 | guidance_scale=0.0, 54 | num_inference_steps=4, 55 | max_sequence_length=256, 56 | generator=torch.Generator("cpu").manual_seed(0), 57 | ).images[0] 58 | 59 | return image 60 | 61 | 62 | if __name__ == "__main__": 63 | # We can load the remote model from anywhere that has access to the cluster 64 | flux_pipeline = FluxPipeline.from_name("flux") 65 | 66 | # We can call the `generate` method on the model class instance if it were running locally. 67 | # This will run the function on the remote cluster and return the response to our local machine automatically. 68 | # We can also call this from a different machine or script and create composite ML systems. 69 | prompt = "A woman runs through a large, grassy field towards a house." 70 | response = flux_pipeline.generate(prompt) 71 | response.save("flux-schnell.png") 72 | response.show() 73 | -------------------------------------------------------------------------------- /examples/flux/readme.md: -------------------------------------------------------------------------------- 1 | # Deploy Flux1 Schnell on AWS EC2 2 | 3 | See a more [rich explanation](https://www.run.house/guides/host-and-run-flux1-image-genai-aws) 4 | of this example on our site. 5 | 6 | This example demonstrates how to deploy a 7 | [Flux.1 model from Hugging Face](https://huggingface.co/black-forest-labs/FLUX.1-schnell) 8 | on AWS EC2 using Runhouse. Schnell is smaller than their Dev version, but fits easily onto a single A10G. 9 | 10 | ## Setup credentials and dependencies 11 | 12 | Optionally, set up a virtual environment: 13 | ```shell 14 | $ conda create -n rh-flux python=3.11 15 | $ conda activate rh-flux 16 | ``` 17 | Install the few required dependencies: 18 | ```shell 19 | $ pip install -r requirements.txt 20 | ``` 21 | 22 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to 23 | make sure our AWS credentials are set up: 24 | ```shell 25 | $ aws configure 26 | $ sky check 27 | ``` 28 | 29 | After that, you can just run the example: 30 | ```shell 31 | $ python flux.py 32 | ``` 33 | -------------------------------------------------------------------------------- /examples/flux/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse 2 | -------------------------------------------------------------------------------- /examples/hello-world/hello_world.py: -------------------------------------------------------------------------------- 1 | import kubetorch as kt 2 | 3 | # ## Write your code 4 | # This regular Python code is developed locally, and then 5 | # deployed to Kubernetes with Kubetorch. On first execution, it 6 | # may take a little time to allocate compute; subsequently, changes to this function 7 | # will hot sync instantaneously for interactive development. Then, the dispatch 8 | # can be scheduled or put into CI as-is to reach production. 9 | def hello_world(num_prints=1): 10 | for print_num in range(num_prints): 11 | print("Hello world ", print_num) 12 | 13 | 14 | # ## Define compute, deploy, and call 15 | # You define compute with kt.Compute(), and then send the `hello_world` 16 | # function to that compute to run. You can see that you get back a callable 17 | # with the same function signature as the original, and you can call it identically. 18 | if __name__ == "__main__": 19 | compute = kt.Compute(cpus=1) 20 | 21 | remote_hello = kt.fn(hello_world).to(compute) 22 | 23 | results = remote_hello(5) 24 | -------------------------------------------------------------------------------- /examples/hello-world/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/hello-world/requirements.txt -------------------------------------------------------------------------------- /examples/hpo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/hpo/__init__.py -------------------------------------------------------------------------------- /examples/hpo/hpo.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | 4 | import numpy as np 5 | 6 | import runhouse as rh 7 | 8 | NUM_WORKERS = 8 9 | NUM_JOBS = 30 10 | 11 | 12 | def train_fn(step, width, height): 13 | time.sleep(5) 14 | return (0.1 + width * step / 100) ** (-1) + height * 0.1 15 | 16 | 17 | def generate_params(): 18 | return {"width": np.random.uniform(0, 1), "height": np.random.uniform(0, 1)} 19 | 20 | 21 | async def find_best_params(): 22 | cluster = rh.compute( 23 | name="rh-4x16-cpu", instance_type="CPU:16", num_nodes=4, provider="aws" 24 | ).up_if_not() 25 | 26 | remote_train_fn = rh.function(train_fn).to(cluster) 27 | available_worker_fns = [remote_train_fn] + remote_train_fn.replicate( 28 | NUM_WORKERS - 1 29 | ) 30 | 31 | async def run_job(step): 32 | while not available_worker_fns: 33 | await asyncio.sleep(1) 34 | worker_fn = available_worker_fns.pop(0) 35 | next_point_to_probe = generate_params() 36 | 37 | print(f"Calling step {step} on point {next_point_to_probe}") 38 | target = await worker_fn(step=step, **next_point_to_probe, run_async=True) 39 | print(f"Returned step {step} with value {target}") 40 | 41 | available_worker_fns.append(worker_fn) 42 | return next_point_to_probe, target 43 | 44 | results = await asyncio.gather( 45 | *[run_job(counter) for counter in range(NUM_JOBS)], return_exceptions=True 46 | ) 47 | 48 | max_result = max(results, key=lambda x: x[1]) 49 | print(f"Optimization finished. Best parameters found: {max_result}") 50 | 51 | 52 | if __name__ == "__main__": 53 | asyncio.run(find_best_params()) 54 | -------------------------------------------------------------------------------- /examples/hpo/hpo_bayes_opt.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import runhouse as rh 4 | 5 | from bayes_opt import BayesianOptimization 6 | 7 | NUM_WORKERS = 8 8 | NUM_JOBS = 30 9 | 10 | 11 | def train_fn(x, y): 12 | return -(x**2) - (y - 1) ** 2 + 1 13 | 14 | 15 | if __name__ == "__main__": 16 | img = rh.Image("worker_image").pip_install(["bayesian-optimization"]) 17 | 18 | cluster = rh.compute( 19 | name="rh-4x16-cpu", 20 | instance_type="CPU:4+", 21 | num_nodes=2, 22 | provider="kubernetes", 23 | image=img, 24 | ).up_if_not() 25 | 26 | remote_train_fn = rh.function(train_fn).to(cluster) 27 | train_fn_pool = remote_train_fn.distribute( 28 | "pool", num_replicas=NUM_WORKERS, replicas_per_node=NUM_WORKERS // 2 29 | ) 30 | 31 | optimizer = BayesianOptimization( 32 | f=partial(train_fn_pool, stream_logs=False), 33 | pbounds={"x": (-2, 2), "y": (-3, 3)}, 34 | verbose=2, 35 | random_state=1, 36 | ) 37 | optimizer.maximize(init_points=NUM_WORKERS, n_iter=NUM_JOBS) 38 | print(f"Optimization finished. Best parameters found: {optimizer.max}") 39 | -------------------------------------------------------------------------------- /examples/hpo/hpo_bayes_opt_low_level.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | 4 | import runhouse as rh 5 | 6 | NUM_WORKERS = 8 7 | NUM_JOBS = 30 8 | 9 | 10 | def train_fn(step, width, height): 11 | time.sleep(5) 12 | return (0.1 + width * step / 100) ** (-1) + height * 0.1 13 | 14 | 15 | async def find_best_params(): 16 | from bayes_opt import BayesianOptimization, UtilityFunction 17 | 18 | img = rh.Image("worker_image").pip_install(["bayesian-optimization"]) 19 | 20 | cluster = rh.compute( 21 | name="rh-4x16-cpu", 22 | instance_type="CPU:16", 23 | num_nodes=4, 24 | provider="aws", 25 | image=img, 26 | ).up_if_not() 27 | 28 | worker_fns = rh.function(train_fn).to(cluster).replicate(replicas=NUM_WORKERS) 29 | 30 | optimizer = BayesianOptimization( 31 | f=None, 32 | pbounds={"width": (0, 20), "height": (-100, 100)}, 33 | verbose=2, 34 | random_state=1, 35 | ) 36 | utility = UtilityFunction(kind="ucb", kappa=2.5, xi=0.0) 37 | 38 | async def run_job(step): 39 | while not worker_fns: 40 | await asyncio.sleep(1) 41 | worker_fn = worker_fns.pop(0) 42 | hyperparams = optimizer.suggest(utility) 43 | 44 | print(f"Calling step {step} on point {hyperparams}") 45 | target = await worker_fn(step=step, **hyperparams, run_async=True) 46 | print(f"Returned step {step} with value {target}") 47 | 48 | optimizer.register(hyperparams, target) 49 | utility.update_params() 50 | 51 | worker_fns.append(worker_fn) 52 | 53 | futs = [run_job(counter) for counter in range(NUM_JOBS)] 54 | await asyncio.gather(*futs, return_exceptions=True) 55 | 56 | print(f"Optimization finished. Best parameters found: {optimizer.max}") 57 | 58 | 59 | if __name__ == "__main__": 60 | asyncio.run(find_best_params()) 61 | -------------------------------------------------------------------------------- /examples/inference_llama70b/llama70b_vllm.py: -------------------------------------------------------------------------------- 1 | import kubetorch as kt 2 | from vllm import LLM, SamplingParams 3 | 4 | 5 | img = ( 6 | kt.images.pytorch() 7 | .pip_install(["transformers", "vllm"]) 8 | .sync_secrets(["huggingface"]) 9 | ) 10 | 11 | 12 | @kt.compute(gpus="L4:8", image=img, name="llama70b") 13 | @kt.distribute("auto", num_replicas=(0, 4)) 14 | class Llama70B_vLLM: 15 | def __init__(self, num_gpus, model_id="meta-llama/Llama-3.3-70B-Instruct"): 16 | self.model_id = model_id 17 | self.model = None 18 | self.sampling_params = None 19 | self.num_gpus = num_gpus 20 | 21 | def load_model(self, temperature=1, top_p=0.9, max_tokens=256, min_tokens=32): 22 | self.sampling_params = SamplingParams( 23 | temperature=temperature, 24 | top_p=top_p, 25 | max_tokens=max_tokens, 26 | min_tokens=min_tokens, 27 | ) 28 | print("loading model") 29 | self.model = LLM( 30 | self.model_id, 31 | tensor_parallel_size=self.num_gpus, 32 | dtype="bfloat16", 33 | trust_remote_code=True, 34 | max_model_len=8192, 35 | ) 36 | print("model loaded") 37 | 38 | def generate(self, queries, temperature=1, top_p=0.95): 39 | if self.model is None: 40 | self.load_model(temperature, top_p) 41 | 42 | outputs = self.model.generate(queries, self.sampling_params) 43 | return outputs 44 | 45 | 46 | if __name__ == "__main__": 47 | llama = Llama70B_vLLM.from_name("llama70b") 48 | 49 | queries = [ 50 | "What is the best type of bread in the world?", 51 | "What are some cheeses that go with bread?", 52 | "What is the best way to make a sandwich?", 53 | ] 54 | outputs = llama.generate(queries) 55 | for output in outputs: 56 | prompt = output.prompt 57 | generated_text = output.outputs[0].text 58 | print(f"Prompt: {prompt}, Generated text: {generated_text}") 59 | -------------------------------------------------------------------------------- /examples/langchain-rag-ec2/README.md: -------------------------------------------------------------------------------- 1 | # Deploy a Langchain RAG as a service on AWS EC2 2 | 3 | This is an example of easily deploying [Langchain's Quickstart RAG app](https://python.langchain.com/docs/use_cases/question_answering/quickstart) 4 | as a service on AWS EC2 using Runhouse. 5 | 6 | ## Setup credentials and dependencies 7 | 8 | Optionally, set up a virtual environment: 9 | ```shell 10 | $ conda create -n langchain-rag python=3.9.15 11 | $ conda activate langchain-rag 12 | ``` 13 | Install Runhouse, the only library needed to run this script locally: 14 | ```shell 15 | $ pip install "runhouse[aws]" 16 | ``` 17 | 18 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to 19 | make sure our AWS credentials are set up: 20 | ```shell 21 | $ aws configure 22 | $ sky check 23 | ``` 24 | 25 | We'll be hitting OpenAI's API, so we need to set up our OpenAI API key: 26 | ```shell 27 | $ export OPENAI_API_KEY= 28 | ``` 29 | 30 | After that, you can just run the example: 31 | ```shell 32 | $ python langchain_rag.py 33 | ``` 34 | -------------------------------------------------------------------------------- /examples/lightning-resnet/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | kubetorch 3 | boto3 4 | lightning 5 | datasets 6 | torchvision 7 | -------------------------------------------------------------------------------- /examples/llama2-13b-ec2/README.md: -------------------------------------------------------------------------------- 1 | # Deploy Llama2 13B Chat Model Inference on AWS EC2 2 | 3 | See a more [rich explanation](https://www.run.house/examples/llama2-chat-model-inference-aws-ec2) 4 | of this example on our site. 5 | 6 | This example demonstrates how to deploy a 7 | [LLama2 13B model from Hugging Face](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) 8 | on AWS EC2 using Runhouse. 9 | 10 | ## Setup credentials and dependencies 11 | 12 | Optionally, set up a virtual environment: 13 | ```shell 14 | $ conda create -n llama-demo-apps python=3.8 15 | $ conda activate llama-demo-apps 16 | ``` 17 | Install the few required dependencies: 18 | ```shell 19 | $ pip install -r requirements.txt 20 | ``` 21 | 22 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to 23 | make sure our AWS credentials are set up: 24 | ```shell 25 | $ aws configure 26 | $ sky check 27 | ``` 28 | We'll be downloading the Llama2 model from Hugging Face, so we need to set up our Hugging Face token: 29 | ```shell 30 | $ export HF_TOKEN= 31 | ``` 32 | 33 | After that, you can just run the example: 34 | ```shell 35 | $ python llama2_ec2.py 36 | ``` 37 | -------------------------------------------------------------------------------- /examples/llama2-13b-ec2/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse[aws] 2 | torch 3 | -------------------------------------------------------------------------------- /examples/llama2-fine-tuning-with-lora/README.md: -------------------------------------------------------------------------------- 1 | # Fine Tune Llama 2 with LoRA on AWS EC2 2 | 3 | See a more [rich explanation](https://www.run.house/examples/llama2-fine-tuning-with-lora) 4 | of this example on our site. 5 | 6 | This example demonstrates how to fine tune a model using 7 | [Llama 2](https://huggingface.co/NousResearch/Llama-2-7b-chat-hf) and 8 | [LoRA](https://huggingface.co/docs/peft/main/en/conceptual_guides/lora) on AWS EC2 using Runhouse. 9 | 10 | ## Setup credentials and dependencies 11 | 12 | Install the few required dependencies: 13 | ```shell 14 | $ pip install -r requirements.txt 15 | ``` 16 | 17 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to 18 | make sure our AWS credentials are set up: 19 | ```shell 20 | $ aws configure 21 | $ sky check 22 | ``` 23 | 24 | After that, you can just run the example: 25 | ```shell 26 | $ python llama2_fine_tuning.py 27 | ``` 28 | -------------------------------------------------------------------------------- /examples/llama2-fine-tuning-with-lora/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse[aws] 2 | -------------------------------------------------------------------------------- /examples/llama2-with-tgi-aws-inferentia2/README.md: -------------------------------------------------------------------------------- 1 | # Deploy Llama 2 7B Model with TGI on AWS Inferentia 2 | 3 | See a more [rich explanation](https://www.run.house/examples/llama-tgi-inference-on-aws-inferentia) 4 | of this example on our site. 5 | 6 | This example demonstrates how to deploy a [Llama 7B model](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) using 7 | [TGI](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on AWS Inferentia 8 | using Runhouse, specifically with the [AWS Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/). 9 | 10 | ## Setup credentials and dependencies 11 | Install the required dependencies: 12 | ```shell 13 | $ pip install -r requirements.txt 14 | ``` 15 | 16 | We'll be launching an AWS Inferentia instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to 17 | make sure our AWS credentials are set up: 18 | ```shell 19 | $ aws configure 20 | $ sky check 21 | ``` 22 | -------------------------------------------------------------------------------- /examples/llama2-with-tgi-aws-inferentia2/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse[aws] 2 | -------------------------------------------------------------------------------- /examples/llama2-with-tgi-ec2/README.md: -------------------------------------------------------------------------------- 1 | # Deploy Llama 2 7B Model with TGI on AWS EC2 2 | 3 | See a more [rich explanation](https://www.run.house/examples/llama-tgi-inference-on-aws-ec2) 4 | of this example on our site. 5 | 6 | This example demonstrates how to deploy a 7 | [Llama 7B model](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) using 8 | [TGI](https://huggingface.co/docs/text-generation-inference/messages_api) on AWS EC2 using Runhouse. 9 | 10 | ## Setup credentials and dependencies 11 | Install the required dependencies: 12 | ```shell 13 | $ pip install -r requirements.txt 14 | ``` 15 | 16 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to make 17 | sure our AWS credentials are set up: 18 | ```shell 19 | $ aws configure 20 | $ sky check 21 | ``` 22 | -------------------------------------------------------------------------------- /examples/llama2-with-tgi-ec2/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse[aws] 2 | -------------------------------------------------------------------------------- /examples/llama3-8b-ec2/README.md: -------------------------------------------------------------------------------- 1 | # Deploy Llama3 8B Chat Model Inference on AWS EC2 2 | 3 | See a more [rich explanation](https://www.run.house/examples/llama3-8b-chat-model-inference-aws-ec2) 4 | of this example on our site. 5 | 6 | This example demonstrates how to deploy a 7 | [LLama2 13B model from Hugging Face](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) 8 | on AWS EC2 using Runhouse. 9 | 10 | Make sure to sign the waiver on the model page so that you can access it. 11 | 12 | ## Setup credentials and dependencies 13 | 14 | Optionally, set up a virtual environment: 15 | ```shell 16 | $ conda create -n llama3-rh python=3.9.15 17 | $ conda activate llama3-rh 18 | ``` 19 | Install the few required dependencies: 20 | ```shell 21 | $ pip install -r requirements.txt 22 | ``` 23 | 24 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to 25 | make sure our AWS credentials are set up: 26 | ```shell 27 | $ aws configure 28 | $ sky check 29 | ``` 30 | We'll be downloading the Llama2 model from Hugging Face, so we need to set up our Hugging Face token: 31 | ```shell 32 | $ export HF_TOKEN= 33 | ``` 34 | 35 | After that, you can just run the example: 36 | ```shell 37 | $ python llama3_ec2.py 38 | ``` 39 | -------------------------------------------------------------------------------- /examples/llama3-8b-ec2/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse[aws] 2 | torch 3 | -------------------------------------------------------------------------------- /examples/llama3-8b-tgi-ec2/README.md: -------------------------------------------------------------------------------- 1 | # Deploy Llama 3 8B with TGI on AWS EC2 2 | This example demonstrates how to deploy a Meta Llama 3 8B model from Hugging Face with 3 | [TGI](https://huggingface.co/docs/text-generation-inference/messages_api) on AWS EC2 using Runhouse. 4 | 5 | 6 | Make sure to sign the waiver on the [Hugging Face model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) so that you can access it. 7 | 8 | ## Setup credentials and dependencies 9 | Install the required dependencies: 10 | ```shell 11 | $ pip install -r requirements.txt 12 | ``` 13 | 14 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to make sure our AWS credentials are set up: 15 | ```shell 16 | $ aws configure 17 | $ sky check 18 | ``` 19 | -------------------------------------------------------------------------------- /examples/llama3-8b-tgi-ec2/requirements.txt: -------------------------------------------------------------------------------- 1 | docker 2 | runhouse[aws] 3 | -------------------------------------------------------------------------------- /examples/llama3-fine-tuning-lora/README.md: -------------------------------------------------------------------------------- 1 | # Fine-Tune Llama 3 with LoRA on AWS EC2 2 | 3 | This example demonstrates how to fine-tune a Llama 3 8B model using 4 | [LoRA](https://huggingface.co/docs/peft/main/en/conceptual_guides/lora) on AWS EC2 using Runhouse. 5 | 6 | Make sure to sign the waiver on the [Hugging Face model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) 7 | so that you can access it. 8 | 9 | ## Setup credentials and dependencies 10 | 11 | Install the few required dependencies: 12 | ```shell 13 | $ pip install -r requirements.txt 14 | ``` 15 | 16 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to 17 | make sure our AWS credentials are set up: 18 | ```shell 19 | $ aws configure 20 | $ sky check 21 | ``` 22 | 23 | After that, you can just run the example: 24 | ```shell 25 | $ python llama3_fine_tuning.py 26 | ``` 27 | -------------------------------------------------------------------------------- /examples/llama3-fine-tuning-lora/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse[aws] 2 | torch 3 | datasets 4 | peft 5 | transformers 6 | trl 7 | -------------------------------------------------------------------------------- /examples/llama3-vllm-gcp/README.md: -------------------------------------------------------------------------------- 1 | # Run Llama 3 8B Model Inference with vLLM on GCP 2 | 3 | This example demonstrates how to run a Llama 3 8B model from Hugging Face with vLLM using Runhouse. 4 | 5 | Make sure to sign the waiver on the [Hugging Face model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) 6 | so that you can access it. 7 | 8 | ## Setup credentials and dependencies 9 | 10 | Optionally, set up a virtual environment: 11 | ```shell 12 | $ conda create -n llama3-rh python=3.9.15 13 | $ conda activate llama3-rh 14 | ``` 15 | 16 | Install the required dependencies: 17 | 18 | ```shell 19 | $ pip install -r requirements.txt 20 | ``` 21 | 22 | If you do not have a Runhouse account and want to launch an instance via [SkyPilot](https://github.com/skypilot-org/skypilot), make sure your credentials are set up. You may be prompted to pick a cloud project to use after running `gcloud init`. If you don't have one ready yet, you can connect one later by listing your projects with `gcloud projects list` and setting one with `gcloud config set project `. 23 | 24 | If you already have a Runhouse account, you do not need to run this. 25 | 26 | ```shell 27 | $ gcloud init 28 | $ gcloud auth application-default login 29 | $ sky check 30 | ``` 31 | 32 | We'll be downloading the Llama 3 model from Hugging Face, so we need to set up our Hugging Face token: 33 | 34 | ```shell 35 | $ export HF_TOKEN= 36 | ``` 37 | 38 | ## Run the Python script 39 | 40 | ```shell 41 | $ python llama3_vllm_gcp.py 42 | ``` 43 | -------------------------------------------------------------------------------- /examples/llama3-vllm-gcp/requirements.txt: -------------------------------------------------------------------------------- 1 | asyncio 2 | runhouse[gcp] 3 | -------------------------------------------------------------------------------- /examples/lora-example-with-notebook/LoraFineTuner_check_status.py: -------------------------------------------------------------------------------- 1 | import runhouse as rh 2 | 3 | # We check if we have already created a "rh_finetuner" on the remote which is an *instance* of the remote fine tuner class 4 | cluster = rh.compute( 5 | name="rh-a10x", 6 | instance_type="A10G:1", 7 | memory="32+", 8 | provider="aws", 9 | ).up_if_not() 10 | 11 | fine_tuner_remote_name = "rh_finetuner" 12 | fine_tuner_remote = cluster.get(fine_tuner_remote_name, default=None, remote=True) 13 | 14 | # Check what the training status is on remote 15 | if fine_tuner_remote is not None: 16 | print(fine_tuner_remote.get_training_status()) 17 | -------------------------------------------------------------------------------- /examples/lora-example-with-notebook/readme.md: -------------------------------------------------------------------------------- 1 | ## LoRA Fine-Tuning Class with Example of Notebook Usage 2 | In this example, we define a Fine Tuner class (LoraFineTuner.py) in **regular Python** and launch remote GPU compute to do the fine-tuning. 3 | 4 | In particular, we show how you can start the fine tuning and interact with the fine-tuning class (a remote object) through regular Python or a Notebook. Runhouse lets you work *locally* with *remote objects* defined by regular code and edited locally, compared to tooling like hosted notebooks which let you *work locally while SSH'ed into a remote setting.* This offers a few distinct advantages: 5 | * **Real compute and real data:** ML Engineers and data scientists do not need to launch projects on toy compute offered in a research environment. 6 | * **Real code:** Rather than working on Notebooks (because they have to), your team is writing code and developing locally just like a normal software team. The only difference is dispatching the work for remote computation since the local machine doesn't have the right hardware. 7 | * **Fast research to production:** The work done while writing and testing the class is essentially enough to bring the work to production as well. There is no costly rebuilding of the same code a second time to work in a Pipeline. 8 | -------------------------------------------------------------------------------- /examples/mistral-with-tgi-ec2/README.md: -------------------------------------------------------------------------------- 1 | # Deploy Mistral's 7B Model with TGI on AWS EC2 2 | 3 | See a more [rich explanation](https://www.run.house/examples/mistral-tgi-inference-on-aws-ec2) 4 | of this example on our site. 5 | 6 | This example demonstrates how to deploy a 7 | [TGI model](https://huggingface.co/docs/text-generation-inference/messages_api) on AWS EC2 using Runhouse. 8 | This example draws inspiration from 9 | [Huggingface's tutorial on AWS SageMaker](https://huggingface.co/blog/text-generation-inference-on-inferentia2). 10 | Zephyr is a 7B fine-tuned version of [Mistral's 7B-v0.1 model](https://huggingface.co/mistralai/Mistral-7B-v0.1). 11 | 12 | ## Setup credentials and dependencies 13 | Install the required dependencies: 14 | ```shell 15 | $ pip install -r requirements.txt 16 | ``` 17 | 18 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to make 19 | sure our AWS credentials are set up: 20 | ```shell 21 | $ aws configure 22 | $ sky check 23 | ``` 24 | -------------------------------------------------------------------------------- /examples/mistral-with-tgi-ec2/requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | runhouse[aws] 3 | -------------------------------------------------------------------------------- /examples/parallel-hf-embedding/README.md: -------------------------------------------------------------------------------- 1 | # An embarrassingly parallel embedding task with Hugging Face models on AWS EC2 2 | 3 | This example demonstrates how to use Runhouse primitives to embed a large number of websites in parallel. 4 | We use a [BGE large model from Hugging Face](https://huggingface.co/BAAI/bge-large-en-v1.5) and load it via 5 | the `SentenceTransformer` class from the `huggingface` library. 6 | 7 | ## Setup credentials and dependencies 8 | 9 | Optionally, set up a virtual environment: 10 | ```shell 11 | $ conda create -n parallel-embed python=3.9.15 12 | $ conda activate parallel-embed 13 | ``` 14 | Install the few required dependencies: 15 | ```shell 16 | $ pip install -r requirements.txt 17 | ``` 18 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to 19 | make sure our AWS credentials are set up: 20 | ```shell 21 | $ aws configure 22 | $ sky check 23 | ``` 24 | 25 | ## Some utility functions 26 | 27 | We import `runhouse` and other utility libraries; only the ones that are needed to run the script locally. 28 | Imports of libraries that are needed on the remote machine (in this case, the `huggingface` dependencies) 29 | can happen within the functions that will be sent to the Runhouse cluster. 30 | -------------------------------------------------------------------------------- /examples/parallel-hf-embedding/requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | runhouse[aws] 3 | torch 4 | tqdm 5 | -------------------------------------------------------------------------------- /examples/pytorch-distributed-basic/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Multi-node Distributed Training 2 | 3 | A basic example showing how to use Runhouse to Pythonically run a PyTorch distributed training script on a 4 | cluster of GPUs. Often distributed training is launched from multiple parallel CLI commands 5 | (`python -m torch.distributed.launch ...`), each spawning separate training processes (ranks). 6 | Here, we're creating each process as a separate worker on the cluster, sending our training function 7 | into each worker, and calling the replicas concurrently to trigger coordinated multi-node training 8 | (`torch.distributed.init_process_group` causes each to wait for all to connect, and sets up the distributed 9 | communication). We're using two single-GPU instances (and therefore two ranks) for simplicity, but we've included 10 | the basic logic to handle multi-GPU nodes as well, where you'd add more worker processes per node and set `device_ids` 11 | accordingly. 12 | 13 | Despite it being common to use a launcher script to start distributed training, this approach is more flexible and 14 | allows for more complex orchestration, such as running multiple training jobs concurrently, handling exceptions, 15 | running distributed training alongside other tasks on the same cluster. It's also significantly easier to debug 16 | and monitor, as you can see the output of each rank in real-time and get stack traces if a worker fails. 17 | -------------------------------------------------------------------------------- /examples/pytorch-distributed-basic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/pytorch-distributed-basic/__init__.py -------------------------------------------------------------------------------- /examples/pytorch-distributed-basic/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | kubetorch 3 | -------------------------------------------------------------------------------- /examples/pytorch-resnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/pytorch-resnet/__init__.py -------------------------------------------------------------------------------- /examples/pytorch-resnet/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse 2 | torch 3 | torchvision 4 | datasets 5 | -------------------------------------------------------------------------------- /examples/pytorch-torchvision-mnist-training/README.md: -------------------------------------------------------------------------------- 1 | # Deploy and Train a Model with Torch 2 | This example demonstrates how to use the `SimpleTrainer` class to train and test a machine learning model using PyTorch and the MNIST dataset. The `SimpleTrainer` class handles model training, evaluation, and prediction tasks and shows you how you can send model classes to train and execute on remote compute. 3 | 4 | ## Setup and Installation 5 | 6 | Optionally, set up a virtual environment: 7 | ```shell 8 | $ conda create -n simple-trainer python=3.10 9 | $ conda activate simple-trainer 10 | ``` 11 | Install the necessary dependencies: 12 | ```shell 13 | $ pip install -r requirements.txt 14 | ``` 15 | 16 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to 17 | make sure our AWS credentials are set up (you can use any cloud with Runhouse): 18 | ```shell 19 | $ aws configure 20 | $ sky check 21 | ``` 22 | 23 | ## Run the Python script 24 | 25 | ```shell 26 | $ python TorchBasicExample-AWS.py 27 | ``` 28 | -------------------------------------------------------------------------------- /examples/pytorch-torchvision-mnist-training/airflow-multicloud/DataProcessing.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import boto3 4 | 5 | 6 | # Download data from S3 7 | def download_folder_from_s3(bucket_name, s3_folder_prefix, local_folder_path): 8 | s3 = boto3.client("s3") 9 | 10 | paginator = s3.get_paginator("list_objects_v2") 11 | for page in paginator.paginate(Bucket=bucket_name, Prefix=s3_folder_prefix): 12 | if "Contents" in page: 13 | for obj in page["Contents"]: 14 | s3_key = obj["Key"] 15 | relative_path = os.path.relpath(s3_key, s3_folder_prefix) 16 | local_path = os.path.join(local_folder_path, relative_path) 17 | 18 | os.makedirs(os.path.dirname(local_path), exist_ok=True) 19 | s3.download_file(bucket_name, s3_key, local_path) 20 | print(f"Downloaded {s3_key} to {local_path}") 21 | 22 | 23 | # download_folder_from_s3('rh-demo-external', 'your/s3/folder/prefix', '/path/to/local/folder', 'your-access-key-id', 'your-secret-access-key') 24 | 25 | 26 | # Upload data to S3 bucket 27 | def upload_folder_to_s3(local_folder_path, bucket_name, s3_folder_prefix): 28 | s3 = boto3.client("s3") 29 | 30 | for root, dirs, files in os.walk(local_folder_path): 31 | for file in files: 32 | local_path = os.path.join(root, file) 33 | relative_path = os.path.relpath(local_path, local_folder_path) 34 | s3_path = os.path.join(s3_folder_prefix, relative_path) 35 | 36 | s3.upload_file(local_path, bucket_name, s3_path) 37 | print(f"Uploaded {local_path} to s3://{bucket_name}/{s3_path}") 38 | 39 | 40 | # upload_folder_to_s3('/path/to/local/folder', 'rh-demo-external', 'your/s3/folder/prefix', 'your-access-key-id', 'your-secret-access-key') 41 | -------------------------------------------------------------------------------- /examples/pytorch-torchvision-mnist-training/airflow-multicloud/local_run_of_callables.py: -------------------------------------------------------------------------------- 1 | # You can easily test both the Airflow flow, and the underlying components and code by calling them from local 2 | 3 | # Because the execution has been offloaded to GPU compute on remote, you can call each step from local, or from a notebook 4 | # You can imagine that a DS or MLE might write a pipeline and interactively debug from local. 5 | # Then, only when they are confident all the functions work, do they upload the Airflow pipeline which is minimal 6 | 7 | # Airflow is used to schedule, monitor, and retry jobs while providing observability over runs. 8 | # However, the code that is the substance of the program is not packed into the Airflow DAG. 9 | 10 | import logging 11 | 12 | from airflow_multicloud_torch_train import ( 13 | access_data_callable, 14 | bring_up_cluster_callable, 15 | down_cluster, 16 | download_s3_data_callable, 17 | preprocess_data_callable, 18 | train_model_callable, 19 | ) 20 | 21 | logging.basicConfig(level=logging.INFO) 22 | logger = logging.getLogger(__name__) 23 | 24 | if __name__ == "__main__": 25 | logger.info("Starting the pipeline...") 26 | 27 | logger.info("Step 1: Bring up cluster") 28 | cpu_cluster_config = { 29 | "cluster_name": "cpu-cluster", 30 | "instance_type": "r6i.xlarge", 31 | "provider": "aws", 32 | } 33 | gpu_cluster_config = { 34 | "cluster_name": "gpu-cluster", 35 | "gpus": "L4:1", 36 | "provider": "gcp", 37 | } 38 | 39 | cpu = bring_up_cluster_callable(**cpu_cluster_config) 40 | logger.info("Step 2: Access data") 41 | access_data_callable(**cpu_cluster_config) 42 | 43 | logger.info("Step 3: Preprocess data") 44 | preprocess_data_callable(**cpu_cluster_config) 45 | 46 | logger.info("Step 4: Train model") 47 | bring_up_cluster_callable(**gpu_cluster_config) 48 | download_s3_data_callable(**gpu_cluster_config) 49 | train_model_callable(**gpu_cluster_config) 50 | 51 | logger.info("Pipeline completed.") 52 | 53 | down_cluster(**gpu_cluster_config) 54 | down_cluster(**cpu_cluster_config) 55 | logger.info("Cluster sucessfully downed.") 56 | -------------------------------------------------------------------------------- /examples/pytorch-torchvision-mnist-training/airflow/local_run_of_callables.py: -------------------------------------------------------------------------------- 1 | ## You can easily test both the Airflow flow, and the underlying components and code by calling them from local 2 | 3 | ## Because the execution has been offloaded to GPU compute on remote, you can call each step from local, or from a notebook 4 | ## You can imagine that a DS or MLE might write a pipeline and interactively debug from local. 5 | ## Then, only when they are confident all the functions work, do they upload the Airflow pipeline which is minimal 6 | 7 | ## Airflow is used to schedule, monitor, and retry jobs while providing observability over runs. 8 | ## However, the code that is the substance of the program is not packed into the Airflow DAG. 9 | 10 | import logging 11 | 12 | from airflow_example_torch_train import ( 13 | access_data_callable, 14 | bring_up_cluster_callable, 15 | down_cluster, 16 | train_model_callable, 17 | ) 18 | 19 | logging.basicConfig(level=logging.INFO) 20 | logger = logging.getLogger(__name__) 21 | 22 | if __name__ == "__main__": 23 | logger.info("Starting the pipeline...") 24 | 25 | logger.info("Step 1: Bring up cluster") 26 | bring_up_cluster_callable() 27 | 28 | logger.info("Step 2: Access data") 29 | access_data_callable() 30 | 31 | logger.info("Step 3: Train model") 32 | train_model_callable() 33 | 34 | logger.info("Pipeline completed.") 35 | 36 | down_cluster() 37 | logger.info("Cluster sucessfully downed.") 38 | -------------------------------------------------------------------------------- /examples/pytorch-torchvision-mnist-training/airflow/readme.md: -------------------------------------------------------------------------------- 1 | # Using Airflow with Runhouse 2 | The principal goal of using Runhouse alongside Airflow or other DAG systems is to restore interactive debuggability and fast iteration to developers. Packaging research code into production pipelines easily takes up half of machine learning engineers' time, and this is even true for sophisticated organizations. 3 | 4 | **Use Airflow for what Airflow is good for** 5 | * Scheduling 6 | * Ensuring reliable execution 7 | * Observability of runs 8 | 9 | The usage pattern for Runhouse with Airflow should be as follows: 10 | * Write Python classes and functions using normal, ordinary coding best practices. Do not think about DAGs or DSLs at all, just write great code. 11 | * Send the code for remote execution with Runhouse, and figure out whether the code works, debugging it interactively. Runhouse lets you send the code in seconds, and streams logs back. You can work on remote as if it were local. 12 | * Once you are satisfied with your code, you can write the callables for an Airflow PythonOperator. The code that is actually in the Airflow DAG is the **minimal code** to call out to already working Classes and Functions, defining the order of the steps (or you can even have a one-step Airflow DAG, making Airflow purely for scheduling and observability) 13 | * And you can easily iterate further on your code, or test the pipeline end-to-end from local with no Airflow participation 14 | 15 | 16 | **Examples** 17 | * **TorchBasicExample.py:** Normally written Python code with no DSL, defining a simple neural network, in the parent folder. 18 | * **local_run_of_callables.py:** An example of how Runhouse lets you test your functions and Airflow callables from local, since it's all happening on "remote" execution. You can update code, and experiment with calling just that step. 19 | * **airflow_example_torch_train.py:** The Airflow DAG, which simply orchestrates the pipeline. 20 | -------------------------------------------------------------------------------- /examples/pytorch-torchvision-mnist-training/airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse[aws] 2 | torch 3 | torchvision 4 | airflow 5 | -------------------------------------------------------------------------------- /examples/pytorch-torchvision-mnist-training/my_simple_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | # We define a model class. We define a very basic feedforward neural network with three fully connected layers. 6 | class SimpleNN(nn.Module): 7 | def __init__(self): 8 | super(SimpleNN, self).__init__() 9 | self.fc1 = nn.Linear(28 * 28, 128) 10 | self.fc2 = nn.Linear(128, 64) 11 | self.fc3 = nn.Linear(64, 10) 12 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | def forward(self, x): 15 | x = x.view(-1, 28 * 28) # Flatten the input 16 | x = F.relu(self.fc1(x)) 17 | x = F.relu(self.fc2(x)) 18 | x = self.fc3(x) 19 | return x 20 | -------------------------------------------------------------------------------- /examples/pytorch-torchvision-mnist-training/my_transforms.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | from torchvision import transforms 3 | 4 | 5 | def get_transform(): 6 | transform = transforms.Compose( 7 | [ 8 | transforms.Resize( 9 | (28, 28), interpolation=Image.BILINEAR 10 | ), # Resize to 28x28 using bilinear interpolation 11 | transforms.ToTensor(), 12 | transforms.Normalize( 13 | (0.5,), (0.5,) 14 | ), # Normalize with mean=0.5, std=0.5 for general purposes 15 | ] 16 | ) 17 | return transform 18 | -------------------------------------------------------------------------------- /examples/pytorch-torchvision-mnist-training/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse[aws] 2 | torch 3 | torchvision 4 | -------------------------------------------------------------------------------- /examples/pytorch-torchvision-mnist-training/work_with_remote_TorchTrainer.py: -------------------------------------------------------------------------------- 1 | # ## Working with remote objects 2 | # You can run this code in a Python script, a Jupyter notebook, or any other Python environment. It will work while training is happening or long after it ends until the cluster is downed. 3 | import runhouse as rh 4 | 5 | # Define a cluster type - here we launch an on-demand AWS cluster with 1 NVIDIA A10G GPU. 6 | # You can use any cloud you want, or existing compute 7 | cluster = rh.compute( 8 | name="a10g-cluster", instance_type="A10G:1", provider="aws" 9 | ).up_if_not() 10 | 11 | # Get our remote TorchTrainer by name 12 | model = cluster.get("torch_model", default=None, remote=True) 13 | 14 | # Get the training status of the model 15 | print(model.return_status()) 16 | 17 | # Make a prediction with the model, which we can do even when training is happening in a different thread. 18 | from torchvision import datasets, transforms 19 | 20 | transform = transforms.Compose( 21 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 22 | ) 23 | 24 | local_dataset = datasets.MNIST( 25 | "./data", train=False, download=True, transform=transform 26 | ) 27 | example_data, example_target = local_dataset[0][0].unsqueeze(0), local_dataset[0][1] 28 | prediction = model.predict(example_data) 29 | print(f"Predicted: {prediction}, Actual: {example_target}") 30 | -------------------------------------------------------------------------------- /examples/stable-diffusion-xl-ec2/README.md: -------------------------------------------------------------------------------- 1 | # Deploy Stable Diffusion XL 1.0 on AWS EC2 2 | 3 | See a more [rich explanation](https://www.run.house/examples/stable-diffusion-xl-on-aws-ec2) 4 | of this example on our site. 5 | 6 | This example demonstrates how to deploy a 7 | [Stable Diffusion XL model from Hugging Face](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) 8 | on AWS EC2 using Runhouse. 9 | 10 | ## Setup credentials and dependencies 11 | 12 | Optionally, set up a virtual environment: 13 | ```shell 14 | $ conda create -n rh-sdxl python=3.9.15 15 | $ conda activate rh-sdxl 16 | ``` 17 | Install the few required dependencies: 18 | ```shell 19 | $ pip install -r requirements.txt 20 | ``` 21 | 22 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to 23 | make sure our AWS credentials are set up: 24 | ```shell 25 | $ aws configure 26 | $ sky check 27 | ``` 28 | We'll be downloading the model from Hugging Face, so we need to set up our Hugging Face token: 29 | ```shell 30 | $ export HF_TOKEN= 31 | ``` 32 | 33 | After that, you can just run the example: 34 | ```shell 35 | $ python sdxl.py 36 | ``` 37 | -------------------------------------------------------------------------------- /examples/stable-diffusion-xl-ec2/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse[aws] 2 | Pillow 3 | -------------------------------------------------------------------------------- /examples/tensorflow-distributed/README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow Multi-node Distributed Training 2 | 3 | A basic example showing how to use Runhouse to Pythonically run a TensorFlow distributed training script on a 4 | cluster of GPUs. We use the `TF_CONFIG` environment variable to set up the distributed training environment, and 5 | create a separate worker for each rank. We then call the replicas concurrently to trigger coordinated 6 | multi-node training. We're using two single-GPU instances (and therefore two ranks) with the 7 | MultiWorkerMirroredStrategy, but this same strategy could be used for other TensorFlow distributed strategies. 8 | 9 | Despite it being common to use a launcher script to start distributed training, this approach is more flexible and 10 | allows for more complex orchestration, such as running multiple training jobs concurrently, handling exceptions, 11 | running distributed training alongside other tasks on the same cluster. It's also significantly easier to debug 12 | and monitor, as you can see the output of each rank in real-time and get stack traces if a worker fails. 13 | -------------------------------------------------------------------------------- /examples/tensorflow-distributed/requirements.txt: -------------------------------------------------------------------------------- 1 | runhouse[aws] 2 | tensorflow 3 | -------------------------------------------------------------------------------- /examples/tensorflow-distributed/tensorflow_distributed.py: -------------------------------------------------------------------------------- 1 | # # TensorFlow Multi-node Distributed Training 2 | # A basic example showing how to use Kubetorch to Pythonically run a TensorFlow distributed training script on 3 | # multiple GPUs. We use the TF_CONFIG environment variable to set up the distributed training environment, and 4 | # create a separate worker (env) for each rank. We then call the replicas concurrently to trigger coordinated 5 | # multi-node training. We're using two single-GPU instances (and therefore two ranks) with the 6 | # MultiWorkerMirroredStrategy, but this same strategy could be used for other TensorFlow distributed strategies. 7 | # 8 | # Despite it being common to use a launcher script to start distributed training, this approach is more flexible and 9 | # allows for more complex orchestration, such as running multiple training jobs concurrently, handling exceptions, 10 | # running distributed training alongside other tasks on the same cluster. It's also significantly easier to debug 11 | # and monitor, as you can see the output of each rank in real-time and get stack traces if a worker fails. 12 | 13 | import json 14 | import os 15 | 16 | import kubetorch as kt 17 | import tensorflow as tf 18 | 19 | 20 | # ## Define the TensorFlow distributed training logic 21 | # This is the function that will be run on each worker. It initializes the distributed training environment, 22 | # creates a simple model and optimizer, and runs a training loop. 23 | def train_process(): 24 | # Initialize the distributed training environment, 25 | # per https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras 26 | tf_config = json.loads(os.environ["TF_CONFIG"]) 27 | strategy = tf.distribute.MultiWorkerMirroredStrategy() 28 | num_workers = strategy.num_replicas_in_sync 29 | print(f"Worker {tf_config['task']['index']} of {num_workers} initialized") 30 | 31 | # Create a simple model and optimizer 32 | model = tf.keras.Sequential([tf.keras.layers.Dense(10, activation="relu")]) 33 | optimizer = tf.keras.optimizers.SGD(0.01) 34 | 35 | with strategy.scope(): 36 | model.compile(optimizer=optimizer, loss="mse") 37 | 38 | model.fit( 39 | tf.data.Dataset.from_tensor_slices( 40 | (tf.random.normal([1000, 10]), tf.random.normal([1000, 1])) 41 | ).batch(32) 42 | ) 43 | 44 | print(f"Worker {tf_config['task']['index']} finished") 45 | 46 | 47 | if __name__ == "__main__": 48 | # Dispatch the training function to a multi-node cluster with 4 nodes, each with 1 GPU 49 | gpus = kt.Compute(gpus="A10G:1", image=kt.images.tensorflow()) 50 | remote_train = kt.fn(train_process).to(gpus).distribute("tensorflow", num_nodes=4) 51 | 52 | remote_train() 53 | -------------------------------------------------------------------------------- /examples/xgboost-gpu/requirements.txt: -------------------------------------------------------------------------------- 1 | xgboost 2 | numpy 3 | pandas 4 | scikit-learn 5 | kubetorch 6 | -------------------------------------------------------------------------------- /examples/yolo-fastapi/requirements.txt: -------------------------------------------------------------------------------- 1 | # Usage: pip install -r requirements.txt 2 | 3 | # Base ---------------------------------------- 4 | runhouse 5 | matplotlib>=3.2.2 6 | numpy>=1.18.5,<1.24.0 7 | opencv-python>=4.1.1 8 | Pillow>=7.1.2 9 | PyYAML>=5.3.1 10 | requests>=2.23.0 11 | scipy>=1.4.1 12 | torch>=1.7.0,!=1.12.0 13 | torchvision>=0.8.1,!=0.13.0 14 | tqdm>=4.41.0 15 | protobuf<4.21.3 16 | 17 | # Logging ------------------------------------- 18 | tensorboard>=2.4.1 19 | # wandb 20 | 21 | # Plotting ------------------------------------ 22 | pandas>=1.1.4 23 | seaborn>=0.11.0 24 | 25 | # Export -------------------------------------- 26 | # coremltools>=4.1 # CoreML export 27 | # onnx>=1.9.0 # ONNX export 28 | # onnx-simplifier>=0.3.6 # ONNX simplifier 29 | # scikit-learn==0.19.2 # CoreML quantization 30 | # tensorflow>=2.4.1 # TFLite export 31 | # tensorflowjs>=3.9.0 # TF.js export 32 | # openvino-dev # OpenVINO export 33 | 34 | # Extras -------------------------------------- 35 | ipython # interactive notebook 36 | psutil # system utilization 37 | thop # FLOPs computation 38 | # albumentations>=1.0.3 39 | # pycocotools>=2.0 # COCO mAP 40 | # roboflow 41 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=58.0, < 70"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.ruff] 6 | extend-exclude = ["runhouse/resources/hardware/sky/"] 7 | 8 | [tool.ruff.lint.per-file-ignores] 9 | "__init__.py" = ["F401"] 10 | "examples/*" = ["E501"] 11 | 12 | [tool.pytest.ini_options] 13 | asyncio_mode = "auto" 14 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = -s -v 3 | markers = 4 | servertest: all tests in the tests/test_servers/ directory, for filtering out 5 | secrettest: all tests in tests/test_resources/test_secrets/, for filtering out 6 | moduletest: all tests in TestModule, for filtering out 7 | functiontest: all tests in TestFunction, for filtering out 8 | clustertest: all tests in TestCluster, for filtering out 9 | level: mark tests with a given level that will be used when selecting tests to run 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv 2 | pexpect 3 | pyopenssl>=23.3.0 4 | rich 5 | setuptools < 70.0.0 6 | typer 7 | uvicorn 8 | wheel 9 | apispec 10 | httpx 11 | pydantic >=2.5.0 12 | -------------------------------------------------------------------------------- /runhouse/__init__.py: -------------------------------------------------------------------------------- 1 | import runhouse.resources.images.builtin_images as images 2 | 3 | from runhouse.exceptions import InsufficientDiskError 4 | from runhouse.resources.asgi import Asgi, asgi 5 | from runhouse.resources.folders import Folder, folder, GCSFolder, S3Folder 6 | from runhouse.resources.functions.function import Function 7 | from runhouse.resources.functions.function_factory import function 8 | 9 | from runhouse.resources.hardware import ( 10 | cluster, 11 | Cluster, 12 | DockerCluster, 13 | ondemand_cluster, 14 | OnDemandCluster, 15 | ) 16 | from runhouse.resources.images import Image 17 | 18 | # WARNING: Any built-in module that is imported here must be capitalized followed by all lowercase, or we will 19 | # will not find the module class when attempting to reconstruct it from a config. 20 | from runhouse.resources.module import Module, module 21 | from runhouse.resources.packages import CodeSyncError, package, Package 22 | from runhouse.resources.resource import Resource 23 | from runhouse.resources.secrets import provider_secret, ProviderSecret, Secret, secret 24 | 25 | from runhouse.rns.top_level_rns_fns import ( 26 | as_caller, 27 | current_folder, 28 | exists, 29 | get_local_cluster_object, 30 | ipython, 31 | load, 32 | locate, 33 | set_folder, 34 | unset_folder, 35 | ) 36 | from runhouse.utils import sync_function 37 | 38 | # Note these are global variables that are instantiated within globals.py: 39 | from .globals import configs, obj_store 40 | 41 | from .rns.login import login, logout 42 | 43 | # Syntactic sugar 44 | fn = function 45 | compute = cluster 46 | cls = module 47 | 48 | 49 | def __getattr__(name): 50 | if name == "here": 51 | # If it's either the first time or the cluster was not initialized before, attempt to retrieve the cluster again 52 | return sync_function(get_local_cluster_object)() 53 | 54 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 55 | 56 | 57 | __version__ = "0.0.43" 58 | -------------------------------------------------------------------------------- /runhouse/builtins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/builtins/__init__.py -------------------------------------------------------------------------------- /runhouse/builtins/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resource_type": "folder", 3 | "name": "builtins", 4 | "rns_address": "/builtins" 5 | } 6 | -------------------------------------------------------------------------------- /runhouse/builtins/generate_builtins.py: -------------------------------------------------------------------------------- 1 | from runhouse.resources.hardware.on_demand_cluster import OnDemandCluster 2 | 3 | rh_cpu = OnDemandCluster(name="^rh-cpu", instance_type="CPU:1", dryrun=False) 4 | rh_8_cpu = OnDemandCluster(name="^rh-8-cpu", instance_type="CPU:8", dryrun=False) 5 | rh_32_cpu = OnDemandCluster(name="^rh-32-cpu", instance_type="CPU:32", dryrun=False) 6 | rh_gpu = OnDemandCluster(name="^rh-gpu", instance_type="K80:1", dryrun=False) 7 | rh_4_gpu = OnDemandCluster(name="^rh-4-gpu", instance_type="K80:4", dryrun=False) 8 | rh_8_gpu = OnDemandCluster(name="^rh-8-gpu", instance_type="K80:8", dryrun=False) 9 | rh_v100 = OnDemandCluster(name="^rh-v100", instance_type="V100:1", dryrun=False) 10 | rh_4_v100 = OnDemandCluster(name="^rh-4-v100", instance_type="V100:4", dryrun=False) 11 | rh_8_v100 = OnDemandCluster(name="^rh-8-v100", instance_type="V100:8", dryrun=False) 12 | 13 | for cluster in [ 14 | rh_cpu, 15 | rh_8_cpu, 16 | rh_32_cpu, 17 | rh_gpu, 18 | rh_4_gpu, 19 | rh_8_gpu, 20 | rh_v100, 21 | rh_4_v100, 22 | rh_8_v100, 23 | ]: 24 | cluster.autostop_mins = None 25 | cluster.provider = None 26 | # Need to manually more into builtins because we can't save there 27 | cluster.save(name=f"~/{cluster.name}") 28 | -------------------------------------------------------------------------------- /runhouse/builtins/rh-32-cpu/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resource_type": "cluster", 3 | "name": "/builtins/rh-32-cpu", 4 | "resource_subtype": "OnDemandCluster", 5 | "rns_address": "/builtins/rh-32-cpu", 6 | "instance_type": "CPU:32+", 7 | "num_nodes": null, 8 | "provider": null, 9 | "autostop_mins": null, 10 | "use_spot": false 11 | } 12 | -------------------------------------------------------------------------------- /runhouse/builtins/rh-4-gpu/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resource_type": "cluster", 3 | "name": "/builtins/rh-4-gpu", 4 | "resource_subtype": "OnDemandCluster", 5 | "rns_address": "/builtins/rh-4-gpu", 6 | "instance_type": "K80:4", 7 | "num_nodes": null, 8 | "provider": null, 9 | "autostop_mins": null, 10 | "use_spot": false 11 | } 12 | -------------------------------------------------------------------------------- /runhouse/builtins/rh-4-v100/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resource_type": "cluster", 3 | "name": "/builtins/rh-4-v100", 4 | "resource_subtype": "OnDemandCluster", 5 | "rns_address": "/builtins/rh-4-v100", 6 | "instance_type": "V100:4", 7 | "num_nodes": null, 8 | "provider": null, 9 | "autostop_mins": null, 10 | "use_spot": false 11 | } 12 | -------------------------------------------------------------------------------- /runhouse/builtins/rh-8-cpu/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resource_type": "cluster", 3 | "name": "/builtins/rh-8-cpu", 4 | "resource_subtype": "OnDemandCluster", 5 | "rns_address": "/builtins/rh-8-cpu", 6 | "instance_type": "CPU:8+", 7 | "num_nodes": null, 8 | "provider": null, 9 | "autostop_mins": null, 10 | "use_spot": false 11 | } 12 | -------------------------------------------------------------------------------- /runhouse/builtins/rh-8-gpu/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resource_type": "cluster", 3 | "name": "/builtins/rh-8-gpu", 4 | "resource_subtype": "OnDemandCluster", 5 | "rns_address": "/builtins/rh-8-gpu", 6 | "instance_type": "K80:8", 7 | "num_nodes": null, 8 | "provider": null, 9 | "autostop_mins": null, 10 | "use_spot": false 11 | } 12 | -------------------------------------------------------------------------------- /runhouse/builtins/rh-8-v100/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resource_type": "cluster", 3 | "name": "/builtins/rh-8-v100", 4 | "resource_subtype": "OnDemandCluster", 5 | "rns_address": "/builtins/rh-8-v100", 6 | "instance_type": "V100:8", 7 | "num_nodes": null, 8 | "provider": null, 9 | "autostop_mins": null, 10 | "use_spot": false 11 | } 12 | -------------------------------------------------------------------------------- /runhouse/builtins/rh-cpu/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resource_type": "cluster", 3 | "name": "/builtins/rh-cpu", 4 | "resource_subtype": "OnDemandCluster", 5 | "rns_address": "/builtins/rh-cpu", 6 | "instance_type": "CPU:2+", 7 | "num_nodes": null, 8 | "provider": null, 9 | "autostop_mins": null, 10 | "use_spot": false 11 | } 12 | -------------------------------------------------------------------------------- /runhouse/builtins/rh-gpu/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resource_type": "cluster", 3 | "name": "/builtins/rh-gpu", 4 | "resource_subtype": "OnDemandCluster", 5 | "rns_address": "/builtins/rh-gpu", 6 | "instance_type": "K80:1", 7 | "num_nodes": null, 8 | "provider": null, 9 | "autostop_mins": null, 10 | "use_spot": false 11 | } 12 | -------------------------------------------------------------------------------- /runhouse/builtins/rh-v100/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resource_type": "cluster", 3 | "name": "/builtins/rh-v100", 4 | "resource_subtype": "OnDemandCluster", 5 | "rns_address": "/builtins/rh-v100", 6 | "instance_type": "V100:1", 7 | "num_nodes": null, 8 | "provider": null, 9 | "autostop_mins": null, 10 | "use_spot": false 11 | } 12 | -------------------------------------------------------------------------------- /runhouse/exceptions.py: -------------------------------------------------------------------------------- 1 | # Runhouse exceptions 2 | 3 | 4 | class InsufficientDiskError(Exception): 5 | """Raised when a process on the cluster fails due to lack of disk space. 6 | 7 | Args: 8 | command: The command / process that was run. 9 | error_msg: The error message to print. 10 | """ 11 | 12 | def __init__( 13 | self, 14 | error_msg: str = None, 15 | command: str = None, 16 | ) -> None: 17 | self.command = command 18 | self.error_msg = error_msg 19 | self.default_error_msg = "Cluster is out of disk space" 20 | msg = ( 21 | self.error_msg 22 | if self.error_msg 23 | else f"Command {command} failed" 24 | if self.command 25 | else self.default_error_msg 26 | ) 27 | msg = f"{msg}. To resolve it, teardown the cluster and re-launch it with larger disk size." 28 | super().__init__(msg) 29 | -------------------------------------------------------------------------------- /runhouse/globals.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | 3 | # Following design pattern for singleton variables from here: 4 | # https://docs.python.org/3/faq/programming.html#how-do-i-share-global-variables-across-modules 5 | 6 | from runhouse.rns.defaults import Defaults 7 | from runhouse.rns.rns_client import RNSClient 8 | from runhouse.servers.obj_store import ObjStore 9 | 10 | # Configure the logger once 11 | # TODO commenting out for now because this duplicates the logging config in the root logger 12 | # logging.config.dictConfig(LOGGING_CONFIG) 13 | 14 | configs = Defaults() 15 | 16 | ssh_tunnel_cache = {} 17 | 18 | 19 | def clean_up_ssh_connections(): 20 | for _, v in ssh_tunnel_cache.items(): 21 | v.terminate() 22 | 23 | 24 | atexit.register(clean_up_ssh_connections) 25 | 26 | rns_client = RNSClient(configs=configs) 27 | 28 | # Note: this initalizes a dummy global object. The obj_store must 29 | # be properly initialized by a servlet via initialize. 30 | obj_store = ObjStore() 31 | -------------------------------------------------------------------------------- /runhouse/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | 5 | 6 | def get_logger(name) -> logging.Logger: 7 | """ 8 | Creates and returns a logger with the specified name. 9 | 10 | Ensures a universal logger configuration across the codebase with the format: 11 | "levelname - asctime - filename:lineno - message" 12 | 13 | Args: 14 | name (str): Name of the logger. Defaults to None, which gets the root logger. 15 | 16 | Returns: 17 | logging.Logger: Configured logger instance. 18 | """ 19 | # Create or retrieve the logger 20 | return logging.getLogger(name) 21 | 22 | 23 | class NewLineFormatter(logging.Formatter): 24 | """Adds logging prefix to newlines to align multi-line messages.""" 25 | 26 | def __init__(self, fmt, datefmt=None): 27 | logging.Formatter.__init__(self, fmt, datefmt) 28 | 29 | def format(self, record): 30 | msg = logging.Formatter.format(self, record) 31 | if record.message != "": 32 | parts = msg.partition(record.message) 33 | msg = msg.replace("\n", "\r\n" + parts[0]) 34 | return msg 35 | 36 | 37 | root_logger = logging.getLogger("runhouse") 38 | 39 | 40 | def init_logger(logger): 41 | level = os.getenv("RH_LOG_LEVEL") or "INFO" 42 | level = getattr(logging, level.upper()) 43 | logger.setLevel(level) 44 | for handler in logger.handlers: 45 | logger.removeHandler(handler) 46 | 47 | if not logger.handlers: 48 | formatter = NewLineFormatter( 49 | "%(levelname)s | %(asctime)s | %(name)s:%(lineno)d | %(message)s", 50 | datefmt="%Y-%m-%d %H:%M:%S", 51 | ) 52 | handler = logging.StreamHandler(sys.stdout) 53 | handler.setFormatter(formatter) 54 | logger.addHandler(handler) 55 | 56 | logger.propagate = False 57 | 58 | 59 | init_logger(root_logger) 60 | -------------------------------------------------------------------------------- /runhouse/resources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/resources/__init__.py -------------------------------------------------------------------------------- /runhouse/resources/distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/resources/distributed/__init__.py -------------------------------------------------------------------------------- /runhouse/resources/distributed/dask_distributed.py: -------------------------------------------------------------------------------- 1 | from runhouse.constants import DEFAULT_DASK_PORT 2 | from runhouse.resources.distributed.supervisor import Supervisor 3 | 4 | from runhouse.resources.module import Module 5 | 6 | 7 | class DaskDistributed(Supervisor): 8 | def __init__( 9 | self, 10 | name, 11 | module: Module = None, 12 | port: int = DEFAULT_DASK_PORT, 13 | client_timeout="3s", 14 | **kwargs 15 | ): 16 | super().__init__(name=name, **kwargs) 17 | self._module = module 18 | self._dask_port = port 19 | self._dask_client = None 20 | self._client_timeout = client_timeout 21 | 22 | def _compute_signature(self, rich=False): 23 | return self.local._module.signature(rich=rich) 24 | 25 | def forward(self, item, *args, **kwargs): 26 | if not self._dask_client: 27 | self._dask_client = self.system.connect_dask( 28 | port=self._dask_port, client_timeout=self._client_timeout 29 | ) 30 | method = getattr(self._module, item) 31 | return method(*args, **kwargs) 32 | 33 | def __call__(self, *args, **kwargs): 34 | return self.call(*args, **kwargs) 35 | 36 | def __getstate__(self): 37 | state = super().__getstate__() 38 | # Dask client can't be serialized 39 | state["_dask_client"] = None 40 | return state 41 | -------------------------------------------------------------------------------- /runhouse/resources/distributed/distributed_pool.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import List, Optional 3 | 4 | from runhouse.resources.distributed.supervisor import Supervisor 5 | 6 | from runhouse.resources.module import Module 7 | 8 | 9 | class DistributedPool(Supervisor): 10 | def __init__( 11 | self, name, replicas: List[Module] = None, max_concurrency: int = 1, **kwargs 12 | ): 13 | super().__init__(name=name, **kwargs) 14 | self._replicas = replicas or [] 15 | self._max_concurrency = max_concurrency 16 | self._available_replicas = list( 17 | range(len(self._replicas) * self._max_concurrency) 18 | ) 19 | 20 | def _compute_signature(self, rich=False): 21 | return self.local._replicas[0].signature(rich=rich) 22 | 23 | def forward(self, item, timeout: Optional[int] = None, *args, **kwargs): 24 | time_waited = 0 25 | while not self._available_replicas: 26 | if timeout == 0: 27 | raise TimeoutError("No available replicas.") 28 | if timeout is not None and time_waited >= timeout: 29 | raise TimeoutError("Timed out waiting for a replica to be available.") 30 | time.sleep(0.25) 31 | time_waited += 0.25 32 | worker_idx = self._available_replicas.pop(0) 33 | worker = self._replicas[worker_idx // self._max_concurrency] 34 | method = getattr(worker, item) 35 | res = method(*args, **kwargs) 36 | self._available_replicas.append(worker_idx) 37 | return res 38 | 39 | def __call__(self, *args, **kwargs): 40 | return self.call(*args, **kwargs) 41 | -------------------------------------------------------------------------------- /runhouse/resources/distributed/pytorch_distributed.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures.thread import ThreadPoolExecutor 2 | from typing import List 3 | 4 | from runhouse.resources.distributed.supervisor import Supervisor 5 | 6 | from runhouse.resources.hardware import Cluster, OnDemandCluster 7 | 8 | from runhouse.resources.module import Module 9 | 10 | 11 | class PyTorchDistributed(Supervisor): 12 | def __init__(self, name, replicas: List[Module] = None, port=None, **kwargs): 13 | super().__init__(name=name, **kwargs) 14 | self._replicas = replicas or [] 15 | self._port = port 16 | 17 | def _compute_signature(self, rich=False): 18 | return self.local._replicas[0].signature(rich=rich) 19 | 20 | def _find_available_port_on_head_node(self): 21 | find_available_port_cmd = "python3 -c \"import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()\"" 22 | status_code, stdout, _ = self._replicas[0].system.run_bash( 23 | find_available_port_cmd, 24 | node=self._replicas[0].system.head_ip, 25 | require_outputs=True, 26 | ) 27 | 28 | if status_code != 0: 29 | raise RuntimeError(f"Failed to find available port on head rank: {stdout}") 30 | return stdout 31 | 32 | def forward(self, item, *args, **kwargs): 33 | port = self._port or self._find_available_port_on_head_node() 34 | 35 | def run_on_replica(replica, rank): 36 | # Per https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization 37 | master_addr = ( 38 | self.system.internal_ips[0] 39 | if isinstance(self.system, OnDemandCluster) 40 | else self.system.head_ip 41 | if isinstance(self.system, Cluster) 42 | else "localhost" 43 | ) 44 | 45 | processes_per_node = len(self._replicas) // len(self.system.ips) 46 | 47 | dist_config = { 48 | "MASTER_ADDR": master_addr, 49 | "MASTER_PORT": port, 50 | "RANK": str(rank), 51 | "WORLD_SIZE": str(len(self._replicas)), 52 | "LOCAL_RANK": str(rank % processes_per_node), 53 | } 54 | 55 | replica.system.set_process_env_vars(replica.process, dist_config) 56 | method = getattr(replica, item) 57 | return method(*args, **kwargs) 58 | 59 | with ThreadPoolExecutor(max_workers=len(self._replicas)) as executor: 60 | res = executor.map( 61 | run_on_replica, self._replicas, range(len(self._replicas)) 62 | ) 63 | res = list(res) 64 | 65 | return res 66 | 67 | def __call__(self, *args, **kwargs): 68 | return self.call(*args, **kwargs) 69 | -------------------------------------------------------------------------------- /runhouse/resources/distributed/ray_distributed.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import sys 3 | 4 | from runhouse.resources.distributed.supervisor import Supervisor 5 | 6 | from runhouse.resources.module import Module 7 | 8 | 9 | class RayDistributed(Supervisor): 10 | def __init__(self, name, module: Module = None, ray_init_options=None, **kwargs): 11 | if not hasattr(module, "fn_pointers"): 12 | raise ValueError( 13 | "RayDistributed requires a Runhouse Function object to distribute." 14 | ) 15 | 16 | super().__init__(name=name, **kwargs) 17 | self._module = module 18 | self._ray_init_options = ray_init_options or {} 19 | 20 | def _compute_signature(self, rich=False): 21 | return self.local._module.signature(rich=rich) 22 | 23 | def forward(self, item, *args, **kwargs): 24 | from runhouse.resources.distributed.utils import subprocess_ray_fn_call_helper 25 | 26 | # TODO replace this with passing the filepath that this module is already writing to! 27 | parent_conn, child_conn = multiprocessing.Pipe() 28 | 29 | subproc_args = ( 30 | self._module.fn_pointers, 31 | args, 32 | kwargs, 33 | child_conn, 34 | self._ray_init_options, 35 | ) 36 | 37 | # Check if start method is already spawn, because set_start_method will error if called again 38 | if multiprocessing.get_start_method(allow_none=True) != "spawn": 39 | multiprocessing.set_start_method("spawn") 40 | 41 | with multiprocessing.Pool(processes=1) as pool: 42 | result = pool.apply_async(subprocess_ray_fn_call_helper, args=subproc_args) 43 | while True: 44 | try: 45 | (msg, output_stream) = parent_conn.recv() 46 | if msg == EOFError: 47 | break 48 | print( 49 | msg, 50 | end="", 51 | file=sys.stdout if output_stream == "stdout" else sys.stderr, 52 | ) 53 | except EOFError: 54 | break 55 | res = result.get() 56 | return res 57 | 58 | def __call__(self, *args, **kwargs): 59 | return self.call(*args, **kwargs) 60 | -------------------------------------------------------------------------------- /runhouse/resources/distributed/spark_distributed.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import sys 3 | 4 | from runhouse.resources.distributed.supervisor import Supervisor 5 | 6 | from runhouse.resources.module import Module 7 | 8 | 9 | class SparkDistributed(Supervisor): 10 | def __init__( 11 | self, 12 | name, 13 | module: Module = None, 14 | ray_init_options=None, 15 | spark_init_options=None, 16 | **kwargs 17 | ): 18 | if not hasattr(module, "fn_pointers"): 19 | raise ValueError( 20 | "Spark Distributed requires a Runhouse Function object to distribute." 21 | ) 22 | 23 | print("initializing spark distribution") 24 | super().__init__(name=name, **kwargs) 25 | self._module = module 26 | self._ray_init_options = ray_init_options or {} 27 | self._spark_init_options = spark_init_options or {} 28 | 29 | def _compute_signature(self, rich=False): 30 | return self.local._module.signature(rich=rich) 31 | 32 | def forward(self, item, *args, **kwargs): 33 | from runhouse.resources.distributed.utils import subprocess_raydp_fn_call_helper 34 | 35 | # TODO replace this with passing the filepath that this module is already writing to! 36 | parent_conn, child_conn = multiprocessing.Pipe() 37 | subproc_args = ( 38 | self._module.fn_pointers, 39 | args, 40 | kwargs, 41 | child_conn, 42 | self._ray_init_options, 43 | kwargs.pop("spark_init_options", self._spark_init_options), 44 | ) 45 | 46 | # Check if start method is already spawn, because set_start_method will error if called again 47 | if multiprocessing.get_start_method(allow_none=True) != "spawn": 48 | multiprocessing.set_start_method("spawn") 49 | with multiprocessing.Pool(processes=1) as pool: 50 | result = pool.apply_async( 51 | subprocess_raydp_fn_call_helper, args=subproc_args 52 | ) 53 | while True: 54 | try: 55 | (msg, output_stream) = parent_conn.recv() 56 | if msg == EOFError: 57 | break 58 | print( 59 | msg, 60 | end="", 61 | file=sys.stdout if output_stream == "stdout" else sys.stderr, 62 | ) 63 | except EOFError: 64 | break 65 | res = result.get() 66 | return res 67 | 68 | def __call__(self, *args, **kwargs): 69 | return self.call(*args, **kwargs) 70 | -------------------------------------------------------------------------------- /runhouse/resources/folders/__init__.py: -------------------------------------------------------------------------------- 1 | from .folder import Folder 2 | from .folder_factory import folder 3 | from .gcs_folder import GCSFolder 4 | from .s3_folder import S3Folder 5 | -------------------------------------------------------------------------------- /runhouse/resources/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .function import Function 2 | from .function_factory import function 3 | -------------------------------------------------------------------------------- /runhouse/resources/hardware/__init__.py: -------------------------------------------------------------------------------- 1 | from .cluster import Cluster 2 | from .cluster_factory import cluster, ondemand_cluster 3 | from .docker_cluster import DockerCluster 4 | from .on_demand_cluster import OnDemandCluster 5 | from .ray_utils import check_for_existing_ray_instance, kill_actors, list_actor_states 6 | from .utils import ( 7 | _current_cluster, 8 | _get_cluster_from, 9 | cluster_config_file_exists, 10 | ClusterStatus, 11 | get_all_sky_clusters, 12 | load_cluster_config_from_file, 13 | SSEClient, 14 | ) 15 | -------------------------------------------------------------------------------- /runhouse/resources/hardware/constants.py: -------------------------------------------------------------------------------- 1 | STATIC_CLUSTER_ARGS = { 2 | "host", 3 | "ssh_creds", 4 | } 5 | 6 | ONDEMAND_COMPUTE_ARGS = { 7 | "instance_type", 8 | "num_nodes", 9 | "provider", 10 | "pool", 11 | "use_spot", 12 | "region", 13 | "memory", 14 | "disk_size", 15 | "vpc_name", 16 | "num_cpus", 17 | "gpus", 18 | "sky_kwargs", 19 | "launcher", 20 | "autostop_mins", 21 | } 22 | 23 | KUBERNETES_CLUSTER_ARGS = { 24 | "kube_context", 25 | "kube_namespace", 26 | "kube_config_path", 27 | } 28 | 29 | RH_SERVER_ARGS = { 30 | "server_port", 31 | "server_host", 32 | "ssh_port", 33 | "open_ports", # ondemand only 34 | "server_connection_type", 35 | "ssl_keyfile", 36 | "ssl_certfile", 37 | "domain", 38 | "image", 39 | } 40 | -------------------------------------------------------------------------------- /runhouse/resources/hardware/kubernetes/rsync_helper.sh: -------------------------------------------------------------------------------- 1 | # When using pod@namespace, rsync passes args as: {us} -l pod namespace 2 | shift 3 | pod=$1 4 | shift 5 | namespace=$1 6 | shift 7 | kubectl exec -i $pod -n $namespace -- "$@" 8 | -------------------------------------------------------------------------------- /runhouse/resources/hardware/ray_utils.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from typing import Optional 3 | 4 | from runhouse.logger import get_logger 5 | 6 | logger = get_logger(__name__) 7 | 8 | 9 | def check_for_existing_ray_instance(venv): 10 | ray_status_check = subprocess.run( 11 | ["ray", "status"], 12 | stdout=subprocess.PIPE, 13 | stderr=subprocess.PIPE, 14 | env={"PATH": f"{venv}/bin"} if venv else None, 15 | ) 16 | return ray_status_check.returncode == 0 17 | 18 | 19 | def list_actor_states( 20 | actor_name: Optional[str] = None, 21 | actor_class_name: Optional[str] = None, 22 | namespace: Optional[str] = "runhouse", 23 | state: Optional[str] = "ALIVE", 24 | ): 25 | import ray 26 | from ray.experimental.state.api import list_actors 27 | 28 | def filter_by(actor: "ActorState"): 29 | if actor_name and actor["name"] != actor_name: 30 | return False 31 | 32 | if actor_class_name and actor["class_name"] != actor_class_name: 33 | return False 34 | 35 | if namespace and actor["ray_namespace"] != namespace: 36 | return False 37 | 38 | if state and actor["state"] != state: 39 | return False 40 | 41 | return True 42 | 43 | return list(filter(filter_by, list_actors())) if ray.is_initialized() else [] 44 | 45 | 46 | def kill_actors( 47 | actor_name: Optional[str] = None, 48 | actor_class_name: Optional[str] = None, 49 | namespace: Optional[str] = None, 50 | gracefully: bool = True, 51 | ): 52 | import ray 53 | 54 | cluster_servlet_actor = None 55 | for actor in list_actor_states(actor_name, actor_class_name, namespace): 56 | actor_handle_to_kill = ray.get_actor(actor["name"]) 57 | if actor["name"] == "cluster_servlet": 58 | cluster_servlet_actor = actor_handle_to_kill 59 | continue 60 | logger.info(f"Killing actor {actor['name']}") 61 | if gracefully: 62 | actor_handle_to_kill.__ray_terminate__.remote() 63 | else: 64 | ray.kill(actor_handle_to_kill) 65 | 66 | # Make sure to kill cluster_servlet last 67 | if cluster_servlet_actor: 68 | logger.info("Killing actor cluster_servlet") 69 | if gracefully: 70 | cluster_servlet_actor.__ray_terminate__.remote() 71 | else: 72 | ray.kill(cluster_servlet_actor) 73 | -------------------------------------------------------------------------------- /runhouse/resources/hardware/sky/__init__.py: -------------------------------------------------------------------------------- 1 | # Importing relevant files/functions from SkyPilot (Apache 2.0) 2 | # https://github.com/skypilot-org/skypilot/tree/ 3 | -------------------------------------------------------------------------------- /runhouse/resources/hardware/sky/constants.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/skypilot-org/skypilot/blob/feb52cf/sky/skylet/constants.py 2 | 3 | 4 | DEFAULT_DOCKER_PORT = 10022 5 | USER_ID_ENV_VAR = 'SKYPILOT_USER_ID' 6 | -------------------------------------------------------------------------------- /runhouse/resources/hardware/sky/subprocess_daemon.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pulled from: https://github.com/skypilot-org/skypilot/blob/3baa9c9/sky/skylet/subprocess_daemon.py 3 | 4 | Sky subprocess daemon. 5 | 6 | Wait for parent_pid to exit, then SIGTERM (or SIGKILL if needed) the child 7 | processes of proc_pid. 8 | """ 9 | 10 | import argparse 11 | import sys 12 | import time 13 | 14 | if __name__ == '__main__': 15 | import psutil 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--parent-pid', type=int, required=True) 19 | parser.add_argument('--proc-pid', type=int, required=True) 20 | args = parser.parse_args() 21 | 22 | process = None 23 | parent_process = None 24 | try: 25 | process = psutil.Process(args.proc_pid) 26 | parent_process = psutil.Process(args.parent_pid) 27 | except psutil.NoSuchProcess: 28 | pass 29 | 30 | if process is None: 31 | sys.exit() 32 | 33 | if parent_process is not None: 34 | # Wait for either parent or target process to exit. 35 | while process.is_running() and parent_process.is_running(): 36 | time.sleep(1) 37 | 38 | try: 39 | children = process.children(recursive=True) 40 | children.append(process) 41 | except psutil.NoSuchProcess: 42 | sys.exit() 43 | 44 | for pid in children: 45 | try: 46 | pid.terminate() 47 | except psutil.NoSuchProcess: 48 | pass 49 | 50 | # Wait 30s for the processes to exit gracefully. 51 | time.sleep(30) 52 | 53 | # SIGKILL if they're still running. 54 | for pid in children: 55 | try: 56 | pid.kill() 57 | except psutil.NoSuchProcess: 58 | pass 59 | -------------------------------------------------------------------------------- /runhouse/resources/images/__init__.py: -------------------------------------------------------------------------------- 1 | from .image import Image, ImageSetupStep, ImageSetupStepType 2 | -------------------------------------------------------------------------------- /runhouse/resources/images/builtin_images.py: -------------------------------------------------------------------------------- 1 | from runhouse.resources.images.image import Image 2 | 3 | 4 | def dask(): 5 | return Image().pip_install(["dask[distributed,dataframe]", "dask-ml"]) 6 | 7 | 8 | def pytorch(): 9 | return Image().pip_install(["torch"]) 10 | 11 | 12 | def ray(): 13 | return Image().pip_install(["ray[tune,data,train]"]) 14 | -------------------------------------------------------------------------------- /runhouse/resources/packages/__init__.py: -------------------------------------------------------------------------------- 1 | from .package import CodeSyncError, InstallTarget, Package, package 2 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/__init__.py: -------------------------------------------------------------------------------- 1 | from .provider_secrets import ProviderSecret 2 | from .secret import Secret 3 | from .secret_factory import provider_secret, secret 4 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/__init__.py: -------------------------------------------------------------------------------- 1 | from .provider_secret import ProviderSecret 2 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/anthropic_secret.py: -------------------------------------------------------------------------------- 1 | from runhouse.resources.secrets.provider_secrets.api_key_secret import ApiKeySecret 2 | 3 | 4 | class AnthropicSecret(ApiKeySecret): 5 | """ 6 | .. note:: 7 | To create an AnthropicSecret, please use the factory method :func:`provider_secret` 8 | with ``provider="anthropic"``. 9 | """ 10 | 11 | _PROVIDER = "anthropic" 12 | _DEFAULT_ENV_VARS = {"api_key": "ANTHROPIC_API_KEY"} 13 | 14 | @staticmethod 15 | def from_config(config: dict, dryrun: bool = False): 16 | return AnthropicSecret(**config, dryrun=dryrun) 17 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/api_key_secret.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, Union 2 | 3 | from runhouse.resources.hardware.cluster import Cluster 4 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret 5 | 6 | 7 | class ApiKeySecret(ProviderSecret): 8 | """Secret class for providers consisting of a single API key, generally stored as an environment variable. 9 | 10 | .. note:: 11 | To create an ApiKeySecret, please use the factory method :func:`provider_secret` 12 | and passing in the corresponding provider. 13 | """ 14 | 15 | def write( 16 | self, 17 | file: bool = False, 18 | env: bool = False, 19 | path: str = None, 20 | env_vars: Dict = None, 21 | overwrite: bool = False, 22 | write_config: bool = True, 23 | ): 24 | if not file or path: 25 | env = True 26 | super().write( 27 | file=file, 28 | env=env, 29 | path=path, 30 | env_vars=env_vars, 31 | overwrite=overwrite, 32 | write_config=write_config, 33 | ) 34 | 35 | def to( 36 | self, 37 | system: Union[str, Cluster], 38 | path: str = None, 39 | process: Optional[str] = None, 40 | values: bool = True, 41 | name: Optional[str] = None, 42 | ): 43 | return super().to( 44 | system=system, path=path, process=process, values=values, name=name 45 | ) 46 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/aws_secret.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import copy 3 | import os 4 | 5 | from typing import Dict 6 | 7 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret 8 | from runhouse.resources.secrets.utils import _check_file_for_mismatches 9 | from runhouse.utils import create_local_dir 10 | 11 | 12 | class AWSSecret(ProviderSecret): 13 | """ 14 | .. note:: 15 | To create an AWSSecret, please use the factory method :func:`provider_secret` with ``provider="aws"``. 16 | """ 17 | 18 | _PROVIDER = "aws" 19 | _DEFAULT_CREDENTIALS_PATH = "~/.aws/credentials" 20 | _DEFAULT_ENV_VARS = { 21 | "access_key": "AWS_ACCESS_KEY_ID", 22 | "secret_key": "AWS_SECRET_ACCESS_KEY", 23 | } 24 | 25 | @staticmethod 26 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 27 | return AWSSecret(**config, dryrun=dryrun) 28 | 29 | def _write_to_file( 30 | self, 31 | path: str, 32 | values: Dict, 33 | overwrite: bool = False, 34 | write_config: bool = True, 35 | ): 36 | new_secret = copy.deepcopy(self) 37 | 38 | if not _check_file_for_mismatches( 39 | path, self._from_path(path), values, overwrite 40 | ): 41 | 42 | parser = configparser.ConfigParser() 43 | section_name = "default" 44 | parser.add_section(section_name) 45 | parser.set( 46 | section=section_name, 47 | option="aws_access_key_id", 48 | value=values["access_key"], 49 | ) 50 | parser.set( 51 | section=section_name, 52 | option="aws_secret_access_key", 53 | value=values["secret_key"], 54 | ) 55 | 56 | full_path = create_local_dir(path) 57 | with open(full_path, "w+") as f: 58 | parser.write(f) 59 | 60 | if write_config: 61 | new_secret._add_to_rh_config(path) 62 | 63 | new_secret._values = None 64 | new_secret.path = path 65 | return new_secret 66 | 67 | def _from_path(self, path: str): 68 | config = configparser.ConfigParser() 69 | if path and os.path.exists(os.path.expanduser(path)): 70 | config.read(os.path.expanduser(path)) 71 | else: 72 | return {} 73 | 74 | section_name = "default" 75 | access_key = config[section_name]["aws_access_key_id"] 76 | secret_key = config[section_name]["aws_secret_access_key"] 77 | 78 | return { 79 | "access_key": access_key, 80 | "secret_key": secret_key, 81 | } 82 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/azure_secret.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import copy 3 | import os 4 | 5 | from typing import Dict 6 | 7 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret 8 | from runhouse.resources.secrets.utils import _check_file_for_mismatches 9 | from runhouse.utils import create_local_dir 10 | 11 | 12 | class AzureSecret(ProviderSecret): 13 | """ 14 | .. note:: 15 | To create an AzureSecret, please use the factory method :func:`provider_secret` with ``provider="azure"``. 16 | """ 17 | 18 | # values format: {"subscription_id": subscription_id} 19 | _PROVIDER = "azure" 20 | _DEFAULT_CREDENTIALS_PATH = "~/.azure/clouds.config" 21 | _DEFAULT_ENV_VARS = {"subscription_id": "AZURE_SUBSCRIPTION_ID"} 22 | 23 | @staticmethod 24 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 25 | return AzureSecret(**config, dryrun=dryrun) 26 | 27 | def _write_to_file( 28 | self, 29 | path: str = None, 30 | values: Dict = None, 31 | overwrite: bool = False, 32 | write_config: bool = True, 33 | ): 34 | new_secret = copy.deepcopy(self) 35 | if not _check_file_for_mismatches( 36 | path, self._from_path(path), values, overwrite 37 | ): 38 | subscription_id = values["subscription_id"] 39 | 40 | parser = configparser.ConfigParser() 41 | section_name = "AzureCloud" 42 | parser.add_section(section_name) 43 | parser.set( 44 | section=section_name, 45 | option="subscription", 46 | value=subscription_id, 47 | ) 48 | 49 | full_path = create_local_dir(path) 50 | with open(full_path, "w") as f: 51 | parser.write(f) 52 | 53 | if write_config: 54 | new_secret._add_to_rh_config(path) 55 | 56 | new_secret._values = None 57 | new_secret.path = path 58 | return new_secret 59 | 60 | def _from_path(self, path: str = None): 61 | config = configparser.ConfigParser() 62 | if path and os.path.exists(os.path.expanduser(path)): 63 | path = os.path.expanduser(path) 64 | config.read(path) 65 | if config and "AzureCloud" in config.sections(): 66 | subscription_id = config["AzureCloud"]["subscription"] 67 | return {"subscription_id": subscription_id} 68 | return {} 69 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/cohere_secret.py: -------------------------------------------------------------------------------- 1 | from runhouse.resources.secrets.provider_secrets.api_key_secret import ApiKeySecret 2 | 3 | 4 | class CohereSecret(ApiKeySecret): 5 | """ 6 | .. note:: 7 | To create an CohereSecret, please use the factory method :func:`provider_secret` 8 | with ``provider="cohere"``. 9 | """ 10 | 11 | _PROVIDER = "cohere" 12 | _DEFAULT_ENV_VARS = {"api_key": "COHERE_API_KEY"} 13 | 14 | @staticmethod 15 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 16 | return CohereSecret(**config, dryrun=dryrun) 17 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/docker_secret.py: -------------------------------------------------------------------------------- 1 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret 2 | 3 | 4 | class DockerRegistrySecret(ProviderSecret): 5 | """ 6 | .. note:: 7 | To create a DockerRegistrySecret, please use the factory method 8 | :func:`provider_secret` with ``provider="docker"``. 9 | """ 10 | 11 | _PROVIDER = "docker" 12 | _DEFAULT_ENV_VARS = { 13 | "username": "SKYPILOT_DOCKER_USERNAME", 14 | "password": "SKYPILOT_DOCKER_PASSWORD", 15 | "server": "SKYPILOT_DOCKER_SERVER", 16 | } 17 | 18 | @staticmethod 19 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 20 | return DockerRegistrySecret(**config, dryrun=dryrun) 21 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/gcp_secret.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import os 4 | from pathlib import Path 5 | 6 | from typing import Dict 7 | 8 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret 9 | from runhouse.resources.secrets.utils import _check_file_for_mismatches 10 | from runhouse.utils import create_local_dir 11 | 12 | 13 | class GCPSecret(ProviderSecret): 14 | """ 15 | .. note:: 16 | To create a GCPSecret, please use the factory method :func:`provider_secret` with ``provider="gcp"``. 17 | """ 18 | 19 | _PROVIDER = "gcp" 20 | _DEFAULT_CREDENTIALS_PATH = "~/.config/gcloud/application_default_credentials.json" 21 | _DEFAULT_ENV_VARS = { 22 | "client_id": "CLIENT_ID", 23 | "client_secret": "CLIENT_SECRET", 24 | } 25 | 26 | @staticmethod 27 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 28 | return GCPSecret(**config, dryrun=dryrun) 29 | 30 | def _write_to_file( 31 | self, 32 | path: str, 33 | values: Dict = None, 34 | overwrite: bool = False, 35 | write_config: bool = True, 36 | ): 37 | new_secret = copy.deepcopy(self) 38 | if not _check_file_for_mismatches( 39 | path, self._from_path(path), values, overwrite 40 | ): 41 | Path(path).parent.mkdir(parents=True, exist_ok=True) 42 | 43 | full_path = create_local_dir(path) 44 | with open(full_path, "w+") as f: 45 | json.dump(values, f, indent=4) 46 | 47 | if write_config: 48 | new_secret._add_to_rh_config(path) 49 | 50 | new_secret._values = None 51 | new_secret.path = path 52 | return new_secret 53 | 54 | def _from_path(self, path: str = None): 55 | config = {} 56 | if path and os.path.exists(os.path.expanduser(path)): 57 | with open(os.path.expanduser(path), "r") as config_file: 58 | config = json.load(config_file) 59 | return config 60 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/github_secret.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | from pathlib import Path 4 | 5 | from typing import Dict 6 | 7 | import yaml 8 | 9 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret 10 | from runhouse.resources.secrets.utils import _check_file_for_mismatches 11 | 12 | 13 | class GitHubSecret(ProviderSecret): 14 | """ 15 | .. note:: 16 | To create a GitHubSecret, please use the factory method :func:`provider_secret` with ``provider="github"``. 17 | """ 18 | 19 | # values format: {"oauth_token": oath_token} 20 | _PROVIDER = "github" 21 | _DEFAULT_CREDENTIALS_PATH = "~/.config/gh/hosts.yml" 22 | 23 | @staticmethod 24 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 25 | return GitHubSecret(**config, dryrun=dryrun) 26 | 27 | def _write_to_file( 28 | self, 29 | path: str, 30 | values: Dict = None, 31 | overwrite: bool = False, 32 | write_config: bool = True, 33 | ): 34 | new_secret = copy.deepcopy(self) 35 | if not _check_file_for_mismatches( 36 | path, self._from_path(path), values, overwrite 37 | ): 38 | config = {} 39 | 40 | full_path = os.path.expanduser(path) 41 | if Path(full_path).exists(): 42 | with open(full_path, "r") as stream: 43 | config = yaml.safe_load(stream) 44 | config["github.com"] = values 45 | 46 | Path(full_path).parent.mkdir(parents=True, exist_ok=True) 47 | with open(full_path, "w") as yaml_file: 48 | yaml.dump(config, yaml_file, default_flow_style=False) 49 | 50 | if write_config: 51 | new_secret._add_to_rh_config(path) 52 | 53 | new_secret._values = None 54 | new_secret.path = path 55 | return new_secret 56 | 57 | def _from_path(self, path: str = None): 58 | config = {} 59 | if path and os.path.exists(os.path.expanduser(path)): 60 | with open(os.path.expanduser(path), "r") as stream: 61 | config = yaml.safe_load(stream) 62 | return config["github.com"] if config else {} 63 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/huggingface_secret.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | from pathlib import Path 4 | 5 | from typing import Dict 6 | 7 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret 8 | from runhouse.resources.secrets.utils import _check_file_for_mismatches 9 | from runhouse.utils import create_local_dir 10 | 11 | 12 | class HuggingFaceSecret(ProviderSecret): 13 | """ 14 | .. note:: 15 | To create a HuggingFaceSecret, please use the factory method :func:`provider_secret` with 16 | ``provider="huggingface"``. 17 | """ 18 | 19 | # values format: {"token": hf_token} 20 | _PROVIDER = "huggingface" 21 | _DEFAULT_CREDENTIALS_PATH = "~/.cache/huggingface/token" 22 | _DEFAULT_ENV_VARS = {"token": "HF_TOKEN"} 23 | 24 | @staticmethod 25 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 26 | return HuggingFaceSecret(**config, dryrun=dryrun) 27 | 28 | def _write_to_file( 29 | self, 30 | path: str, 31 | values: Dict = None, 32 | overwrite: bool = False, 33 | write_config: bool = True, 34 | ): 35 | new_secret = copy.deepcopy(self) 36 | if not _check_file_for_mismatches( 37 | path, self._from_path(path), values, overwrite 38 | ): 39 | token = values["token"] 40 | full_path = create_local_dir(path) 41 | with open(full_path, "a") as f: 42 | f.write(token) 43 | 44 | if write_config: 45 | new_secret._add_to_rh_config(path) 46 | 47 | new_secret._values = None 48 | new_secret.path = path 49 | return new_secret 50 | 51 | def _from_path(self, path: str = None): 52 | token = None 53 | if path and os.path.exists(os.path.expanduser(path)): 54 | token = Path(os.path.expanduser(path)).read_text().strip("\n") 55 | if token: 56 | return {"token": token} 57 | return {} 58 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/kubeconfig_secret.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | from typing import Dict 4 | 5 | import yaml 6 | 7 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret 8 | from runhouse.resources.secrets.utils import _check_file_for_mismatches 9 | from runhouse.utils import create_local_dir 10 | 11 | 12 | class KubeConfigSecret(ProviderSecret): 13 | """ 14 | .. note:: 15 | To create a KubeConfigSecret, please use the factory method :func:`provider_secret` with ``provider=="kubernetes"``. 16 | """ 17 | 18 | _PROVIDER = "kubernetes" 19 | _DEFAULT_CREDENTIALS_PATH = "~/.kube/config" 20 | 21 | @staticmethod 22 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 23 | # try block if for the case we are trying to load a shared secret. 24 | return KubeConfigSecret(**config, dryrun=dryrun) 25 | 26 | def _from_path(self, path: str = None): 27 | path = path or self.path 28 | if not path: 29 | return {} 30 | 31 | path = os.path.expanduser(path) 32 | if os.path.exists(path): 33 | try: 34 | with open(path) as f: 35 | contents = yaml.safe_load(f) 36 | except: 37 | contents = {} 38 | return contents 39 | return {} 40 | 41 | def _write_to_file( 42 | self, 43 | path: str, 44 | values: Dict, 45 | overwrite: bool = False, 46 | write_config: bool = True, 47 | ): 48 | new_secret = copy.deepcopy(self) 49 | path = path or self.path 50 | if not _check_file_for_mismatches( 51 | path, self._from_path(path), values, overwrite 52 | ): 53 | full_path = create_local_dir(path) 54 | with open(full_path, "w") as f: 55 | yaml.safe_dump(values, f) 56 | 57 | if write_config: 58 | self._add_to_rh_config(path) 59 | 60 | new_secret._values = None 61 | new_secret.path = path 62 | return new_secret 63 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/lambda_secret.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import os 3 | 4 | from typing import Dict 5 | 6 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret 7 | from runhouse.resources.secrets.utils import _check_file_for_mismatches 8 | from runhouse.utils import create_local_dir 9 | 10 | 11 | class LambdaSecret(ProviderSecret): 12 | """ 13 | .. note:: 14 | To create a LambdaSecret, please use the factory method :func:`provider_secret` with ``provider="lambda"``. 15 | """ 16 | 17 | # values format: {"api_key": api_key} 18 | _DEFAULT_CREDENTIALS_PATH = "~/.lambda_cloud/lambda_keys" 19 | _PROVIDER = "lambda" 20 | 21 | @staticmethod 22 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 23 | return LambdaSecret(**config, dryrun=dryrun) 24 | 25 | def _write_to_file( 26 | self, 27 | path: str, 28 | values: Dict = None, 29 | overwrite: bool = False, 30 | write_config: bool = True, 31 | ): 32 | new_secret = copy.deepcopy(self) 33 | if not _check_file_for_mismatches( 34 | path, self._from_path(path), values, overwrite 35 | ): 36 | data = f'api_key = {values["api_key"]}\n' 37 | full_path = create_local_dir(path) 38 | with open(full_path, "w+") as f: 39 | f.write(data) 40 | 41 | if write_config: 42 | new_secret._add_to_rh_config(path) 43 | 44 | new_secret._values = None 45 | new_secret.path = path 46 | return new_secret 47 | 48 | def _from_path(self, path: str = None): 49 | lines = None 50 | if path and os.path.exists(os.path.expanduser(path)): 51 | with open(os.path.expanduser(path), "r") as f: 52 | lines = f.readlines() 53 | if lines: 54 | for line in lines: 55 | split = line.split() 56 | if split[0] == "api_key": 57 | api_key = split[-1] 58 | return {"api_key": api_key} 59 | return {} 60 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/langchain_secret.py: -------------------------------------------------------------------------------- 1 | from runhouse.resources.secrets.provider_secrets.api_key_secret import ApiKeySecret 2 | 3 | 4 | class LangChainSecret(ApiKeySecret): 5 | """ 6 | .. note:: 7 | To create an LangChainSecret, please use the factory method :func:`provider_secret` 8 | with ``provider="langchain"``. 9 | """ 10 | 11 | _PROVIDER = "langchain" 12 | _DEFAULT_ENV_VARS = {"api_key": "LANGCHAIN_API_KEY"} 13 | 14 | @staticmethod 15 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 16 | return LangChainSecret(**config, dryrun=dryrun) 17 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/openai_secret.py: -------------------------------------------------------------------------------- 1 | from runhouse.resources.secrets.provider_secrets.api_key_secret import ApiKeySecret 2 | 3 | 4 | class OpenAISecret(ApiKeySecret): 5 | """ 6 | .. note:: 7 | To create an OpenAISecret, please use the factory method :func:`provider_secret` with ``provider="openai"``. 8 | """ 9 | 10 | _PROVIDER = "openai" 11 | _DEFAULT_ENV_VARS = {"api_key": "OPENAI_API_KEY"} 12 | 13 | @staticmethod 14 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 15 | return OpenAISecret(**config, dryrun=dryrun) 16 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/pinecone_secret.py: -------------------------------------------------------------------------------- 1 | from runhouse.resources.secrets.provider_secrets.api_key_secret import ApiKeySecret 2 | 3 | 4 | class PineconeSecret(ApiKeySecret): 5 | """ 6 | .. note:: 7 | To create an PineconeSecret, please use the factory method :func:`provider_secret` 8 | with ``provider="pinecone"``. 9 | """ 10 | 11 | _PROVIDER = "pinecone" 12 | _DEFAULT_ENV_VARS = {"api_key": "PINECONE_API_KEY"} 13 | 14 | @staticmethod 15 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 16 | return PineconeSecret(**config, dryrun=dryrun) 17 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/providers.py: -------------------------------------------------------------------------------- 1 | from runhouse.resources.secrets.provider_secrets.anthropic_secret import AnthropicSecret 2 | from runhouse.resources.secrets.provider_secrets.aws_secret import AWSSecret 3 | from runhouse.resources.secrets.provider_secrets.azure_secret import AzureSecret 4 | from runhouse.resources.secrets.provider_secrets.cohere_secret import CohereSecret 5 | from runhouse.resources.secrets.provider_secrets.docker_secret import ( 6 | DockerRegistrySecret, 7 | ) 8 | from runhouse.resources.secrets.provider_secrets.gcp_secret import GCPSecret 9 | from runhouse.resources.secrets.provider_secrets.github_secret import GitHubSecret 10 | from runhouse.resources.secrets.provider_secrets.huggingface_secret import ( 11 | HuggingFaceSecret, 12 | ) 13 | from runhouse.resources.secrets.provider_secrets.kubeconfig_secret import ( 14 | KubeConfigSecret, 15 | ) 16 | from runhouse.resources.secrets.provider_secrets.lambda_secret import LambdaSecret 17 | from runhouse.resources.secrets.provider_secrets.langchain_secret import LangChainSecret 18 | from runhouse.resources.secrets.provider_secrets.openai_secret import OpenAISecret 19 | from runhouse.resources.secrets.provider_secrets.pinecone_secret import PineconeSecret 20 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret 21 | from runhouse.resources.secrets.provider_secrets.sky_secret import SkySecret 22 | from runhouse.resources.secrets.provider_secrets.ssh_secret import SSHSecret 23 | from runhouse.resources.secrets.provider_secrets.wandb_secret import WandBSecret 24 | 25 | 26 | _str_to_provider_class = { 27 | # File and/or Env secrets 28 | "aws": AWSSecret, 29 | "azure": AzureSecret, 30 | "gcp": GCPSecret, 31 | "github": GitHubSecret, 32 | "huggingface": HuggingFaceSecret, 33 | "kubernetes": KubeConfigSecret, 34 | "lambda": LambdaSecret, 35 | "docker": DockerRegistrySecret, 36 | # SSH secrets 37 | "ssh": SSHSecret, 38 | "sky": SkySecret, 39 | # API key secrets 40 | "anthropic": AnthropicSecret, 41 | "cohere": CohereSecret, 42 | "langchain": LangChainSecret, 43 | "openai": OpenAISecret, 44 | "pinecone": PineconeSecret, 45 | "wandb": WandBSecret, 46 | } 47 | 48 | 49 | def _get_provider_class(provider_str): 50 | if provider_str not in _str_to_provider_class: 51 | return ProviderSecret 52 | return _str_to_provider_class[provider_str] 53 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/sky_secret.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | from runhouse.resources.secrets.provider_secrets.ssh_secret import SSHSecret 4 | 5 | 6 | class SkySecret(SSHSecret): 7 | """ 8 | .. note:: 9 | To create a SkySecret, please use the factory method :func:`provider_secret` with ``provider="sky"``. 10 | """ 11 | 12 | _PROVIDER = "sky" 13 | _DEFAULT_KEY = "sky-key" 14 | 15 | def __init__( 16 | self, 17 | name: Optional[str] = None, 18 | provider: Optional[str] = None, 19 | values: Dict = {}, 20 | path: str = None, 21 | dryrun: bool = True, 22 | **kwargs, 23 | ): 24 | super().__init__( 25 | name=name, provider=provider, values=values, path=path, dryrun=dryrun 26 | ) 27 | 28 | @staticmethod 29 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 30 | return SkySecret(**config, dryrun=dryrun) 31 | -------------------------------------------------------------------------------- /runhouse/resources/secrets/provider_secrets/wandb_secret.py: -------------------------------------------------------------------------------- 1 | from runhouse.resources.secrets.provider_secrets.api_key_secret import ApiKeySecret 2 | 3 | 4 | class WandBSecret(ApiKeySecret): 5 | """ 6 | .. note:: 7 | To create an WandBSecret, please use the factory method :func:`provider_secret` with ``provider="wandb"``. 8 | """ 9 | 10 | _PROVIDER = "wandb" 11 | _DEFAULT_ENV_VARS = {"api_key": "WANDB_API_KEY"} 12 | 13 | @staticmethod 14 | def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True): 15 | return WandBSecret(**config, dryrun=dryrun) 16 | -------------------------------------------------------------------------------- /runhouse/rns/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/rns/__init__.py -------------------------------------------------------------------------------- /runhouse/rns/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/rns/utils/__init__.py -------------------------------------------------------------------------------- /runhouse/rns/utils/api.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import datetime 3 | import json 4 | import os 5 | import uuid 6 | from enum import Enum 7 | 8 | from requests import Response 9 | 10 | 11 | def timing(func): 12 | def wrapper(*args, **kwargs): 13 | import time 14 | 15 | start = time.time() 16 | result = func(*args, **kwargs) 17 | end = time.time() 18 | print(f"Finished {func.__name__.title()} in {int((end - start))} seconds") 19 | return result 20 | 21 | return wrapper 22 | 23 | 24 | def remove_null_values_from_dict(source_dic: dict) -> dict: 25 | return {k: v for k, v in source_dic.items() if v is not None} 26 | 27 | 28 | def load_resp_content(resp: Response) -> dict: 29 | return json.loads(resp.content) 30 | 31 | 32 | def read_resp_data(resp: Response): 33 | return load_resp_content(resp).get("data", {}) 34 | 35 | 36 | def to_bool(value): 37 | try: 38 | return ast.literal_eval(value) 39 | except: 40 | return value 41 | 42 | 43 | def is_jsonable(myjson): 44 | try: 45 | json.dumps(myjson) 46 | except (TypeError, OverflowError): 47 | return False 48 | return True 49 | 50 | 51 | def generate_uuid(): 52 | return uuid.uuid4().hex 53 | 54 | 55 | def utc_now(): 56 | """Current time as datetime object.""" 57 | return datetime.datetime.now(datetime.timezone.utc) 58 | 59 | 60 | def log_timestamp(): 61 | """Return as timestamp in nanoseconds.""" 62 | return int(utc_now().timestamp() * 1e9) 63 | 64 | 65 | def log_datetime(): 66 | """Current time as readable datetime string. 67 | Example: '2023-04-23'""" 68 | return utc_now().strftime("%Y-%m-%d") 69 | 70 | 71 | def resolve_absolute_path(path: str): 72 | return os.path.abspath(os.path.expanduser(path)) 73 | 74 | 75 | def relative_file_path(file_path: str): 76 | """Convert to a relative path if it is not already one.""" 77 | if file_path.startswith("~"): 78 | return file_path 79 | 80 | # Convert to a relative path 81 | relative_path = os.path.relpath(file_path, os.path.expanduser("~")) 82 | relative_path = relative_path.replace("\\", "/") 83 | 84 | if not relative_path.startswith("~"): 85 | relative_path = f"~/{relative_path}" 86 | 87 | return relative_path 88 | 89 | 90 | class ResourceAccess(str, Enum): 91 | WRITE = "write" 92 | READ = "read" 93 | DENIED = "denied" 94 | 95 | 96 | class ResourceVisibility(str, Enum): 97 | PRIVATE = "private" 98 | UNLISTED = "unlisted" 99 | PUBLIC = "public" 100 | -------------------------------------------------------------------------------- /runhouse/rns/utils/names.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pathlib import Path 3 | 4 | from runhouse.globals import configs, rns_client 5 | from runhouse.resources.hardware.utils import _get_cluster_from 6 | from runhouse.utils import generate_default_name 7 | 8 | DEFAULT_LOCAL_FOLDER = f"{Path.cwd()}/" 9 | DEFAULT_CLUSTER_FS_FOLDER = ( 10 | "" # Objects will land inside home directory when sent without a path 11 | ) 12 | DEFAULT_BLOB_STORAGE_FOLDER = ( 13 | configs.get("default_blob_storage_folder", "runhouse") + "/" 14 | ) 15 | 16 | 17 | def is_valid_resource_name(name, strict_slashes=False): 18 | if strict_slashes is True: 19 | # Require a leading slash if any are present in string 20 | return re.match( 21 | r"^(?!.*\/{2,})(?![^\/\.\@~]+\/)[.@~]?[a-zA-Z0-9\-\_\/@]{2,200}[a-zA-Z0-9\-\_]+$", 22 | name, 23 | ) 24 | return re.match( 25 | r"^(?!.*\/{2,})[.@~]?[a-zA-Z0-9\-\_\/@]{2,200}[a-zA-Z0-9\-\_]+$", 26 | name, 27 | ) 28 | 29 | 30 | def _generate_default_path(cls, name, system): 31 | """Generate a default path for a data resource. Logic is as follows: 32 | 1. If the system is a local file system, save to the current working directory 33 | 2. If the system is a remote file system, save to the default cache folder 34 | 3. If the system is a remote object store, save to the default object store folder 35 | """ 36 | 37 | from runhouse.resources.hardware import Cluster 38 | 39 | system = _get_cluster_from(system) 40 | 41 | name = name or generate_default_name(prefix=cls.RESOURCE_TYPE) 42 | if system == rns_client.DEFAULT_FS or "here": 43 | base_folder = DEFAULT_LOCAL_FOLDER 44 | elif isinstance(system, Cluster): 45 | if system.on_this_cluster(): 46 | base_folder = DEFAULT_LOCAL_FOLDER 47 | else: 48 | base_folder = DEFAULT_CLUSTER_FS_FOLDER 49 | else: 50 | base_folder = DEFAULT_BLOB_STORAGE_FOLDER 51 | return f"{base_folder}{cls.RESOURCE_TYPE}/{name}" 52 | -------------------------------------------------------------------------------- /runhouse/servers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/servers/__init__.py -------------------------------------------------------------------------------- /runhouse/servers/caddy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/servers/caddy/__init__.py -------------------------------------------------------------------------------- /runhouse/servers/http/__init__.py: -------------------------------------------------------------------------------- 1 | from .http_client import HTTPClient 2 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/build_package.sh: -------------------------------------------------------------------------------- 1 | # Delete dist directory if exists 2 | rm -r dist 3 | # Run from base directory of runhouse project 4 | python3 -m build --sdist --wheel 5 | #twine upload --repository testpypi dist/* 6 | twine upload dist/* 7 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/README.md: -------------------------------------------------------------------------------- 1 | 2 | The guides in this folder can be used to setup Kubernetes clusters on EKS, GKE, or AKS. 3 | 4 | When using Kubernetes please make sure you have the following: 5 | - `pip install kubernetes` 6 | - `kubectl` access 7 | - Ensure you have the `AWSKeyManagementServicePowerUser` IAM policy enabled. (For EKS only) 8 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-aks/0-locals.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | env = "dev" 3 | region = "eastus2" 4 | resource_group_name = "skyakstestrg" 5 | eks_name = "skyakstest" # Note: AKS cluster name will be dev-{eks_name} 6 | eks_version = "1.28" 7 | } 8 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-aks/1-provider.tf: -------------------------------------------------------------------------------- 1 | provider "azurerm" { 2 | features {} 3 | } 4 | 5 | terraform { 6 | required_providers { 7 | azurerm = { 8 | source = "hashicorp/azurerm" 9 | version = "3.75.0" 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-aks/2-resource-group.tf: -------------------------------------------------------------------------------- 1 | resource "azurerm_resource_group" "this" { 2 | name = local.resource_group_name 3 | location = local.region 4 | } 5 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-aks/3-vpc.tf: -------------------------------------------------------------------------------- 1 | resource "azurerm_virtual_network" "this" { 2 | name = "main" 3 | address_space = ["10.0.0.0/16"] 4 | location = azurerm_resource_group.this.location 5 | resource_group_name = azurerm_resource_group.this.name 6 | 7 | tags = { 8 | env = local.env 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-aks/4-subnets.tf: -------------------------------------------------------------------------------- 1 | resource "azurerm_subnet" "subnet1" { 2 | name = "subnet1" 3 | address_prefixes = ["10.0.0.0/19"] 4 | resource_group_name = azurerm_resource_group.this.name 5 | virtual_network_name = azurerm_virtual_network.this.name 6 | } 7 | 8 | resource "azurerm_subnet" "subnet2" { 9 | name = "subnet2" 10 | address_prefixes = ["10.0.32.0/19"] 11 | resource_group_name = azurerm_resource_group.this.name 12 | virtual_network_name = azurerm_virtual_network.this.name 13 | } 14 | 15 | # If you want to use existing subnet 16 | # data "azurerm_subnet" "subnet1" { 17 | # name = "subnet1" 18 | # virtual_network_name = "main" 19 | # resource_group_name = "tutorial" 20 | # } 21 | 22 | # output "subnet_id" { 23 | # value = data.azurerm_subnet.subnet1.id 24 | # } 25 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-aks/5-aks.tf: -------------------------------------------------------------------------------- 1 | resource "azurerm_user_assigned_identity" "base" { 2 | name = "base" 3 | location = azurerm_resource_group.this.location 4 | resource_group_name = azurerm_resource_group.this.name 5 | } 6 | 7 | resource "azurerm_role_assignment" "base" { 8 | scope = azurerm_resource_group.this.id 9 | role_definition_name = "Network Contributor" 10 | principal_id = azurerm_user_assigned_identity.base.principal_id 11 | } 12 | 13 | resource "azurerm_kubernetes_cluster" "this" { 14 | name = "${local.env}-${local.eks_name}" # AKS cluster name gets set here 15 | location = azurerm_resource_group.this.location 16 | resource_group_name = azurerm_resource_group.this.name 17 | dns_prefix = "devaks1" 18 | 19 | kubernetes_version = local.eks_version 20 | automatic_channel_upgrade = "stable" 21 | private_cluster_enabled = false 22 | node_resource_group = "${local.resource_group_name}-${local.env}-${local.eks_name}" 23 | 24 | # It's in Preview 25 | # api_server_access_profile { 26 | # vnet_integration_enabled = true 27 | # subnet_id = azurerm_subnet.subnet1.id 28 | # } 29 | 30 | # For production change to "Standard" 31 | sku_tier = "Standard" # Can also be set to Free 32 | 33 | oidc_issuer_enabled = true 34 | workload_identity_enabled = true 35 | 36 | network_profile { 37 | network_plugin = "azure" 38 | dns_service_ip = "10.0.64.10" 39 | service_cidr = "10.0.64.0/19" 40 | } 41 | 42 | default_node_pool { 43 | name = "general" 44 | vm_size = "Standard_D3_v2" # 4 vCPU, 14 GiB Memory 45 | vnet_subnet_id = azurerm_subnet.subnet1.id 46 | orchestrator_version = local.eks_version 47 | type = "VirtualMachineScaleSets" 48 | enable_auto_scaling = true 49 | node_count = 1 50 | min_count = 1 51 | max_count = 10 52 | 53 | node_labels = { 54 | role = "general" 55 | } 56 | } 57 | 58 | identity { 59 | type = "UserAssigned" 60 | identity_ids = [azurerm_user_assigned_identity.base.id] 61 | } 62 | 63 | tags = { 64 | env = local.env 65 | } 66 | 67 | lifecycle { 68 | ignore_changes = [default_node_pool[0].node_count] 69 | } 70 | 71 | depends_on = [ 72 | azurerm_role_assignment.base 73 | ] 74 | } 75 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-aks/README.md: -------------------------------------------------------------------------------- 1 | To spin up an AKS cluster in Azure, please follow the below steps. 2 | 3 | Clone down this repository and modify locals.tf to contain your desired region, resource group name, and cluster name. Optionally modify the K8s version to reflect the 4 | latest one. 5 | 6 | In subnets.tf, you can use your existing subnets or create new ones. If you do not modify it, it will create new subnets in Azure. 7 | 8 | Lastly, you can also modify some settings in aks.tf to reflect your need. The area of most interest is the `default node pool` where you may adjust the VM type, auto scaling, 9 | and min / max node count. 10 | 11 | Once you are ready with your TF scripts, you will begin by logging into Azure via the CLI. 12 | 13 | Open a terminal and run `brew install azure-cli`. Then, run `az login`. 14 | NOTE: You may need to add a `TENANT_ID` argument to the `az login` command. 15 | 16 | Authenticate with `az login`, making sure your terminal has access to your Azure Cloud account. 17 | 18 | Next, find your subscription ID by running `az account list`. Copy your subscription's ID and then run 19 | 20 | `az account set --subscription SUBSCRIPTION_ID` 21 | 22 | Finally, run your standard TF commands to deploy this AKS cluster. 23 | 24 | `terraform init` 25 | 26 | `terraform validate` 27 | 28 | `terraform plan -out tf_plan` 29 | 30 | `terraform apply "tf_plan"` 31 | 32 | To get access to your AKS cluster, you will need its kubeconfig locally. To obtain this run, 33 | 34 | `az aks get-credentials --resource-group RESOURCE_GROUP_NAME --name AKS_CLUSTER_NAME`. 35 | 36 | Note now that ~/.kube/config's contents will be updated with the kubeconfig of your AKS cluster. 37 | 38 | Finally, test your connection by running `kubectl get nodes` 39 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-eks/README.md: -------------------------------------------------------------------------------- 1 | To spin up an EKS cluster in AWS, 2 | 3 | Simply change the commented fields in main.tf and run the standard TF commands. Ensure that you have the AWS CLI setup with the correct permissions and access keys, etc. 4 | 5 | `terraform init` 6 | 7 | `terraform validate` 8 | 9 | `terraform plan -out eks_plan` 10 | 11 | `terraform apply "eks_plan"` 12 | 13 | You should also run `aws eks update-kubeconfig --region us-east-1 --name NAME_OF_EKS_CLUSTER` to update your kubeconfig. 14 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-eks/main.tf: -------------------------------------------------------------------------------- 1 | # Script to spin up a quick EKS cluster that can be used for testing Runhouse Kubernetes support 2 | 3 | provider "aws" { 4 | region = local.region 5 | } 6 | 7 | locals { 8 | name = "eks-clus-1" # change this field 9 | region = "us-east-1" 10 | 11 | vpc_cidr = "10.123.0.0/16" 12 | azs = ["us-east-1a", "us-east-1b"] 13 | 14 | public_subnets = ["10.123.1.0/24", "10.123.2.0/24"] 15 | private_subnets = ["10.123.3.0/24", "10.123.4.0/24"] 16 | intra_subnets = ["10.123.5.0/24", "10.123.6.0/24"] 17 | 18 | tags = { 19 | Example = local.name 20 | } 21 | } 22 | 23 | module "vpc" { 24 | source = "terraform-aws-modules/vpc/aws" 25 | version = "~> 4.0" 26 | 27 | name = local.name 28 | cidr = local.vpc_cidr 29 | 30 | azs = local.azs 31 | private_subnets = local.private_subnets 32 | public_subnets = local.public_subnets 33 | intra_subnets = local.intra_subnets 34 | 35 | enable_nat_gateway = true 36 | 37 | public_subnet_tags = { 38 | "kubernetes.io/role/elb" = 1 39 | } 40 | 41 | private_subnet_tags = { 42 | "kubernetes.io/role/internal-elb" = 1 43 | } 44 | } 45 | 46 | module "eks" { 47 | source = "terraform-aws-modules/eks/aws" 48 | version = "19.16.0" 49 | 50 | cluster_name = local.name 51 | cluster_endpoint_public_access = true 52 | 53 | cluster_addons = { 54 | coredns = { 55 | most_recent = true 56 | } 57 | kube-proxy = { 58 | most_recent = true 59 | } 60 | vpc-cni = { 61 | most_recent = true 62 | } 63 | } 64 | 65 | vpc_id = module.vpc.vpc_id 66 | subnet_ids = module.vpc.private_subnets 67 | control_plane_subnet_ids = module.vpc.intra_subnets 68 | 69 | # EKS Managed Node Group(s) 70 | eks_managed_node_group_defaults = { 71 | ami_type = "AL2_x86_64" 72 | instance_types = ["m6i.xlarge"] 73 | 74 | attach_cluster_primary_security_group = true 75 | } 76 | 77 | eks_managed_node_groups = { 78 | eks-clus-1-wg = { # change this field 79 | min_size = 2 80 | max_size = 10 81 | desired_size = 4 82 | 83 | instance_types = ["m6i.xlarge"] # 4 vCPU 16 GB Memory 84 | capacity_type = "ON_DEMAND" 85 | 86 | tags = { 87 | ExtraTag = "helloworld" 88 | } 89 | } 90 | } 91 | 92 | tags = local.tags 93 | } 94 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-gke/1-provider.tf: -------------------------------------------------------------------------------- 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs 2 | provider "google" { 3 | project = "runhouse-prod" # project name needs to exist in GCP already 4 | region = "us-east1" 5 | } 6 | 7 | # https://www.terraform.io/language/settings/backends/gcs 8 | terraform { 9 | required_providers { 10 | google = { 11 | source = "hashicorp/google" 12 | version = "~> 4.0" 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-gke/2-vpc.tf: -------------------------------------------------------------------------------- 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/google_project_service 2 | resource "google_project_service" "compute" { 3 | service = "compute.googleapis.com" 4 | } 5 | 6 | resource "google_project_service" "container" { 7 | service = "container.googleapis.com" 8 | } 9 | 10 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_network 11 | resource "google_compute_network" "main" { 12 | name = "main" 13 | routing_mode = "REGIONAL" 14 | auto_create_subnetworks = false 15 | mtu = 1460 16 | delete_default_routes_on_create = false 17 | 18 | depends_on = [ 19 | google_project_service.compute, 20 | google_project_service.container 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-gke/3-subnets.tf: -------------------------------------------------------------------------------- 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_subnetwork 2 | resource "google_compute_subnetwork" "private" { 3 | name = "private" 4 | ip_cidr_range = "10.0.0.0/18" 5 | region = "us-east1" 6 | network = google_compute_network.main.id 7 | private_ip_google_access = true 8 | 9 | secondary_ip_range { # CIDR for pods 10 | range_name = "k8s-pod-range" 11 | ip_cidr_range = "10.48.0.0/14" 12 | } 13 | secondary_ip_range { # CIDR for services 14 | range_name = "k8s-service-range" 15 | ip_cidr_range = "10.52.0.0/20" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-gke/4-router.tf: -------------------------------------------------------------------------------- 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_router 2 | resource "google_compute_router" "router" { 3 | name = "router" 4 | region = "us-east1" 5 | network = google_compute_network.main.id 6 | } 7 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-gke/5-nat.tf: -------------------------------------------------------------------------------- 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_router_nat 2 | resource "google_compute_router_nat" "nat" { 3 | name = "nat" 4 | router = google_compute_router.router.name 5 | region = "us-east1" 6 | 7 | source_subnetwork_ip_ranges_to_nat = "LIST_OF_SUBNETWORKS" 8 | nat_ip_allocate_option = "MANUAL_ONLY" 9 | 10 | subnetwork { 11 | name = google_compute_subnetwork.private.id 12 | source_ip_ranges_to_nat = ["ALL_IP_RANGES"] 13 | } 14 | 15 | nat_ips = [google_compute_address.nat.self_link] 16 | } 17 | 18 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_address 19 | resource "google_compute_address" "nat" { 20 | name = "nat" 21 | address_type = "EXTERNAL" 22 | network_tier = "STANDARD" # can also be set to PREMIUM 23 | 24 | depends_on = [google_project_service.compute] 25 | } 26 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-gke/6-firewalls.tf: -------------------------------------------------------------------------------- 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_firewall 2 | resource "google_compute_firewall" "allow-ssh" { 3 | name = "allow-ssh" 4 | network = google_compute_network.main.name 5 | 6 | allow { 7 | protocol = "tcp" 8 | ports = ["22"] 9 | } 10 | 11 | source_ranges = ["0.0.0.0/0"] # (warning) allows any IP to connect over SSH 12 | } 13 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-gke/7-kubernetes.tf: -------------------------------------------------------------------------------- 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster 2 | resource "google_container_cluster" "primary" { 3 | name = "primary" 4 | location = "us-east1" # can deploy to a region or multiple AZs 5 | remove_default_node_pool = true 6 | initial_node_count = 1 7 | network = google_compute_network.main.self_link 8 | subnetwork = google_compute_subnetwork.private.self_link 9 | logging_service = "logging.googleapis.com/kubernetes" 10 | monitoring_service = "monitoring.googleapis.com/kubernetes" 11 | networking_mode = "VPC_NATIVE" 12 | 13 | # Optional, if you want multi-zonal cluster 14 | # node_locations = [ 15 | # "us-east1-b" 16 | # ] 17 | 18 | addons_config { 19 | horizontal_pod_autoscaling { 20 | disabled = false 21 | } 22 | } 23 | 24 | release_channel { 25 | channel = "REGULAR" 26 | } 27 | 28 | workload_identity_config { 29 | workload_pool = "runhouse-prod.svc.id.goog" 30 | } 31 | 32 | ip_allocation_policy { 33 | cluster_secondary_range_name = "k8s-pod-range" 34 | services_secondary_range_name = "k8s-service-range" 35 | } 36 | 37 | private_cluster_config { 38 | enable_private_nodes = true 39 | enable_private_endpoint = false 40 | master_ipv4_cidr_block = "172.16.0.0/28" 41 | } 42 | 43 | # Jenkins use case 44 | # master_authorized_networks_config { 45 | # cidr_blocks { 46 | # cidr_block = "10.0.0.0/18" 47 | # display_name = "private-subnet-w-jenkins" 48 | # } 49 | # } 50 | } 51 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-gke/8-node-pools.tf: -------------------------------------------------------------------------------- 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/google_service_account 2 | resource "google_service_account" "kubernetes" { 3 | account_id = "kubernetes" 4 | } 5 | 6 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool 7 | resource "google_container_node_pool" "general" { 8 | name = "general" 9 | cluster = google_container_cluster.primary.id 10 | node_count = 1 11 | 12 | management { 13 | auto_repair = true 14 | auto_upgrade = true 15 | } 16 | 17 | node_config { 18 | preemptible = false 19 | machine_type = "e2-small" 20 | 21 | labels = { 22 | role = "general" 23 | } 24 | 25 | service_account = google_service_account.kubernetes.email 26 | oauth_scopes = [ 27 | "https://www.googleapis.com/auth/cloud-platform" 28 | ] 29 | } 30 | } 31 | 32 | resource "google_container_node_pool" "regular" { 33 | name = "regular" 34 | cluster = google_container_cluster.primary.id 35 | 36 | management { 37 | auto_repair = true 38 | auto_upgrade = true 39 | } 40 | 41 | autoscaling { 42 | min_node_count = 1 43 | max_node_count = 10 44 | } 45 | 46 | node_config { 47 | preemptible = true 48 | machine_type = "c3-standard-4" # 4 vCPU 16 GB Memory 49 | 50 | service_account = google_service_account.kubernetes.email 51 | oauth_scopes = [ 52 | "https://www.googleapis.com/auth/cloud-platform" 53 | ] 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /scripts/kubernetes_cluster/tf-gke/README.md: -------------------------------------------------------------------------------- 1 | Basic GKE cluster setup guide in Terraform 2 | 3 | To spin up a GKE cluster in GCP, please follow the below steps: 4 | 5 | `brew install --cask google-cloud-sdk` 6 | 7 | Ensure your have GCP access first with the appropriate permissions level. 8 | 9 | `gcloud auth application-default login` This will prompt you to login via browser, using your gmail account. 10 | 11 | `gcloud auth application-default set-quota-project runhouse-prod` 12 | 13 | `terraform init` 14 | 15 | `terraform validate` 16 | 17 | `terraform plan -out gke_plan` 18 | 19 | `terraform apply "gke_plan"` 20 | 21 | 22 | `gcloud config set project runhouse-prod` 23 | 24 | `gcloud components install gke-gcloud-auth-plugin` This is neccesary for kubectl with GKE to work 25 | 26 | Finally, go to your GKE cluster in the GCP console and copy the command found by pressing the `Connect` tab. Run this command. 27 | 28 | Test your access to the GKE cluster by running `kubectl get nodes` 29 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/__init__.py -------------------------------------------------------------------------------- /tests/constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | TEST_ORG = "test-org" 4 | TESTING_LOG_LEVEL = "DEBUG" 5 | TESTING_AUTOSTOP_INTERVAL = 15 6 | 7 | TEST_ENV_VARS = { 8 | "var1": "val1", 9 | "var2": "val2", 10 | "RH_LOG_LEVEL": os.getenv("RH_LOG_LEVEL") or TESTING_LOG_LEVEL, 11 | "RH_AUTOSTOP_INTERVAL": str( 12 | os.getenv("RH_AUTOSTOP_INTERVAL") or TESTING_AUTOSTOP_INTERVAL 13 | ), 14 | } 15 | 16 | TEST_REQS = [ 17 | "pytest", 18 | "httpx", 19 | "pytest_asyncio", 20 | "pandas", 21 | "numpy<=1.26.4", 22 | ] 23 | 24 | DEFAULT_KEYPAIR_KEYPATH = "~/.ssh/sky-key" 25 | -------------------------------------------------------------------------------- /tests/fixtures/test_fake_package/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="test_fake_package", 5 | version="0.1", 6 | packages=find_packages(), 7 | install_requires=[], 8 | author="Rohin Bhasin", 9 | author_email="bhasin.rohin@gmail.com", 10 | description="A simple example package", 11 | python_requires=">=3.6", 12 | ) 13 | -------------------------------------------------------------------------------- /tests/fixtures/test_fake_package/test_fake_package/__init__.py: -------------------------------------------------------------------------------- 1 | from .function_to_import import editable_package_function 2 | from .module_to_import import TestModuleFromPackage 3 | -------------------------------------------------------------------------------- /tests/fixtures/test_fake_package/test_fake_package/function_to_import.py: -------------------------------------------------------------------------------- 1 | def editable_package_function(): 2 | print("Hello from the editable package!") 3 | return "Hello from the editable package!" 4 | -------------------------------------------------------------------------------- /tests/fixtures/test_fake_package/test_fake_package/module_to_import.py: -------------------------------------------------------------------------------- 1 | class TestModuleFromPackage: 2 | @staticmethod 3 | def hello_world(): 4 | print("Hello from the editable package module!") 5 | return "Hello from the editable package module!" 6 | -------------------------------------------------------------------------------- /tests/fixtures/test_fake_package_copy/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="test_fake_package_copy", 5 | version="0.1", 6 | packages=find_packages(), 7 | install_requires=[], 8 | author="Rohin Bhasin", 9 | author_email="bhasin.rohin@gmail.com", 10 | description="A simple example package", 11 | python_requires=">=3.6", 12 | ) 13 | -------------------------------------------------------------------------------- /tests/fixtures/test_fake_package_copy/test_fake_package_copy/__init__.py: -------------------------------------------------------------------------------- 1 | from .function_to_import import editable_package_function 2 | from .module_to_import import TestModuleFromPackage 3 | -------------------------------------------------------------------------------- /tests/fixtures/test_fake_package_copy/test_fake_package_copy/function_to_import.py: -------------------------------------------------------------------------------- 1 | def editable_package_function(): 2 | print("Hello from the editable package!") 3 | return "Hello from the editable package!" 4 | -------------------------------------------------------------------------------- /tests/fixtures/test_fake_package_copy/test_fake_package_copy/module_to_import.py: -------------------------------------------------------------------------------- 1 | class TestModuleFromPackage: 2 | @staticmethod 3 | def hello_world(): 4 | print("Hello from the editable package module!") 5 | return "Hello from the editable package module!" 6 | -------------------------------------------------------------------------------- /tests/fixtures/utils.py: -------------------------------------------------------------------------------- 1 | def create_s3_bucket(bucket_name: str): 2 | """Create bucket in S3 if it does not already exist.""" 3 | from sky.data.storage import S3Store 4 | 5 | s3_store = S3Store(name=bucket_name, source="") 6 | return s3_store 7 | 8 | 9 | def create_gcs_bucket(bucket_name: str): 10 | """Create bucket in GS if it does not already exist.""" 11 | from sky.data.storage import GcsStore 12 | 13 | gcs_store = GcsStore(name=bucket_name, source="") 14 | return gcs_store 15 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-mock 3 | httpx < 0.28.0 4 | pytest_asyncio 5 | datasets 6 | dask 7 | tqdm 8 | fastapi 9 | ray[default]>=2.9.0 10 | 11 | # packages for local and unit tests 12 | boto3 13 | google-cloud-storage 14 | docker 15 | pandas 16 | numpy<=1.26.4 17 | openapi-core==0.19.1 18 | plotly 19 | 20 | # packages for minimal+ tests 21 | skypilot==0.7.0 22 | 23 | # requests must be lowered 24 | requests<2.32.0 25 | -------------------------------------------------------------------------------- /tests/test_den/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_den/__init__.py -------------------------------------------------------------------------------- /tests/test_den/test_defaults.py: -------------------------------------------------------------------------------- 1 | import runhouse as rh 2 | 3 | 4 | def test_download_defaults(): 5 | rh.globals.configs.defaults_cache["default_folder"] = "nonsense" 6 | local_defaults = rh.configs.load_defaults_from_file() 7 | local_defaults.pop("secrets") 8 | rh.configs.upload_defaults(defaults=local_defaults) 9 | loaded_defaults = rh.configs.load_defaults_from_den() 10 | assert local_defaults == loaded_defaults 11 | assert rh.globals.rns_client.default_folder == "nonsense" 12 | -------------------------------------------------------------------------------- /tests/test_login.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import runhouse as rh 4 | import sky 5 | from runhouse.rns.login import _login_download_secrets 6 | 7 | 8 | def add_secrets_to_vault(headers): 9 | """Add some test secrets to Vault""" 10 | # Add real credentials for AWS and SKY to test sky status 11 | rh.provider_secret( 12 | name="/aws", # add backslash / to name to force it to be vault secret 13 | provider="aws", 14 | values={ 15 | "access_key": os.getenv("TEST_AWS_ACCESS_KEY"), 16 | "secret_key": os.getenv("TEST_AWS_SECRET_KEY"), 17 | }, 18 | ).save(headers=headers) 19 | 20 | rh.provider_secret( 21 | name="/sky", 22 | provider="sky", 23 | values={ 24 | "private_key": os.getenv("TEST_SKY_PRIVATE_KEY"), 25 | "public_key": os.getenv("TEST_SKY_PUBLIC_KEY"), 26 | }, 27 | ).save(headers=headers) 28 | 29 | rh.provider_secret( 30 | name="/snowflake", 31 | provider="snowflake", 32 | values={"token": "ABCD1234"}, 33 | ).save(headers=headers) 34 | 35 | 36 | def test_login_flow_in_new_env(): 37 | token = os.getenv("KITCHEN_TESTER_TOKEN") 38 | headers = {"Authorization": f"Bearer {token}"} 39 | 40 | add_secrets_to_vault(headers) 41 | 42 | secrets_in_vault = rh.Secret.vault_secrets(headers=headers) 43 | assert secrets_in_vault, "No secrets found in Vault" 44 | 45 | # Run login download secrets stored in Vault into the new env 46 | _login_download_secrets(headers=headers) 47 | 48 | # Once secrets are saved down to their local config, confirm we have sky enabled 49 | sky.check.check(quiet=True) 50 | clouds = sky.global_user_state.get_enabled_clouds() 51 | cloud_names = [str(c).lower() for c in clouds] 52 | assert "aws" in cloud_names 53 | 54 | for secret in secrets_in_vault.values(): 55 | secret.delete(headers=headers) 56 | 57 | secrets_in_vault = rh.Secret.vault_secrets(headers=headers) 58 | assert not secrets_in_vault 59 | -------------------------------------------------------------------------------- /tests/test_performance.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import requests 4 | 5 | from runhouse.globals import rns_client 6 | from runhouse.logger import get_logger 7 | 8 | logger = get_logger(__name__) 9 | 10 | 11 | def profile(func, reps=10): 12 | times = [] 13 | for _ in range(reps): 14 | start = time.time() 15 | assert func() 16 | times.append(round((time.time() - start) * 1000, 2)) 17 | return times, sum(times) / len(times) 18 | 19 | 20 | def run_performance_tests(summer_func): 21 | cluster = summer_func.system 22 | times_list, avg_time = profile(lambda: summer_func.system.keys() is not None) 23 | print(f"Listing keys took {round(avg_time, 2)} ms: {times_list}") 24 | 25 | times_list, avg_time = profile(lambda: summer_func(1, 5) == 6) 26 | print(f"Call with logs took {round(avg_time, 2)} ms: {times_list}") 27 | 28 | times_list, avg_time = profile(lambda: summer_func(1, 5, stream_logs=False) == 6) 29 | print(f"Call without logs took {round(avg_time, 2)} ms: {times_list}") 30 | 31 | port = cluster.client.port 32 | suffix = "https" if cluster._use_https else "http" 33 | address = cluster.server_address 34 | 35 | call_url = f"{suffix}://{address}:{port}/summer_func/call/?serialization=None" 36 | logger.info(f"Call url: {call_url}") 37 | times_list, avg_time = profile( 38 | lambda: requests.post( 39 | call_url, 40 | json={"args": [1, 2]}, 41 | headers=rns_client.request_headers(cluster.rns_address) 42 | if cluster.den_auth 43 | else None, 44 | verify=cluster.client.verify, 45 | ).json() 46 | == 3 47 | ) 48 | print(f"{suffix} call took {round(avg_time, 2)} ms: {times_list}") 49 | 50 | 51 | def test_roundtrip_performance(summer_func): 52 | run_performance_tests(summer_func) 53 | 54 | 55 | def test_https_roundtrip_performance(summer_func_with_auth): 56 | run_performance_tests(summer_func_with_auth) 57 | -------------------------------------------------------------------------------- /tests/test_requirements/aws_test_requirements.txt: -------------------------------------------------------------------------------- 1 | awscli==1.29.17 2 | boto3==1.28.17 3 | pycryptodome==3.12.0 4 | -------------------------------------------------------------------------------- /tests/test_requirements/google_tests_requirements.txt: -------------------------------------------------------------------------------- 1 | google-api-python-client 2 | google-cloud-storage 3 | gcsfs 4 | -------------------------------------------------------------------------------- /tests/test_requirements/tutorial_requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | diffusers 3 | transformers 4 | -------------------------------------------------------------------------------- /tests/test_resources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/__init__.py -------------------------------------------------------------------------------- /tests/test_resources/test_clusters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_clusters/__init__.py -------------------------------------------------------------------------------- /tests/test_resources/test_clusters/test_docker_cluster.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | import os 4 | from pathlib import Path 5 | 6 | import pytest 7 | import ray 8 | import runhouse as rh 9 | 10 | 11 | def get_uname(): 12 | return os.uname() 13 | 14 | 15 | @pytest.mark.level("release") 16 | def test_docker_cluster(): 17 | import docker 18 | 19 | client = docker.from_env() 20 | 21 | cluster = rh.DockerCluster( 22 | name="test-cluster", 23 | container_name="runhouse-test-container", 24 | ) 25 | if not cluster.is_up(): 26 | rh_parent_path = Path(importlib.util.find_spec("runhouse").origin).parent.parent 27 | dockerfile_path = rh_parent_path / "docker/slim" 28 | # Rebuild the image if not already built 29 | if not client.images.list(name="runhouse-slim"): 30 | client.images.build( 31 | path=".", 32 | dockerfile=str(dockerfile_path), 33 | tag="runhouse-slim", 34 | ) 35 | container = client.containers.run( 36 | "runhouse-slim", 37 | command="tail -f /dev/null", 38 | detach=True, 39 | ports={"32300": 32300}, 40 | shm_size="3gb", # Needed by Ray 41 | name="runhouse-test-container", 42 | ) 43 | container.start() 44 | # Installs the local runhouse version inside the container and starts the server, 45 | # skip if you've pre-installed runhouse[server] in the image and started the server in the docker CMD 46 | cluster.restart_server() 47 | 48 | cluster.install_packages(["pytest"]) 49 | 50 | ray_resources = rh.function(ray.available_resources).to(cluster, sync_local=False) 51 | assert ray_resources() 52 | 53 | get_uname_dc = rh.function(get_uname).to(cluster) 54 | assert get_uname_dc() 55 | -------------------------------------------------------------------------------- /tests/test_resources/test_modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/__init__.py -------------------------------------------------------------------------------- /tests/test_resources/test_modules/exception_module.py: -------------------------------------------------------------------------------- 1 | import plotly # noqa 2 | import runhouse as rh 3 | 4 | 5 | class ExceptionModule(rh.Module): 6 | def __init__(self): 7 | super().__init__() 8 | 9 | def test_fn(self): 10 | return None 11 | -------------------------------------------------------------------------------- /tests/test_resources/test_modules/test_folder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/test_folder.py -------------------------------------------------------------------------------- /tests/test_resources/test_modules/test_folders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/test_folders/__init__.py -------------------------------------------------------------------------------- /tests/test_resources/test_modules/test_folders/test_packages/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/test_folders/test_packages/__init__.py -------------------------------------------------------------------------------- /tests/test_resources/test_modules/test_folders/test_packages/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import runhouse as rh 4 | 5 | from tests.conftest import init_args 6 | 7 | 8 | @pytest.fixture(scope="session") 9 | def package(request): 10 | """Parametrize over multiple packages - useful for running the same test on multiple storage types.""" 11 | return request.getfixturevalue(request.param) 12 | 13 | 14 | @pytest.fixture 15 | def local_package(local_folder): 16 | args = {"path": local_folder.path, "install_method": "local"} 17 | p = rh.package(**args) 18 | init_args[id(p)] = args 19 | return p 20 | 21 | 22 | @pytest.fixture 23 | def s3_package(s3_folder): 24 | return rh.package( 25 | path=s3_folder.path, system=s3_folder.system, install_method="local" 26 | ) 27 | -------------------------------------------------------------------------------- /tests/test_resources/test_modules/test_functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/test_functions/__init__.py -------------------------------------------------------------------------------- /tests/test_resources/test_modules/test_functions/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import runhouse as rh 4 | 5 | from tests.conftest import init_args 6 | 7 | 8 | def summer(a: int, b: int): 9 | print("Running summer function") 10 | return a + b 11 | 12 | 13 | def save_and_load_artifacts(): 14 | cpu = rh.ondemand_cluster("^rh-cpu").save() 15 | loaded_cluster = rh.load(name=cpu.name) 16 | return loaded_cluster.name 17 | 18 | 19 | def slow_running_func(a, b): 20 | import time 21 | 22 | time.sleep(20) 23 | return a + b 24 | 25 | 26 | @pytest.fixture(scope="session") 27 | def summer_func(local_launched_ondemand_aws_docker_cluster): 28 | args = {"name": "summer_func", "fn": summer} 29 | f = rh.function(**args).to(local_launched_ondemand_aws_docker_cluster) 30 | init_args[id(f)] = args 31 | return f 32 | 33 | 34 | @pytest.fixture(scope="session") 35 | def summer_func_with_auth(ondemand_aws_https_cluster_with_auth): 36 | return rh.function(summer, name="summer_func").to( 37 | ondemand_aws_https_cluster_with_auth 38 | ) 39 | 40 | 41 | @pytest.fixture(scope="session") 42 | def summer_func_shared(shared_cluster): 43 | return rh.function(summer, name="summer_func").to(shared_cluster) 44 | 45 | 46 | @pytest.fixture(scope="session") 47 | def func_with_artifacts(local_launched_ondemand_aws_docker_cluster): 48 | return rh.function(save_and_load_artifacts, name="artifacts_func").to( 49 | local_launched_ondemand_aws_docker_cluster 50 | ) 51 | 52 | 53 | @pytest.fixture(scope="session") 54 | def slow_func(local_launched_ondemand_aws_docker_cluster): 55 | return rh.function(slow_running_func, name="slow_func").to( 56 | local_launched_ondemand_aws_docker_cluster 57 | ) 58 | -------------------------------------------------------------------------------- /tests/test_resources/test_modules/test_server_modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/test_server_modules/__init__.py -------------------------------------------------------------------------------- /tests/test_resources/test_modules/test_server_modules/assets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/test_server_modules/assets/__init__.py -------------------------------------------------------------------------------- /tests/test_resources/test_modules/test_server_modules/assets/sample_fastapi_app.py: -------------------------------------------------------------------------------- 1 | import fastapi 2 | 3 | app = fastapi.FastAPI() 4 | 5 | 6 | @app.get("/summer/{a}") 7 | def summer(a: int, b: int): 8 | return a + b 9 | 10 | 11 | @app.post("/my/deeply/{arg1}/nested/endpoint/{arg2}") 12 | async def my_deeply_nested_async_endpoint(arg1: str, arg2: int, arg3: float): 13 | return arg1, arg2, arg3 14 | 15 | 16 | @app.get("/my/streaming/endpoint") 17 | def my_streaming_endpoint(): 18 | for i in range(10): 19 | yield i 20 | 21 | 22 | @app.get("/my/endpoint/with/optional/body/params/and/header") 23 | def my_endpoint_with_optional_body_params_and_header( 24 | a: int = fastapi.Body(None), 25 | b: int = fastapi.Body(None), 26 | c: int = fastapi.Header(None), 27 | ): 28 | return a, b, c 29 | 30 | 31 | if __name__ == "__main__": 32 | import uvicorn 33 | 34 | uvicorn.run(app, port=8000) 35 | -------------------------------------------------------------------------------- /tests/test_resources/test_secrets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_secrets/__init__.py -------------------------------------------------------------------------------- /tests/test_servers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_servers/__init__.py -------------------------------------------------------------------------------- /tests/test_servers/test_nginx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_servers/test_nginx.py -------------------------------------------------------------------------------- /tests/test_tutorials.py: -------------------------------------------------------------------------------- 1 | import runhouse as rh 2 | 3 | 4 | def sd_generate( 5 | prompt, 6 | num_images=1, 7 | steps=100, 8 | guidance_scale=7.5, 9 | model_id="stabilityai/stable-diffusion-2-base", 10 | ): 11 | import torch 12 | from diffusers import StableDiffusionPipeline 13 | 14 | pipe = StableDiffusionPipeline.from_pretrained( 15 | model_id, torch_dtype=torch.float16, revision="fp16" 16 | ).to("cuda") 17 | return pipe( 18 | [prompt] * num_images, num_inference_steps=steps, guidance_scale=guidance_scale 19 | ).images 20 | 21 | 22 | def test_sd_generate(a10g_gpu_cluster): 23 | generate_gpu = rh.function(fn=sd_generate).to( 24 | a10g_gpu_cluster, reqs=["pytest", "diffusers", "torch", "transformers"] 25 | ) 26 | 27 | images = generate_gpu( 28 | prompt="A hot dog made of matcha powder.", num_images=4, steps=50 29 | ) 30 | assert images 31 | --------------------------------------------------------------------------------