├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── custom.md
    │   └── feature-request.md
    └── workflows
    │   ├── build_docs.yaml
    │   ├── cluster_tests.yaml
    │   ├── copy_docs.yaml
    │   ├── generate_docs_for_tag.yaml
    │   ├── local_den_unit_tests.yaml
    │   ├── local_tests.yaml
    │   ├── local_tests_den_dev.yaml
    │   ├── nightly_release_testing.yaml
    │   ├── precommit.yaml
    │   ├── push_to_ecr_rh_all.yaml
    │   ├── release_precheck.yaml
    │   ├── setup_release_testing
    │       └── action.yaml
    │   ├── setup_rh_config
    │       └── action.yaml
    │   ├── setup_runhouse
    │       └── action.yaml
    │   └── unit_tests.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── collect_env.py
├── docker
    ├── cuda
    │   └── Dockerfile
    ├── slim
    └── testing
    │   ├── password-file-auth
    │       └── Dockerfile
    │   ├── public-key-auth-conda
    │       └── Dockerfile
    │   └── public-key-auth
    │       ├── Dockerfile
    │       └── instructions.md
├── docs
    ├── Makefile
    ├── _ext
    │   └── json_globaltoc.py
    ├── _static
    │   ├── favicon.ico
    │   └── rh_1.png
    ├── _templates
    │   └── layout.html
    ├── api
    │   ├── cli.rst
    │   ├── python.rst
    │   └── python
    │   │   ├── cluster.rst
    │   │   ├── folder.rst
    │   │   ├── function.rst
    │   │   ├── image.rst
    │   │   ├── login.rst
    │   │   ├── module.rst
    │   │   ├── package.rst
    │   │   ├── resource.rst
    │   │   └── secrets.rst
    ├── assets
    │   ├── img.png
    │   └── img_1.png
    ├── conf.py
    ├── debugging-logging.rst
    ├── development-guide.rst
    ├── docker-setup.rst
    ├── docker-workflows.rst
    ├── index.rst
    ├── installation-setup.rst
    ├── installation.rst
    ├── make.bat
    ├── requirements.txt
    ├── runhouse-in-your-stack.rst
    ├── security-and-authentication.rst
    └── tutorials
    │   ├── api-clusters.rst
    │   ├── api-folders.rst
    │   ├── api-images.rst
    │   ├── api-modules.rst
    │   ├── api-process.rst
    │   ├── api-resources.rst
    │   ├── api-secrets.rst
    │   ├── async.rst
    │   ├── quick-start-cloud.rst
    │   └── quick-start-local.rst
├── examples
    ├── README.md
    ├── dask-basic
    │   ├── __init__.py
    │   └── lgbm_train.py
    ├── dask-preprocessing-and-training
    │   ├── dask_on_ray.py
    │   ├── lightgbm_dask.py
    │   ├── notebook - interact with remote objects.ipynb
    │   └── requirements.txt
    ├── deepseek_inference
    │   ├── deepseek_llama_70b_vllm.py
    │   └── deepseek_qwen_32b_vllm.py
    ├── dlrm-movielens
    │   ├── Dockerfile
    │   ├── __init__.py
    │   ├── dlrm_data_prepoc.py
    │   ├── dlrm_inference.py
    │   ├── dlrm_training.py
    │   ├── my_pipeline.yaml
    │   └── requirements.txt
    ├── embedding-batch-inference
    │   └── embedding_batch_inference.py
    ├── fastapi-embeddings-rag
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── app
    │   │   ├── __init__.py
    │   │   ├── main.py
    │   │   └── modules
    │   │   │   ├── __init__.py
    │   │   │   ├── embedding.py
    │   │   │   └── llm.py
    │   └── requirements.txt
    ├── flux
    │   ├── flux.py
    │   ├── readme.md
    │   └── requirements.txt
    ├── hello-world
    │   ├── hello_world.py
    │   └── requirements.txt
    ├── hpo
    │   ├── __init__.py
    │   ├── hpo.py
    │   ├── hpo_bayes_opt.py
    │   ├── hpo_bayes_opt_low_level.py
    │   ├── hpo_pytorch_distributed.py
    │   └── hpo_ray_tune_remote.py
    ├── inference_llama70b
    │   ├── llama70b_hf_accelerate.py
    │   └── llama70b_vllm.py
    ├── langchain-rag-ec2
    │   ├── README.md
    │   └── langchain_rag.py
    ├── lightning-resnet
    │   ├── requirements.txt
    │   └── resnet_training.py
    ├── llama2-13b-ec2
    │   ├── README.md
    │   ├── llama2_ec2.py
    │   └── requirements.txt
    ├── llama2-fine-tuning-with-lora
    │   ├── README.md
    │   ├── llama2_fine_tuning.py
    │   └── requirements.txt
    ├── llama2-with-tgi-aws-inferentia2
    │   ├── README.md
    │   ├── requirements.txt
    │   └── tgi_llama2_inferentia.py
    ├── llama2-with-tgi-ec2
    │   ├── README.md
    │   ├── requirements.txt
    │   └── tgi_llama_ec2.py
    ├── llama3-8b-ec2
    │   ├── README.md
    │   ├── llama3_ec2.py
    │   └── requirements.txt
    ├── llama3-8b-tgi-ec2
    │   ├── README.md
    │   ├── llama3_tgi_ec2.py
    │   └── requirements.txt
    ├── llama3-fine-tuning-lora
    │   ├── README.md
    │   ├── llama3_fine_tuning.py
    │   ├── llama3_fine_tuning_distributed.py
    │   ├── requirements.txt
    │   └── runhouse_marimo.py
    ├── llama3-vllm-gcp
    │   ├── README.md
    │   ├── llama3_vllm_aws.py
    │   ├── llama3_vllm_gcp.py
    │   └── requirements.txt
    ├── lora-example-with-notebook
    │   ├── Lora Fine Tuning Notebook.ipynb
    │   ├── LoraFineTuner.py
    │   ├── LoraFineTuner_check_status.py
    │   └── readme.md
    ├── mistral-with-tgi-ec2
    │   ├── README.md
    │   ├── requirements.txt
    │   └── tgi_mistral_ec2.py
    ├── parallel-hf-embedding
    │   ├── README.md
    │   ├── parallel_hf_embedding.py
    │   └── requirements.txt
    ├── pytorch-distributed-basic
    │   ├── README.md
    │   ├── __init__.py
    │   ├── pytorch_distributed.py
    │   └── requirements.txt
    ├── pytorch-resnet
    │   ├── __init__.py
    │   ├── imagenet_preproc.py
    │   ├── requirements.txt
    │   ├── resnet_training.py
    │   └── resnet_training_full.py
    ├── pytorch-torchvision-mnist-training
    │   ├── README.md
    │   ├── airflow-multicloud
    │   │   ├── DataProcessing.py
    │   │   ├── airflow_multicloud_torch_train.py
    │   │   └── local_run_of_callables.py
    │   ├── airflow
    │   │   ├── airflow_example_torch_train.py
    │   │   ├── local_run_of_callables.py
    │   │   ├── readme.md
    │   │   └── requirements.txt
    │   ├── kfp_training.py
    │   ├── my_simple_model.py
    │   ├── my_transforms.py
    │   ├── requirements.txt
    │   ├── torch_basic_example.py
    │   └── work_with_remote_TorchTrainer.py
    ├── ray-data-lightgbm
    │   ├── lightgbm_ray_fns.py
    │   └── ray_6_nodes_lightgbm.ipynb
    ├── spark-basic
    │   └── spark_taxi_preprocess.py
    ├── stable-diffusion-xl-ec2
    │   ├── README.md
    │   ├── requirements.txt
    │   └── sdxl.py
    ├── tensorflow-distributed
    │   ├── README.md
    │   ├── requirements.txt
    │   └── tensorflow_distributed.py
    ├── xgboost-gpu
    │   ├── requirements.txt
    │   ├── xgboost_fashionmnist_training.py
    │   └── xgboost_training_hpo.py
    └── yolo-fastapi
    │   ├── requirements.txt
    │   └── yolo_fastapi.py
├── pyproject.toml
├── pytest.ini
├── requirements.txt
├── runhouse
    ├── __init__.py
    ├── builtins
    │   ├── __init__.py
    │   ├── config.json
    │   ├── generate_builtins.py
    │   ├── rh-32-cpu
    │   │   └── config.json
    │   ├── rh-4-gpu
    │   │   └── config.json
    │   ├── rh-4-v100
    │   │   └── config.json
    │   ├── rh-8-cpu
    │   │   └── config.json
    │   ├── rh-8-gpu
    │   │   └── config.json
    │   ├── rh-8-v100
    │   │   └── config.json
    │   ├── rh-cpu
    │   │   └── config.json
    │   ├── rh-gpu
    │   │   └── config.json
    │   └── rh-v100
    │   │   └── config.json
    ├── cli_utils.py
    ├── constants.py
    ├── exceptions.py
    ├── globals.py
    ├── logger.py
    ├── main.py
    ├── resources
    │   ├── __init__.py
    │   ├── asgi.py
    │   ├── distributed
    │   │   ├── __init__.py
    │   │   ├── dask_distributed.py
    │   │   ├── distributed_pool.py
    │   │   ├── pytorch_distributed.py
    │   │   ├── ray_distributed.py
    │   │   ├── spark_distributed.py
    │   │   ├── supervisor.py
    │   │   └── utils.py
    │   ├── folders
    │   │   ├── __init__.py
    │   │   ├── folder.py
    │   │   ├── folder_factory.py
    │   │   ├── gcs_folder.py
    │   │   └── s3_folder.py
    │   ├── functions
    │   │   ├── __init__.py
    │   │   ├── function.py
    │   │   └── function_factory.py
    │   ├── future_module.py
    │   ├── hardware
    │   │   ├── __init__.py
    │   │   ├── cluster.py
    │   │   ├── cluster_factory.py
    │   │   ├── constants.py
    │   │   ├── docker_cluster.py
    │   │   ├── kubernetes
    │   │   │   └── rsync_helper.sh
    │   │   ├── launcher_utils.py
    │   │   ├── on_demand_cluster.py
    │   │   ├── ray_utils.py
    │   │   ├── sky
    │   │   │   ├── __init__.py
    │   │   │   ├── command_runner.py
    │   │   │   ├── common_utils.py
    │   │   │   ├── constants.py
    │   │   │   ├── log_lib.py
    │   │   │   ├── subprocess_daemon.py
    │   │   │   └── subprocess_utils.py
    │   │   ├── sky_command_runner.py
    │   │   ├── ssh_tunnel.py
    │   │   └── utils.py
    │   ├── images
    │   │   ├── __init__.py
    │   │   ├── builtin_images.py
    │   │   └── image.py
    │   ├── module.py
    │   ├── packages
    │   │   ├── __init__.py
    │   │   └── package.py
    │   ├── resource.py
    │   └── secrets
    │   │   ├── __init__.py
    │   │   ├── provider_secrets
    │   │       ├── __init__.py
    │   │       ├── anthropic_secret.py
    │   │       ├── api_key_secret.py
    │   │       ├── aws_secret.py
    │   │       ├── azure_secret.py
    │   │       ├── cohere_secret.py
    │   │       ├── docker_secret.py
    │   │       ├── gcp_secret.py
    │   │       ├── github_secret.py
    │   │       ├── huggingface_secret.py
    │   │       ├── kubeconfig_secret.py
    │   │       ├── lambda_secret.py
    │   │       ├── langchain_secret.py
    │   │       ├── openai_secret.py
    │   │       ├── pinecone_secret.py
    │   │       ├── provider_secret.py
    │   │       ├── providers.py
    │   │       ├── sky_secret.py
    │   │       ├── ssh_secret.py
    │   │       └── wandb_secret.py
    │   │   ├── secret.py
    │   │   ├── secret_factory.py
    │   │   └── utils.py
    ├── rns
    │   ├── __init__.py
    │   ├── defaults.py
    │   ├── login.py
    │   ├── rns_client.py
    │   ├── top_level_rns_fns.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   └── names.py
    ├── servers
    │   ├── __init__.py
    │   ├── autostop_helper.py
    │   ├── caddy
    │   │   ├── __init__.py
    │   │   └── config.py
    │   ├── cluster_servlet.py
    │   ├── http
    │   │   ├── __init__.py
    │   │   ├── auth.py
    │   │   ├── certs.py
    │   │   ├── http_client.py
    │   │   ├── http_server.py
    │   │   └── http_utils.py
    │   ├── node_servlet.py
    │   ├── obj_store.py
    │   └── servlet.py
    └── utils.py
├── scripts
    ├── __init__.py
    ├── build_package.sh
    ├── generating_docs.py
    └── kubernetes_cluster
    │   ├── README.md
    │   ├── tf-aks
    │       ├── 0-locals.tf
    │       ├── 1-provider.tf
    │       ├── 2-resource-group.tf
    │       ├── 3-vpc.tf
    │       ├── 4-subnets.tf
    │       ├── 5-aks.tf
    │       └── README.md
    │   ├── tf-eks
    │       ├── README.md
    │       └── main.tf
    │   └── tf-gke
    │       ├── 1-provider.tf
    │       ├── 2-vpc.tf
    │       ├── 3-subnets.tf
    │       ├── 4-router.tf
    │       ├── 5-nat.tf
    │       ├── 6-firewalls.tf
    │       ├── 7-kubernetes.tf
    │       ├── 8-node-pools.tf
    │       └── README.md
├── setup.py
└── tests
    ├── README.md
    ├── __init__.py
    ├── conftest.py
    ├── constants.py
    ├── fixtures
        ├── docker_cluster_fixtures.py
        ├── folder_fixtures.py
        ├── on_demand_cluster_fixtures.py
        ├── package_fixtures.py
        ├── resource_fixtures.py
        ├── secret_fixtures.py
        ├── static_cluster_fixtures.py
        ├── test_fake_package
        │   ├── setup.py
        │   └── test_fake_package
        │   │   ├── __init__.py
        │   │   ├── function_to_import.py
        │   │   └── module_to_import.py
        ├── test_fake_package_copy
        │   ├── setup.py
        │   └── test_fake_package_copy
        │   │   ├── __init__.py
        │   │   ├── function_to_import.py
        │   │   └── module_to_import.py
        └── utils.py
    ├── requirements.txt
    ├── test_den
        ├── __init__.py
        ├── test_defaults.py
        └── test_rns.py
    ├── test_login.py
    ├── test_obj_store.py
    ├── test_performance.py
    ├── test_requirements
        ├── aws_test_requirements.txt
        ├── google_tests_requirements.txt
        └── tutorial_requirements.txt
    ├── test_resources
        ├── __init__.py
        ├── test_clusters
        │   ├── __init__.py
        │   ├── cluster_tests.py
        │   ├── test_cluster.py
        │   ├── test_docker_cluster.py
        │   ├── test_multinode_cluster.py
        │   └── test_on_demand_cluster.py
        ├── test_data
        │   ├── test_folder.py
        │   └── test_package.py
        ├── test_modules
        │   ├── __init__.py
        │   ├── exception_module.py
        │   ├── test_folder.py
        │   ├── test_folders
        │   │   ├── __init__.py
        │   │   └── test_packages
        │   │   │   ├── __init__.py
        │   │   │   ├── conftest.py
        │   │   │   └── test_package.py
        │   ├── test_functions
        │   │   ├── __init__.py
        │   │   ├── conftest.py
        │   │   └── test_function.py
        │   ├── test_module.py
        │   └── test_server_modules
        │   │   ├── __init__.py
        │   │   ├── assets
        │   │       ├── __init__.py
        │   │       └── sample_fastapi_app.py
        │   │   └── dont_test_server_module.py
        ├── test_resource.py
        ├── test_resource_sharing.py
        └── test_secrets
        │   ├── __init__.py
        │   └── test_secret.py
    ├── test_servers
        ├── __init__.py
        ├── conftest.py
        ├── test_caddy.py
        ├── test_certs.py
        ├── test_http_client.py
        ├── test_http_server.py
        ├── test_nginx.py
        ├── test_server_obj_store.py
        └── test_servlet.py
    ├── test_tutorials.py
    └── utils.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us reproduce and fix the bug.
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | Please provide a clear and concise description of what the bug and the expected behavior is.
12 | 
13 | If relevant, include the steps or code snippet to reproduce the error.
14 | 
15 | **Versions**
16 | Please run the following and paste the output below.
17 | ```
18 | wget https://raw.githubusercontent.com/run-house/runhouse/main/collect_env.py
19 | # For security purposes, please check the contents of collect_env.py before running it.
20 | python collect_env.py
21 | ```
22 | 
23 | **Additional context**
24 | Add any other context about the problem here.
25 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Custom issue template
3 | about: Describe this issue template's purpose here.
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 | 
8 | ---
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Submit a proposal or request for a new Runhouse feature.
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **The feature**
11 | A clear and concise description of the feature proposal.
12 | 
13 | **Motivation**
14 | What is the motivation for the feature request? Is it related to a problem you're running into?
15 | 
16 | **What the ideal solution looks like**
17 | A clear and concise description of what you want to happen.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/build_docs.yaml:
--------------------------------------------------------------------------------
 1 | name: Build docs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   build_docs:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Check out repository code
13 |         uses: actions/checkout@v3
14 |       - name: Install
15 |         run: python -m pip install --upgrade pip && pip install -e .
16 |       - name: Install docs requirements
17 |         run: pip install -r docs/requirements.txt
18 |       - name: Build docs
19 |         run: cd docs && make html && cd ..
20 |       - name: Upload artifacts
21 |         uses: actions/upload-artifact@v4
22 |         with:
23 |           name: docs
24 |           path: docs/_build/html
25 | 


--------------------------------------------------------------------------------
/.github/workflows/cluster_tests.yaml:
--------------------------------------------------------------------------------
 1 | name: cluster-tests
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | jobs:
 6 |   cluster-tests:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - name: Check out repository code
10 |         uses: actions/checkout@v3
11 | 
12 |       - name: setup python
13 |         uses: actions/setup-python@v4
14 |         with:
15 |           python-version: '3.10'
16 | 
17 |       - name: Configure aws
18 |         run: |
19 |           aws configure set aws_access_key_id ${{ secrets.DEV_AWS_ACCESS_KEY }}
20 |           aws configure set aws_secret_access_key ${{ secrets.DEV_AWS_SECRET_KEY }}
21 |           aws configure set default.region us-east-1
22 | 
23 |       - name: Install & check skypilot configuration
24 |         run: |
25 |           python -m pip install --upgrade pip
26 |           pip install skypilot
27 |           sky check
28 |           sky status
29 | 
30 |       - name: Install python packages & dependencies
31 |         run: |
32 |           pip install runhouse[aws]
33 |           pip install -r tests/requirements.txt
34 | 
35 |       - name: Run all cluster unit tests tests
36 |         env:
37 |           KITCHEN_TESTER_TOKEN: ${{ secrets.KITCHEN_TESTER_PROD_TOKEN }}
38 |         run: pytest -v tests/test_resources/test_cluster.py --level unit
39 | 
40 |       - name: Teardown all test clusters
41 |         run: |
42 |           sky status
43 |           sky down --all -y
44 |           sky status
45 | 


--------------------------------------------------------------------------------
/.github/workflows/copy_docs.yaml:
--------------------------------------------------------------------------------
 1 | name: Generate docs for runhouse-docs
 2 | # https://cpina.github.io/push-to-another-repository-docs/index.html
 3 | 
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - '*'
 8 | 
 9 | jobs:
10 |   generate-docs:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Check out repository code
14 |         uses: actions/checkout@v3
15 | 
16 |       - name: Install
17 |         run: python -m pip install --upgrade pip && pip install -e .
18 | 
19 |       - name: Install docs requirements
20 |         run: pip install -r docs/requirements.txt
21 | 
22 |       - name: Build docs
23 |         run: cd docs && make json
24 | 
25 |       - name: Get current branch name
26 |         run: echo "CURRENT_BRANCH=${GITHUB_REF#refs/heads/}" >> $GITHUB_ENV
27 | 
28 |       - name: Set target branch name
29 |         run: echo "BRANCH_NAME=${CURRENT_BRANCH}" >> $GITHUB_ENV
30 | 
31 |       - name: Push directory to another repository
32 |         uses: cpina/github-action-push-to-another-repository@v1.7.2
33 |         env:
34 |           SSH_DEPLOY_KEY: ${{ secrets.SSH_DEPLOY_KEY }}
35 |         with:
36 |           source-directory: 'docs/_build/json/'
37 |           destination-github-username: 'run-house'
38 |           destination-repository-name: 'runhouse-docs'
39 |           target-branch: ${{ env.BRANCH_NAME }}
40 |           create-target-branch-if-needed: true
41 | 


--------------------------------------------------------------------------------
/.github/workflows/generate_docs_for_tag.yaml:
--------------------------------------------------------------------------------
 1 | name: Generate Docs for Tag
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '*'  # Triggers on any tag push
 7 |   workflow_dispatch:
 8 |     inputs:
 9 |       tag-name:
10 |         description: 'Tag Name (ex: v0.0.32)'
11 |         required: false # Allow empty for cases where the release event provides the tag
12 |         default: ''
13 | 
14 | jobs:
15 |   build-docs-for-tag:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |     - name: Checkout code
19 |       uses: actions/checkout@v2
20 | 
21 |     - name: Set up Python
22 |       uses: actions/setup-python@v2
23 |       with:
24 |         python-version: '3.8'
25 | 
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install -r docs/requirements.txt
30 |         pip install runhouse sshtunnel python-dotenv gitpython
31 | 
32 |     - name: Determine Tag Name
33 |       id: determine-tag
34 |       run: |
35 |         if [[ "${{ github.event_name }}" == "release" ]]; then
36 |           echo "Tag name from release: ${{ github.event.release.tag_name }}"
37 |           echo "tag_name=${{ github.event.release.tag_name }}" >> $GITHUB_ENV
38 |         elif [[ -n "${{ github.event.inputs.tag-name }}" ]]; then
39 |           echo "Tag name from manual input: ${{ github.event.inputs.tag-name }}"
40 |           echo "tag_name=${{ github.event.inputs.tag-name }}" >> $GITHUB_ENV
41 |         else
42 |           echo "Error: No tag name provided"
43 |           exit 1
44 |         fi
45 |       shell: bash
46 | 
47 |     - name: Run docs build script for specific tag
48 |       env:
49 |         GH_TOKEN: ${{ secrets.GH_TOKEN }}
50 |         TAG_NAME: ${{ env.tag_name }}
51 |       run: |
52 |         python scripts/generating_docs.py --tag-name "${{ env.TAG_NAME }}"
53 | 


--------------------------------------------------------------------------------
/.github/workflows/precommit.yaml:
--------------------------------------------------------------------------------
 1 | name: Run pre-commit
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   linting:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Check out repository code
13 |         uses: actions/checkout@v3
14 |       - name: Install pre-commit
15 |         run: python -m pip install --upgrade pip && pip install pre-commit
16 |       - name: Lint code
17 |         run: pre-commit run --all-files
18 |       - name: Show diff
19 |         run: git --no-pager diff --color=always
20 | 


--------------------------------------------------------------------------------
/.github/workflows/push_to_ecr_rh_all.yaml:
--------------------------------------------------------------------------------
 1 | name: Push to ECR
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   push_to_ecr:
11 |     name: Build and Push Image
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - name: Check out code
16 |         uses: actions/checkout@v2
17 | 
18 |       - name: Configure AWS credentials
19 |         uses: aws-actions/configure-aws-credentials@v1
20 |         with:
21 |           aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY }}
22 |           aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_KEY }}
23 |           aws-region: us-east-1
24 | 
25 |       - name: Login to Amazon ECR Public
26 |         id: login-ecr-public
27 |         uses: aws-actions/amazon-ecr-login@v2
28 |         with:
29 |           registry-type: public
30 | 
31 |       - name: Set the environment
32 |         id: set-image-tag
33 |         run: |
34 | 
35 |           BRANCH_NAME="${GITHUB_REF#refs/heads/}"  # Extract branch name from refs/heads/<branch>
36 |           PR_BRANCH_NAME="${GITHUB_HEAD_REF:-}"     # For pull requests, GITHUB_HEAD_REF contains the branch name
37 | 
38 |           if [[ -n "$PR_BRANCH_NAME" ]]; then
39 |               # If it's a pull request, use the PR branch name instead
40 |               BRANCH_NAME="$PR_BRANCH_NAME"
41 |           fi
42 | 
43 |           if [[ "$BRANCH_NAME" != "main" ]]; then
44 |               # Build a dev image for pull requests or feature branches
45 |               IMAGE_TAG="rh-all-${BRANCH_NAME}-${GITHUB_SHA::8}"
46 |           else
47 |               # Build an image for main branch pushes
48 |               IMAGE_TAG="rh-all-main-${GITHUB_SHA::8}"
49 |           fi
50 | 
51 |           # Replace "/" with "-" in the image tag
52 |           IMAGE_TAG="${IMAGE_TAG//\//-}"
53 | 
54 |           # Save environment variables
55 |           echo "BRANCH_NAME=$BRANCH_NAME" >> $GITHUB_ENV
56 |           echo "IMAGE_TAG=$IMAGE_TAG" >> $GITHUB_ENV
57 |           echo "GITHUB_REF=$GITHUB_REF" >> $GITHUB_ENV
58 | 
59 |       - name: Build, tag, and push image to Amazon ECR
60 |         env:
61 |           ECR_REGISTRY: public.ecr.aws/a9j3d7s3
62 |           ECR_REPOSITORY: run-house/runhouse
63 |         run: |
64 |           docker build --platform linux/amd64 -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -f ./docker/slim . --build-arg RUNHOUSE_EXTRAS=all --build-arg BRANCH_NAME=$BRANCH_NAME
65 |           docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
66 | 


--------------------------------------------------------------------------------
/.github/workflows/release_precheck.yaml:
--------------------------------------------------------------------------------
 1 | name: Conda Environment Setup and Test
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [ created ]
 6 |   workflow_dispatch:
 7 | 
 8 | 
 9 | jobs:
10 |   build-and-test:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
15 | 
16 |     steps:
17 |     - name: Checkout repository
18 |       uses: actions/checkout@v2
19 | 
20 |     - name: Setup Miniconda
21 |       uses: conda-incubator/setup-miniconda@v2
22 |       with:
23 |         auto-update-conda: true
24 |         python-version: ${{ matrix.python-version }}
25 | 
26 |     - name: Create Conda environment
27 |       run: conda create --yes --name test-env python=${{ matrix.python-version }}
28 | 
29 |     - name: Install current package in editable mode
30 |       run: |
31 |         source $CONDA/etc/profile.d/conda.sh
32 |         conda activate test-env
33 |         pip install -e .
34 | 
35 |     - name: Test package import
36 |       run: |
37 |         source $CONDA/etc/profile.d/conda.sh
38 |         conda activate test-env
39 |         python -c "import runhouse"
40 | 


--------------------------------------------------------------------------------
/.github/workflows/setup_rh_config/action.yaml:
--------------------------------------------------------------------------------
 1 | name: Setup an RH config
 2 | 
 3 | description: Reusable short flow for setting up a fake ~/.rh/config.yaml
 4 | 
 5 | inputs:
 6 |   username:
 7 |     description: 'The username to log in with'
 8 |     required: true
 9 | 
10 |   token:
11 |     description: 'The token of the logged in username'
12 |     required: true
13 | 
14 |   api_server_url:
15 |     description: 'The den api server to send the requests to'
16 |     required: true
17 | 
18 | runs:
19 |   using: composite
20 |   steps:
21 |     - name: Setup ~/.rh/config.yaml
22 |       shell: bash
23 |       run: |
24 |         mkdir ~/.rh && touch ~/.rh/config.yaml
25 |         echo "default_folder: /${{ inputs.username }}" > ~/.rh/config.yaml
26 |         echo "token: ${{ inputs.token }}" >> ~/.rh/config.yaml
27 |         echo "username: ${{ inputs.username }}" >> ~/.rh/config.yaml
28 |         echo "api_server_url: ${{ inputs.api_server_url }}" >> ~/.rh/config.yaml
29 |         echo "default_ssh_key: ssh-sky-key" >> ~/.rh/config.yaml
30 |         echo "autosave: false" >> ~/.rh/config.yaml
31 |         echo "disable_observability: false" >> ~/.rh/config.yaml
32 | 


--------------------------------------------------------------------------------
/.github/workflows/setup_runhouse/action.yaml:
--------------------------------------------------------------------------------
 1 | name: Setup Runhouse
 2 | 
 3 | description: Reusable steps for setting up Runhouse
 4 | 
 5 | inputs:
 6 |   den_tester_ssh_private_key:
 7 |     description: 'SSH private key value'
 8 |     required: true
 9 | 
10 |   den_tester_ssh_public_key:
11 |     description: 'SSH public key value'
12 |     required: true
13 | 
14 | runs:
15 |   using: composite
16 |   steps:
17 |     - name: Setup python
18 |       uses: actions/setup-python@v4
19 |       with:
20 |         python-version: '3.10'
21 | 
22 |     # Note: using the default SSH keys stored for Den tester
23 |     - name: Set up local default SSH keys
24 |       shell: bash
25 |       run: |
26 |         mkdir -p ~/.ssh
27 |         echo "${{ inputs.den_tester_ssh_private_key }}" > ~/.ssh/sky-key
28 |         echo "${{ inputs.den_tester_ssh_public_key }}" > ~/.ssh/sky-key.pub
29 |         chmod 600 ~/.ssh/sky-key
30 |         chmod 644 ~/.ssh/sky-key.pub
31 |         echo "password" > $GITHUB_WORKSPACE/../docker_user_passwd
32 | 
33 |     - name: Install runhouse from source code
34 |       shell: bash
35 |       run: |
36 |         pip install -e $GITHUB_WORKSPACE
37 | 
38 |     - name: Install python packages & dependencies for unit and local tests
39 |       shell: bash
40 |       run:
41 |         pip install -r tests/requirements.txt scipy boto3 google-cloud-storage
42 | 


--------------------------------------------------------------------------------
/.github/workflows/unit_tests.yaml:
--------------------------------------------------------------------------------
 1 | name: Tests with level "unit"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   all-tests-logged-out-level-unit:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Check out repository code
13 |         uses: actions/checkout@v3
14 | 
15 |       - name: Setup Runhouse
16 |         uses: ./.github/workflows/setup_runhouse
17 |         with:
18 |           den_tester_ssh_private_key: ${{ secrets.DEN_TESTER_SSH_PRIVATE_KEY }}
19 |           den_tester_ssh_public_key: ${{ secrets.DEN_TESTER_SSH_PUBLIC_KEY }}
20 | 
21 |       - name: pytest -v --level unit -k "not secrettest"
22 |         run: pytest -v --level unit -k "not secrettest"
23 |         timeout-minutes: 20
24 | 
25 | #  all-tests-logged-in-level-unit:
26 | #    runs-on: ubuntu-latest
27 | #    steps:
28 | #      - name: Check out repository code
29 | #        uses: actions/checkout@v3
30 | #
31 | #      - name: Setup Runhouse
32 | #        uses: ./.github/workflows/setup_runhouse
33 | #
34 | #      - name: Setup ~/.rh/config.yaml
35 | #        uses: ./.github/workflows/setup_rh_config
36 | #        with:
37 | #          username: ${{ secrets.CI_ACCOUNT_USERNAME }}
38 | #          token: ${{ secrets.CI_ACCOUNT_TOKEN }}
39 | #
40 | #      - name: pytest -v --level unit -k "not den_auth"
41 | #        env:
42 | #          TEST_TOKEN: ${{ secrets.TEST_TOKEN }}
43 | #          TEST_USERNAME: ${{ secrets.TEST_USERNAME }}
44 | #        run: pytest -v --level unit
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | .DS_Store
132 | 
133 | # IDE project files
134 | .idea/
135 | .vscode/
136 | 
137 | # Config files
138 | rh/
139 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.0.1
 4 |     hooks:
 5 |       - id: trailing-whitespace
 6 |       - id: end-of-file-fixer
 7 | 
 8 |   - repo: https://github.com/omnilib/ufmt
 9 |     rev: v1.3.2
10 |     hooks:
11 |       - id: ufmt
12 |         exclude: (runhouse/servers/grpc/unary_pb2.py|runhouse/servers/grpc/unary_pb2_grpc.py|runhouse/resources/hardware/sky/)
13 |         additional_dependencies:
14 |           - black == 22.6.0
15 |           - usort == 1.0.4
16 | 
17 |   - repo: https://github.com/astral-sh/ruff-pre-commit
18 |     rev: v0.3.0
19 |     hooks:
20 |       - id: ruff
21 |         args: [--line-length=120, '--ignore=E402,E721,E722,E731,F821']
22 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-20.04
11 |   tools:
12 |     python: "3.9"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |    configuration: docs/conf.py
17 | 
18 | python:
19 |    install:
20 |    - method: pip
21 |      path: .
22 |    - requirements: docs/requirements.txt
23 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Runhouse
 2 | Please file an [issue](https://github.com/run-house/runhouse/issues) if you encounter a bug.
 3 | 
 4 | If you would like to submit a bug-fix or improve an existing feature, please submit a pull request following the
 5 | process outlined below.
 6 | 
 7 | If you would like to contribute, but don't know what to add, you can look for open issues labeled
 8 | `good first issue`, or take a look at the [funhouse repo](https://github.com/run-house/funhouse) to
 9 | create and add your own ML application using Runhouse!
10 | 
11 | ## Development Process
12 | If you want to modify code, please follow the instructions for creating a Pull Request.
13 | 
14 | 1. Fork the Github repository, and then clone the forked repo to local.
15 | ```
16 | git clone git@github.com:<your username>/runhouse.git
17 | cd runhouse
18 | git remote add upstream https://github.com/run-house/runhouse.git
19 | ```
20 | 
21 | 2. Create a new branch for your development changes:
22 | ```
23 | git checkout -b branch-name
24 | ```
25 | 
26 | 3. Install Runhouse
27 | ```
28 | pip install -e .
29 | ```
30 | 
31 | 4. Develop your features
32 | 
33 | 5. Download and run pre-commit to automatically format your code using black and ruff.
34 | 
35 | ```
36 | pip install pre-commit
37 | pre-commit run --files [FILES [FILES ...]]
38 | ```
39 | 
40 | 6. Add, commit, and push your changes. Create a "Pull Request" on GitHub to submit the changes for review.
41 | 
42 | ```
43 | git push -u origin branch-name
44 | ```
45 | 
46 | ## Testing
47 | 
48 | To run tests, please install test/requirements.txt.
49 | ```
50 | pip install -r tests/requirements.txt
51 | ```
52 | 
53 | Additional optional packages to install to run related tests:
54 | 
55 | aws related tests
56 | ```
57 | pip install -r tests/test_requirements/aws_test_requirements.txt
58 | ```
59 | 
60 | google related tests
61 | ```
62 | pip install -r tests/test_requirements/google_tests_requirements.txt
63 | ```
64 | 
65 | 
66 | 
67 | ## Documentation
68 | Docs source code is located in `docs/`. To build and review docs locally:
69 | 
70 | ```
71 | pip install -r docs/requirements.txt
72 | cd docs/
73 | make clean html
74 | ```
75 | 
76 | ### Tutorials and Examples
77 | Notebook (`.ipynb`) code lives in [run-house/notebooks](https://github.com/run-house/notebooks). If modifying
78 | a tutorial or example involving a `.ipynb` file, please refer to these
79 | [instructions](https://github.com/run-house/notebooks?tab=readme-ov-file#syncing-docs-to-run-houserunhouse) for
80 | how to upload your notebook to the notebooks repo and sync the rendered `.rst` file over to the runhouse repo.
81 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include runhouse/builtins/*
2 | include runhouse/resources/hardware/kubernetes/*
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🏃‍♀️Runhouse🏠 is now 📦Kubetorch🔥
2 | 
3 | This repository is no longer supported as we privately beta 📦Kubetorch🔥, the next generation of this project.
4 | Kubetorch is for production people that like Kubernetes and ML people that don't.
5 | It's a Pythonic, debuggable successor to Kubeflow just as PyTorch succeeded Tensorflow.
6 | To join the waitlist, please contact us at support@run.house .
7 | 


--------------------------------------------------------------------------------
/collect_env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | import sys
 4 | 
 5 | try:
 6 |     from pip._internal.operations import freeze
 7 | except ImportError:  # pip < 10.0
 8 |     from pip.operations import freeze
 9 | 
10 | py_version = sys.version.replace("\n", " ")
11 | py_platform = platform.platform()
12 | 
13 | pkgs = freeze.freeze()
14 | pip_pkgs = "\n".join(
15 |     pkg
16 |     for pkg in pkgs
17 |     if any(
18 |         name in pkg
19 |         for name in {
20 |             # runhouse
21 |             "runhouse",
22 |             # required installs
23 |             "wheel",
24 |             "rich",
25 |             "typer",
26 |             "skypilot",
27 |             "fastapi",
28 |             "uvicorn",
29 |             "pyOpenSSL"
30 |             # aws
31 |             "awscli",
32 |             "boto3",
33 |             "pycryptodome",
34 |             "s3fs",
35 |             "sshtunnel",
36 |             # azure
37 |             "azure-cli",
38 |             "azure-core",
39 |             # gcp
40 |             "google-api-python-client",
41 |             "google-cloud-storage",
42 |             "gcsfs",
43 |             # docker
44 |             "docker",
45 |         }
46 |     )
47 | )
48 | 
49 | print(f"Python Platform: {py_platform}")
50 | print(f"Python Version: {py_version}")
51 | print()
52 | print(f"Relevant packages: \n{pip_pkgs}")
53 | print()
54 | os.system("sky check")
55 | os.system("sky status --refresh")
56 | 


--------------------------------------------------------------------------------
/docker/cuda/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Nvidia/CUDA on Ubuntu based image
 2 | FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
 3 | 
 4 | ARG DOCKER_USER_PASSWORD_FILE
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | # Create the password file
 9 | RUN mkdir -p /app/ssh
10 | 
11 | # Install the required packages
12 | RUN apt-get update && \
13 |     apt-get install -y gcc python3-dev openssh-server supervisor && \
14 |     rm -rf /var/lib/apt/lists/*
15 | 
16 | # Copy the password file into the image
17 | COPY $DOCKER_USER_PASSWORD_FILE /app/ssh/docker_user_password_file
18 | 
19 | RUN pip install runhouse
20 | RUN pip install -e .
21 | 
22 | # Create the privilege separation directory required by sshd
23 | RUN mkdir -p /run/sshd
24 | 
25 | # Create a user for SSH access (using password from $DOCKER_USER_PASSWORD_FILE)
26 | RUN useradd -m rh-docker-user && \
27 |     echo "rh-docker-user:$(cat /app/ssh/docker_user_password_file)" | chpasswd && \
28 |     echo "PermitRootLogin no" >> /etc/ssh/sshd_config && \
29 |     echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config
30 | 
31 | # Create supervisord configuration file
32 | RUN echo "[supervisord]" > /etc/supervisor/conf.d/supervisord.conf && \
33 |     echo "nodaemon=true" >> /etc/supervisor/conf.d/supervisord.conf && \
34 |     echo "user=root" >> /etc/supervisor/conf.d/supervisord.conf && \
35 |     echo "[program:sshd]" >> /etc/supervisor/conf.d/supervisord.conf && \
36 |     echo "command=/usr/sbin/sshd -D" >> /etc/supervisor/conf.d/supervisord.conf && \
37 |     echo "stdout_logfile=/var/log/sshd.log" >> /etc/supervisor/conf.d/supervisord.conf && \
38 |     echo "stderr_logfile=/var/log/sshd.err" >> /etc/supervisor/conf.d/supervisord.conf && \
39 |     echo "[program:runhouse]" >> /etc/supervisor/conf.d/supervisord.conf && \
40 |     echo "command=runhouse server start --host "0.0.0.0"" >> /etc/supervisor/conf.d/supervisord.conf && \
41 |     echo "stdout_logfile=/var/log/runhouse.log" >> /etc/supervisor/conf.d/supervisord.conf && \
42 |     echo "stderr_logfile=/var/log/runhouse.err" >> /etc/supervisor/conf.d/supervisord.conf
43 | 
44 | # Runhouse server port
45 | EXPOSE 32300
46 | # HTTPS port
47 | EXPOSE 443
48 | # HTTP port
49 | EXPOSE 80
50 | # SSH port
51 | EXPOSE 22
52 | 
53 | # Run supervisord as the main process to manage the others
54 | CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
55 | 


--------------------------------------------------------------------------------
/docker/slim:
--------------------------------------------------------------------------------
 1 | # A simple container to use as a local runhouse cluster
 2 | # docker build -t runhouse -f ./runhouse/docker/slim ./runhouse
 3 | 
 4 | # Use the official Python image as a parent image
 5 | FROM python:3.11-slim
 6 | 
 7 | ARG RUNHOUSE_EXTRAS=server
 8 | ARG BRANCH_NAME=None
 9 | RUN echo "RUNHOUSE_EXTRAS is: $RUNHOUSE_EXTRAS"
10 | RUN echo "BRANCH_NAME is: BRANCH_NAME"
11 | 
12 | # get the correct runhouse version based on BRANCH_NAME, install screen and Runhouse
13 | RUN if [ "$BRANCH_NAME" = "None" ]; then \
14 |         export rh_version="runhouse[${RUNHOUSE_EXTRAS}]"; \
15 |     else \
16 |         export rh_version="runhouse[${RUNHOUSE_EXTRAS}] @ git+https://github.com/run-house/runhouse.git@$BRANCH_NAME"; \
17 |     fi && \
18 |     echo "install_pkg is: $rh_version" && \
19 |     apt-get update && apt-get install -y screen procps rsync ssh netcat-traditional git && \
20 |     python -m pip install --upgrade pip && \
21 |     python -m pip install "$rh_version"
22 | 
23 | # Alias python3 as python
24 | RUN ln -s /usr/bin/python3 /usr/bin/python
25 | 
26 | # Make port 32300 available to the world outside this container
27 | EXPOSE 32300
28 | 
29 | # Start ray and the the runhouse server
30 | CMD ["runhouse", "server", "start", "--no-restart-ray", "--host", "0.0.0.0", "--port", "32300", "--no-screen", "--no-nohup"]
31 | 


--------------------------------------------------------------------------------
/docker/testing/password-file-auth/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Debian based image (should work for Ubuntu as well)
 2 | FROM python:3.9.15-slim
 3 | 
 4 | ARG DOCKER_USER_PASSWORD_FILE
 5 | ARG RUNHOUSE_PATH
 6 | ARG RUNHOUSE_VERSION
 7 | 
 8 | WORKDIR /app
 9 | 
10 | # Create the password file directory
11 | RUN mkdir -p /app/ssh
12 | 
13 | # Install the required packages
14 | RUN apt-get update --allow-insecure-repositories && \
15 |     apt-get install -y --no-install-recommends gcc python3-dev openssh-server rsync supervisor screen wget curl sudo ufw git awscli && \
16 |     apt-get clean && \
17 |     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
18 | 
19 | # Copy the password file into the image
20 | COPY $DOCKER_USER_PASSWORD_FILE /app/ssh/docker_user_password_file
21 | 
22 | # COPY local Runhouse package into the image if provided
23 | COPY $RUNHOUSE_PATH /app/runhouse
24 | 
25 | # If using a local version of runhouse, install it from the local directory
26 | RUN if [ -d "/app/runhouse" ]; then pip install -U -e /app/runhouse; else pip install -U runhouse==$RUNHOUSE_VERSION; fi
27 | 
28 | # Create the privilege separation directory required by sshd
29 | RUN mkdir -p /run/sshd
30 | 
31 | # Create a user for SSH access (using password from $DOCKER_USER_PASSWORD_FILE)
32 | RUN useradd -m rh-docker-user && \
33 |     echo "rh-docker-user:$(cat /app/ssh/docker_user_password_file)" | chpasswd && \
34 |     echo "PermitRootLogin no" >> /etc/ssh/sshd_config && \
35 |     echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config
36 | 
37 | # Create supervisord configuration file
38 | RUN echo "[supervisord]" > /etc/supervisor/conf.d/supervisord.conf && \
39 |     echo "nodaemon=true" >> /etc/supervisor/conf.d/supervisord.conf && \
40 |     echo "user=root" >> /etc/supervisor/conf.d/supervisord.conf && \
41 |     echo "[program:sshd]" >> /etc/supervisor/conf.d/supervisord.conf && \
42 |     echo "command=/usr/sbin/sshd -D" >> /etc/supervisor/conf.d/supervisord.conf && \
43 |     echo "stdout_logfile=/var/log/sshd.log" >> /etc/supervisor/conf.d/supervisord.conf && \
44 |     echo "stderr_logfile=/var/log/sshd.err" >> /etc/supervisor/conf.d/supervisord.conf
45 | 
46 | # Runhouse server port
47 | EXPOSE 32300
48 | # HTTPS port
49 | EXPOSE 443
50 | # HTTP port
51 | EXPOSE 80
52 | # SSH port
53 | EXPOSE 22
54 | 
55 | # Run supervisord as the main process to manage the others
56 | CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
57 | 


--------------------------------------------------------------------------------
/docker/testing/public-key-auth/instructions.md:
--------------------------------------------------------------------------------
 1 | How to start a local Docker container with public key based authentication
 2 | 
 3 | 1. Configure Docker to use secrets in the build process
 4 | ```
 5 | echo "DOCKER_BUILDKIT=1" >> ~/.docker/config.json
 6 | ```
 7 | 
 8 |   or edit the file manually to make sure it includes
 9 | ```
10 | {
11 |   "features": {
12 |     "buildkit": true
13 |   }
14 | }
15 | ```
16 | 
17 | 2. Generate a public private key pair
18 | ```
19 | mkdir -p ~/.ssh/runhouse/docker
20 | ssh-keygen -t rsa -b 4096 -C "your_email@example.com" -f ~/.ssh/runhouse/docker/id_rsa
21 | ```
22 | 
23 | 3. The Dockerfile in the current directory should support public key based authentication using Docker Secrets for its build process
24 | 
25 | 4. Build the Docker container
26 | ```
27 | docker build --no-cache --pull --rm -f "docker/testing/public-key-auth/Dockerfile" --secret id=ssh_key,src=$HOME/.ssh/runhouse/docker/id_rsa.pub -t runhouse:start .
28 | ```
29 | 
30 | 5. Run the Docker container
31 | ```
32 | docker run --rm --shm-size=4gb -it -p 32300:32300 -p 6379:6379 -p 52365:52365 -p 22:22 -p 443:443 -p 80:80 runhouse:start
33 | ```
34 | 
35 | 6. Verify via SSH
36 | ```
37 | ssh -i ~/.ssh/runhouse/docker/id_rsa rh-docker-user@localhost
38 | ```
39 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | json:
16 | 	@$(SPHINXBUILD) -M json "$(SOURCEDIR)" "$(BUILDDIR)" -b json -t json
17 | 
18 | .PHONY: help Makefile
19 | 
20 | # Catch-all target: route all unknown targets to Sphinx using the new
21 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
22 | %: Makefile
23 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
24 | 


--------------------------------------------------------------------------------
/docs/_ext/json_globaltoc.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | from sphinx.application import Sphinx
 4 | from sphinx.environment.adapters.toctree import TocTree
 5 | from sphinxcontrib.serializinghtml import JSONHTMLBuilder
 6 | 
 7 | __version__ = "0.0.1"
 8 | 
 9 | 
10 | def setup(app: Sphinx) -> Dict[str, Any]:
11 |     app.add_builder(SphinxGlobalTOCJSONHTMLBuilder, override=True)
12 | 
13 |     return {"version": __version__, "parallel_read_safe": True}
14 | 
15 | 
16 | class SphinxGlobalTOCJSONHTMLBuilder(JSONHTMLBuilder):
17 | 
18 |     name: str = "json"
19 | 
20 |     def get_doc_context(self, docname: str, body: str, metatags: str) -> Dict[str, Any]:
21 |         """
22 |         Extends :py:class:`sphinxcontrib.serializinghtml.JSONHTMLBuilder`.
23 | 
24 |         Add a ``globaltoc`` key to our document that contains the HTML for the
25 |         global table of contents.
26 | 
27 |         Note:
28 | 
29 |             We're rendering the **full global toc** for the entire documentation
30 |             set into every page. We do this to easily render the toc on each
31 |             page and allow for a unique toc for each branch and repo version.
32 |         """
33 |         doc = super().get_doc_context(docname, body, metatags)
34 |         # Get the entire doctree.  It is the 3rd argument (``collapse``) that
35 |         # does this.  If you set that to ``True`` you will only get the submenu
36 |         # HTML included if you are on a page that is within that submenu.
37 |         self_toctree = TocTree(self.env).get_toctree_for(
38 |             "index", self, False, titles_only=True, includehidden=False, maxdepth=2
39 |         )
40 |         toctree = self.render_partial(self_toctree)["fragment"]
41 |         doc["globaltoc"] = toctree
42 |         return doc
43 | 


--------------------------------------------------------------------------------
/docs/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/docs/_static/favicon.ico


--------------------------------------------------------------------------------
/docs/_static/rh_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/docs/_static/rh_1.png


--------------------------------------------------------------------------------
/docs/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 | {% block extrahead %}
3 | <link rel="canonical" href="{{('https://www.run.house/docs/' + pagename).replace('/index', '')}}" />
4 | {% endblock %}
5 | 


--------------------------------------------------------------------------------
/docs/api/cli.rst:
--------------------------------------------------------------------------------
 1 | Command Line Interface
 2 | ------------------------------------
 3 | Runhouse provides CLI commands for the following use cases:
 4 | 
 5 | * logging in and out (``runhouse login/logout``)
 6 | * interacting with or retrieving information about clusters (``runhouse cluster <cmd>``)
 7 | * interacting with the Runhouse server (``runhouse server <cmd>``)
 8 | 
 9 | The commands can be run using either ``runhouse`` or the ``rh``` alias
10 | 
11 | .. automodule:: runhouse.main
12 |    :members: login, logout, cluster_ssh, server_start, server_restart, server_stop, server_status, cluster_status, cluster_list, cluster_keep_warm, cluster_up, cluster_down, cluster_logs
13 |    :undoc-members:
14 |    :show-inheritance:
15 | 


--------------------------------------------------------------------------------
/docs/api/python.rst:
--------------------------------------------------------------------------------
 1 | Python API
 2 | ====================================
 3 | Runhouse offers a programmatic API in Python to manage your account and resources.
 4 | 
 5 | 
 6 | Resources
 7 | ------------------------------------
 8 | Resources are the Runhouse abstraction for objects that can be saved, shared, and reused.
 9 | This includes both compute abstractions (clusters, functions, packages, environments) and
10 | data abstractions (folders).
11 | 
12 | .. toctree::
13 |    :maxdepth: 1
14 | 
15 |    python/resource
16 | 
17 | 
18 | Compute Abstractions
19 | ------------------------------------
20 | The Function, Cluster, Env, Package, and Module APIs allow a seamless flow of code and execution across local and remote compute.
21 | They blur the line between program execution and deployment, providing both a path of least resistence for running
22 | a sub-routine on specific hardware, while unceremoniously turning that sub-routine into a reusable service.
23 | They also provide convenient dependency isolation and management, provider-agnostic provisioning and termination,
24 | and rich debugging and accessibility interfaces built-in.
25 | 
26 | .. toctree::
27 |    :maxdepth: 1
28 | 
29 |    python/function
30 | 
31 | .. toctree::
32 |    :maxdepth: 1
33 | 
34 |    python/cluster
35 | 
36 | .. toctree::
37 |    :maxdepth: 1
38 | 
39 |    python/image
40 | 
41 | .. toctree::
42 |    :maxdepth: 1
43 | 
44 |    python/package
45 | 
46 | .. toctree::
47 |     :maxdepth: 1
48 | 
49 |     python/module
50 | 
51 | 
52 | Data Abstractions
53 | ------------------------------------
54 | The Folder APIs provide a simple interface for storing, recalling, and moving data between
55 | the user's laptop, remote compute, and cloud storage (currently we support `S3` and `GCS`). They provide
56 | least-common-denominator APIs across providers, allowing users to easily specify the actions
57 | they want to take on the data without needed to dig into provider-specific APIs.
58 | 
59 | .. toctree::
60 |    :maxdepth: 1
61 | 
62 |    python/folder
63 | 
64 | 
65 | 
66 | Secrets
67 | ------------------------------------
68 | Runhouse provides a convenient interface for managing your secrets in a secure manner.
69 | Secrets are stored in `Vault <https://www.vaultproject.io/>`__, an industry standard for
70 | secrets management, and never touches Runhouse servers. Please see
71 | :ref:`Security and Authentication` for more information on security.
72 | 
73 | .. toctree::
74 |    :maxdepth: 1
75 | 
76 |    python/secrets
77 | 
78 | .. toctree::
79 |    :maxdepth: 1
80 | 
81 |    python/login
82 | 


--------------------------------------------------------------------------------
/docs/api/python/folder.rst:
--------------------------------------------------------------------------------
 1 | Folder
 2 | ====================================
 3 | A Folder represents a specified location for organizing and storing other Runhouse primitives
 4 | across various systems.
 5 | 
 6 | 
 7 | Folder Factory Method
 8 | ~~~~~~~~~~~~~~~~~~~~~
 9 | 
10 | .. autofunction:: runhouse.folder
11 | 
12 | 
13 | Folder Class
14 | ~~~~~~~~~~~~
15 | 
16 | .. autoclass:: runhouse.Folder
17 |    :members:
18 |    :exclude-members:
19 | 
20 |     .. automethod:: __init__
21 | 


--------------------------------------------------------------------------------
/docs/api/python/function.rst:
--------------------------------------------------------------------------------
 1 | Function
 2 | ====================================
 3 | 
 4 | A Function is a portable code block that can be sent to remote hardware to run as a subroutine or service.
 5 | It is comprised of the entrypoint, system (:ref:`Cluster`), and requirements necessary to run it.
 6 | 
 7 | 
 8 | Function Factory Methods
 9 | ~~~~~~~~~~~~~~~~~~~~~~~~
10 | 
11 | .. autofunction:: runhouse.function
12 | 
13 | Function Class
14 | ~~~~~~~~~~~~~~
15 | 
16 | .. autoclass:: runhouse.Function
17 |    :members:
18 |    :exclude-members: map, starmap, get_or_call
19 | 
20 |     .. automethod:: __init__
21 | 


--------------------------------------------------------------------------------
/docs/api/python/image.rst:
--------------------------------------------------------------------------------
 1 | Image
 2 | =====
 3 | A Runhouse image allows you to easily encapsulate various setup steps to take across each node on the cluster before
 4 | it is launched. See the :ref:`Images` section for a more in-depth explanation.
 5 | 
 6 | Image Class
 7 | ~~~~~~~~~~~
 8 | 
 9 | .. autoclass:: runhouse.Image
10 |    :members:
11 |    :exclude-members:
12 | 
13 |     .. automethod:: __init__
14 | 
15 | ImageSteupStepType
16 | ~~~~~~~~~~~~~~~~~~
17 | 
18 | .. autoclass:: runhouse.resources.images.ImageSetupStepType
19 | 
20 |     .. autoattribute:: PACKAGES
21 |     .. autoattribute:: CMD_RUN
22 |     .. autoattribute:: SETUP_CONDA_ENV
23 |     .. autoattribute:: RSYNC
24 |     .. autoattribute:: SYNC_SECRETS
25 |     .. autoattribute:: SET_ENV_VARS
26 |     .. autoattribute:: PIP_INSTALL
27 |     .. autoattribute:: CONDA_INSTALL
28 |     .. autoattribute:: UV_INSTALL
29 |     .. autoattribute:: SYNC_PACKAGE
30 |     .. autoattribute:: SET_VENV
31 | 
32 | ImageSetupStep
33 | ~~~~~~~~~~~~~~
34 | 
35 | .. autoclass:: runhouse.resources.images.ImageSetupStep
36 |     :members:
37 |     :exclude-members:
38 | 
39 |     .. automethod:: __init__
40 | 


--------------------------------------------------------------------------------
/docs/api/python/login.rst:
--------------------------------------------------------------------------------
1 | Login/Logout
2 | ====================================
3 | Functions for logging in and out of your Runhouse account.
4 | 
5 | .. autofunction:: runhouse.login
6 | 
7 | .. autofunction:: runhouse.logout
8 | 


--------------------------------------------------------------------------------
/docs/api/python/module.rst:
--------------------------------------------------------------------------------
 1 | Module
 2 | ====================================
 3 | 
 4 | A Module represents a class that can be sent to and used on remote clusters and environments. Modules can live on remote hardware and its class methods called remotely.
 5 | 
 6 | 
 7 | Module Factory Method
 8 | ~~~~~~~~~~~~~~~~~~~~~
 9 | 
10 | .. autofunction:: runhouse.module
11 | 
12 | Module Class
13 | ~~~~~~~~~~~~
14 | 
15 | .. autoclass:: runhouse.Module
16 |    :members:
17 |    :exclude-members:
18 | 
19 |     .. automethod:: __init__
20 | 


--------------------------------------------------------------------------------
/docs/api/python/package.rst:
--------------------------------------------------------------------------------
 1 | Package
 2 | ====================================
 3 | A Package is a Runhouse primitive for sharing code between various systems (ex: s3, cluster, local).
 4 | 
 5 | 
 6 | Package Factory Method
 7 | ~~~~~~~~~~~~~~~~~~~~~~
 8 | 
 9 | .. autofunction:: runhouse.package
10 | 
11 | 
12 | Package Class
13 | ~~~~~~~~~~~~~
14 | 
15 | .. autoclass:: runhouse.Package
16 |    :members:
17 |    :exclude-members:
18 | 
19 |     .. automethod:: __init__
20 | 


--------------------------------------------------------------------------------
/docs/api/python/resource.rst:
--------------------------------------------------------------------------------
 1 | Resource
 2 | ========
 3 | Resources are the Runhouse abstraction for objects that can be saved, shared, and reused.
 4 | 
 5 | 
 6 | Resource Class
 7 | ~~~~~~~~~~~~~~
 8 | .. autoclass:: runhouse.resources.resource.Resource
 9 |    :members:
10 |    :exclude-members:
11 | 
12 |     .. automethod:: __init__
13 | 


--------------------------------------------------------------------------------
/docs/assets/img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/docs/assets/img.png


--------------------------------------------------------------------------------
/docs/assets/img_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/docs/assets/img_1.png


--------------------------------------------------------------------------------
/docs/debugging-logging.rst:
--------------------------------------------------------------------------------
 1 | Debugging and Logging
 2 | =====================
 3 | 
 4 | Below, we describe how to access log outputs and show a sample debugging flow.
 5 | 
 6 | 
 7 | Logging
 8 | ~~~~~~~
 9 | 
10 | There are three main ways to access logs:
11 | 
12 | (1) **On the cluster**
13 | 
14 |     Logs are automatically output onto the cluster, in the file ``~/.rh/server.log``. You can ssh
15 |     into the cluster with ``runhouse cluster ssh cluster-name`` to view these logs.
16 | 
17 | (2) **Streaming**
18 | 
19 |     To see logs on your local machine while running a remote function, you can add the ``stream_logs=True``
20 |     argument to your function call.
21 | 
22 |     .. code:: ipython3
23 | 
24 |         remote_fn = rh.function(fn)
25 |         fn(fn_args, stream_logs=True)
26 | 
27 | (3) **Runhouse CLI**
28 | 
29 |     You can view the latest logs by running the command: ``runhouse cluster logs cluster-name``.
30 | 
31 | Log Levels
32 | ----------
33 | You can set the log level to control the verbosity of the Runhouse logs. You can adjust the log level by setting the
34 | environment variable ``RH_LOG_LEVEL`` to your desired level.
35 | 
36 | Debugging
37 | ~~~~~~~~~
38 | 
39 | For general debugging that doesn't occur within remote function calls, you can add ``breakpoint()`` wherever you want
40 | to set your debugging session. If the code is being run locally at the point of the debugger, you'll be able to access
41 | the session from your local machine. If the code is being run remotely on a cluster, you will need to ssh into the
42 | cluster with ``runhouse cluster ssh cluster-name``, and then run ``screen -r`` inside the cluster.
43 | From there, you will see the RPC logs being printed out, and can debug normally inside the ``screen``.
44 | 
45 | .. note::
46 | 
47 |     When debugging inside ``screen``, please use ``Ctrl A+D`` to exit out of the screen. Do NOT use ``Ctrl C``,
48 |     which will terminate the RPC server.
49 | 
50 |     If you accidentally terminate the RPC server, you can run ``cluster.restart_server()`` to restart the
51 |     server.
52 | 
53 | For debugging remote functions, which are launched using ``ray``, we can utilize Ray's debugger. Add a ``breakpoint()``
54 | call inside the function where you want to start the debugging session, then ssh into the cluster with
55 | ``runhouse cluster ssh cluster-name``, and call ``ray debug`` to view select the breakpoint to enter.
56 | You can run normal ``pdb`` commands within the debugging session, and can refer to `Ray Debugger
57 | <https://docs.ray.io/en/latest/ray-contribute/debugging.html>`__ for more information.
58 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.https://www.sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | chardet==4.0.0
 2 | myst-parser==2.0.0
 3 | pint==0.20.1
 4 | pydata-sphinx-theme==0.13.3
 5 | ray>=2.2.0
 6 | sphinx-book-theme==1.0.1
 7 | sphinx-click==4.3.0
 8 | sphinx-copybutton==0.5.1
 9 | sphinx-thebe==0.2.1
10 | sphinx==6.2.1
11 | sphinx_autodoc_typehints==1.17.0
12 | sphinxcontrib-serializinghtml==1.1.5
13 | 


--------------------------------------------------------------------------------
/docs/security-and-authentication.rst:
--------------------------------------------------------------------------------
 1 | Security and Authentication
 2 | ===========================
 3 | By default, Runhouse collects metadata from provisioned clusters and data relating to performance and error monitoring.
 4 | This data will only be used by Runhouse to improve the product.
 5 | 
 6 | Cluster Metadata Collection
 7 | ---------------------------
 8 | We collect non-sensitive data on the cluster that helps us understand how Runhouse is being used. This data includes:
 9 | 
10 | - Python version
11 | - Resources (cpus, gpus, memory)
12 | - Cloud provider
13 | - Region
14 | - Instance type
15 | 
16 | 
17 | Cluster Observability
18 | ---------------------------------------
19 | Runhouse collects various telemetry data by default on clusters. This data will be used to provide better observability
20 | into logs, traces, and metrics associated with clusters. We will not sell data or buy any observability data collected.
21 | 
22 | To disable observability globally for all clusters, set the environment variable :code:`disable_observability`
23 | to :code:`True`. Alternatively, set :code:`disable_observability` to :code:`true` in your
24 | local Runhouse config (:code:`~/.rh/config.yaml`), or in Python:
25 | 
26 | .. code-block:: python
27 | 
28 |     import runhouse as rh
29 |     rh.configs.disable_observability()
30 | 


--------------------------------------------------------------------------------
/docs/tutorials/api-images.rst:
--------------------------------------------------------------------------------
 1 | Images
 2 | ======
 3 | 
 4 | .. raw:: html
 5 | 
 6 |     <p><a href="https://colab.research.google.com/github/run-house/notebooks/blob/stable/docs/api-images.ipynb">
 7 |     <img height="20px" width="117px" src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a></p>
 8 | 
 9 | Runhouse clusters expose various functions that allow you to set up
10 | state, dependencies, and whatnot on all nodes of your cluster. These
11 | include:
12 | 
13 | -  ``cluster.pip_install(...)``
14 | -  ``cluster.rsync(...)``
15 | -  ``cluster.set_env_vars(...)``
16 | -  ``cluster.run_bash(...)``
17 | 
18 | A Runhouse “Image” is simply an abstraction that allows you to run
19 | several setup steps *before* we install ``runhouse`` and bring up the
20 | Runhouse daemon and initial set up on your cluster’s nodes. You can also
21 | specify a Docker ``image_id`` as the “base image” of your Runhouse
22 | image.
23 | 
24 | Here’s a simple example of using the Runhouse Image abstraction in your
25 | cluster setup:
26 | 
27 | .. code:: ipython3
28 | 
29 |     import runhouse as rh
30 | 
31 |     image = (
32 |         rh.Image(name="sample_image")
33 |         .from_docker("python:3.12.8-bookworm")
34 |         .pip_install(["numpy", "pandas"])
35 |         .sync_secrets(["huggingface"])
36 |         .set_env_vars({"RH_LOG_LEVEL": "debug"})
37 |     )
38 | 
39 |     cluster = rh.cluster(name="ml_ready_cluster", image=image, instance_type="CPU:2+", provider="aws").up_if_not()
40 | 
41 | 
42 | .. parsed-literal::
43 |     :class: code-output
44 | 
45 |     I 12-17 12:04:55 provisioner.py:560] [32mSuccessfully provisioned cluster: ml_ready_cluster[0m
46 |     I 12-17 12:04:57 cloud_vm_ray_backend.py:3402] Run commands not specified or empty.
47 |     Clusters
48 |     [2mAWS: Fetching availability zones mapping...[0mNAME              LAUNCHED        RESOURCES                                                                  STATUS  AUTOSTOP  COMMAND
49 |     ml_ready_cluster  a few secs ago  1x AWS(m6i.large, image_id={'us-east-1': 'docker:python:3.12.8-bookwor...  UP      (down)    /Users/rohinbhasin/minico...
50 | 
51 |     [?25h
52 | 
53 | The growing listing of setup steps available for Runhouse images is
54 | available in the :ref:`API reference docs <Image Class>`.
55 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Runhouse Examples
2 | 
3 | These directories contain self-contained examples that use Runhouse for various use cases. Each example has
4 | several comments that contain Markdown. These are rendered as examples on
5 | [our site](https://www.run.house/examples). To add to these, make a new directory and example file, and follow
6 | the Markdown-in-comments format that the rest of the examples follow.
7 | 


--------------------------------------------------------------------------------
/examples/dask-basic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/dask-basic/__init__.py


--------------------------------------------------------------------------------
/examples/dask-preprocessing-and-training/dask_on_ray.py:
--------------------------------------------------------------------------------
 1 | import kubetorch as kt
 2 | 
 3 | # ## Using Dask on Ray for data processing
 4 | # Dask on Ray works out of the box when KT sets up the Ray cluster; simply use enable_dask_on_ray()
 5 | def read_taxi_df_dask(dataset_path, X_vars, y_vars):
 6 |     import dask.dataframe as dd
 7 |     from ray.util.dask import disable_dask_on_ray, enable_dask_on_ray
 8 | 
 9 |     enable_dask_on_ray()
10 | 
11 |     # Read the dataset
12 |     df = dd.read_parquet(dataset_path)
13 |     print(df.head())
14 | 
15 |     X = df[X_vars].to_dask_array(lengths=True)
16 |     y = df[y_vars].to_dask_array(lengths=True)
17 | 
18 |     from dask_ml.model_selection import train_test_split
19 | 
20 |     X_train, X_test, y_train, y_test = train_test_split(
21 |         X, y, test_size=0.2, random_state=42
22 |     )
23 | 
24 |     print("First few rows of X_train:")
25 |     print(
26 |         X_train[:5].compute()
27 |     )  # Limit to first 5 rows and compute to bring it to memory
28 | 
29 |     disable_dask_on_ray()
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     img = (
34 |         kt.images.ray()
35 |         .pip_install(
36 |             [
37 |                 "dask-ml",
38 |                 "dask[distributed]",
39 |                 "dask[dataframe]",
40 |                 "boto3",
41 |                 "s3fs",
42 |                 "xgboost",
43 |             ]
44 |         )
45 |         .sync_secrets(["aws"])
46 |     )
47 |     compute = kt.Compute(cpus="4+", image=img)
48 | 
49 |     remote_read_taxi_df_dask = (
50 |         kt.fn(read_taxi_df_dask).to(compute).distribute("ray", num_nodes=4)
51 |     )
52 | 
53 |     # ## Example of using Dask on Ray to read data and minimally preprocess the data
54 |     # Use one slice of the NYC taxi data as an example
55 |     remote_read_taxi_df_dask(
56 |         dataset_path="s3://rh-demo-external/taxi/yellow_tripdata_2024-01.parquet",
57 |         X_vars=["passenger_count", "trip_distance", "fare_amount"],
58 |         y_var=["tip_amount"],
59 |     )
60 | 


--------------------------------------------------------------------------------
/examples/dask-preprocessing-and-training/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse
2 | dask[distributed]
3 | dask-ml
4 | 


--------------------------------------------------------------------------------
/examples/dlrm-movielens/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim
 2 | 
 3 | ENV HOME /root
 4 | 
 5 | RUN apt-get update && apt-get install -y git
 6 | 
 7 | # Install required Python packages
 8 | RUN pip install --no-cache-dir skypilot[aws] awscli runhouse torch "ray[data, train]"
 9 | RUN apt-get update && apt-get install -y rsync openssh-client
10 | 
11 | # Copy your custom Python module
12 | COPY /dlrm_data_prepoc.py /root/code/dlrm_data_preproc.py
13 | COPY /dlrm_training.py /root/code/dlrm_training.py
14 | COPY /dlrm_inference.py /root/code/dlrm_inference.py
15 | COPY /requirements.txt /root/code/requirements.txt
16 | COPY /__init__.py /root/code/__init__.py
17 | 
18 | RUN mkdir -p ~/.ssh
19 | 


--------------------------------------------------------------------------------
/examples/dlrm-movielens/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/dlrm-movielens/__init__.py


--------------------------------------------------------------------------------
/examples/dlrm-movielens/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse
2 | ray[data,train]
3 | torch
4 | boto3
5 | 


--------------------------------------------------------------------------------
/examples/fastapi-embeddings-rag/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9
 2 | 
 3 | WORKDIR /code
 4 | 
 5 | # Copy only the file with the requirements first, not the rest of the code.
 6 | # As this file doesn't change often, Docker will detect it and use the cache for this step,
 7 | # enabling the cache for the next step too.
 8 | COPY ./requirements.txt /code/requirements.txt
 9 | 
10 | # The --no-cache-dir option tells pip to not save the downloaded packages locally,
11 | # as that is only if pip was going to be run again to install the same packages,
12 | # but that's not the case when working with containers.
13 | RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
14 | 
15 | # Rsync is required to run sky commands on the container
16 | RUN apt-get update && apt-get -y install rsync
17 | 
18 | # As this has all the code which is what changes most frequently the Docker
19 | # cache won't be used for this or any following steps easily.
20 | # So, it's important to put this near the end of the Dockerfile,
21 | # to optimize the container image build times.
22 | COPY ./app /code/app
23 | 
24 | # Set the command to use fastapi run, which uses Uvicorn underneath.
25 | # CMD takes a list of strings, each of these strings is
26 | # what you would type in the command line separated by spaces.
27 | # This command will be run from the current working directory,
28 | # the same /code directory you set above with WORKDIR /code.
29 | CMD ["fastapi", "run", "app/main.py", "--port", "80"]
30 | 


--------------------------------------------------------------------------------
/examples/fastapi-embeddings-rag/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/fastapi-embeddings-rag/app/__init__.py


--------------------------------------------------------------------------------
/examples/fastapi-embeddings-rag/app/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/fastapi-embeddings-rag/app/modules/__init__.py


--------------------------------------------------------------------------------
/examples/fastapi-embeddings-rag/app/modules/embedding.py:
--------------------------------------------------------------------------------
 1 | from lancedb.pydantic import LanceModel, Vector
 2 | 
 3 | 
 4 | class Item(LanceModel):
 5 |     url: str
 6 |     page_content: str
 7 |     vector: Vector(1024)
 8 | 
 9 | 
10 | class URLEmbedder:
11 |     def __init__(self, **model_kwargs):
12 |         import torch
13 |         from sentence_transformers import SentenceTransformer
14 | 
15 |         self.model = torch.compile(SentenceTransformer(**model_kwargs))
16 | 
17 |     def encode_text(self, text: str, **embed_kwargs):
18 |         embeddings = self.model.encode([text], **embed_kwargs)
19 | 
20 |         return embeddings[0]
21 | 
22 |     def embed_docs(self, paths: str, **embed_kwargs):
23 |         from langchain_community.document_loaders import WebBaseLoader
24 |         from langchain_text_splitters import RecursiveCharacterTextSplitter
25 | 
26 |         docs = WebBaseLoader(
27 |             web_paths=paths,
28 |         ).load()
29 |         split_docs = RecursiveCharacterTextSplitter(
30 |             chunk_size=250, chunk_overlap=50
31 |         ).split_documents(docs)
32 |         splits_as_str = [doc.page_content for doc in split_docs]
33 |         embeddings = self.model.encode(splits_as_str, **embed_kwargs)
34 |         items = [
35 |             {
36 |                 "url": doc.metadata["source"],
37 |                 "page_content": doc.page_content,
38 |                 "vector": embeddings[index],
39 |             }
40 |             for index, doc in enumerate(split_docs)
41 |         ]
42 | 
43 |         return items
44 | 


--------------------------------------------------------------------------------
/examples/fastapi-embeddings-rag/app/modules/llm.py:
--------------------------------------------------------------------------------
 1 | class LlamaModel:
 2 |     def __init__(self, model_id="meta-llama/Meta-Llama-3-8B-Instruct", **model_kwargs):
 3 |         super().__init__()
 4 |         self.model_id, self.model_kwargs = model_id, model_kwargs
 5 |         self.engine = None
 6 | 
 7 |     def load_engine(self):
 8 |         import gc
 9 | 
10 |         import torch
11 |         from vllm.distributed.parallel_state import (
12 |             destroy_distributed_environment,
13 |             destroy_model_parallel,
14 |         )
15 |         from vllm.engine.arg_utils import AsyncEngineArgs
16 |         from vllm.engine.async_llm_engine import AsyncLLMEngine
17 | 
18 |         # This vLLM function resets the global variables, which enables initializing models
19 |         destroy_model_parallel()
20 |         # Cleanup methods in case vLLM is reloaded in a new LlamaModel instance
21 |         destroy_distributed_environment()
22 |         gc.collect()
23 |         torch.cuda.empty_cache()
24 | 
25 |         args = AsyncEngineArgs(
26 |             model=self.model_id,  # Hugging Face Model ID
27 |             tensor_parallel_size=1,  # Increase if using additional GPUs
28 |             trust_remote_code=True,  # Trust remote code from Hugging Face
29 |             enforce_eager=True,  # Set to False in production to improve performance
30 |             max_model_len=7056,  #
31 |         )
32 |         self.engine = AsyncLLMEngine.from_engine_args(args)
33 | 
34 |     async def generate(self, prompt: str, **sampling_params):
35 |         from vllm.sampling_params import SamplingParams
36 |         from vllm.utils import random_uuid
37 | 
38 |         if not self.engine:
39 |             self.load_engine()
40 | 
41 |         sampling_params = SamplingParams(**sampling_params)
42 |         request_id = random_uuid()
43 |         results_generator = self.engine.generate(prompt, sampling_params, request_id)
44 | 
45 |         async for output in results_generator:
46 |             final_output = output
47 |         responses = []
48 |         for output in final_output.outputs:
49 |             responses.append(output.text)
50 |         return responses
51 | 


--------------------------------------------------------------------------------
/examples/fastapi-embeddings-rag/requirements.txt:
--------------------------------------------------------------------------------
1 | asyncio
2 | fastapi[standard]
3 | lancedb==0.11.0
4 | runhouse[aws]==0.0.32
5 | 


--------------------------------------------------------------------------------
/examples/flux/flux.py:
--------------------------------------------------------------------------------
 1 | import kubetorch as kt
 2 | 
 3 | # ## Create Flux Pipeline with Kubetorch
 4 | # First, we define a class that will hold the model and allow us to send prompts to it.
 5 | # To deploy it as a service, we simply decorate the class to send it to our cluster
 6 | # when we call `kubetorch deploy` in the CLI.
 7 | img = (
 8 |     kt.images.pytorch()
 9 |     .pip_install(
10 |         [
11 |             "diffusers",
12 |             "transformers[sentencepiece]",
13 |             "accelerate",
14 |         ]
15 |     )
16 |     .sync_secrets(["huggingface"])
17 | )
18 | 
19 | 
20 | @kt.compute(
21 |     gpus="A10G:1", memory="64", image=img
22 | )  # Send to compute with an A10 GPU and 64GB of memory
23 | @kt.distribute("auto", num_replicas=(1, 4))  # Autoscale between 1 and 4 replicas
24 | class FluxPipeline:
25 |     def __init__(
26 |         self,
27 |         model_id: str = "black-forest-labs/FLUX.1-schnell",  # Schenll is smaller and faster while dev is more powerful but slower
28 |     ):
29 |         super().__init__()
30 |         self.model_id = model_id
31 |         self.pipeline = None
32 | 
33 |     def _load_pipeline(self):
34 |         import torch
35 |         from diffusers import FluxPipeline
36 | 
37 |         if not self.pipeline:
38 |             self.pipeline = FluxPipeline.from_pretrained(
39 |                 self.model_id, torch_dtype=torch.bfloat16, use_safetensors=True
40 |             )
41 |             self.pipeline.enable_sequential_cpu_offload()  # Optimizes memory usage to allow the model to fit and inference on an A10 which has 24GB of memory
42 | 
43 |     def generate(self, input_prompt: str, **parameters):
44 |         import torch
45 | 
46 |         torch.cuda.empty_cache()
47 | 
48 |         if not self.pipeline:
49 |             self._load_pipeline()
50 | 
51 |         image = self.pipeline(
52 |             input_prompt,
53 |             guidance_scale=0.0,
54 |             num_inference_steps=4,
55 |             max_sequence_length=256,
56 |             generator=torch.Generator("cpu").manual_seed(0),
57 |         ).images[0]
58 | 
59 |         return image
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     # We can load the remote model from anywhere that has access to the cluster
64 |     flux_pipeline = FluxPipeline.from_name("flux")
65 | 
66 |     # We can call the `generate` method on the model class instance if it were running locally.
67 |     # This will run the function on the remote cluster and return the response to our local machine automatically.
68 |     # We can also call this from a different machine or script and create composite ML systems.
69 |     prompt = "A woman runs through a large, grassy field towards a house."
70 |     response = flux_pipeline.generate(prompt)
71 |     response.save("flux-schnell.png")
72 |     response.show()
73 | 


--------------------------------------------------------------------------------
/examples/flux/readme.md:
--------------------------------------------------------------------------------
 1 | # Deploy Flux1 Schnell on AWS EC2
 2 | 
 3 | See a more [rich explanation](https://www.run.house/guides/host-and-run-flux1-image-genai-aws)
 4 | of this example on our site.
 5 | 
 6 | This example demonstrates how to deploy a
 7 | [Flux.1 model from Hugging Face](https://huggingface.co/black-forest-labs/FLUX.1-schnell)
 8 | on AWS EC2 using Runhouse. Schnell is smaller than their Dev version, but fits easily onto a single A10G.
 9 | 
10 | ## Setup credentials and dependencies
11 | 
12 | Optionally, set up a virtual environment:
13 | ```shell
14 | $ conda create -n rh-flux python=3.11
15 | $ conda activate rh-flux
16 | ```
17 | Install the few required dependencies:
18 | ```shell
19 | $ pip install -r requirements.txt
20 | ```
21 | 
22 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to
23 | make sure our AWS credentials are set up:
24 | ```shell
25 | $ aws configure
26 | $ sky check
27 | ```
28 | 
29 | After that, you can just run the example:
30 | ```shell
31 | $ python flux.py
32 | ```
33 | 


--------------------------------------------------------------------------------
/examples/flux/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse
2 | 


--------------------------------------------------------------------------------
/examples/hello-world/hello_world.py:
--------------------------------------------------------------------------------
 1 | import kubetorch as kt
 2 | 
 3 | # ## Write your code
 4 | # This regular Python code is developed locally, and then
 5 | # deployed to Kubernetes with Kubetorch. On first execution, it
 6 | # may take a little time to allocate compute; subsequently, changes to this function
 7 | # will hot sync instantaneously for interactive development. Then, the dispatch
 8 | # can be scheduled or put into CI as-is to reach production.
 9 | def hello_world(num_prints=1):
10 |     for print_num in range(num_prints):
11 |         print("Hello world ", print_num)
12 | 
13 | 
14 | # ## Define compute, deploy, and call
15 | # You define compute with kt.Compute(), and then send the `hello_world`
16 | # function to that compute to run. You can see that you get back a callable
17 | # with the same function signature as the original, and you can call it identically.
18 | if __name__ == "__main__":
19 |     compute = kt.Compute(cpus=1)
20 | 
21 |     remote_hello = kt.fn(hello_world).to(compute)
22 | 
23 |     results = remote_hello(5)
24 | 


--------------------------------------------------------------------------------
/examples/hello-world/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/hello-world/requirements.txt


--------------------------------------------------------------------------------
/examples/hpo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/hpo/__init__.py


--------------------------------------------------------------------------------
/examples/hpo/hpo.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import time
 3 | 
 4 | import numpy as np
 5 | 
 6 | import runhouse as rh
 7 | 
 8 | NUM_WORKERS = 8
 9 | NUM_JOBS = 30
10 | 
11 | 
12 | def train_fn(step, width, height):
13 |     time.sleep(5)
14 |     return (0.1 + width * step / 100) ** (-1) + height * 0.1
15 | 
16 | 
17 | def generate_params():
18 |     return {"width": np.random.uniform(0, 1), "height": np.random.uniform(0, 1)}
19 | 
20 | 
21 | async def find_best_params():
22 |     cluster = rh.compute(
23 |         name="rh-4x16-cpu", instance_type="CPU:16", num_nodes=4, provider="aws"
24 |     ).up_if_not()
25 | 
26 |     remote_train_fn = rh.function(train_fn).to(cluster)
27 |     available_worker_fns = [remote_train_fn] + remote_train_fn.replicate(
28 |         NUM_WORKERS - 1
29 |     )
30 | 
31 |     async def run_job(step):
32 |         while not available_worker_fns:
33 |             await asyncio.sleep(1)
34 |         worker_fn = available_worker_fns.pop(0)
35 |         next_point_to_probe = generate_params()
36 | 
37 |         print(f"Calling step {step} on point {next_point_to_probe}")
38 |         target = await worker_fn(step=step, **next_point_to_probe, run_async=True)
39 |         print(f"Returned step {step} with value {target}")
40 | 
41 |         available_worker_fns.append(worker_fn)
42 |         return next_point_to_probe, target
43 | 
44 |     results = await asyncio.gather(
45 |         *[run_job(counter) for counter in range(NUM_JOBS)], return_exceptions=True
46 |     )
47 | 
48 |     max_result = max(results, key=lambda x: x[1])
49 |     print(f"Optimization finished. Best parameters found: {max_result}")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     asyncio.run(find_best_params())
54 | 


--------------------------------------------------------------------------------
/examples/hpo/hpo_bayes_opt.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | import runhouse as rh
 4 | 
 5 | from bayes_opt import BayesianOptimization
 6 | 
 7 | NUM_WORKERS = 8
 8 | NUM_JOBS = 30
 9 | 
10 | 
11 | def train_fn(x, y):
12 |     return -(x**2) - (y - 1) ** 2 + 1
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     img = rh.Image("worker_image").pip_install(["bayesian-optimization"])
17 | 
18 |     cluster = rh.compute(
19 |         name="rh-4x16-cpu",
20 |         instance_type="CPU:4+",
21 |         num_nodes=2,
22 |         provider="kubernetes",
23 |         image=img,
24 |     ).up_if_not()
25 | 
26 |     remote_train_fn = rh.function(train_fn).to(cluster)
27 |     train_fn_pool = remote_train_fn.distribute(
28 |         "pool", num_replicas=NUM_WORKERS, replicas_per_node=NUM_WORKERS // 2
29 |     )
30 | 
31 |     optimizer = BayesianOptimization(
32 |         f=partial(train_fn_pool, stream_logs=False),
33 |         pbounds={"x": (-2, 2), "y": (-3, 3)},
34 |         verbose=2,
35 |         random_state=1,
36 |     )
37 |     optimizer.maximize(init_points=NUM_WORKERS, n_iter=NUM_JOBS)
38 |     print(f"Optimization finished. Best parameters found: {optimizer.max}")
39 | 


--------------------------------------------------------------------------------
/examples/hpo/hpo_bayes_opt_low_level.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import time
 3 | 
 4 | import runhouse as rh
 5 | 
 6 | NUM_WORKERS = 8
 7 | NUM_JOBS = 30
 8 | 
 9 | 
10 | def train_fn(step, width, height):
11 |     time.sleep(5)
12 |     return (0.1 + width * step / 100) ** (-1) + height * 0.1
13 | 
14 | 
15 | async def find_best_params():
16 |     from bayes_opt import BayesianOptimization, UtilityFunction
17 | 
18 |     img = rh.Image("worker_image").pip_install(["bayesian-optimization"])
19 | 
20 |     cluster = rh.compute(
21 |         name="rh-4x16-cpu",
22 |         instance_type="CPU:16",
23 |         num_nodes=4,
24 |         provider="aws",
25 |         image=img,
26 |     ).up_if_not()
27 | 
28 |     worker_fns = rh.function(train_fn).to(cluster).replicate(replicas=NUM_WORKERS)
29 | 
30 |     optimizer = BayesianOptimization(
31 |         f=None,
32 |         pbounds={"width": (0, 20), "height": (-100, 100)},
33 |         verbose=2,
34 |         random_state=1,
35 |     )
36 |     utility = UtilityFunction(kind="ucb", kappa=2.5, xi=0.0)
37 | 
38 |     async def run_job(step):
39 |         while not worker_fns:
40 |             await asyncio.sleep(1)
41 |         worker_fn = worker_fns.pop(0)
42 |         hyperparams = optimizer.suggest(utility)
43 | 
44 |         print(f"Calling step {step} on point {hyperparams}")
45 |         target = await worker_fn(step=step, **hyperparams, run_async=True)
46 |         print(f"Returned step {step} with value {target}")
47 | 
48 |         optimizer.register(hyperparams, target)
49 |         utility.update_params()
50 | 
51 |         worker_fns.append(worker_fn)
52 | 
53 |     futs = [run_job(counter) for counter in range(NUM_JOBS)]
54 |     await asyncio.gather(*futs, return_exceptions=True)
55 | 
56 |     print(f"Optimization finished. Best parameters found: {optimizer.max}")
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     asyncio.run(find_best_params())
61 | 


--------------------------------------------------------------------------------
/examples/inference_llama70b/llama70b_vllm.py:
--------------------------------------------------------------------------------
 1 | import kubetorch as kt
 2 | from vllm import LLM, SamplingParams
 3 | 
 4 | 
 5 | img = (
 6 |     kt.images.pytorch()
 7 |     .pip_install(["transformers", "vllm"])
 8 |     .sync_secrets(["huggingface"])
 9 | )
10 | 
11 | 
12 | @kt.compute(gpus="L4:8", image=img, name="llama70b")
13 | @kt.distribute("auto", num_replicas=(0, 4))
14 | class Llama70B_vLLM:
15 |     def __init__(self, num_gpus, model_id="meta-llama/Llama-3.3-70B-Instruct"):
16 |         self.model_id = model_id
17 |         self.model = None
18 |         self.sampling_params = None
19 |         self.num_gpus = num_gpus
20 | 
21 |     def load_model(self, temperature=1, top_p=0.9, max_tokens=256, min_tokens=32):
22 |         self.sampling_params = SamplingParams(
23 |             temperature=temperature,
24 |             top_p=top_p,
25 |             max_tokens=max_tokens,
26 |             min_tokens=min_tokens,
27 |         )
28 |         print("loading model")
29 |         self.model = LLM(
30 |             self.model_id,
31 |             tensor_parallel_size=self.num_gpus,
32 |             dtype="bfloat16",
33 |             trust_remote_code=True,
34 |             max_model_len=8192,
35 |         )
36 |         print("model loaded")
37 | 
38 |     def generate(self, queries, temperature=1, top_p=0.95):
39 |         if self.model is None:
40 |             self.load_model(temperature, top_p)
41 | 
42 |         outputs = self.model.generate(queries, self.sampling_params)
43 |         return outputs
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     llama = Llama70B_vLLM.from_name("llama70b")
48 | 
49 |     queries = [
50 |         "What is the best type of bread in the world?",
51 |         "What are some cheeses that go with bread?",
52 |         "What is the best way to make a sandwich?",
53 |     ]
54 |     outputs = llama.generate(queries)
55 |     for output in outputs:
56 |         prompt = output.prompt
57 |         generated_text = output.outputs[0].text
58 |         print(f"Prompt: {prompt}, Generated text: {generated_text}")
59 | 


--------------------------------------------------------------------------------
/examples/langchain-rag-ec2/README.md:
--------------------------------------------------------------------------------
 1 | # Deploy a Langchain RAG as a service on AWS EC2
 2 | 
 3 | This is an example of easily deploying [Langchain's Quickstart RAG app](https://python.langchain.com/docs/use_cases/question_answering/quickstart)
 4 | as a service on AWS EC2 using Runhouse.
 5 | 
 6 | ## Setup credentials and dependencies
 7 | 
 8 | Optionally, set up a virtual environment:
 9 | ```shell
10 | $ conda create -n langchain-rag python=3.9.15
11 | $ conda activate langchain-rag
12 | ```
13 | Install Runhouse, the only library needed to run this script locally:
14 | ```shell
15 | $ pip install "runhouse[aws]"
16 | ```
17 | 
18 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to
19 | make sure our AWS credentials are set up:
20 | ```shell
21 | $ aws configure
22 | $ sky check
23 | ```
24 | 
25 | We'll be hitting OpenAI's API, so we need to set up our OpenAI API key:
26 | ```shell
27 | $ export OPENAI_API_KEY=<your openai key>
28 | ```
29 | 
30 | After that, you can just run the example:
31 | ```shell
32 | $ python langchain_rag.py
33 | ```
34 | 


--------------------------------------------------------------------------------
/examples/lightning-resnet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | kubetorch
3 | boto3
4 | lightning
5 | datasets
6 | torchvision
7 | 


--------------------------------------------------------------------------------
/examples/llama2-13b-ec2/README.md:
--------------------------------------------------------------------------------
 1 | # Deploy Llama2 13B Chat Model Inference on AWS EC2
 2 | 
 3 | See a more [rich explanation](https://www.run.house/examples/llama2-chat-model-inference-aws-ec2)
 4 | of this example on our site.
 5 | 
 6 | This example demonstrates how to deploy a
 7 | [LLama2 13B model from Hugging Face](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)
 8 | on AWS EC2 using Runhouse.
 9 | 
10 | ## Setup credentials and dependencies
11 | 
12 | Optionally, set up a virtual environment:
13 | ```shell
14 | $ conda create -n llama-demo-apps python=3.8
15 | $ conda activate llama-demo-apps
16 | ```
17 | Install the few required dependencies:
18 | ```shell
19 | $ pip install -r requirements.txt
20 | ```
21 | 
22 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to
23 | make sure our AWS credentials are set up:
24 | ```shell
25 | $ aws configure
26 | $ sky check
27 | ```
28 | We'll be downloading the Llama2 model from Hugging Face, so we need to set up our Hugging Face token:
29 | ```shell
30 | $ export HF_TOKEN=<your huggingface token>
31 | ```
32 | 
33 | After that, you can just run the example:
34 | ```shell
35 | $ python llama2_ec2.py
36 | ```
37 | 


--------------------------------------------------------------------------------
/examples/llama2-13b-ec2/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse[aws]
2 | torch
3 | 


--------------------------------------------------------------------------------
/examples/llama2-fine-tuning-with-lora/README.md:
--------------------------------------------------------------------------------
 1 | # Fine Tune Llama 2 with LoRA on AWS EC2
 2 | 
 3 | See a more [rich explanation](https://www.run.house/examples/llama2-fine-tuning-with-lora)
 4 | of this example on our site.
 5 | 
 6 | This example demonstrates how to fine tune a model using
 7 | [Llama 2](https://huggingface.co/NousResearch/Llama-2-7b-chat-hf) and
 8 | [LoRA](https://huggingface.co/docs/peft/main/en/conceptual_guides/lora) on AWS EC2 using Runhouse.
 9 | 
10 | ## Setup credentials and dependencies
11 | 
12 | Install the few required dependencies:
13 | ```shell
14 | $ pip install -r requirements.txt
15 | ```
16 | 
17 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to
18 | make sure our AWS credentials are set up:
19 | ```shell
20 | $ aws configure
21 | $ sky check
22 | ```
23 | 
24 | After that, you can just run the example:
25 | ```shell
26 | $ python llama2_fine_tuning.py
27 | ```
28 | 


--------------------------------------------------------------------------------
/examples/llama2-fine-tuning-with-lora/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse[aws]
2 | 


--------------------------------------------------------------------------------
/examples/llama2-with-tgi-aws-inferentia2/README.md:
--------------------------------------------------------------------------------
 1 | # Deploy Llama 2 7B Model with TGI on AWS Inferentia
 2 | 
 3 | See a more [rich explanation](https://www.run.house/examples/llama-tgi-inference-on-aws-inferentia)
 4 | of this example on our site.
 5 | 
 6 | This example demonstrates how to deploy a [Llama 7B model](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) using
 7 | [TGI](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on AWS Inferentia
 8 | using Runhouse, specifically with the [AWS Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/).
 9 | 
10 | ## Setup credentials and dependencies
11 | Install the required dependencies:
12 | ```shell
13 | $ pip install -r requirements.txt
14 | ```
15 | 
16 | We'll be launching an AWS Inferentia instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to
17 | make sure our AWS credentials are set up:
18 | ```shell
19 | $ aws configure
20 | $ sky check
21 | ```
22 | 


--------------------------------------------------------------------------------
/examples/llama2-with-tgi-aws-inferentia2/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse[aws]
2 | 


--------------------------------------------------------------------------------
/examples/llama2-with-tgi-ec2/README.md:
--------------------------------------------------------------------------------
 1 | # Deploy Llama 2 7B Model with TGI on AWS EC2
 2 | 
 3 | See a more [rich explanation](https://www.run.house/examples/llama-tgi-inference-on-aws-ec2)
 4 | of this example on our site.
 5 | 
 6 | This example demonstrates how to deploy a
 7 | [Llama 7B model](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) using
 8 | [TGI](https://huggingface.co/docs/text-generation-inference/messages_api) on AWS EC2 using Runhouse.
 9 | 
10 | ## Setup credentials and dependencies
11 | Install the required dependencies:
12 | ```shell
13 | $ pip install -r requirements.txt
14 | ```
15 | 
16 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to make
17 | sure our AWS credentials are set up:
18 | ```shell
19 | $ aws configure
20 | $ sky check
21 | ```
22 | 


--------------------------------------------------------------------------------
/examples/llama2-with-tgi-ec2/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse[aws]
2 | 


--------------------------------------------------------------------------------
/examples/llama3-8b-ec2/README.md:
--------------------------------------------------------------------------------
 1 | # Deploy Llama3 8B Chat Model Inference on AWS EC2
 2 | 
 3 | See a more [rich explanation](https://www.run.house/examples/llama3-8b-chat-model-inference-aws-ec2)
 4 | of this example on our site.
 5 | 
 6 | This example demonstrates how to deploy a
 7 | [LLama2 13B model from Hugging Face](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)
 8 | on AWS EC2 using Runhouse.
 9 | 
10 | Make sure to sign the waiver on the model page so that you can access it.
11 | 
12 | ## Setup credentials and dependencies
13 | 
14 | Optionally, set up a virtual environment:
15 | ```shell
16 | $ conda create -n llama3-rh python=3.9.15
17 | $ conda activate llama3-rh
18 | ```
19 | Install the few required dependencies:
20 | ```shell
21 | $ pip install -r requirements.txt
22 | ```
23 | 
24 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to
25 | make sure our AWS credentials are set up:
26 | ```shell
27 | $ aws configure
28 | $ sky check
29 | ```
30 | We'll be downloading the Llama2 model from Hugging Face, so we need to set up our Hugging Face token:
31 | ```shell
32 | $ export HF_TOKEN=<your huggingface token>
33 | ```
34 | 
35 | After that, you can just run the example:
36 | ```shell
37 | $ python llama3_ec2.py
38 | ```
39 | 


--------------------------------------------------------------------------------
/examples/llama3-8b-ec2/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse[aws]
2 | torch
3 | 


--------------------------------------------------------------------------------
/examples/llama3-8b-tgi-ec2/README.md:
--------------------------------------------------------------------------------
 1 | # Deploy Llama 3 8B with TGI on AWS EC2
 2 | This example demonstrates how to deploy a Meta Llama 3 8B model from Hugging Face with
 3 | [TGI](https://huggingface.co/docs/text-generation-inference/messages_api) on AWS EC2 using Runhouse.
 4 | 
 5 | 
 6 | Make sure to sign the waiver on the [Hugging Face model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) so that you can access it.
 7 | 
 8 | ## Setup credentials and dependencies
 9 | Install the required dependencies:
10 | ```shell
11 | $ pip install -r requirements.txt
12 | ```
13 | 
14 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to make sure our AWS credentials are set up:
15 | ```shell
16 | $ aws configure
17 | $ sky check
18 | ```
19 | 


--------------------------------------------------------------------------------
/examples/llama3-8b-tgi-ec2/requirements.txt:
--------------------------------------------------------------------------------
1 | docker
2 | runhouse[aws]
3 | 


--------------------------------------------------------------------------------
/examples/llama3-fine-tuning-lora/README.md:
--------------------------------------------------------------------------------
 1 | # Fine-Tune Llama 3 with LoRA on AWS EC2
 2 | 
 3 | This example demonstrates how to fine-tune a Llama 3 8B model using
 4 | [LoRA](https://huggingface.co/docs/peft/main/en/conceptual_guides/lora) on AWS EC2 using Runhouse.
 5 | 
 6 | Make sure to sign the waiver on the [Hugging Face model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
 7 | so that you can access it.
 8 | 
 9 | ## Setup credentials and dependencies
10 | 
11 | Install the few required dependencies:
12 | ```shell
13 | $ pip install -r requirements.txt
14 | ```
15 | 
16 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to
17 | make sure our AWS credentials are set up:
18 | ```shell
19 | $ aws configure
20 | $ sky check
21 | ```
22 | 
23 | After that, you can just run the example:
24 | ```shell
25 | $ python llama3_fine_tuning.py
26 | ```
27 | 


--------------------------------------------------------------------------------
/examples/llama3-fine-tuning-lora/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse[aws]
2 | torch
3 | datasets
4 | peft
5 | transformers
6 | trl
7 | 


--------------------------------------------------------------------------------
/examples/llama3-vllm-gcp/README.md:
--------------------------------------------------------------------------------
 1 | # Run Llama 3 8B Model Inference with vLLM on GCP
 2 | 
 3 | This example demonstrates how to run a Llama 3 8B model from Hugging Face with vLLM using Runhouse.
 4 | 
 5 | Make sure to sign the waiver on the [Hugging Face model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
 6 | so that you can access it.
 7 | 
 8 | ## Setup credentials and dependencies
 9 | 
10 | Optionally, set up a virtual environment:
11 | ```shell
12 | $ conda create -n llama3-rh python=3.9.15
13 | $ conda activate llama3-rh
14 | ```
15 | 
16 | Install the required dependencies:
17 | 
18 | ```shell
19 | $ pip install -r requirements.txt
20 | ```
21 | 
22 | If you do not have a Runhouse account and want to launch an instance via [SkyPilot](https://github.com/skypilot-org/skypilot), make sure your credentials are set up. You may be prompted to pick a cloud project to use after running `gcloud init`. If you don't have one ready yet, you can connect one later by listing your projects with `gcloud projects list` and setting one with `gcloud config set project <PROJECT_ID>`.
23 | 
24 | If you already have a Runhouse account, you do not need to run this.
25 | 
26 | ```shell
27 | $ gcloud init
28 | $ gcloud auth application-default login
29 | $ sky check
30 | ```
31 | 
32 | We'll be downloading the Llama 3 model from Hugging Face, so we need to set up our Hugging Face token:
33 | 
34 | ```shell
35 | $ export HF_TOKEN=<your huggingface token>
36 | ```
37 | 
38 | ## Run the Python script
39 | 
40 | ```shell
41 | $ python llama3_vllm_gcp.py
42 | ```
43 | 


--------------------------------------------------------------------------------
/examples/llama3-vllm-gcp/requirements.txt:
--------------------------------------------------------------------------------
1 | asyncio
2 | runhouse[gcp]
3 | 


--------------------------------------------------------------------------------
/examples/lora-example-with-notebook/LoraFineTuner_check_status.py:
--------------------------------------------------------------------------------
 1 | import runhouse as rh
 2 | 
 3 | # We check if we have already created a "rh_finetuner" on the remote which is an *instance* of the remote fine tuner class
 4 | cluster = rh.compute(
 5 |     name="rh-a10x",
 6 |     instance_type="A10G:1",
 7 |     memory="32+",
 8 |     provider="aws",
 9 | ).up_if_not()
10 | 
11 | fine_tuner_remote_name = "rh_finetuner"
12 | fine_tuner_remote = cluster.get(fine_tuner_remote_name, default=None, remote=True)
13 | 
14 | # Check what the training status is on remote
15 | if fine_tuner_remote is not None:
16 |     print(fine_tuner_remote.get_training_status())
17 | 


--------------------------------------------------------------------------------
/examples/lora-example-with-notebook/readme.md:
--------------------------------------------------------------------------------
1 | ## LoRA Fine-Tuning Class with Example of Notebook Usage
2 | In this example, we define a Fine Tuner class (LoraFineTuner.py) in **regular Python** and launch remote GPU compute to do the fine-tuning.
3 | 
4 | In particular, we show how you can start the fine tuning and interact with the fine-tuning class (a remote object) through regular Python or a Notebook. Runhouse lets you work *locally* with *remote objects* defined by regular code and edited locally, compared to tooling like hosted notebooks which let you *work locally while SSH'ed into a remote setting.* This offers a few distinct advantages:
5 | * **Real compute and real data:** ML Engineers and data scientists do not need to launch projects on toy compute offered in a research environment.
6 | * **Real code:** Rather than working on Notebooks (because they have to), your team is writing code and developing locally just like a normal software team. The only difference is dispatching the work for remote computation since the local machine doesn't have the right hardware.
7 | * **Fast research to production:** The work done while writing and testing the class is essentially enough to bring the work to production as well. There is no costly rebuilding of the same code a second time to work in a Pipeline.
8 | 


--------------------------------------------------------------------------------
/examples/mistral-with-tgi-ec2/README.md:
--------------------------------------------------------------------------------
 1 | # Deploy Mistral's 7B Model with TGI on AWS EC2
 2 | 
 3 | See a more [rich explanation](https://www.run.house/examples/mistral-tgi-inference-on-aws-ec2)
 4 | of this example on our site.
 5 | 
 6 | This example demonstrates how to deploy a
 7 | [TGI model](https://huggingface.co/docs/text-generation-inference/messages_api) on AWS EC2 using Runhouse.
 8 | This example draws inspiration from
 9 | [Huggingface's tutorial on AWS SageMaker](https://huggingface.co/blog/text-generation-inference-on-inferentia2).
10 | Zephyr is a 7B fine-tuned version of [Mistral's 7B-v0.1 model](https://huggingface.co/mistralai/Mistral-7B-v0.1).
11 | 
12 | ## Setup credentials and dependencies
13 | Install the required dependencies:
14 | ```shell
15 | $ pip install -r requirements.txt
16 | ```
17 | 
18 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to make
19 | sure our AWS credentials are set up:
20 | ```shell
21 | $ aws configure
22 | $ sky check
23 | ```
24 | 


--------------------------------------------------------------------------------
/examples/mistral-with-tgi-ec2/requirements.txt:
--------------------------------------------------------------------------------
1 | openai
2 | runhouse[aws]
3 | 


--------------------------------------------------------------------------------
/examples/parallel-hf-embedding/README.md:
--------------------------------------------------------------------------------
 1 | # An embarrassingly parallel embedding task with Hugging Face models on AWS EC2
 2 | 
 3 | This example demonstrates how to use Runhouse primitives to embed a large number of websites in parallel.
 4 | We use a [BGE large model from Hugging Face](https://huggingface.co/BAAI/bge-large-en-v1.5) and load it via
 5 | the `SentenceTransformer` class from the `huggingface` library.
 6 | 
 7 | ## Setup credentials and dependencies
 8 | 
 9 | Optionally, set up a virtual environment:
10 | ```shell
11 | $ conda create -n parallel-embed python=3.9.15
12 | $ conda activate parallel-embed
13 | ```
14 | Install the few required dependencies:
15 | ```shell
16 | $ pip install -r requirements.txt
17 | ```
18 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to
19 | make sure our AWS credentials are set up:
20 | ```shell
21 | $ aws configure
22 | $ sky check
23 | ```
24 | 
25 | ## Some utility functions
26 | 
27 | We import `runhouse` and other utility libraries; only the ones that are needed to run the script locally.
28 | Imports of libraries that are needed on the remote machine (in this case, the `huggingface` dependencies)
29 | can happen within the functions that will be sent to the Runhouse cluster.
30 | 


--------------------------------------------------------------------------------
/examples/parallel-hf-embedding/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | runhouse[aws]
3 | torch
4 | tqdm
5 | 


--------------------------------------------------------------------------------
/examples/pytorch-distributed-basic/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch Multi-node Distributed Training
 2 | 
 3 | A basic example showing how to use Runhouse to Pythonically run a PyTorch distributed training script on a
 4 | cluster of GPUs. Often distributed training is launched from multiple parallel CLI commands
 5 | (`python -m torch.distributed.launch ...`), each spawning separate training processes (ranks).
 6 | Here, we're creating each process as a separate worker on the cluster, sending our training function
 7 | into each worker, and calling the replicas concurrently to trigger coordinated multi-node training
 8 | (`torch.distributed.init_process_group` causes each to wait for all to connect, and sets up the distributed
 9 | communication). We're using two single-GPU instances (and therefore two ranks) for simplicity, but we've included
10 | the basic logic to handle multi-GPU nodes as well, where you'd add more worker processes per node and set `device_ids`
11 | accordingly.
12 | 
13 | Despite it being common to use a launcher script to start distributed training, this approach is more flexible and
14 | allows for more complex orchestration, such as running multiple training jobs concurrently, handling exceptions,
15 | running distributed training alongside other tasks on the same cluster. It's also significantly easier to debug
16 | and monitor, as you can see the output of each rank in real-time and get stack traces if a worker fails.
17 | 


--------------------------------------------------------------------------------
/examples/pytorch-distributed-basic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/pytorch-distributed-basic/__init__.py


--------------------------------------------------------------------------------
/examples/pytorch-distributed-basic/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | kubetorch
3 | 


--------------------------------------------------------------------------------
/examples/pytorch-resnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/examples/pytorch-resnet/__init__.py


--------------------------------------------------------------------------------
/examples/pytorch-resnet/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse
2 | torch
3 | torchvision
4 | datasets
5 | 


--------------------------------------------------------------------------------
/examples/pytorch-torchvision-mnist-training/README.md:
--------------------------------------------------------------------------------
 1 | # Deploy and Train a Model with Torch
 2 | This example demonstrates how to use the `SimpleTrainer` class to train and test a machine learning model using PyTorch and the MNIST dataset. The `SimpleTrainer` class handles model training, evaluation, and prediction tasks and shows you how you can send model classes to train and execute on remote compute.
 3 | 
 4 | ## Setup and Installation
 5 | 
 6 | Optionally, set up a virtual environment:
 7 | ```shell
 8 | $ conda create -n simple-trainer python=3.10
 9 | $ conda activate simple-trainer
10 | ```
11 | Install the necessary dependencies:
12 | ```shell
13 | $ pip install -r requirements.txt
14 | ```
15 | 
16 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to
17 | make sure our AWS credentials are set up (you can use any cloud with Runhouse):
18 | ```shell
19 | $ aws configure
20 | $ sky check
21 | ```
22 | 
23 | ## Run the Python script
24 | 
25 | ```shell
26 | $ python TorchBasicExample-AWS.py
27 | ```
28 | 


--------------------------------------------------------------------------------
/examples/pytorch-torchvision-mnist-training/airflow-multicloud/DataProcessing.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import boto3
 4 | 
 5 | 
 6 | # Download data from S3
 7 | def download_folder_from_s3(bucket_name, s3_folder_prefix, local_folder_path):
 8 |     s3 = boto3.client("s3")
 9 | 
10 |     paginator = s3.get_paginator("list_objects_v2")
11 |     for page in paginator.paginate(Bucket=bucket_name, Prefix=s3_folder_prefix):
12 |         if "Contents" in page:
13 |             for obj in page["Contents"]:
14 |                 s3_key = obj["Key"]
15 |                 relative_path = os.path.relpath(s3_key, s3_folder_prefix)
16 |                 local_path = os.path.join(local_folder_path, relative_path)
17 | 
18 |                 os.makedirs(os.path.dirname(local_path), exist_ok=True)
19 |                 s3.download_file(bucket_name, s3_key, local_path)
20 |                 print(f"Downloaded {s3_key} to {local_path}")
21 | 
22 | 
23 | # download_folder_from_s3('rh-demo-external', 'your/s3/folder/prefix', '/path/to/local/folder', 'your-access-key-id', 'your-secret-access-key')
24 | 
25 | 
26 | # Upload data to S3 bucket
27 | def upload_folder_to_s3(local_folder_path, bucket_name, s3_folder_prefix):
28 |     s3 = boto3.client("s3")
29 | 
30 |     for root, dirs, files in os.walk(local_folder_path):
31 |         for file in files:
32 |             local_path = os.path.join(root, file)
33 |             relative_path = os.path.relpath(local_path, local_folder_path)
34 |             s3_path = os.path.join(s3_folder_prefix, relative_path)
35 | 
36 |             s3.upload_file(local_path, bucket_name, s3_path)
37 |             print(f"Uploaded {local_path} to s3://{bucket_name}/{s3_path}")
38 | 
39 | 
40 | # upload_folder_to_s3('/path/to/local/folder', 'rh-demo-external', 'your/s3/folder/prefix', 'your-access-key-id', 'your-secret-access-key')
41 | 


--------------------------------------------------------------------------------
/examples/pytorch-torchvision-mnist-training/airflow-multicloud/local_run_of_callables.py:
--------------------------------------------------------------------------------
 1 | # You can easily test both the Airflow flow, and the underlying components and code by calling them from local
 2 | 
 3 | # Because the execution has been offloaded to GPU compute on remote, you can call each step from local, or from a notebook
 4 | # You can imagine that a DS or MLE might write a pipeline and interactively debug from local.
 5 | # Then, only when they are confident all the functions work, do they upload the Airflow pipeline which is minimal
 6 | 
 7 | # Airflow is used to schedule, monitor, and retry jobs while providing observability over runs.
 8 | # However, the code that is the substance of the program is not packed into the Airflow DAG.
 9 | 
10 | import logging
11 | 
12 | from airflow_multicloud_torch_train import (
13 |     access_data_callable,
14 |     bring_up_cluster_callable,
15 |     down_cluster,
16 |     download_s3_data_callable,
17 |     preprocess_data_callable,
18 |     train_model_callable,
19 | )
20 | 
21 | logging.basicConfig(level=logging.INFO)
22 | logger = logging.getLogger(__name__)
23 | 
24 | if __name__ == "__main__":
25 |     logger.info("Starting the pipeline...")
26 | 
27 |     logger.info("Step 1: Bring up cluster")
28 |     cpu_cluster_config = {
29 |         "cluster_name": "cpu-cluster",
30 |         "instance_type": "r6i.xlarge",
31 |         "provider": "aws",
32 |     }
33 |     gpu_cluster_config = {
34 |         "cluster_name": "gpu-cluster",
35 |         "gpus": "L4:1",
36 |         "provider": "gcp",
37 |     }
38 | 
39 |     cpu = bring_up_cluster_callable(**cpu_cluster_config)
40 |     logger.info("Step 2: Access data")
41 |     access_data_callable(**cpu_cluster_config)
42 | 
43 |     logger.info("Step 3: Preprocess data")
44 |     preprocess_data_callable(**cpu_cluster_config)
45 | 
46 |     logger.info("Step 4: Train model")
47 |     bring_up_cluster_callable(**gpu_cluster_config)
48 |     download_s3_data_callable(**gpu_cluster_config)
49 |     train_model_callable(**gpu_cluster_config)
50 | 
51 |     logger.info("Pipeline completed.")
52 | 
53 |     down_cluster(**gpu_cluster_config)
54 |     down_cluster(**cpu_cluster_config)
55 |     logger.info("Cluster sucessfully downed.")
56 | 


--------------------------------------------------------------------------------
/examples/pytorch-torchvision-mnist-training/airflow/local_run_of_callables.py:
--------------------------------------------------------------------------------
 1 | ## You can easily test both the Airflow flow, and the underlying components and code by calling them from local
 2 | 
 3 | ## Because the execution has been offloaded to GPU compute on remote, you can call each step from local, or from a notebook
 4 | ## You can imagine that a DS or MLE might write a pipeline and interactively debug from local.
 5 | ## Then, only when they are confident all the functions work, do they upload the Airflow pipeline which is minimal
 6 | 
 7 | ## Airflow is used to schedule, monitor, and retry jobs while providing observability over runs.
 8 | ## However, the code that is the substance of the program is not packed into the Airflow DAG.
 9 | 
10 | import logging
11 | 
12 | from airflow_example_torch_train import (
13 |     access_data_callable,
14 |     bring_up_cluster_callable,
15 |     down_cluster,
16 |     train_model_callable,
17 | )
18 | 
19 | logging.basicConfig(level=logging.INFO)
20 | logger = logging.getLogger(__name__)
21 | 
22 | if __name__ == "__main__":
23 |     logger.info("Starting the pipeline...")
24 | 
25 |     logger.info("Step 1: Bring up cluster")
26 |     bring_up_cluster_callable()
27 | 
28 |     logger.info("Step 2: Access data")
29 |     access_data_callable()
30 | 
31 |     logger.info("Step 3: Train model")
32 |     train_model_callable()
33 | 
34 |     logger.info("Pipeline completed.")
35 | 
36 |     down_cluster()
37 |     logger.info("Cluster sucessfully downed.")
38 | 


--------------------------------------------------------------------------------
/examples/pytorch-torchvision-mnist-training/airflow/readme.md:
--------------------------------------------------------------------------------
 1 | # Using Airflow with Runhouse
 2 | The principal goal of using Runhouse alongside Airflow or other DAG systems is to restore interactive debuggability and fast iteration to developers. Packaging research code into production pipelines easily takes up half of machine learning engineers' time, and this is even true for sophisticated organizations.
 3 | 
 4 | **Use Airflow for what Airflow is good for**
 5 | * Scheduling
 6 | * Ensuring reliable execution
 7 | * Observability of runs
 8 | 
 9 | The usage pattern for Runhouse with Airflow should be as follows:
10 | * Write Python classes and functions using normal, ordinary coding best practices. Do not think about DAGs or DSLs at all, just write great code.
11 | * Send the code for remote execution with Runhouse, and figure out whether the code works, debugging it interactively. Runhouse lets you send the code in seconds, and streams logs back. You can work on remote as if it were local.
12 | * Once you are satisfied with your code, you can write the callables for an Airflow PythonOperator. The code that is actually in the Airflow DAG is the **minimal code** to call out to already working Classes and Functions, defining the order of the steps (or you can even have a one-step Airflow DAG, making Airflow purely for scheduling and observability)
13 | * And you can easily iterate further on your code, or test the pipeline end-to-end from local with no Airflow participation
14 | 
15 | 
16 | **Examples**
17 | * **TorchBasicExample.py:** Normally written Python code with no DSL, defining a simple neural network, in the parent folder.
18 | * **local_run_of_callables.py:** An example of how Runhouse lets you test your functions and Airflow callables from local, since it's all happening on "remote" execution. You can update code, and experiment with calling just that step.
19 | * **airflow_example_torch_train.py:** The Airflow DAG, which simply orchestrates the pipeline.
20 | 


--------------------------------------------------------------------------------
/examples/pytorch-torchvision-mnist-training/airflow/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse[aws]
2 | torch
3 | torchvision
4 | airflow
5 | 


--------------------------------------------------------------------------------
/examples/pytorch-torchvision-mnist-training/my_simple_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | # We define a model class. We define a very basic feedforward neural network with three fully connected layers.
 6 | class SimpleNN(nn.Module):
 7 |     def __init__(self):
 8 |         super(SimpleNN, self).__init__()
 9 |         self.fc1 = nn.Linear(28 * 28, 128)
10 |         self.fc2 = nn.Linear(128, 64)
11 |         self.fc3 = nn.Linear(64, 10)
12 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | 
14 |     def forward(self, x):
15 |         x = x.view(-1, 28 * 28)  # Flatten the input
16 |         x = F.relu(self.fc1(x))
17 |         x = F.relu(self.fc2(x))
18 |         x = self.fc3(x)
19 |         return x
20 | 


--------------------------------------------------------------------------------
/examples/pytorch-torchvision-mnist-training/my_transforms.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from torchvision import transforms
 3 | 
 4 | 
 5 | def get_transform():
 6 |     transform = transforms.Compose(
 7 |         [
 8 |             transforms.Resize(
 9 |                 (28, 28), interpolation=Image.BILINEAR
10 |             ),  # Resize to 28x28 using bilinear interpolation
11 |             transforms.ToTensor(),
12 |             transforms.Normalize(
13 |                 (0.5,), (0.5,)
14 |             ),  # Normalize with mean=0.5, std=0.5 for general purposes
15 |         ]
16 |     )
17 |     return transform
18 | 


--------------------------------------------------------------------------------
/examples/pytorch-torchvision-mnist-training/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse[aws]
2 | torch
3 | torchvision
4 | 


--------------------------------------------------------------------------------
/examples/pytorch-torchvision-mnist-training/work_with_remote_TorchTrainer.py:
--------------------------------------------------------------------------------
 1 | # ## Working with remote objects
 2 | # You can run this code in a Python script, a Jupyter notebook, or any other Python environment. It will work while training is happening or long after it ends until the cluster is downed.
 3 | import runhouse as rh
 4 | 
 5 | # Define a cluster type - here we launch an on-demand AWS cluster with 1 NVIDIA A10G GPU.
 6 | # You can use any cloud you want, or existing compute
 7 | cluster = rh.compute(
 8 |     name="a10g-cluster", instance_type="A10G:1", provider="aws"
 9 | ).up_if_not()
10 | 
11 | # Get our remote TorchTrainer by name
12 | model = cluster.get("torch_model", default=None, remote=True)
13 | 
14 | # Get the training status of the model
15 | print(model.return_status())
16 | 
17 | # Make a prediction with the model, which we can do even when training is happening in a different thread.
18 | from torchvision import datasets, transforms
19 | 
20 | transform = transforms.Compose(
21 |     [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
22 | )
23 | 
24 | local_dataset = datasets.MNIST(
25 |     "./data", train=False, download=True, transform=transform
26 | )
27 | example_data, example_target = local_dataset[0][0].unsqueeze(0), local_dataset[0][1]
28 | prediction = model.predict(example_data)
29 | print(f"Predicted: {prediction}, Actual: {example_target}")
30 | 


--------------------------------------------------------------------------------
/examples/stable-diffusion-xl-ec2/README.md:
--------------------------------------------------------------------------------
 1 | # Deploy Stable Diffusion XL 1.0 on AWS EC2
 2 | 
 3 | See a more [rich explanation](https://www.run.house/examples/stable-diffusion-xl-on-aws-ec2)
 4 | of this example on our site.
 5 | 
 6 | This example demonstrates how to deploy a
 7 | [Stable Diffusion XL model from Hugging Face](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
 8 | on AWS EC2 using Runhouse.
 9 | 
10 | ## Setup credentials and dependencies
11 | 
12 | Optionally, set up a virtual environment:
13 | ```shell
14 | $ conda create -n rh-sdxl python=3.9.15
15 | $ conda activate rh-sdxl
16 | ```
17 | Install the few required dependencies:
18 | ```shell
19 | $ pip install -r requirements.txt
20 | ```
21 | 
22 | We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to
23 | make sure our AWS credentials are set up:
24 | ```shell
25 | $ aws configure
26 | $ sky check
27 | ```
28 | We'll be downloading the model from Hugging Face, so we need to set up our Hugging Face token:
29 | ```shell
30 | $ export HF_TOKEN=<your huggingface token>
31 | ```
32 | 
33 | After that, you can just run the example:
34 | ```shell
35 | $ python sdxl.py
36 | ```
37 | 


--------------------------------------------------------------------------------
/examples/stable-diffusion-xl-ec2/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse[aws]
2 | Pillow
3 | 


--------------------------------------------------------------------------------
/examples/tensorflow-distributed/README.md:
--------------------------------------------------------------------------------
 1 | # TensorFlow Multi-node Distributed Training
 2 | 
 3 | A basic example showing how to use Runhouse to Pythonically run a TensorFlow distributed training script on a
 4 | cluster of GPUs. We use the `TF_CONFIG` environment variable to set up the distributed training environment, and
 5 | create a separate worker for each rank. We then call the replicas concurrently to trigger coordinated
 6 | multi-node training. We're using two single-GPU instances (and therefore two ranks) with the
 7 | MultiWorkerMirroredStrategy, but this same strategy could be used for other TensorFlow distributed strategies.
 8 | 
 9 | Despite it being common to use a launcher script to start distributed training, this approach is more flexible and
10 | allows for more complex orchestration, such as running multiple training jobs concurrently, handling exceptions,
11 | running distributed training alongside other tasks on the same cluster. It's also significantly easier to debug
12 | and monitor, as you can see the output of each rank in real-time and get stack traces if a worker fails.
13 | 


--------------------------------------------------------------------------------
/examples/tensorflow-distributed/requirements.txt:
--------------------------------------------------------------------------------
1 | runhouse[aws]
2 | tensorflow
3 | 


--------------------------------------------------------------------------------
/examples/tensorflow-distributed/tensorflow_distributed.py:
--------------------------------------------------------------------------------
 1 | # # TensorFlow Multi-node Distributed Training
 2 | # A basic example showing how to use Kubetorch to Pythonically run a TensorFlow distributed training script on
 3 | # multiple GPUs. We use the TF_CONFIG environment variable to set up the distributed training environment, and
 4 | # create a separate worker (env) for each rank. We then call the replicas concurrently to trigger coordinated
 5 | # multi-node training. We're using two single-GPU instances (and therefore two ranks) with the
 6 | # MultiWorkerMirroredStrategy, but this same strategy could be used for other TensorFlow distributed strategies.
 7 | #
 8 | # Despite it being common to use a launcher script to start distributed training, this approach is more flexible and
 9 | # allows for more complex orchestration, such as running multiple training jobs concurrently, handling exceptions,
10 | # running distributed training alongside other tasks on the same cluster. It's also significantly easier to debug
11 | # and monitor, as you can see the output of each rank in real-time and get stack traces if a worker fails.
12 | 
13 | import json
14 | import os
15 | 
16 | import kubetorch as kt
17 | import tensorflow as tf
18 | 
19 | 
20 | # ## Define the TensorFlow distributed training logic
21 | # This is the function that will be run on each worker. It initializes the distributed training environment,
22 | # creates a simple model and optimizer, and runs a training loop.
23 | def train_process():
24 |     # Initialize the distributed training environment,
25 |     # per https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras
26 |     tf_config = json.loads(os.environ["TF_CONFIG"])
27 |     strategy = tf.distribute.MultiWorkerMirroredStrategy()
28 |     num_workers = strategy.num_replicas_in_sync
29 |     print(f"Worker {tf_config['task']['index']} of {num_workers} initialized")
30 | 
31 |     # Create a simple model and optimizer
32 |     model = tf.keras.Sequential([tf.keras.layers.Dense(10, activation="relu")])
33 |     optimizer = tf.keras.optimizers.SGD(0.01)
34 | 
35 |     with strategy.scope():
36 |         model.compile(optimizer=optimizer, loss="mse")
37 | 
38 |     model.fit(
39 |         tf.data.Dataset.from_tensor_slices(
40 |             (tf.random.normal([1000, 10]), tf.random.normal([1000, 1]))
41 |         ).batch(32)
42 |     )
43 | 
44 |     print(f"Worker {tf_config['task']['index']} finished")
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     # Dispatch the training function to a multi-node cluster with 4 nodes, each with 1 GPU
49 |     gpus = kt.Compute(gpus="A10G:1", image=kt.images.tensorflow())
50 |     remote_train = kt.fn(train_process).to(gpus).distribute("tensorflow", num_nodes=4)
51 | 
52 |     remote_train()
53 | 


--------------------------------------------------------------------------------
/examples/xgboost-gpu/requirements.txt:
--------------------------------------------------------------------------------
1 | xgboost
2 | numpy
3 | pandas
4 | scikit-learn
5 | kubetorch
6 | 


--------------------------------------------------------------------------------
/examples/yolo-fastapi/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Usage: pip install -r requirements.txt
 2 | 
 3 | # Base ----------------------------------------
 4 | runhouse
 5 | matplotlib>=3.2.2
 6 | numpy>=1.18.5,<1.24.0
 7 | opencv-python>=4.1.1
 8 | Pillow>=7.1.2
 9 | PyYAML>=5.3.1
10 | requests>=2.23.0
11 | scipy>=1.4.1
12 | torch>=1.7.0,!=1.12.0
13 | torchvision>=0.8.1,!=0.13.0
14 | tqdm>=4.41.0
15 | protobuf<4.21.3
16 | 
17 | # Logging -------------------------------------
18 | tensorboard>=2.4.1
19 | # wandb
20 | 
21 | # Plotting ------------------------------------
22 | pandas>=1.1.4
23 | seaborn>=0.11.0
24 | 
25 | # Export --------------------------------------
26 | # coremltools>=4.1  # CoreML export
27 | # onnx>=1.9.0  # ONNX export
28 | # onnx-simplifier>=0.3.6  # ONNX simplifier
29 | # scikit-learn==0.19.2  # CoreML quantization
30 | # tensorflow>=2.4.1  # TFLite export
31 | # tensorflowjs>=3.9.0  # TF.js export
32 | # openvino-dev  # OpenVINO export
33 | 
34 | # Extras --------------------------------------
35 | ipython  # interactive notebook
36 | psutil  # system utilization
37 | thop  # FLOPs computation
38 | # albumentations>=1.0.3
39 | # pycocotools>=2.0  # COCO mAP
40 | # roboflow
41 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=58.0, < 70"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.ruff]
 6 | extend-exclude = ["runhouse/resources/hardware/sky/"]
 7 | 
 8 | [tool.ruff.lint.per-file-ignores]
 9 | "__init__.py" = ["F401"]
10 | "examples/*" = ["E501"]
11 | 
12 | [tool.pytest.ini_options]
13 | asyncio_mode = "auto"
14 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | addopts = -s -v
 3 | markers =
 4 |     servertest: all tests in the tests/test_servers/ directory, for filtering out
 5 |     secrettest: all tests in tests/test_resources/test_secrets/, for filtering out
 6 |     moduletest: all tests in TestModule, for filtering out
 7 |     functiontest: all tests in TestFunction, for filtering out
 8 |     clustertest: all tests in TestCluster, for filtering out
 9 |     level: mark tests with a given level that will be used when selecting tests to run
10 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | python-dotenv
 2 | pexpect
 3 | pyopenssl>=23.3.0
 4 | rich
 5 | setuptools < 70.0.0
 6 | typer
 7 | uvicorn
 8 | wheel
 9 | apispec
10 | httpx
11 | pydantic >=2.5.0
12 | 


--------------------------------------------------------------------------------
/runhouse/__init__.py:
--------------------------------------------------------------------------------
 1 | import runhouse.resources.images.builtin_images as images
 2 | 
 3 | from runhouse.exceptions import InsufficientDiskError
 4 | from runhouse.resources.asgi import Asgi, asgi
 5 | from runhouse.resources.folders import Folder, folder, GCSFolder, S3Folder
 6 | from runhouse.resources.functions.function import Function
 7 | from runhouse.resources.functions.function_factory import function
 8 | 
 9 | from runhouse.resources.hardware import (
10 |     cluster,
11 |     Cluster,
12 |     DockerCluster,
13 |     ondemand_cluster,
14 |     OnDemandCluster,
15 | )
16 | from runhouse.resources.images import Image
17 | 
18 | # WARNING: Any built-in module that is imported here must be capitalized followed by all lowercase, or we will
19 | # will not find the module class when attempting to reconstruct it from a config.
20 | from runhouse.resources.module import Module, module
21 | from runhouse.resources.packages import CodeSyncError, package, Package
22 | from runhouse.resources.resource import Resource
23 | from runhouse.resources.secrets import provider_secret, ProviderSecret, Secret, secret
24 | 
25 | from runhouse.rns.top_level_rns_fns import (
26 |     as_caller,
27 |     current_folder,
28 |     exists,
29 |     get_local_cluster_object,
30 |     ipython,
31 |     load,
32 |     locate,
33 |     set_folder,
34 |     unset_folder,
35 | )
36 | from runhouse.utils import sync_function
37 | 
38 | # Note these are global variables that are instantiated within globals.py:
39 | from .globals import configs, obj_store
40 | 
41 | from .rns.login import login, logout
42 | 
43 | # Syntactic sugar
44 | fn = function
45 | compute = cluster
46 | cls = module
47 | 
48 | 
49 | def __getattr__(name):
50 |     if name == "here":
51 |         # If it's either the first time or the cluster was not initialized before, attempt to retrieve the cluster again
52 |         return sync_function(get_local_cluster_object)()
53 | 
54 |     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
55 | 
56 | 
57 | __version__ = "0.0.43"
58 | 


--------------------------------------------------------------------------------
/runhouse/builtins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/builtins/__init__.py


--------------------------------------------------------------------------------
/runhouse/builtins/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "resource_type": "folder",
3 |     "name": "builtins",
4 |     "rns_address": "/builtins"
5 | }
6 | 


--------------------------------------------------------------------------------
/runhouse/builtins/generate_builtins.py:
--------------------------------------------------------------------------------
 1 | from runhouse.resources.hardware.on_demand_cluster import OnDemandCluster
 2 | 
 3 | rh_cpu = OnDemandCluster(name="^rh-cpu", instance_type="CPU:1", dryrun=False)
 4 | rh_8_cpu = OnDemandCluster(name="^rh-8-cpu", instance_type="CPU:8", dryrun=False)
 5 | rh_32_cpu = OnDemandCluster(name="^rh-32-cpu", instance_type="CPU:32", dryrun=False)
 6 | rh_gpu = OnDemandCluster(name="^rh-gpu", instance_type="K80:1", dryrun=False)
 7 | rh_4_gpu = OnDemandCluster(name="^rh-4-gpu", instance_type="K80:4", dryrun=False)
 8 | rh_8_gpu = OnDemandCluster(name="^rh-8-gpu", instance_type="K80:8", dryrun=False)
 9 | rh_v100 = OnDemandCluster(name="^rh-v100", instance_type="V100:1", dryrun=False)
10 | rh_4_v100 = OnDemandCluster(name="^rh-4-v100", instance_type="V100:4", dryrun=False)
11 | rh_8_v100 = OnDemandCluster(name="^rh-8-v100", instance_type="V100:8", dryrun=False)
12 | 
13 | for cluster in [
14 |     rh_cpu,
15 |     rh_8_cpu,
16 |     rh_32_cpu,
17 |     rh_gpu,
18 |     rh_4_gpu,
19 |     rh_8_gpu,
20 |     rh_v100,
21 |     rh_4_v100,
22 |     rh_8_v100,
23 | ]:
24 |     cluster.autostop_mins = None
25 |     cluster.provider = None
26 |     # Need to manually more into builtins because we can't save there
27 |     cluster.save(name=f"~/{cluster.name}")
28 | 


--------------------------------------------------------------------------------
/runhouse/builtins/rh-32-cpu/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resource_type": "cluster",
 3 |     "name": "/builtins/rh-32-cpu",
 4 |     "resource_subtype": "OnDemandCluster",
 5 |     "rns_address": "/builtins/rh-32-cpu",
 6 |     "instance_type": "CPU:32+",
 7 |     "num_nodes": null,
 8 |     "provider": null,
 9 |     "autostop_mins": null,
10 |     "use_spot": false
11 | }
12 | 


--------------------------------------------------------------------------------
/runhouse/builtins/rh-4-gpu/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resource_type": "cluster",
 3 |     "name": "/builtins/rh-4-gpu",
 4 |     "resource_subtype": "OnDemandCluster",
 5 |     "rns_address": "/builtins/rh-4-gpu",
 6 |     "instance_type": "K80:4",
 7 |     "num_nodes": null,
 8 |     "provider": null,
 9 |     "autostop_mins": null,
10 |     "use_spot": false
11 | }
12 | 


--------------------------------------------------------------------------------
/runhouse/builtins/rh-4-v100/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resource_type": "cluster",
 3 |     "name": "/builtins/rh-4-v100",
 4 |     "resource_subtype": "OnDemandCluster",
 5 |     "rns_address": "/builtins/rh-4-v100",
 6 |     "instance_type": "V100:4",
 7 |     "num_nodes": null,
 8 |     "provider": null,
 9 |     "autostop_mins": null,
10 |     "use_spot": false
11 | }
12 | 


--------------------------------------------------------------------------------
/runhouse/builtins/rh-8-cpu/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resource_type": "cluster",
 3 |     "name": "/builtins/rh-8-cpu",
 4 |     "resource_subtype": "OnDemandCluster",
 5 |     "rns_address": "/builtins/rh-8-cpu",
 6 |     "instance_type": "CPU:8+",
 7 |     "num_nodes": null,
 8 |     "provider": null,
 9 |     "autostop_mins": null,
10 |     "use_spot": false
11 | }
12 | 


--------------------------------------------------------------------------------
/runhouse/builtins/rh-8-gpu/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resource_type": "cluster",
 3 |     "name": "/builtins/rh-8-gpu",
 4 |     "resource_subtype": "OnDemandCluster",
 5 |     "rns_address": "/builtins/rh-8-gpu",
 6 |     "instance_type": "K80:8",
 7 |     "num_nodes": null,
 8 |     "provider": null,
 9 |     "autostop_mins": null,
10 |     "use_spot": false
11 | }
12 | 


--------------------------------------------------------------------------------
/runhouse/builtins/rh-8-v100/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resource_type": "cluster",
 3 |     "name": "/builtins/rh-8-v100",
 4 |     "resource_subtype": "OnDemandCluster",
 5 |     "rns_address": "/builtins/rh-8-v100",
 6 |     "instance_type": "V100:8",
 7 |     "num_nodes": null,
 8 |     "provider": null,
 9 |     "autostop_mins": null,
10 |     "use_spot": false
11 | }
12 | 


--------------------------------------------------------------------------------
/runhouse/builtins/rh-cpu/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resource_type": "cluster",
 3 |     "name": "/builtins/rh-cpu",
 4 |     "resource_subtype": "OnDemandCluster",
 5 |     "rns_address": "/builtins/rh-cpu",
 6 |     "instance_type": "CPU:2+",
 7 |     "num_nodes": null,
 8 |     "provider": null,
 9 |     "autostop_mins": null,
10 |     "use_spot": false
11 | }
12 | 


--------------------------------------------------------------------------------
/runhouse/builtins/rh-gpu/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resource_type": "cluster",
 3 |     "name": "/builtins/rh-gpu",
 4 |     "resource_subtype": "OnDemandCluster",
 5 |     "rns_address": "/builtins/rh-gpu",
 6 |     "instance_type": "K80:1",
 7 |     "num_nodes": null,
 8 |     "provider": null,
 9 |     "autostop_mins": null,
10 |     "use_spot": false
11 | }
12 | 


--------------------------------------------------------------------------------
/runhouse/builtins/rh-v100/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resource_type": "cluster",
 3 |     "name": "/builtins/rh-v100",
 4 |     "resource_subtype": "OnDemandCluster",
 5 |     "rns_address": "/builtins/rh-v100",
 6 |     "instance_type": "V100:1",
 7 |     "num_nodes": null,
 8 |     "provider": null,
 9 |     "autostop_mins": null,
10 |     "use_spot": false
11 | }
12 | 


--------------------------------------------------------------------------------
/runhouse/exceptions.py:
--------------------------------------------------------------------------------
 1 | # Runhouse exceptions
 2 | 
 3 | 
 4 | class InsufficientDiskError(Exception):
 5 |     """Raised when a process on the cluster fails due to lack of disk space.
 6 | 
 7 |     Args:
 8 |         command: The command / process that was run.
 9 |         error_msg: The error message to print.
10 |     """
11 | 
12 |     def __init__(
13 |         self,
14 |         error_msg: str = None,
15 |         command: str = None,
16 |     ) -> None:
17 |         self.command = command
18 |         self.error_msg = error_msg
19 |         self.default_error_msg = "Cluster is out of disk space"
20 |         msg = (
21 |             self.error_msg
22 |             if self.error_msg
23 |             else f"Command {command} failed"
24 |             if self.command
25 |             else self.default_error_msg
26 |         )
27 |         msg = f"{msg}. To resolve it, teardown the cluster and re-launch it with larger disk size."
28 |         super().__init__(msg)
29 | 


--------------------------------------------------------------------------------
/runhouse/globals.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | 
 3 | # Following design pattern for singleton variables from here:
 4 | # https://docs.python.org/3/faq/programming.html#how-do-i-share-global-variables-across-modules
 5 | 
 6 | from runhouse.rns.defaults import Defaults
 7 | from runhouse.rns.rns_client import RNSClient
 8 | from runhouse.servers.obj_store import ObjStore
 9 | 
10 | # Configure the logger once
11 | # TODO commenting out for now because this duplicates the logging config in the root logger
12 | # logging.config.dictConfig(LOGGING_CONFIG)
13 | 
14 | configs = Defaults()
15 | 
16 | ssh_tunnel_cache = {}
17 | 
18 | 
19 | def clean_up_ssh_connections():
20 |     for _, v in ssh_tunnel_cache.items():
21 |         v.terminate()
22 | 
23 | 
24 | atexit.register(clean_up_ssh_connections)
25 | 
26 | rns_client = RNSClient(configs=configs)
27 | 
28 | # Note: this initalizes a dummy global object. The obj_store must
29 | # be properly initialized by a servlet via initialize.
30 | obj_store = ObjStore()
31 | 


--------------------------------------------------------------------------------
/runhouse/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | 
 5 | 
 6 | def get_logger(name) -> logging.Logger:
 7 |     """
 8 |     Creates and returns a logger with the specified name.
 9 | 
10 |     Ensures a universal logger configuration across the codebase with the format:
11 |     "levelname - asctime - filename:lineno - message"
12 | 
13 |     Args:
14 |         name (str): Name of the logger. Defaults to None, which gets the root logger.
15 | 
16 |     Returns:
17 |         logging.Logger: Configured logger instance.
18 |     """
19 |     # Create or retrieve the logger
20 |     return logging.getLogger(name)
21 | 
22 | 
23 | class NewLineFormatter(logging.Formatter):
24 |     """Adds logging prefix to newlines to align multi-line messages."""
25 | 
26 |     def __init__(self, fmt, datefmt=None):
27 |         logging.Formatter.__init__(self, fmt, datefmt)
28 | 
29 |     def format(self, record):
30 |         msg = logging.Formatter.format(self, record)
31 |         if record.message != "":
32 |             parts = msg.partition(record.message)
33 |             msg = msg.replace("\n", "\r\n" + parts[0])
34 |         return msg
35 | 
36 | 
37 | root_logger = logging.getLogger("runhouse")
38 | 
39 | 
40 | def init_logger(logger):
41 |     level = os.getenv("RH_LOG_LEVEL") or "INFO"
42 |     level = getattr(logging, level.upper())
43 |     logger.setLevel(level)
44 |     for handler in logger.handlers:
45 |         logger.removeHandler(handler)
46 | 
47 |     if not logger.handlers:
48 |         formatter = NewLineFormatter(
49 |             "%(levelname)s | %(asctime)s | %(name)s:%(lineno)d | %(message)s",
50 |             datefmt="%Y-%m-%d %H:%M:%S",
51 |         )
52 |         handler = logging.StreamHandler(sys.stdout)
53 |         handler.setFormatter(formatter)
54 |         logger.addHandler(handler)
55 | 
56 |     logger.propagate = False
57 | 
58 | 
59 | init_logger(root_logger)
60 | 


--------------------------------------------------------------------------------
/runhouse/resources/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/resources/__init__.py


--------------------------------------------------------------------------------
/runhouse/resources/distributed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/resources/distributed/__init__.py


--------------------------------------------------------------------------------
/runhouse/resources/distributed/dask_distributed.py:
--------------------------------------------------------------------------------
 1 | from runhouse.constants import DEFAULT_DASK_PORT
 2 | from runhouse.resources.distributed.supervisor import Supervisor
 3 | 
 4 | from runhouse.resources.module import Module
 5 | 
 6 | 
 7 | class DaskDistributed(Supervisor):
 8 |     def __init__(
 9 |         self,
10 |         name,
11 |         module: Module = None,
12 |         port: int = DEFAULT_DASK_PORT,
13 |         client_timeout="3s",
14 |         **kwargs
15 |     ):
16 |         super().__init__(name=name, **kwargs)
17 |         self._module = module
18 |         self._dask_port = port
19 |         self._dask_client = None
20 |         self._client_timeout = client_timeout
21 | 
22 |     def _compute_signature(self, rich=False):
23 |         return self.local._module.signature(rich=rich)
24 | 
25 |     def forward(self, item, *args, **kwargs):
26 |         if not self._dask_client:
27 |             self._dask_client = self.system.connect_dask(
28 |                 port=self._dask_port, client_timeout=self._client_timeout
29 |             )
30 |         method = getattr(self._module, item)
31 |         return method(*args, **kwargs)
32 | 
33 |     def __call__(self, *args, **kwargs):
34 |         return self.call(*args, **kwargs)
35 | 
36 |     def __getstate__(self):
37 |         state = super().__getstate__()
38 |         # Dask client can't be serialized
39 |         state["_dask_client"] = None
40 |         return state
41 | 


--------------------------------------------------------------------------------
/runhouse/resources/distributed/distributed_pool.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from typing import List, Optional
 3 | 
 4 | from runhouse.resources.distributed.supervisor import Supervisor
 5 | 
 6 | from runhouse.resources.module import Module
 7 | 
 8 | 
 9 | class DistributedPool(Supervisor):
10 |     def __init__(
11 |         self, name, replicas: List[Module] = None, max_concurrency: int = 1, **kwargs
12 |     ):
13 |         super().__init__(name=name, **kwargs)
14 |         self._replicas = replicas or []
15 |         self._max_concurrency = max_concurrency
16 |         self._available_replicas = list(
17 |             range(len(self._replicas) * self._max_concurrency)
18 |         )
19 | 
20 |     def _compute_signature(self, rich=False):
21 |         return self.local._replicas[0].signature(rich=rich)
22 | 
23 |     def forward(self, item, timeout: Optional[int] = None, *args, **kwargs):
24 |         time_waited = 0
25 |         while not self._available_replicas:
26 |             if timeout == 0:
27 |                 raise TimeoutError("No available replicas.")
28 |             if timeout is not None and time_waited >= timeout:
29 |                 raise TimeoutError("Timed out waiting for a replica to be available.")
30 |             time.sleep(0.25)
31 |             time_waited += 0.25
32 |         worker_idx = self._available_replicas.pop(0)
33 |         worker = self._replicas[worker_idx // self._max_concurrency]
34 |         method = getattr(worker, item)
35 |         res = method(*args, **kwargs)
36 |         self._available_replicas.append(worker_idx)
37 |         return res
38 | 
39 |     def __call__(self, *args, **kwargs):
40 |         return self.call(*args, **kwargs)
41 | 


--------------------------------------------------------------------------------
/runhouse/resources/distributed/pytorch_distributed.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures.thread import ThreadPoolExecutor
 2 | from typing import List
 3 | 
 4 | from runhouse.resources.distributed.supervisor import Supervisor
 5 | 
 6 | from runhouse.resources.hardware import Cluster, OnDemandCluster
 7 | 
 8 | from runhouse.resources.module import Module
 9 | 
10 | 
11 | class PyTorchDistributed(Supervisor):
12 |     def __init__(self, name, replicas: List[Module] = None, port=None, **kwargs):
13 |         super().__init__(name=name, **kwargs)
14 |         self._replicas = replicas or []
15 |         self._port = port
16 | 
17 |     def _compute_signature(self, rich=False):
18 |         return self.local._replicas[0].signature(rich=rich)
19 | 
20 |     def _find_available_port_on_head_node(self):
21 |         find_available_port_cmd = "python3 -c \"import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()\""
22 |         status_code, stdout, _ = self._replicas[0].system.run_bash(
23 |             find_available_port_cmd,
24 |             node=self._replicas[0].system.head_ip,
25 |             require_outputs=True,
26 |         )
27 | 
28 |         if status_code != 0:
29 |             raise RuntimeError(f"Failed to find available port on head rank: {stdout}")
30 |         return stdout
31 | 
32 |     def forward(self, item, *args, **kwargs):
33 |         port = self._port or self._find_available_port_on_head_node()
34 | 
35 |         def run_on_replica(replica, rank):
36 |             # Per https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization
37 |             master_addr = (
38 |                 self.system.internal_ips[0]
39 |                 if isinstance(self.system, OnDemandCluster)
40 |                 else self.system.head_ip
41 |                 if isinstance(self.system, Cluster)
42 |                 else "localhost"
43 |             )
44 | 
45 |             processes_per_node = len(self._replicas) // len(self.system.ips)
46 | 
47 |             dist_config = {
48 |                 "MASTER_ADDR": master_addr,
49 |                 "MASTER_PORT": port,
50 |                 "RANK": str(rank),
51 |                 "WORLD_SIZE": str(len(self._replicas)),
52 |                 "LOCAL_RANK": str(rank % processes_per_node),
53 |             }
54 | 
55 |             replica.system.set_process_env_vars(replica.process, dist_config)
56 |             method = getattr(replica, item)
57 |             return method(*args, **kwargs)
58 | 
59 |         with ThreadPoolExecutor(max_workers=len(self._replicas)) as executor:
60 |             res = executor.map(
61 |                 run_on_replica, self._replicas, range(len(self._replicas))
62 |             )
63 |             res = list(res)
64 | 
65 |         return res
66 | 
67 |     def __call__(self, *args, **kwargs):
68 |         return self.call(*args, **kwargs)
69 | 


--------------------------------------------------------------------------------
/runhouse/resources/distributed/ray_distributed.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import sys
 3 | 
 4 | from runhouse.resources.distributed.supervisor import Supervisor
 5 | 
 6 | from runhouse.resources.module import Module
 7 | 
 8 | 
 9 | class RayDistributed(Supervisor):
10 |     def __init__(self, name, module: Module = None, ray_init_options=None, **kwargs):
11 |         if not hasattr(module, "fn_pointers"):
12 |             raise ValueError(
13 |                 "RayDistributed requires a Runhouse Function object to distribute."
14 |             )
15 | 
16 |         super().__init__(name=name, **kwargs)
17 |         self._module = module
18 |         self._ray_init_options = ray_init_options or {}
19 | 
20 |     def _compute_signature(self, rich=False):
21 |         return self.local._module.signature(rich=rich)
22 | 
23 |     def forward(self, item, *args, **kwargs):
24 |         from runhouse.resources.distributed.utils import subprocess_ray_fn_call_helper
25 | 
26 |         # TODO replace this with passing the filepath that this module is already writing to!
27 |         parent_conn, child_conn = multiprocessing.Pipe()
28 | 
29 |         subproc_args = (
30 |             self._module.fn_pointers,
31 |             args,
32 |             kwargs,
33 |             child_conn,
34 |             self._ray_init_options,
35 |         )
36 | 
37 |         # Check if start method is already spawn, because set_start_method will error if called again
38 |         if multiprocessing.get_start_method(allow_none=True) != "spawn":
39 |             multiprocessing.set_start_method("spawn")
40 | 
41 |         with multiprocessing.Pool(processes=1) as pool:
42 |             result = pool.apply_async(subprocess_ray_fn_call_helper, args=subproc_args)
43 |             while True:
44 |                 try:
45 |                     (msg, output_stream) = parent_conn.recv()
46 |                     if msg == EOFError:
47 |                         break
48 |                     print(
49 |                         msg,
50 |                         end="",
51 |                         file=sys.stdout if output_stream == "stdout" else sys.stderr,
52 |                     )
53 |                 except EOFError:
54 |                     break
55 |             res = result.get()
56 |         return res
57 | 
58 |     def __call__(self, *args, **kwargs):
59 |         return self.call(*args, **kwargs)
60 | 


--------------------------------------------------------------------------------
/runhouse/resources/distributed/spark_distributed.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import sys
 3 | 
 4 | from runhouse.resources.distributed.supervisor import Supervisor
 5 | 
 6 | from runhouse.resources.module import Module
 7 | 
 8 | 
 9 | class SparkDistributed(Supervisor):
10 |     def __init__(
11 |         self,
12 |         name,
13 |         module: Module = None,
14 |         ray_init_options=None,
15 |         spark_init_options=None,
16 |         **kwargs
17 |     ):
18 |         if not hasattr(module, "fn_pointers"):
19 |             raise ValueError(
20 |                 "Spark Distributed requires a Runhouse Function object to distribute."
21 |             )
22 | 
23 |         print("initializing spark distribution")
24 |         super().__init__(name=name, **kwargs)
25 |         self._module = module
26 |         self._ray_init_options = ray_init_options or {}
27 |         self._spark_init_options = spark_init_options or {}
28 | 
29 |     def _compute_signature(self, rich=False):
30 |         return self.local._module.signature(rich=rich)
31 | 
32 |     def forward(self, item, *args, **kwargs):
33 |         from runhouse.resources.distributed.utils import subprocess_raydp_fn_call_helper
34 | 
35 |         # TODO replace this with passing the filepath that this module is already writing to!
36 |         parent_conn, child_conn = multiprocessing.Pipe()
37 |         subproc_args = (
38 |             self._module.fn_pointers,
39 |             args,
40 |             kwargs,
41 |             child_conn,
42 |             self._ray_init_options,
43 |             kwargs.pop("spark_init_options", self._spark_init_options),
44 |         )
45 | 
46 |         # Check if start method is already spawn, because set_start_method will error if called again
47 |         if multiprocessing.get_start_method(allow_none=True) != "spawn":
48 |             multiprocessing.set_start_method("spawn")
49 |         with multiprocessing.Pool(processes=1) as pool:
50 |             result = pool.apply_async(
51 |                 subprocess_raydp_fn_call_helper, args=subproc_args
52 |             )
53 |             while True:
54 |                 try:
55 |                     (msg, output_stream) = parent_conn.recv()
56 |                     if msg == EOFError:
57 |                         break
58 |                     print(
59 |                         msg,
60 |                         end="",
61 |                         file=sys.stdout if output_stream == "stdout" else sys.stderr,
62 |                     )
63 |                 except EOFError:
64 |                     break
65 |             res = result.get()
66 |         return res
67 | 
68 |     def __call__(self, *args, **kwargs):
69 |         return self.call(*args, **kwargs)
70 | 


--------------------------------------------------------------------------------
/runhouse/resources/folders/__init__.py:
--------------------------------------------------------------------------------
1 | from .folder import Folder
2 | from .folder_factory import folder
3 | from .gcs_folder import GCSFolder
4 | from .s3_folder import S3Folder
5 | 


--------------------------------------------------------------------------------
/runhouse/resources/functions/__init__.py:
--------------------------------------------------------------------------------
1 | from .function import Function
2 | from .function_factory import function
3 | 


--------------------------------------------------------------------------------
/runhouse/resources/hardware/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cluster import Cluster
 2 | from .cluster_factory import cluster, ondemand_cluster
 3 | from .docker_cluster import DockerCluster
 4 | from .on_demand_cluster import OnDemandCluster
 5 | from .ray_utils import check_for_existing_ray_instance, kill_actors, list_actor_states
 6 | from .utils import (
 7 |     _current_cluster,
 8 |     _get_cluster_from,
 9 |     cluster_config_file_exists,
10 |     ClusterStatus,
11 |     get_all_sky_clusters,
12 |     load_cluster_config_from_file,
13 |     SSEClient,
14 | )
15 | 


--------------------------------------------------------------------------------
/runhouse/resources/hardware/constants.py:
--------------------------------------------------------------------------------
 1 | STATIC_CLUSTER_ARGS = {
 2 |     "host",
 3 |     "ssh_creds",
 4 | }
 5 | 
 6 | ONDEMAND_COMPUTE_ARGS = {
 7 |     "instance_type",
 8 |     "num_nodes",
 9 |     "provider",
10 |     "pool",
11 |     "use_spot",
12 |     "region",
13 |     "memory",
14 |     "disk_size",
15 |     "vpc_name",
16 |     "num_cpus",
17 |     "gpus",
18 |     "sky_kwargs",
19 |     "launcher",
20 |     "autostop_mins",
21 | }
22 | 
23 | KUBERNETES_CLUSTER_ARGS = {
24 |     "kube_context",
25 |     "kube_namespace",
26 |     "kube_config_path",
27 | }
28 | 
29 | RH_SERVER_ARGS = {
30 |     "server_port",
31 |     "server_host",
32 |     "ssh_port",
33 |     "open_ports",  # ondemand only
34 |     "server_connection_type",
35 |     "ssl_keyfile",
36 |     "ssl_certfile",
37 |     "domain",
38 |     "image",
39 | }
40 | 


--------------------------------------------------------------------------------
/runhouse/resources/hardware/kubernetes/rsync_helper.sh:
--------------------------------------------------------------------------------
1 | # When using pod@namespace, rsync passes args as: {us} -l pod namespace
2 | shift
3 | pod=$1
4 | shift
5 | namespace=$1
6 | shift
7 | kubectl exec -i $pod -n $namespace -- "$@"
8 | 


--------------------------------------------------------------------------------
/runhouse/resources/hardware/ray_utils.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from typing import Optional
 3 | 
 4 | from runhouse.logger import get_logger
 5 | 
 6 | logger = get_logger(__name__)
 7 | 
 8 | 
 9 | def check_for_existing_ray_instance(venv):
10 |     ray_status_check = subprocess.run(
11 |         ["ray", "status"],
12 |         stdout=subprocess.PIPE,
13 |         stderr=subprocess.PIPE,
14 |         env={"PATH": f"{venv}/bin"} if venv else None,
15 |     )
16 |     return ray_status_check.returncode == 0
17 | 
18 | 
19 | def list_actor_states(
20 |     actor_name: Optional[str] = None,
21 |     actor_class_name: Optional[str] = None,
22 |     namespace: Optional[str] = "runhouse",
23 |     state: Optional[str] = "ALIVE",
24 | ):
25 |     import ray
26 |     from ray.experimental.state.api import list_actors
27 | 
28 |     def filter_by(actor: "ActorState"):
29 |         if actor_name and actor["name"] != actor_name:
30 |             return False
31 | 
32 |         if actor_class_name and actor["class_name"] != actor_class_name:
33 |             return False
34 | 
35 |         if namespace and actor["ray_namespace"] != namespace:
36 |             return False
37 | 
38 |         if state and actor["state"] != state:
39 |             return False
40 | 
41 |         return True
42 | 
43 |     return list(filter(filter_by, list_actors())) if ray.is_initialized() else []
44 | 
45 | 
46 | def kill_actors(
47 |     actor_name: Optional[str] = None,
48 |     actor_class_name: Optional[str] = None,
49 |     namespace: Optional[str] = None,
50 |     gracefully: bool = True,
51 | ):
52 |     import ray
53 | 
54 |     cluster_servlet_actor = None
55 |     for actor in list_actor_states(actor_name, actor_class_name, namespace):
56 |         actor_handle_to_kill = ray.get_actor(actor["name"])
57 |         if actor["name"] == "cluster_servlet":
58 |             cluster_servlet_actor = actor_handle_to_kill
59 |             continue
60 |         logger.info(f"Killing actor {actor['name']}")
61 |         if gracefully:
62 |             actor_handle_to_kill.__ray_terminate__.remote()
63 |         else:
64 |             ray.kill(actor_handle_to_kill)
65 | 
66 |     # Make sure to kill cluster_servlet last
67 |     if cluster_servlet_actor:
68 |         logger.info("Killing actor cluster_servlet")
69 |         if gracefully:
70 |             cluster_servlet_actor.__ray_terminate__.remote()
71 |         else:
72 |             ray.kill(cluster_servlet_actor)
73 | 


--------------------------------------------------------------------------------
/runhouse/resources/hardware/sky/__init__.py:
--------------------------------------------------------------------------------
1 | # Importing relevant files/functions from SkyPilot (Apache 2.0)
2 | # https://github.com/skypilot-org/skypilot/tree/
3 | 


--------------------------------------------------------------------------------
/runhouse/resources/hardware/sky/constants.py:
--------------------------------------------------------------------------------
1 | # Source: https://github.com/skypilot-org/skypilot/blob/feb52cf/sky/skylet/constants.py
2 | 
3 | 
4 | DEFAULT_DOCKER_PORT = 10022
5 | USER_ID_ENV_VAR = 'SKYPILOT_USER_ID'
6 | 


--------------------------------------------------------------------------------
/runhouse/resources/hardware/sky/subprocess_daemon.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pulled from: https://github.com/skypilot-org/skypilot/blob/3baa9c9/sky/skylet/subprocess_daemon.py
 3 | 
 4 | Sky subprocess daemon.
 5 | 
 6 | Wait for parent_pid to exit, then SIGTERM (or SIGKILL if needed) the child
 7 | processes of proc_pid.
 8 | """
 9 | 
10 | import argparse
11 | import sys
12 | import time
13 | 
14 | if __name__ == '__main__':
15 |     import psutil
16 | 
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('--parent-pid', type=int, required=True)
19 |     parser.add_argument('--proc-pid', type=int, required=True)
20 |     args = parser.parse_args()
21 | 
22 |     process = None
23 |     parent_process = None
24 |     try:
25 |         process = psutil.Process(args.proc_pid)
26 |         parent_process = psutil.Process(args.parent_pid)
27 |     except psutil.NoSuchProcess:
28 |         pass
29 | 
30 |     if process is None:
31 |         sys.exit()
32 | 
33 |     if parent_process is not None:
34 |         # Wait for either parent or target process to exit.
35 |         while process.is_running() and parent_process.is_running():
36 |             time.sleep(1)
37 | 
38 |     try:
39 |         children = process.children(recursive=True)
40 |         children.append(process)
41 |     except psutil.NoSuchProcess:
42 |         sys.exit()
43 | 
44 |     for pid in children:
45 |         try:
46 |             pid.terminate()
47 |         except psutil.NoSuchProcess:
48 |             pass
49 | 
50 |     # Wait 30s for the processes to exit gracefully.
51 |     time.sleep(30)
52 | 
53 |     # SIGKILL if they're still running.
54 |     for pid in children:
55 |         try:
56 |             pid.kill()
57 |         except psutil.NoSuchProcess:
58 |             pass
59 | 


--------------------------------------------------------------------------------
/runhouse/resources/images/__init__.py:
--------------------------------------------------------------------------------
1 | from .image import Image, ImageSetupStep, ImageSetupStepType
2 | 


--------------------------------------------------------------------------------
/runhouse/resources/images/builtin_images.py:
--------------------------------------------------------------------------------
 1 | from runhouse.resources.images.image import Image
 2 | 
 3 | 
 4 | def dask():
 5 |     return Image().pip_install(["dask[distributed,dataframe]", "dask-ml"])
 6 | 
 7 | 
 8 | def pytorch():
 9 |     return Image().pip_install(["torch"])
10 | 
11 | 
12 | def ray():
13 |     return Image().pip_install(["ray[tune,data,train]"])
14 | 


--------------------------------------------------------------------------------
/runhouse/resources/packages/__init__.py:
--------------------------------------------------------------------------------
1 | from .package import CodeSyncError, InstallTarget, Package, package
2 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/__init__.py:
--------------------------------------------------------------------------------
1 | from .provider_secrets import ProviderSecret
2 | from .secret import Secret
3 | from .secret_factory import provider_secret, secret
4 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/__init__.py:
--------------------------------------------------------------------------------
1 | from .provider_secret import ProviderSecret
2 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/anthropic_secret.py:
--------------------------------------------------------------------------------
 1 | from runhouse.resources.secrets.provider_secrets.api_key_secret import ApiKeySecret
 2 | 
 3 | 
 4 | class AnthropicSecret(ApiKeySecret):
 5 |     """
 6 |     .. note::
 7 |             To create an AnthropicSecret, please use the factory method :func:`provider_secret`
 8 |             with ``provider="anthropic"``.
 9 |     """
10 | 
11 |     _PROVIDER = "anthropic"
12 |     _DEFAULT_ENV_VARS = {"api_key": "ANTHROPIC_API_KEY"}
13 | 
14 |     @staticmethod
15 |     def from_config(config: dict, dryrun: bool = False):
16 |         return AnthropicSecret(**config, dryrun=dryrun)
17 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/api_key_secret.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional, Union
 2 | 
 3 | from runhouse.resources.hardware.cluster import Cluster
 4 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret
 5 | 
 6 | 
 7 | class ApiKeySecret(ProviderSecret):
 8 |     """Secret class for providers consisting of a single API key, generally stored as an environment variable.
 9 | 
10 |     .. note::
11 |             To create an ApiKeySecret, please use the factory method :func:`provider_secret`
12 |             and passing in the corresponding provider.
13 |     """
14 | 
15 |     def write(
16 |         self,
17 |         file: bool = False,
18 |         env: bool = False,
19 |         path: str = None,
20 |         env_vars: Dict = None,
21 |         overwrite: bool = False,
22 |         write_config: bool = True,
23 |     ):
24 |         if not file or path:
25 |             env = True
26 |         super().write(
27 |             file=file,
28 |             env=env,
29 |             path=path,
30 |             env_vars=env_vars,
31 |             overwrite=overwrite,
32 |             write_config=write_config,
33 |         )
34 | 
35 |     def to(
36 |         self,
37 |         system: Union[str, Cluster],
38 |         path: str = None,
39 |         process: Optional[str] = None,
40 |         values: bool = True,
41 |         name: Optional[str] = None,
42 |     ):
43 |         return super().to(
44 |             system=system, path=path, process=process, values=values, name=name
45 |         )
46 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/aws_secret.py:
--------------------------------------------------------------------------------
 1 | import configparser
 2 | import copy
 3 | import os
 4 | 
 5 | from typing import Dict
 6 | 
 7 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret
 8 | from runhouse.resources.secrets.utils import _check_file_for_mismatches
 9 | from runhouse.utils import create_local_dir
10 | 
11 | 
12 | class AWSSecret(ProviderSecret):
13 |     """
14 |     .. note::
15 |             To create an AWSSecret, please use the factory method :func:`provider_secret` with ``provider="aws"``.
16 |     """
17 | 
18 |     _PROVIDER = "aws"
19 |     _DEFAULT_CREDENTIALS_PATH = "~/.aws/credentials"
20 |     _DEFAULT_ENV_VARS = {
21 |         "access_key": "AWS_ACCESS_KEY_ID",
22 |         "secret_key": "AWS_SECRET_ACCESS_KEY",
23 |     }
24 | 
25 |     @staticmethod
26 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
27 |         return AWSSecret(**config, dryrun=dryrun)
28 | 
29 |     def _write_to_file(
30 |         self,
31 |         path: str,
32 |         values: Dict,
33 |         overwrite: bool = False,
34 |         write_config: bool = True,
35 |     ):
36 |         new_secret = copy.deepcopy(self)
37 | 
38 |         if not _check_file_for_mismatches(
39 |             path, self._from_path(path), values, overwrite
40 |         ):
41 | 
42 |             parser = configparser.ConfigParser()
43 |             section_name = "default"
44 |             parser.add_section(section_name)
45 |             parser.set(
46 |                 section=section_name,
47 |                 option="aws_access_key_id",
48 |                 value=values["access_key"],
49 |             )
50 |             parser.set(
51 |                 section=section_name,
52 |                 option="aws_secret_access_key",
53 |                 value=values["secret_key"],
54 |             )
55 | 
56 |             full_path = create_local_dir(path)
57 |             with open(full_path, "w+") as f:
58 |                 parser.write(f)
59 | 
60 |             if write_config:
61 |                 new_secret._add_to_rh_config(path)
62 | 
63 |         new_secret._values = None
64 |         new_secret.path = path
65 |         return new_secret
66 | 
67 |     def _from_path(self, path: str):
68 |         config = configparser.ConfigParser()
69 |         if path and os.path.exists(os.path.expanduser(path)):
70 |             config.read(os.path.expanduser(path))
71 |         else:
72 |             return {}
73 | 
74 |         section_name = "default"
75 |         access_key = config[section_name]["aws_access_key_id"]
76 |         secret_key = config[section_name]["aws_secret_access_key"]
77 | 
78 |         return {
79 |             "access_key": access_key,
80 |             "secret_key": secret_key,
81 |         }
82 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/azure_secret.py:
--------------------------------------------------------------------------------
 1 | import configparser
 2 | import copy
 3 | import os
 4 | 
 5 | from typing import Dict
 6 | 
 7 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret
 8 | from runhouse.resources.secrets.utils import _check_file_for_mismatches
 9 | from runhouse.utils import create_local_dir
10 | 
11 | 
12 | class AzureSecret(ProviderSecret):
13 |     """
14 |     .. note::
15 |             To create an AzureSecret, please use the factory method :func:`provider_secret` with ``provider="azure"``.
16 |     """
17 | 
18 |     # values format: {"subscription_id": subscription_id}
19 |     _PROVIDER = "azure"
20 |     _DEFAULT_CREDENTIALS_PATH = "~/.azure/clouds.config"
21 |     _DEFAULT_ENV_VARS = {"subscription_id": "AZURE_SUBSCRIPTION_ID"}
22 | 
23 |     @staticmethod
24 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
25 |         return AzureSecret(**config, dryrun=dryrun)
26 | 
27 |     def _write_to_file(
28 |         self,
29 |         path: str = None,
30 |         values: Dict = None,
31 |         overwrite: bool = False,
32 |         write_config: bool = True,
33 |     ):
34 |         new_secret = copy.deepcopy(self)
35 |         if not _check_file_for_mismatches(
36 |             path, self._from_path(path), values, overwrite
37 |         ):
38 |             subscription_id = values["subscription_id"]
39 | 
40 |             parser = configparser.ConfigParser()
41 |             section_name = "AzureCloud"
42 |             parser.add_section(section_name)
43 |             parser.set(
44 |                 section=section_name,
45 |                 option="subscription",
46 |                 value=subscription_id,
47 |             )
48 | 
49 |             full_path = create_local_dir(path)
50 |             with open(full_path, "w") as f:
51 |                 parser.write(f)
52 | 
53 |             if write_config:
54 |                 new_secret._add_to_rh_config(path)
55 | 
56 |         new_secret._values = None
57 |         new_secret.path = path
58 |         return new_secret
59 | 
60 |     def _from_path(self, path: str = None):
61 |         config = configparser.ConfigParser()
62 |         if path and os.path.exists(os.path.expanduser(path)):
63 |             path = os.path.expanduser(path)
64 |             config.read(path)
65 |         if config and "AzureCloud" in config.sections():
66 |             subscription_id = config["AzureCloud"]["subscription"]
67 |             return {"subscription_id": subscription_id}
68 |         return {}
69 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/cohere_secret.py:
--------------------------------------------------------------------------------
 1 | from runhouse.resources.secrets.provider_secrets.api_key_secret import ApiKeySecret
 2 | 
 3 | 
 4 | class CohereSecret(ApiKeySecret):
 5 |     """
 6 |     .. note::
 7 |             To create an CohereSecret, please use the factory method :func:`provider_secret`
 8 |             with ``provider="cohere"``.
 9 |     """
10 | 
11 |     _PROVIDER = "cohere"
12 |     _DEFAULT_ENV_VARS = {"api_key": "COHERE_API_KEY"}
13 | 
14 |     @staticmethod
15 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
16 |         return CohereSecret(**config, dryrun=dryrun)
17 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/docker_secret.py:
--------------------------------------------------------------------------------
 1 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret
 2 | 
 3 | 
 4 | class DockerRegistrySecret(ProviderSecret):
 5 |     """
 6 |     .. note::
 7 |         To create a DockerRegistrySecret, please use the factory method
 8 |         :func:`provider_secret` with ``provider="docker"``.
 9 |     """
10 | 
11 |     _PROVIDER = "docker"
12 |     _DEFAULT_ENV_VARS = {
13 |         "username": "SKYPILOT_DOCKER_USERNAME",
14 |         "password": "SKYPILOT_DOCKER_PASSWORD",
15 |         "server": "SKYPILOT_DOCKER_SERVER",
16 |     }
17 | 
18 |     @staticmethod
19 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
20 |         return DockerRegistrySecret(**config, dryrun=dryrun)
21 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/gcp_secret.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import json
 3 | import os
 4 | from pathlib import Path
 5 | 
 6 | from typing import Dict
 7 | 
 8 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret
 9 | from runhouse.resources.secrets.utils import _check_file_for_mismatches
10 | from runhouse.utils import create_local_dir
11 | 
12 | 
13 | class GCPSecret(ProviderSecret):
14 |     """
15 |     .. note::
16 |             To create a GCPSecret, please use the factory method :func:`provider_secret` with ``provider="gcp"``.
17 |     """
18 | 
19 |     _PROVIDER = "gcp"
20 |     _DEFAULT_CREDENTIALS_PATH = "~/.config/gcloud/application_default_credentials.json"
21 |     _DEFAULT_ENV_VARS = {
22 |         "client_id": "CLIENT_ID",
23 |         "client_secret": "CLIENT_SECRET",
24 |     }
25 | 
26 |     @staticmethod
27 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
28 |         return GCPSecret(**config, dryrun=dryrun)
29 | 
30 |     def _write_to_file(
31 |         self,
32 |         path: str,
33 |         values: Dict = None,
34 |         overwrite: bool = False,
35 |         write_config: bool = True,
36 |     ):
37 |         new_secret = copy.deepcopy(self)
38 |         if not _check_file_for_mismatches(
39 |             path, self._from_path(path), values, overwrite
40 |         ):
41 |             Path(path).parent.mkdir(parents=True, exist_ok=True)
42 | 
43 |             full_path = create_local_dir(path)
44 |             with open(full_path, "w+") as f:
45 |                 json.dump(values, f, indent=4)
46 | 
47 |             if write_config:
48 |                 new_secret._add_to_rh_config(path)
49 | 
50 |         new_secret._values = None
51 |         new_secret.path = path
52 |         return new_secret
53 | 
54 |     def _from_path(self, path: str = None):
55 |         config = {}
56 |         if path and os.path.exists(os.path.expanduser(path)):
57 |             with open(os.path.expanduser(path), "r") as config_file:
58 |                 config = json.load(config_file)
59 |         return config
60 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/github_secret.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import os
 3 | from pathlib import Path
 4 | 
 5 | from typing import Dict
 6 | 
 7 | import yaml
 8 | 
 9 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret
10 | from runhouse.resources.secrets.utils import _check_file_for_mismatches
11 | 
12 | 
13 | class GitHubSecret(ProviderSecret):
14 |     """
15 |     .. note::
16 |             To create a GitHubSecret, please use the factory method :func:`provider_secret` with ``provider="github"``.
17 |     """
18 | 
19 |     # values format: {"oauth_token": oath_token}
20 |     _PROVIDER = "github"
21 |     _DEFAULT_CREDENTIALS_PATH = "~/.config/gh/hosts.yml"
22 | 
23 |     @staticmethod
24 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
25 |         return GitHubSecret(**config, dryrun=dryrun)
26 | 
27 |     def _write_to_file(
28 |         self,
29 |         path: str,
30 |         values: Dict = None,
31 |         overwrite: bool = False,
32 |         write_config: bool = True,
33 |     ):
34 |         new_secret = copy.deepcopy(self)
35 |         if not _check_file_for_mismatches(
36 |             path, self._from_path(path), values, overwrite
37 |         ):
38 |             config = {}
39 | 
40 |             full_path = os.path.expanduser(path)
41 |             if Path(full_path).exists():
42 |                 with open(full_path, "r") as stream:
43 |                     config = yaml.safe_load(stream)
44 |             config["github.com"] = values
45 | 
46 |             Path(full_path).parent.mkdir(parents=True, exist_ok=True)
47 |             with open(full_path, "w") as yaml_file:
48 |                 yaml.dump(config, yaml_file, default_flow_style=False)
49 | 
50 |             if write_config:
51 |                 new_secret._add_to_rh_config(path)
52 | 
53 |         new_secret._values = None
54 |         new_secret.path = path
55 |         return new_secret
56 | 
57 |     def _from_path(self, path: str = None):
58 |         config = {}
59 |         if path and os.path.exists(os.path.expanduser(path)):
60 |             with open(os.path.expanduser(path), "r") as stream:
61 |                 config = yaml.safe_load(stream)
62 |         return config["github.com"] if config else {}
63 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/huggingface_secret.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import os
 3 | from pathlib import Path
 4 | 
 5 | from typing import Dict
 6 | 
 7 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret
 8 | from runhouse.resources.secrets.utils import _check_file_for_mismatches
 9 | from runhouse.utils import create_local_dir
10 | 
11 | 
12 | class HuggingFaceSecret(ProviderSecret):
13 |     """
14 |     .. note::
15 |             To create a HuggingFaceSecret, please use the factory method :func:`provider_secret` with
16 |             ``provider="huggingface"``.
17 |     """
18 | 
19 |     # values format: {"token": hf_token}
20 |     _PROVIDER = "huggingface"
21 |     _DEFAULT_CREDENTIALS_PATH = "~/.cache/huggingface/token"
22 |     _DEFAULT_ENV_VARS = {"token": "HF_TOKEN"}
23 | 
24 |     @staticmethod
25 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
26 |         return HuggingFaceSecret(**config, dryrun=dryrun)
27 | 
28 |     def _write_to_file(
29 |         self,
30 |         path: str,
31 |         values: Dict = None,
32 |         overwrite: bool = False,
33 |         write_config: bool = True,
34 |     ):
35 |         new_secret = copy.deepcopy(self)
36 |         if not _check_file_for_mismatches(
37 |             path, self._from_path(path), values, overwrite
38 |         ):
39 |             token = values["token"]
40 |             full_path = create_local_dir(path)
41 |             with open(full_path, "a") as f:
42 |                 f.write(token)
43 | 
44 |             if write_config:
45 |                 new_secret._add_to_rh_config(path)
46 | 
47 |         new_secret._values = None
48 |         new_secret.path = path
49 |         return new_secret
50 | 
51 |     def _from_path(self, path: str = None):
52 |         token = None
53 |         if path and os.path.exists(os.path.expanduser(path)):
54 |             token = Path(os.path.expanduser(path)).read_text().strip("\n")
55 |         if token:
56 |             return {"token": token}
57 |         return {}
58 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/kubeconfig_secret.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import os
 3 | from typing import Dict
 4 | 
 5 | import yaml
 6 | 
 7 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret
 8 | from runhouse.resources.secrets.utils import _check_file_for_mismatches
 9 | from runhouse.utils import create_local_dir
10 | 
11 | 
12 | class KubeConfigSecret(ProviderSecret):
13 |     """
14 |     .. note::
15 |         To create a KubeConfigSecret, please use the factory method :func:`provider_secret` with ``provider=="kubernetes"``.
16 |     """
17 | 
18 |     _PROVIDER = "kubernetes"
19 |     _DEFAULT_CREDENTIALS_PATH = "~/.kube/config"
20 | 
21 |     @staticmethod
22 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
23 |         # try block if for the case we are trying to load a shared secret.
24 |         return KubeConfigSecret(**config, dryrun=dryrun)
25 | 
26 |     def _from_path(self, path: str = None):
27 |         path = path or self.path
28 |         if not path:
29 |             return {}
30 | 
31 |         path = os.path.expanduser(path)
32 |         if os.path.exists(path):
33 |             try:
34 |                 with open(path) as f:
35 |                     contents = yaml.safe_load(f)
36 |             except:
37 |                 contents = {}
38 |             return contents
39 |         return {}
40 | 
41 |     def _write_to_file(
42 |         self,
43 |         path: str,
44 |         values: Dict,
45 |         overwrite: bool = False,
46 |         write_config: bool = True,
47 |     ):
48 |         new_secret = copy.deepcopy(self)
49 |         path = path or self.path
50 |         if not _check_file_for_mismatches(
51 |             path, self._from_path(path), values, overwrite
52 |         ):
53 |             full_path = create_local_dir(path)
54 |             with open(full_path, "w") as f:
55 |                 yaml.safe_dump(values, f)
56 | 
57 |             if write_config:
58 |                 self._add_to_rh_config(path)
59 | 
60 |         new_secret._values = None
61 |         new_secret.path = path
62 |         return new_secret
63 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/lambda_secret.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import os
 3 | 
 4 | from typing import Dict
 5 | 
 6 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret
 7 | from runhouse.resources.secrets.utils import _check_file_for_mismatches
 8 | from runhouse.utils import create_local_dir
 9 | 
10 | 
11 | class LambdaSecret(ProviderSecret):
12 |     """
13 |     .. note::
14 |             To create a LambdaSecret, please use the factory method :func:`provider_secret` with ``provider="lambda"``.
15 |     """
16 | 
17 |     # values format: {"api_key": api_key}
18 |     _DEFAULT_CREDENTIALS_PATH = "~/.lambda_cloud/lambda_keys"
19 |     _PROVIDER = "lambda"
20 | 
21 |     @staticmethod
22 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
23 |         return LambdaSecret(**config, dryrun=dryrun)
24 | 
25 |     def _write_to_file(
26 |         self,
27 |         path: str,
28 |         values: Dict = None,
29 |         overwrite: bool = False,
30 |         write_config: bool = True,
31 |     ):
32 |         new_secret = copy.deepcopy(self)
33 |         if not _check_file_for_mismatches(
34 |             path, self._from_path(path), values, overwrite
35 |         ):
36 |             data = f'api_key = {values["api_key"]}\n'
37 |             full_path = create_local_dir(path)
38 |             with open(full_path, "w+") as f:
39 |                 f.write(data)
40 | 
41 |             if write_config:
42 |                 new_secret._add_to_rh_config(path)
43 | 
44 |         new_secret._values = None
45 |         new_secret.path = path
46 |         return new_secret
47 | 
48 |     def _from_path(self, path: str = None):
49 |         lines = None
50 |         if path and os.path.exists(os.path.expanduser(path)):
51 |             with open(os.path.expanduser(path), "r") as f:
52 |                 lines = f.readlines()
53 |         if lines:
54 |             for line in lines:
55 |                 split = line.split()
56 |                 if split[0] == "api_key":
57 |                     api_key = split[-1]
58 |                     return {"api_key": api_key}
59 |         return {}
60 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/langchain_secret.py:
--------------------------------------------------------------------------------
 1 | from runhouse.resources.secrets.provider_secrets.api_key_secret import ApiKeySecret
 2 | 
 3 | 
 4 | class LangChainSecret(ApiKeySecret):
 5 |     """
 6 |     .. note::
 7 |             To create an LangChainSecret, please use the factory method :func:`provider_secret`
 8 |             with ``provider="langchain"``.
 9 |     """
10 | 
11 |     _PROVIDER = "langchain"
12 |     _DEFAULT_ENV_VARS = {"api_key": "LANGCHAIN_API_KEY"}
13 | 
14 |     @staticmethod
15 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
16 |         return LangChainSecret(**config, dryrun=dryrun)
17 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/openai_secret.py:
--------------------------------------------------------------------------------
 1 | from runhouse.resources.secrets.provider_secrets.api_key_secret import ApiKeySecret
 2 | 
 3 | 
 4 | class OpenAISecret(ApiKeySecret):
 5 |     """
 6 |     .. note::
 7 |             To create an OpenAISecret, please use the factory method :func:`provider_secret` with ``provider="openai"``.
 8 |     """
 9 | 
10 |     _PROVIDER = "openai"
11 |     _DEFAULT_ENV_VARS = {"api_key": "OPENAI_API_KEY"}
12 | 
13 |     @staticmethod
14 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
15 |         return OpenAISecret(**config, dryrun=dryrun)
16 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/pinecone_secret.py:
--------------------------------------------------------------------------------
 1 | from runhouse.resources.secrets.provider_secrets.api_key_secret import ApiKeySecret
 2 | 
 3 | 
 4 | class PineconeSecret(ApiKeySecret):
 5 |     """
 6 |     .. note::
 7 |             To create an PineconeSecret, please use the factory method :func:`provider_secret`
 8 |             with ``provider="pinecone"``.
 9 |     """
10 | 
11 |     _PROVIDER = "pinecone"
12 |     _DEFAULT_ENV_VARS = {"api_key": "PINECONE_API_KEY"}
13 | 
14 |     @staticmethod
15 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
16 |         return PineconeSecret(**config, dryrun=dryrun)
17 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/providers.py:
--------------------------------------------------------------------------------
 1 | from runhouse.resources.secrets.provider_secrets.anthropic_secret import AnthropicSecret
 2 | from runhouse.resources.secrets.provider_secrets.aws_secret import AWSSecret
 3 | from runhouse.resources.secrets.provider_secrets.azure_secret import AzureSecret
 4 | from runhouse.resources.secrets.provider_secrets.cohere_secret import CohereSecret
 5 | from runhouse.resources.secrets.provider_secrets.docker_secret import (
 6 |     DockerRegistrySecret,
 7 | )
 8 | from runhouse.resources.secrets.provider_secrets.gcp_secret import GCPSecret
 9 | from runhouse.resources.secrets.provider_secrets.github_secret import GitHubSecret
10 | from runhouse.resources.secrets.provider_secrets.huggingface_secret import (
11 |     HuggingFaceSecret,
12 | )
13 | from runhouse.resources.secrets.provider_secrets.kubeconfig_secret import (
14 |     KubeConfigSecret,
15 | )
16 | from runhouse.resources.secrets.provider_secrets.lambda_secret import LambdaSecret
17 | from runhouse.resources.secrets.provider_secrets.langchain_secret import LangChainSecret
18 | from runhouse.resources.secrets.provider_secrets.openai_secret import OpenAISecret
19 | from runhouse.resources.secrets.provider_secrets.pinecone_secret import PineconeSecret
20 | from runhouse.resources.secrets.provider_secrets.provider_secret import ProviderSecret
21 | from runhouse.resources.secrets.provider_secrets.sky_secret import SkySecret
22 | from runhouse.resources.secrets.provider_secrets.ssh_secret import SSHSecret
23 | from runhouse.resources.secrets.provider_secrets.wandb_secret import WandBSecret
24 | 
25 | 
26 | _str_to_provider_class = {
27 |     # File and/or Env secrets
28 |     "aws": AWSSecret,
29 |     "azure": AzureSecret,
30 |     "gcp": GCPSecret,
31 |     "github": GitHubSecret,
32 |     "huggingface": HuggingFaceSecret,
33 |     "kubernetes": KubeConfigSecret,
34 |     "lambda": LambdaSecret,
35 |     "docker": DockerRegistrySecret,
36 |     # SSH secrets
37 |     "ssh": SSHSecret,
38 |     "sky": SkySecret,
39 |     # API key secrets
40 |     "anthropic": AnthropicSecret,
41 |     "cohere": CohereSecret,
42 |     "langchain": LangChainSecret,
43 |     "openai": OpenAISecret,
44 |     "pinecone": PineconeSecret,
45 |     "wandb": WandBSecret,
46 | }
47 | 
48 | 
49 | def _get_provider_class(provider_str):
50 |     if provider_str not in _str_to_provider_class:
51 |         return ProviderSecret
52 |     return _str_to_provider_class[provider_str]
53 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/sky_secret.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional
 2 | 
 3 | from runhouse.resources.secrets.provider_secrets.ssh_secret import SSHSecret
 4 | 
 5 | 
 6 | class SkySecret(SSHSecret):
 7 |     """
 8 |     .. note::
 9 |             To create a SkySecret, please use the factory method :func:`provider_secret` with ``provider="sky"``.
10 |     """
11 | 
12 |     _PROVIDER = "sky"
13 |     _DEFAULT_KEY = "sky-key"
14 | 
15 |     def __init__(
16 |         self,
17 |         name: Optional[str] = None,
18 |         provider: Optional[str] = None,
19 |         values: Dict = {},
20 |         path: str = None,
21 |         dryrun: bool = True,
22 |         **kwargs,
23 |     ):
24 |         super().__init__(
25 |             name=name, provider=provider, values=values, path=path, dryrun=dryrun
26 |         )
27 | 
28 |     @staticmethod
29 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
30 |         return SkySecret(**config, dryrun=dryrun)
31 | 


--------------------------------------------------------------------------------
/runhouse/resources/secrets/provider_secrets/wandb_secret.py:
--------------------------------------------------------------------------------
 1 | from runhouse.resources.secrets.provider_secrets.api_key_secret import ApiKeySecret
 2 | 
 3 | 
 4 | class WandBSecret(ApiKeySecret):
 5 |     """
 6 |     .. note::
 7 |             To create an WandBSecret, please use the factory method :func:`provider_secret` with ``provider="wandb"``.
 8 |     """
 9 | 
10 |     _PROVIDER = "wandb"
11 |     _DEFAULT_ENV_VARS = {"api_key": "WANDB_API_KEY"}
12 | 
13 |     @staticmethod
14 |     def from_config(config: dict, dryrun: bool = False, _resolve_children: bool = True):
15 |         return WandBSecret(**config, dryrun=dryrun)
16 | 


--------------------------------------------------------------------------------
/runhouse/rns/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/rns/__init__.py


--------------------------------------------------------------------------------
/runhouse/rns/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/rns/utils/__init__.py


--------------------------------------------------------------------------------
/runhouse/rns/utils/api.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import datetime
  3 | import json
  4 | import os
  5 | import uuid
  6 | from enum import Enum
  7 | 
  8 | from requests import Response
  9 | 
 10 | 
 11 | def timing(func):
 12 |     def wrapper(*args, **kwargs):
 13 |         import time
 14 | 
 15 |         start = time.time()
 16 |         result = func(*args, **kwargs)
 17 |         end = time.time()
 18 |         print(f"Finished {func.__name__.title()} in {int((end - start))} seconds")
 19 |         return result
 20 | 
 21 |     return wrapper
 22 | 
 23 | 
 24 | def remove_null_values_from_dict(source_dic: dict) -> dict:
 25 |     return {k: v for k, v in source_dic.items() if v is not None}
 26 | 
 27 | 
 28 | def load_resp_content(resp: Response) -> dict:
 29 |     return json.loads(resp.content)
 30 | 
 31 | 
 32 | def read_resp_data(resp: Response):
 33 |     return load_resp_content(resp).get("data", {})
 34 | 
 35 | 
 36 | def to_bool(value):
 37 |     try:
 38 |         return ast.literal_eval(value)
 39 |     except:
 40 |         return value
 41 | 
 42 | 
 43 | def is_jsonable(myjson):
 44 |     try:
 45 |         json.dumps(myjson)
 46 |     except (TypeError, OverflowError):
 47 |         return False
 48 |     return True
 49 | 
 50 | 
 51 | def generate_uuid():
 52 |     return uuid.uuid4().hex
 53 | 
 54 | 
 55 | def utc_now():
 56 |     """Current time as datetime object."""
 57 |     return datetime.datetime.now(datetime.timezone.utc)
 58 | 
 59 | 
 60 | def log_timestamp():
 61 |     """Return as timestamp in nanoseconds."""
 62 |     return int(utc_now().timestamp() * 1e9)
 63 | 
 64 | 
 65 | def log_datetime():
 66 |     """Current time as readable datetime string.
 67 |     Example: '2023-04-23'"""
 68 |     return utc_now().strftime("%Y-%m-%d")
 69 | 
 70 | 
 71 | def resolve_absolute_path(path: str):
 72 |     return os.path.abspath(os.path.expanduser(path))
 73 | 
 74 | 
 75 | def relative_file_path(file_path: str):
 76 |     """Convert to a relative path if it is not already one."""
 77 |     if file_path.startswith("~"):
 78 |         return file_path
 79 | 
 80 |     # Convert to a relative path
 81 |     relative_path = os.path.relpath(file_path, os.path.expanduser("~"))
 82 |     relative_path = relative_path.replace("\\", "/")
 83 | 
 84 |     if not relative_path.startswith("~"):
 85 |         relative_path = f"~/{relative_path}"
 86 | 
 87 |     return relative_path
 88 | 
 89 | 
 90 | class ResourceAccess(str, Enum):
 91 |     WRITE = "write"
 92 |     READ = "read"
 93 |     DENIED = "denied"
 94 | 
 95 | 
 96 | class ResourceVisibility(str, Enum):
 97 |     PRIVATE = "private"
 98 |     UNLISTED = "unlisted"
 99 |     PUBLIC = "public"
100 | 


--------------------------------------------------------------------------------
/runhouse/rns/utils/names.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from pathlib import Path
 3 | 
 4 | from runhouse.globals import configs, rns_client
 5 | from runhouse.resources.hardware.utils import _get_cluster_from
 6 | from runhouse.utils import generate_default_name
 7 | 
 8 | DEFAULT_LOCAL_FOLDER = f"{Path.cwd()}/"
 9 | DEFAULT_CLUSTER_FS_FOLDER = (
10 |     ""  # Objects will land inside home directory when sent without a path
11 | )
12 | DEFAULT_BLOB_STORAGE_FOLDER = (
13 |     configs.get("default_blob_storage_folder", "runhouse") + "/"
14 | )
15 | 
16 | 
17 | def is_valid_resource_name(name, strict_slashes=False):
18 |     if strict_slashes is True:
19 |         # Require a leading slash if any are present in string
20 |         return re.match(
21 |             r"^(?!.*\/{2,})(?![^\/\.\@~]+\/)[.@~]?[a-zA-Z0-9\-\_\/@]{2,200}[a-zA-Z0-9\-\_]+$",
22 |             name,
23 |         )
24 |     return re.match(
25 |         r"^(?!.*\/{2,})[.@~]?[a-zA-Z0-9\-\_\/@]{2,200}[a-zA-Z0-9\-\_]+$",
26 |         name,
27 |     )
28 | 
29 | 
30 | def _generate_default_path(cls, name, system):
31 |     """Generate a default path for a data resource. Logic is as follows:
32 |     1. If the system is a local file system, save to the current working directory
33 |     2. If the system is a remote file system, save to the default cache folder
34 |     3. If the system is a remote object store, save to the default object store folder
35 |     """
36 | 
37 |     from runhouse.resources.hardware import Cluster
38 | 
39 |     system = _get_cluster_from(system)
40 | 
41 |     name = name or generate_default_name(prefix=cls.RESOURCE_TYPE)
42 |     if system == rns_client.DEFAULT_FS or "here":
43 |         base_folder = DEFAULT_LOCAL_FOLDER
44 |     elif isinstance(system, Cluster):
45 |         if system.on_this_cluster():
46 |             base_folder = DEFAULT_LOCAL_FOLDER
47 |         else:
48 |             base_folder = DEFAULT_CLUSTER_FS_FOLDER
49 |     else:
50 |         base_folder = DEFAULT_BLOB_STORAGE_FOLDER
51 |     return f"{base_folder}{cls.RESOURCE_TYPE}/{name}"
52 | 


--------------------------------------------------------------------------------
/runhouse/servers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/servers/__init__.py


--------------------------------------------------------------------------------
/runhouse/servers/caddy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/runhouse/servers/caddy/__init__.py


--------------------------------------------------------------------------------
/runhouse/servers/http/__init__.py:
--------------------------------------------------------------------------------
1 | from .http_client import HTTPClient
2 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/scripts/__init__.py


--------------------------------------------------------------------------------
/scripts/build_package.sh:
--------------------------------------------------------------------------------
1 | # Delete dist directory if exists
2 | rm -r dist
3 | # Run from base directory of runhouse project
4 | python3 -m build --sdist --wheel
5 | #twine upload --repository testpypi dist/*
6 | twine upload dist/*
7 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/README.md:
--------------------------------------------------------------------------------
1 | 
2 | The guides in this folder can be used to setup Kubernetes clusters on EKS, GKE, or AKS.
3 | 
4 | When using Kubernetes please make sure you have the following:
5 | - `pip install kubernetes`
6 | - `kubectl` access
7 | - Ensure you have the `AWSKeyManagementServicePowerUser` IAM policy enabled. (For EKS only)
8 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-aks/0-locals.tf:
--------------------------------------------------------------------------------
1 | locals {
2 |   env                 = "dev"
3 |   region              = "eastus2"
4 |   resource_group_name = "skyakstestrg"
5 |   eks_name            = "skyakstest"  # Note: AKS cluster name will be dev-{eks_name}
6 |   eks_version         = "1.28"
7 | }
8 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-aks/1-provider.tf:
--------------------------------------------------------------------------------
 1 | provider "azurerm" {
 2 |   features {}
 3 | }
 4 | 
 5 | terraform {
 6 |   required_providers {
 7 |     azurerm = {
 8 |       source  = "hashicorp/azurerm"
 9 |       version = "3.75.0"
10 |     }
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-aks/2-resource-group.tf:
--------------------------------------------------------------------------------
1 | resource "azurerm_resource_group" "this" {
2 |   name     = local.resource_group_name
3 |   location = local.region
4 | }
5 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-aks/3-vpc.tf:
--------------------------------------------------------------------------------
 1 | resource "azurerm_virtual_network" "this" {
 2 |   name                = "main"
 3 |   address_space       = ["10.0.0.0/16"]
 4 |   location            = azurerm_resource_group.this.location
 5 |   resource_group_name = azurerm_resource_group.this.name
 6 | 
 7 |   tags = {
 8 |     env = local.env
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-aks/4-subnets.tf:
--------------------------------------------------------------------------------
 1 | resource "azurerm_subnet" "subnet1" {
 2 |   name                 = "subnet1"
 3 |   address_prefixes     = ["10.0.0.0/19"]
 4 |   resource_group_name  = azurerm_resource_group.this.name
 5 |   virtual_network_name = azurerm_virtual_network.this.name
 6 | }
 7 | 
 8 | resource "azurerm_subnet" "subnet2" {
 9 |   name                 = "subnet2"
10 |   address_prefixes     = ["10.0.32.0/19"]
11 |   resource_group_name  = azurerm_resource_group.this.name
12 |   virtual_network_name = azurerm_virtual_network.this.name
13 | }
14 | 
15 | # If you want to use existing subnet
16 | # data "azurerm_subnet" "subnet1" {
17 | #   name                 = "subnet1"
18 | #   virtual_network_name = "main"
19 | #   resource_group_name  = "tutorial"
20 | # }
21 | 
22 | # output "subnet_id" {
23 | #   value = data.azurerm_subnet.subnet1.id
24 | # }
25 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-aks/5-aks.tf:
--------------------------------------------------------------------------------
 1 | resource "azurerm_user_assigned_identity" "base" {
 2 |   name                = "base"
 3 |   location            = azurerm_resource_group.this.location
 4 |   resource_group_name = azurerm_resource_group.this.name
 5 | }
 6 | 
 7 | resource "azurerm_role_assignment" "base" {
 8 |   scope                = azurerm_resource_group.this.id
 9 |   role_definition_name = "Network Contributor"
10 |   principal_id         = azurerm_user_assigned_identity.base.principal_id
11 | }
12 | 
13 | resource "azurerm_kubernetes_cluster" "this" {
14 |   name                = "${local.env}-${local.eks_name}"   # AKS cluster name gets set here
15 |   location            = azurerm_resource_group.this.location
16 |   resource_group_name = azurerm_resource_group.this.name
17 |   dns_prefix          = "devaks1"
18 | 
19 |   kubernetes_version        = local.eks_version
20 |   automatic_channel_upgrade = "stable"
21 |   private_cluster_enabled   = false
22 |   node_resource_group       = "${local.resource_group_name}-${local.env}-${local.eks_name}"
23 | 
24 |   # It's in Preview
25 |   # api_server_access_profile {
26 |   #   vnet_integration_enabled = true
27 |   #   subnet_id                = azurerm_subnet.subnet1.id
28 |   # }
29 | 
30 |   # For production change to "Standard"
31 |   sku_tier = "Standard"   # Can also be set to Free
32 | 
33 |   oidc_issuer_enabled       = true
34 |   workload_identity_enabled = true
35 | 
36 |   network_profile {
37 |     network_plugin = "azure"
38 |     dns_service_ip = "10.0.64.10"
39 |     service_cidr   = "10.0.64.0/19"
40 |   }
41 | 
42 |   default_node_pool {
43 |     name                 = "general"
44 |     vm_size              = "Standard_D3_v2"  # 4 vCPU, 14 GiB Memory
45 |     vnet_subnet_id       = azurerm_subnet.subnet1.id
46 |     orchestrator_version = local.eks_version
47 |     type                 = "VirtualMachineScaleSets"
48 |     enable_auto_scaling  = true
49 |     node_count           = 1
50 |     min_count            = 1
51 |     max_count            = 10
52 | 
53 |     node_labels = {
54 |       role = "general"
55 |     }
56 |   }
57 | 
58 |   identity {
59 |     type         = "UserAssigned"
60 |     identity_ids = [azurerm_user_assigned_identity.base.id]
61 |   }
62 | 
63 |   tags = {
64 |     env = local.env
65 |   }
66 | 
67 |   lifecycle {
68 |     ignore_changes = [default_node_pool[0].node_count]
69 |   }
70 | 
71 |   depends_on = [
72 |     azurerm_role_assignment.base
73 |   ]
74 | }
75 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-aks/README.md:
--------------------------------------------------------------------------------
 1 | To spin up an AKS cluster in Azure, please follow the below steps.
 2 | 
 3 | Clone down this repository and modify locals.tf to contain your desired region, resource group name, and cluster name. Optionally modify the K8s version to reflect the
 4 | latest one.
 5 | 
 6 | In subnets.tf, you can use your existing subnets or create new ones. If you do not modify it, it will create new subnets in Azure.
 7 | 
 8 | Lastly, you can also modify some settings in aks.tf to reflect your need. The area of most interest is the `default node pool` where you may adjust the VM type, auto scaling,
 9 | and min / max node count.
10 | 
11 | Once you are ready with your TF scripts, you will begin by logging into Azure via the CLI.
12 | 
13 | Open a terminal and run `brew install azure-cli`. Then, run `az login`.
14 | NOTE: You may need to add a `TENANT_ID` argument to the `az login` command.
15 | 
16 | Authenticate with `az login`, making sure your terminal has access to your Azure Cloud account.
17 | 
18 | Next, find your subscription ID by running `az account list`. Copy your subscription's ID and then run
19 | 
20 | `az account set --subscription SUBSCRIPTION_ID`
21 | 
22 | Finally, run your standard TF commands to deploy this AKS cluster.
23 | 
24 | `terraform init`
25 | 
26 | `terraform validate`
27 | 
28 | `terraform plan -out tf_plan`
29 | 
30 | `terraform apply "tf_plan"`
31 | 
32 | To get access to your AKS cluster, you will need its kubeconfig locally. To obtain this run,
33 | 
34 | `az aks get-credentials --resource-group RESOURCE_GROUP_NAME --name AKS_CLUSTER_NAME`.
35 | 
36 | Note now that ~/.kube/config's contents will be updated with the kubeconfig of your AKS cluster.
37 | 
38 | Finally, test your connection by running `kubectl get nodes`
39 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-eks/README.md:
--------------------------------------------------------------------------------
 1 | To spin up an EKS cluster in AWS,
 2 | 
 3 | Simply change the commented fields in main.tf and run the standard TF commands. Ensure that you have the AWS CLI setup with the correct permissions and access keys, etc.
 4 | 
 5 | `terraform init`
 6 | 
 7 | `terraform validate`
 8 | 
 9 | `terraform plan -out eks_plan`
10 | 
11 | `terraform apply "eks_plan"`
12 | 
13 | You should also run `aws eks update-kubeconfig --region us-east-1 --name NAME_OF_EKS_CLUSTER` to update your kubeconfig.
14 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-eks/main.tf:
--------------------------------------------------------------------------------
 1 | # Script to spin up a quick EKS cluster that can be used for testing Runhouse Kubernetes support
 2 | 
 3 | provider "aws" {
 4 |   region = local.region
 5 | }
 6 | 
 7 | locals {
 8 |   name   = "eks-clus-1"           # change this field
 9 |   region = "us-east-1"
10 | 
11 |   vpc_cidr = "10.123.0.0/16"
12 |   azs      = ["us-east-1a", "us-east-1b"]
13 | 
14 |   public_subnets  = ["10.123.1.0/24", "10.123.2.0/24"]
15 |   private_subnets = ["10.123.3.0/24", "10.123.4.0/24"]
16 |   intra_subnets   = ["10.123.5.0/24", "10.123.6.0/24"]
17 | 
18 |   tags = {
19 |     Example = local.name
20 |   }
21 | }
22 | 
23 | module "vpc" {
24 |   source  = "terraform-aws-modules/vpc/aws"
25 |   version = "~> 4.0"
26 | 
27 |   name = local.name
28 |   cidr = local.vpc_cidr
29 | 
30 |   azs             = local.azs
31 |   private_subnets = local.private_subnets
32 |   public_subnets  = local.public_subnets
33 |   intra_subnets   = local.intra_subnets
34 | 
35 |   enable_nat_gateway = true
36 | 
37 |   public_subnet_tags = {
38 |     "kubernetes.io/role/elb" = 1
39 |   }
40 | 
41 |   private_subnet_tags = {
42 |     "kubernetes.io/role/internal-elb" = 1
43 |   }
44 | }
45 | 
46 | module "eks" {
47 |   source  = "terraform-aws-modules/eks/aws"
48 |   version = "19.16.0"
49 | 
50 |   cluster_name                   = local.name
51 |   cluster_endpoint_public_access = true
52 | 
53 |   cluster_addons = {
54 |     coredns = {
55 |       most_recent = true
56 |     }
57 |     kube-proxy = {
58 |       most_recent = true
59 |     }
60 |     vpc-cni = {
61 |       most_recent = true
62 |     }
63 |   }
64 | 
65 |   vpc_id                   = module.vpc.vpc_id
66 |   subnet_ids               = module.vpc.private_subnets
67 |   control_plane_subnet_ids = module.vpc.intra_subnets
68 | 
69 |   # EKS Managed Node Group(s)
70 |   eks_managed_node_group_defaults = {
71 |     ami_type       = "AL2_x86_64"
72 |     instance_types = ["m6i.xlarge"]
73 | 
74 |     attach_cluster_primary_security_group = true
75 |   }
76 | 
77 |   eks_managed_node_groups = {
78 |     eks-clus-1-wg = {             # change this field
79 |       min_size     = 2
80 |       max_size     = 10
81 |       desired_size = 4
82 | 
83 |       instance_types = ["m6i.xlarge"] # 4 vCPU 16 GB Memory
84 |       capacity_type  = "ON_DEMAND"
85 | 
86 |       tags = {
87 |         ExtraTag = "helloworld"
88 |       }
89 |     }
90 |   }
91 | 
92 |   tags = local.tags
93 | }
94 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-gke/1-provider.tf:
--------------------------------------------------------------------------------
 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs
 2 | provider "google" {
 3 |   project = "runhouse-prod" # project name needs to exist in GCP already
 4 |   region  = "us-east1"
 5 | }
 6 | 
 7 | # https://www.terraform.io/language/settings/backends/gcs
 8 | terraform {
 9 |   required_providers {
10 |     google = {
11 |       source  = "hashicorp/google"
12 |       version = "~> 4.0"
13 |     }
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-gke/2-vpc.tf:
--------------------------------------------------------------------------------
 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/google_project_service
 2 | resource "google_project_service" "compute" {
 3 |   service = "compute.googleapis.com"
 4 | }
 5 | 
 6 | resource "google_project_service" "container" {
 7 |   service = "container.googleapis.com"
 8 | }
 9 | 
10 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_network
11 | resource "google_compute_network" "main" {
12 |   name                            = "main"
13 |   routing_mode                    = "REGIONAL"
14 |   auto_create_subnetworks         = false
15 |   mtu                             = 1460
16 |   delete_default_routes_on_create = false
17 | 
18 |   depends_on = [
19 |     google_project_service.compute,
20 |     google_project_service.container
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-gke/3-subnets.tf:
--------------------------------------------------------------------------------
 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_subnetwork
 2 | resource "google_compute_subnetwork" "private" {
 3 |   name                     = "private"
 4 |   ip_cidr_range            = "10.0.0.0/18"
 5 |   region                   = "us-east1"
 6 |   network                  = google_compute_network.main.id
 7 |   private_ip_google_access = true
 8 | 
 9 |   secondary_ip_range {  # CIDR for pods
10 |     range_name    = "k8s-pod-range"
11 |     ip_cidr_range = "10.48.0.0/14"
12 |   }
13 |   secondary_ip_range { # CIDR for services
14 |     range_name    = "k8s-service-range"
15 |     ip_cidr_range = "10.52.0.0/20"
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-gke/4-router.tf:
--------------------------------------------------------------------------------
1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_router
2 | resource "google_compute_router" "router" {
3 |   name    = "router"
4 |   region  = "us-east1"
5 |   network = google_compute_network.main.id
6 | }
7 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-gke/5-nat.tf:
--------------------------------------------------------------------------------
 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_router_nat
 2 | resource "google_compute_router_nat" "nat" {
 3 |   name   = "nat"
 4 |   router = google_compute_router.router.name
 5 |   region = "us-east1"
 6 | 
 7 |   source_subnetwork_ip_ranges_to_nat = "LIST_OF_SUBNETWORKS"
 8 |   nat_ip_allocate_option             = "MANUAL_ONLY"
 9 | 
10 |   subnetwork {
11 |     name                    = google_compute_subnetwork.private.id
12 |     source_ip_ranges_to_nat = ["ALL_IP_RANGES"]
13 |   }
14 | 
15 |   nat_ips = [google_compute_address.nat.self_link]
16 | }
17 | 
18 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_address
19 | resource "google_compute_address" "nat" {
20 |   name         = "nat"
21 |   address_type = "EXTERNAL"
22 |   network_tier = "STANDARD"   # can also be set to PREMIUM
23 | 
24 |   depends_on = [google_project_service.compute]
25 | }
26 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-gke/6-firewalls.tf:
--------------------------------------------------------------------------------
 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_firewall
 2 | resource "google_compute_firewall" "allow-ssh" {
 3 |   name    = "allow-ssh"
 4 |   network = google_compute_network.main.name
 5 | 
 6 |   allow {
 7 |     protocol = "tcp"
 8 |     ports    = ["22"]
 9 |   }
10 | 
11 |   source_ranges = ["0.0.0.0/0"]   # (warning) allows any IP to connect over SSH
12 | }
13 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-gke/7-kubernetes.tf:
--------------------------------------------------------------------------------
 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster
 2 | resource "google_container_cluster" "primary" {
 3 |   name                     = "primary"
 4 |   location                 = "us-east1"   # can deploy to a region or multiple AZs
 5 |   remove_default_node_pool = true
 6 |   initial_node_count       = 1
 7 |   network                  = google_compute_network.main.self_link
 8 |   subnetwork               = google_compute_subnetwork.private.self_link
 9 |   logging_service          = "logging.googleapis.com/kubernetes"
10 |   monitoring_service       = "monitoring.googleapis.com/kubernetes"
11 |   networking_mode          = "VPC_NATIVE"
12 | 
13 |   # Optional, if you want multi-zonal cluster
14 |   # node_locations = [
15 |   #   "us-east1-b"
16 |   # ]
17 | 
18 |   addons_config {
19 |     horizontal_pod_autoscaling {
20 |       disabled = false
21 |     }
22 |   }
23 | 
24 |   release_channel {
25 |     channel = "REGULAR"
26 |   }
27 | 
28 |   workload_identity_config {
29 |     workload_pool = "runhouse-prod.svc.id.goog"
30 |   }
31 | 
32 |   ip_allocation_policy {
33 |     cluster_secondary_range_name  = "k8s-pod-range"
34 |     services_secondary_range_name = "k8s-service-range"
35 |   }
36 | 
37 |   private_cluster_config {
38 |     enable_private_nodes    = true
39 |     enable_private_endpoint = false
40 |     master_ipv4_cidr_block  = "172.16.0.0/28"
41 |   }
42 | 
43 |   #   Jenkins use case
44 |   #   master_authorized_networks_config {
45 |   #     cidr_blocks {
46 |   #       cidr_block   = "10.0.0.0/18"
47 |   #       display_name = "private-subnet-w-jenkins"
48 |   #     }
49 |   #   }
50 | }
51 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-gke/8-node-pools.tf:
--------------------------------------------------------------------------------
 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/google_service_account
 2 | resource "google_service_account" "kubernetes" {
 3 |   account_id = "kubernetes"
 4 | }
 5 | 
 6 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool
 7 | resource "google_container_node_pool" "general" {
 8 |   name       = "general"
 9 |   cluster    = google_container_cluster.primary.id
10 |   node_count = 1
11 | 
12 |   management {
13 |     auto_repair  = true
14 |     auto_upgrade = true
15 |   }
16 | 
17 |   node_config {
18 |     preemptible  = false
19 |     machine_type = "e2-small"
20 | 
21 |     labels = {
22 |       role = "general"
23 |     }
24 | 
25 |     service_account = google_service_account.kubernetes.email
26 |     oauth_scopes = [
27 |       "https://www.googleapis.com/auth/cloud-platform"
28 |     ]
29 |   }
30 | }
31 | 
32 | resource "google_container_node_pool" "regular" {
33 |   name    = "regular"
34 |   cluster = google_container_cluster.primary.id
35 | 
36 |   management {
37 |     auto_repair  = true
38 |     auto_upgrade = true
39 |   }
40 | 
41 |   autoscaling {
42 |     min_node_count = 1
43 |     max_node_count = 10
44 |   }
45 | 
46 |   node_config {
47 |     preemptible  = true
48 |     machine_type = "c3-standard-4" # 4 vCPU 16 GB Memory
49 | 
50 |     service_account = google_service_account.kubernetes.email
51 |     oauth_scopes = [
52 |       "https://www.googleapis.com/auth/cloud-platform"
53 |     ]
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/scripts/kubernetes_cluster/tf-gke/README.md:
--------------------------------------------------------------------------------
 1 | Basic GKE cluster setup guide in Terraform
 2 | 
 3 | To spin up a GKE cluster in GCP, please follow the below steps:
 4 | 
 5 | `brew install --cask google-cloud-sdk`
 6 | 
 7 | Ensure your have GCP access first with the appropriate permissions level.
 8 | 
 9 | `gcloud auth application-default login` This will prompt you to login via browser, using your gmail account.
10 | 
11 | `gcloud auth application-default set-quota-project runhouse-prod`
12 | 
13 | `terraform init`
14 | 
15 | `terraform validate`
16 | 
17 | `terraform plan -out gke_plan`
18 | 
19 | `terraform apply "gke_plan"`
20 | 
21 | 
22 | `gcloud config set project runhouse-prod`
23 | 
24 | `gcloud components install gke-gcloud-auth-plugin` This is neccesary for kubectl with GKE to work
25 | 
26 | Finally, go to your GKE cluster in the GCP console and copy the command found by pressing the `Connect` tab. Run this command.
27 | 
28 | Test your access to the GKE cluster by running `kubectl get nodes`
29 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/constants.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | TEST_ORG = "test-org"
 4 | TESTING_LOG_LEVEL = "DEBUG"
 5 | TESTING_AUTOSTOP_INTERVAL = 15
 6 | 
 7 | TEST_ENV_VARS = {
 8 |     "var1": "val1",
 9 |     "var2": "val2",
10 |     "RH_LOG_LEVEL": os.getenv("RH_LOG_LEVEL") or TESTING_LOG_LEVEL,
11 |     "RH_AUTOSTOP_INTERVAL": str(
12 |         os.getenv("RH_AUTOSTOP_INTERVAL") or TESTING_AUTOSTOP_INTERVAL
13 |     ),
14 | }
15 | 
16 | TEST_REQS = [
17 |     "pytest",
18 |     "httpx",
19 |     "pytest_asyncio",
20 |     "pandas",
21 |     "numpy<=1.26.4",
22 | ]
23 | 
24 | DEFAULT_KEYPAIR_KEYPATH = "~/.ssh/sky-key"
25 | 


--------------------------------------------------------------------------------
/tests/fixtures/test_fake_package/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name="test_fake_package",
 5 |     version="0.1",
 6 |     packages=find_packages(),
 7 |     install_requires=[],
 8 |     author="Rohin Bhasin",
 9 |     author_email="bhasin.rohin@gmail.com",
10 |     description="A simple example package",
11 |     python_requires=">=3.6",
12 | )
13 | 


--------------------------------------------------------------------------------
/tests/fixtures/test_fake_package/test_fake_package/__init__.py:
--------------------------------------------------------------------------------
1 | from .function_to_import import editable_package_function
2 | from .module_to_import import TestModuleFromPackage
3 | 


--------------------------------------------------------------------------------
/tests/fixtures/test_fake_package/test_fake_package/function_to_import.py:
--------------------------------------------------------------------------------
1 | def editable_package_function():
2 |     print("Hello from the editable package!")
3 |     return "Hello from the editable package!"
4 | 


--------------------------------------------------------------------------------
/tests/fixtures/test_fake_package/test_fake_package/module_to_import.py:
--------------------------------------------------------------------------------
1 | class TestModuleFromPackage:
2 |     @staticmethod
3 |     def hello_world():
4 |         print("Hello from the editable package module!")
5 |         return "Hello from the editable package module!"
6 | 


--------------------------------------------------------------------------------
/tests/fixtures/test_fake_package_copy/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name="test_fake_package_copy",
 5 |     version="0.1",
 6 |     packages=find_packages(),
 7 |     install_requires=[],
 8 |     author="Rohin Bhasin",
 9 |     author_email="bhasin.rohin@gmail.com",
10 |     description="A simple example package",
11 |     python_requires=">=3.6",
12 | )
13 | 


--------------------------------------------------------------------------------
/tests/fixtures/test_fake_package_copy/test_fake_package_copy/__init__.py:
--------------------------------------------------------------------------------
1 | from .function_to_import import editable_package_function
2 | from .module_to_import import TestModuleFromPackage
3 | 


--------------------------------------------------------------------------------
/tests/fixtures/test_fake_package_copy/test_fake_package_copy/function_to_import.py:
--------------------------------------------------------------------------------
1 | def editable_package_function():
2 |     print("Hello from the editable package!")
3 |     return "Hello from the editable package!"
4 | 


--------------------------------------------------------------------------------
/tests/fixtures/test_fake_package_copy/test_fake_package_copy/module_to_import.py:
--------------------------------------------------------------------------------
1 | class TestModuleFromPackage:
2 |     @staticmethod
3 |     def hello_world():
4 |         print("Hello from the editable package module!")
5 |         return "Hello from the editable package module!"
6 | 


--------------------------------------------------------------------------------
/tests/fixtures/utils.py:
--------------------------------------------------------------------------------
 1 | def create_s3_bucket(bucket_name: str):
 2 |     """Create bucket in S3 if it does not already exist."""
 3 |     from sky.data.storage import S3Store
 4 | 
 5 |     s3_store = S3Store(name=bucket_name, source="")
 6 |     return s3_store
 7 | 
 8 | 
 9 | def create_gcs_bucket(bucket_name: str):
10 |     """Create bucket in GS if it does not already exist."""
11 |     from sky.data.storage import GcsStore
12 | 
13 |     gcs_store = GcsStore(name=bucket_name, source="")
14 |     return gcs_store
15 | 


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
 1 | pytest
 2 | pytest-mock
 3 | httpx < 0.28.0
 4 | pytest_asyncio
 5 | datasets
 6 | dask
 7 | tqdm
 8 | fastapi
 9 | ray[default]>=2.9.0
10 | 
11 | # packages for local and unit tests
12 | boto3
13 | google-cloud-storage
14 | docker
15 | pandas
16 | numpy<=1.26.4
17 | openapi-core==0.19.1
18 | plotly
19 | 
20 | # packages for minimal+ tests
21 | skypilot==0.7.0
22 | 
23 | # requests must be lowered
24 | requests<2.32.0
25 | 


--------------------------------------------------------------------------------
/tests/test_den/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_den/__init__.py


--------------------------------------------------------------------------------
/tests/test_den/test_defaults.py:
--------------------------------------------------------------------------------
 1 | import runhouse as rh
 2 | 
 3 | 
 4 | def test_download_defaults():
 5 |     rh.globals.configs.defaults_cache["default_folder"] = "nonsense"
 6 |     local_defaults = rh.configs.load_defaults_from_file()
 7 |     local_defaults.pop("secrets")
 8 |     rh.configs.upload_defaults(defaults=local_defaults)
 9 |     loaded_defaults = rh.configs.load_defaults_from_den()
10 |     assert local_defaults == loaded_defaults
11 |     assert rh.globals.rns_client.default_folder == "nonsense"
12 | 


--------------------------------------------------------------------------------
/tests/test_login.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import runhouse as rh
 4 | import sky
 5 | from runhouse.rns.login import _login_download_secrets
 6 | 
 7 | 
 8 | def add_secrets_to_vault(headers):
 9 |     """Add some test secrets to Vault"""
10 |     # Add real credentials for AWS and SKY to test sky status
11 |     rh.provider_secret(
12 |         name="/aws",  # add backslash / to name to force it to be vault secret
13 |         provider="aws",
14 |         values={
15 |             "access_key": os.getenv("TEST_AWS_ACCESS_KEY"),
16 |             "secret_key": os.getenv("TEST_AWS_SECRET_KEY"),
17 |         },
18 |     ).save(headers=headers)
19 | 
20 |     rh.provider_secret(
21 |         name="/sky",
22 |         provider="sky",
23 |         values={
24 |             "private_key": os.getenv("TEST_SKY_PRIVATE_KEY"),
25 |             "public_key": os.getenv("TEST_SKY_PUBLIC_KEY"),
26 |         },
27 |     ).save(headers=headers)
28 | 
29 |     rh.provider_secret(
30 |         name="/snowflake",
31 |         provider="snowflake",
32 |         values={"token": "ABCD1234"},
33 |     ).save(headers=headers)
34 | 
35 | 
36 | def test_login_flow_in_new_env():
37 |     token = os.getenv("KITCHEN_TESTER_TOKEN")
38 |     headers = {"Authorization": f"Bearer {token}"}
39 | 
40 |     add_secrets_to_vault(headers)
41 | 
42 |     secrets_in_vault = rh.Secret.vault_secrets(headers=headers)
43 |     assert secrets_in_vault, "No secrets found in Vault"
44 | 
45 |     # Run login download secrets stored in Vault into the new env
46 |     _login_download_secrets(headers=headers)
47 | 
48 |     # Once secrets are saved down to their local config, confirm we have sky enabled
49 |     sky.check.check(quiet=True)
50 |     clouds = sky.global_user_state.get_enabled_clouds()
51 |     cloud_names = [str(c).lower() for c in clouds]
52 |     assert "aws" in cloud_names
53 | 
54 |     for secret in secrets_in_vault.values():
55 |         secret.delete(headers=headers)
56 | 
57 |     secrets_in_vault = rh.Secret.vault_secrets(headers=headers)
58 |     assert not secrets_in_vault
59 | 


--------------------------------------------------------------------------------
/tests/test_performance.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import requests
 4 | 
 5 | from runhouse.globals import rns_client
 6 | from runhouse.logger import get_logger
 7 | 
 8 | logger = get_logger(__name__)
 9 | 
10 | 
11 | def profile(func, reps=10):
12 |     times = []
13 |     for _ in range(reps):
14 |         start = time.time()
15 |         assert func()
16 |         times.append(round((time.time() - start) * 1000, 2))
17 |     return times, sum(times) / len(times)
18 | 
19 | 
20 | def run_performance_tests(summer_func):
21 |     cluster = summer_func.system
22 |     times_list, avg_time = profile(lambda: summer_func.system.keys() is not None)
23 |     print(f"Listing keys took {round(avg_time, 2)} ms: {times_list}")
24 | 
25 |     times_list, avg_time = profile(lambda: summer_func(1, 5) == 6)
26 |     print(f"Call with logs took {round(avg_time, 2)} ms: {times_list}")
27 | 
28 |     times_list, avg_time = profile(lambda: summer_func(1, 5, stream_logs=False) == 6)
29 |     print(f"Call without logs took {round(avg_time, 2)} ms: {times_list}")
30 | 
31 |     port = cluster.client.port
32 |     suffix = "https" if cluster._use_https else "http"
33 |     address = cluster.server_address
34 | 
35 |     call_url = f"{suffix}://{address}:{port}/summer_func/call/?serialization=None"
36 |     logger.info(f"Call url: {call_url}")
37 |     times_list, avg_time = profile(
38 |         lambda: requests.post(
39 |             call_url,
40 |             json={"args": [1, 2]},
41 |             headers=rns_client.request_headers(cluster.rns_address)
42 |             if cluster.den_auth
43 |             else None,
44 |             verify=cluster.client.verify,
45 |         ).json()
46 |         == 3
47 |     )
48 |     print(f"{suffix} call took {round(avg_time, 2)} ms: {times_list}")
49 | 
50 | 
51 | def test_roundtrip_performance(summer_func):
52 |     run_performance_tests(summer_func)
53 | 
54 | 
55 | def test_https_roundtrip_performance(summer_func_with_auth):
56 |     run_performance_tests(summer_func_with_auth)
57 | 


--------------------------------------------------------------------------------
/tests/test_requirements/aws_test_requirements.txt:
--------------------------------------------------------------------------------
1 | awscli==1.29.17
2 | boto3==1.28.17
3 | pycryptodome==3.12.0
4 | 


--------------------------------------------------------------------------------
/tests/test_requirements/google_tests_requirements.txt:
--------------------------------------------------------------------------------
1 | google-api-python-client
2 | google-cloud-storage
3 | gcsfs
4 | 


--------------------------------------------------------------------------------
/tests/test_requirements/tutorial_requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | diffusers
3 | transformers
4 | 


--------------------------------------------------------------------------------
/tests/test_resources/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/__init__.py


--------------------------------------------------------------------------------
/tests/test_resources/test_clusters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_clusters/__init__.py


--------------------------------------------------------------------------------
/tests/test_resources/test_clusters/test_docker_cluster.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | import os
 4 | from pathlib import Path
 5 | 
 6 | import pytest
 7 | import ray
 8 | import runhouse as rh
 9 | 
10 | 
11 | def get_uname():
12 |     return os.uname()
13 | 
14 | 
15 | @pytest.mark.level("release")
16 | def test_docker_cluster():
17 |     import docker
18 | 
19 |     client = docker.from_env()
20 | 
21 |     cluster = rh.DockerCluster(
22 |         name="test-cluster",
23 |         container_name="runhouse-test-container",
24 |     )
25 |     if not cluster.is_up():
26 |         rh_parent_path = Path(importlib.util.find_spec("runhouse").origin).parent.parent
27 |         dockerfile_path = rh_parent_path / "docker/slim"
28 |         # Rebuild the image if not already built
29 |         if not client.images.list(name="runhouse-slim"):
30 |             client.images.build(
31 |                 path=".",
32 |                 dockerfile=str(dockerfile_path),
33 |                 tag="runhouse-slim",
34 |             )
35 |         container = client.containers.run(
36 |             "runhouse-slim",
37 |             command="tail -f /dev/null",
38 |             detach=True,
39 |             ports={"32300": 32300},
40 |             shm_size="3gb",  # Needed by Ray
41 |             name="runhouse-test-container",
42 |         )
43 |         container.start()
44 |     # Installs the local runhouse version inside the container and starts the server,
45 |     # skip if you've pre-installed runhouse[server] in the image and started the server in the docker CMD
46 |     cluster.restart_server()
47 | 
48 |     cluster.install_packages(["pytest"])
49 | 
50 |     ray_resources = rh.function(ray.available_resources).to(cluster, sync_local=False)
51 |     assert ray_resources()
52 | 
53 |     get_uname_dc = rh.function(get_uname).to(cluster)
54 |     assert get_uname_dc()
55 | 


--------------------------------------------------------------------------------
/tests/test_resources/test_modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/__init__.py


--------------------------------------------------------------------------------
/tests/test_resources/test_modules/exception_module.py:
--------------------------------------------------------------------------------
 1 | import plotly  # noqa
 2 | import runhouse as rh
 3 | 
 4 | 
 5 | class ExceptionModule(rh.Module):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 | 
 9 |     def test_fn(self):
10 |         return None
11 | 


--------------------------------------------------------------------------------
/tests/test_resources/test_modules/test_folder.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/test_folder.py


--------------------------------------------------------------------------------
/tests/test_resources/test_modules/test_folders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/test_folders/__init__.py


--------------------------------------------------------------------------------
/tests/test_resources/test_modules/test_folders/test_packages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/test_folders/test_packages/__init__.py


--------------------------------------------------------------------------------
/tests/test_resources/test_modules/test_folders/test_packages/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import runhouse as rh
 4 | 
 5 | from tests.conftest import init_args
 6 | 
 7 | 
 8 | @pytest.fixture(scope="session")
 9 | def package(request):
10 |     """Parametrize over multiple packages - useful for running the same test on multiple storage types."""
11 |     return request.getfixturevalue(request.param)
12 | 
13 | 
14 | @pytest.fixture
15 | def local_package(local_folder):
16 |     args = {"path": local_folder.path, "install_method": "local"}
17 |     p = rh.package(**args)
18 |     init_args[id(p)] = args
19 |     return p
20 | 
21 | 
22 | @pytest.fixture
23 | def s3_package(s3_folder):
24 |     return rh.package(
25 |         path=s3_folder.path, system=s3_folder.system, install_method="local"
26 |     )
27 | 


--------------------------------------------------------------------------------
/tests/test_resources/test_modules/test_functions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/test_functions/__init__.py


--------------------------------------------------------------------------------
/tests/test_resources/test_modules/test_functions/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import runhouse as rh
 4 | 
 5 | from tests.conftest import init_args
 6 | 
 7 | 
 8 | def summer(a: int, b: int):
 9 |     print("Running summer function")
10 |     return a + b
11 | 
12 | 
13 | def save_and_load_artifacts():
14 |     cpu = rh.ondemand_cluster("^rh-cpu").save()
15 |     loaded_cluster = rh.load(name=cpu.name)
16 |     return loaded_cluster.name
17 | 
18 | 
19 | def slow_running_func(a, b):
20 |     import time
21 | 
22 |     time.sleep(20)
23 |     return a + b
24 | 
25 | 
26 | @pytest.fixture(scope="session")
27 | def summer_func(local_launched_ondemand_aws_docker_cluster):
28 |     args = {"name": "summer_func", "fn": summer}
29 |     f = rh.function(**args).to(local_launched_ondemand_aws_docker_cluster)
30 |     init_args[id(f)] = args
31 |     return f
32 | 
33 | 
34 | @pytest.fixture(scope="session")
35 | def summer_func_with_auth(ondemand_aws_https_cluster_with_auth):
36 |     return rh.function(summer, name="summer_func").to(
37 |         ondemand_aws_https_cluster_with_auth
38 |     )
39 | 
40 | 
41 | @pytest.fixture(scope="session")
42 | def summer_func_shared(shared_cluster):
43 |     return rh.function(summer, name="summer_func").to(shared_cluster)
44 | 
45 | 
46 | @pytest.fixture(scope="session")
47 | def func_with_artifacts(local_launched_ondemand_aws_docker_cluster):
48 |     return rh.function(save_and_load_artifacts, name="artifacts_func").to(
49 |         local_launched_ondemand_aws_docker_cluster
50 |     )
51 | 
52 | 
53 | @pytest.fixture(scope="session")
54 | def slow_func(local_launched_ondemand_aws_docker_cluster):
55 |     return rh.function(slow_running_func, name="slow_func").to(
56 |         local_launched_ondemand_aws_docker_cluster
57 |     )
58 | 


--------------------------------------------------------------------------------
/tests/test_resources/test_modules/test_server_modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/test_server_modules/__init__.py


--------------------------------------------------------------------------------
/tests/test_resources/test_modules/test_server_modules/assets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_modules/test_server_modules/assets/__init__.py


--------------------------------------------------------------------------------
/tests/test_resources/test_modules/test_server_modules/assets/sample_fastapi_app.py:
--------------------------------------------------------------------------------
 1 | import fastapi
 2 | 
 3 | app = fastapi.FastAPI()
 4 | 
 5 | 
 6 | @app.get("/summer/{a}")
 7 | def summer(a: int, b: int):
 8 |     return a + b
 9 | 
10 | 
11 | @app.post("/my/deeply/{arg1}/nested/endpoint/{arg2}")
12 | async def my_deeply_nested_async_endpoint(arg1: str, arg2: int, arg3: float):
13 |     return arg1, arg2, arg3
14 | 
15 | 
16 | @app.get("/my/streaming/endpoint")
17 | def my_streaming_endpoint():
18 |     for i in range(10):
19 |         yield i
20 | 
21 | 
22 | @app.get("/my/endpoint/with/optional/body/params/and/header")
23 | def my_endpoint_with_optional_body_params_and_header(
24 |     a: int = fastapi.Body(None),
25 |     b: int = fastapi.Body(None),
26 |     c: int = fastapi.Header(None),
27 | ):
28 |     return a, b, c
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     import uvicorn
33 | 
34 |     uvicorn.run(app, port=8000)
35 | 


--------------------------------------------------------------------------------
/tests/test_resources/test_secrets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_resources/test_secrets/__init__.py


--------------------------------------------------------------------------------
/tests/test_servers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_servers/__init__.py


--------------------------------------------------------------------------------
/tests/test_servers/test_nginx.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-house/runhouse/2e70c71d6c79459cf88f5375268e5ef679a86e7a/tests/test_servers/test_nginx.py


--------------------------------------------------------------------------------
/tests/test_tutorials.py:
--------------------------------------------------------------------------------
 1 | import runhouse as rh
 2 | 
 3 | 
 4 | def sd_generate(
 5 |     prompt,
 6 |     num_images=1,
 7 |     steps=100,
 8 |     guidance_scale=7.5,
 9 |     model_id="stabilityai/stable-diffusion-2-base",
10 | ):
11 |     import torch
12 |     from diffusers import StableDiffusionPipeline
13 | 
14 |     pipe = StableDiffusionPipeline.from_pretrained(
15 |         model_id, torch_dtype=torch.float16, revision="fp16"
16 |     ).to("cuda")
17 |     return pipe(
18 |         [prompt] * num_images, num_inference_steps=steps, guidance_scale=guidance_scale
19 |     ).images
20 | 
21 | 
22 | def test_sd_generate(a10g_gpu_cluster):
23 |     generate_gpu = rh.function(fn=sd_generate).to(
24 |         a10g_gpu_cluster, reqs=["pytest", "diffusers", "torch", "transformers"]
25 |     )
26 | 
27 |     images = generate_gpu(
28 |         prompt="A hot dog made of matcha powder.", num_images=4, steps=50
29 |     )
30 |     assert images
31 | 


--------------------------------------------------------------------------------