├── .buildkite ├── generate_pipeline.py └── test_buildkite_pipeline_generation.py ├── .dockerignore ├── .github ├── issue_template.md ├── pull_request_template.md └── workflows │ ├── dashboard.yml │ ├── docker-build.yaml │ ├── extract-buildkite.yml │ ├── format.yml │ ├── fusermount-server-image.yaml │ ├── go-reviewable.yaml │ ├── helm-docker-release.yaml │ ├── mypy.yml │ ├── nightly-build.yml │ ├── publish-and-validate.yml │ ├── publish-helm.yml │ ├── pylint.yml │ ├── pytest-generic.yml │ ├── pytest.yml │ ├── release-build.yml │ ├── release-publish.yml │ ├── smoke-tests-trigger.yaml │ ├── stale.yml │ ├── test-doc-build.yml │ └── wait-for-buildkite.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .pylintrc ├── .readthedocs.yml ├── CONTRIBUTING.md ├── Dockerfile ├── Dockerfile_k8s ├── Dockerfile_k8s_gpu ├── LICENSE ├── MANIFEST.in ├── README.md ├── addons └── fuse-proxy │ ├── .gitignore │ ├── Dockerfile │ ├── Makefile │ ├── README.md │ ├── cmd │ ├── fusermount-server │ │ └── main.go │ ├── fusermount-shim │ │ └── main.go │ └── fusermount-wrapper │ │ └── main.go │ ├── go.mod │ ├── go.sum │ └── pkg │ ├── client │ └── client.go │ ├── common │ └── common.go │ └── server │ └── server.go ├── charts └── skypilot │ ├── .gitignore │ ├── .helmignore │ ├── Chart.yaml │ ├── developer.md │ ├── templates │ ├── NOTES.txt │ ├── _helpers.tpl │ ├── api-configmap.yaml │ ├── api-deployment.yaml │ ├── api-secrets.yaml │ ├── api-service.yaml │ ├── auth.yaml │ ├── ingress-nodeport.yaml │ ├── ingress.yaml │ ├── oauth2-proxy-deployment.yaml │ ├── oauth2-proxy-ingress.yaml │ ├── oauth2-proxy-redis.yaml │ ├── oauth2-proxy-service.yaml │ ├── pvc.yaml │ ├── rbac.yaml │ └── system-rbac.yaml │ └── values.yaml ├── docs ├── .nojekyll ├── Makefile ├── README.md ├── build.sh ├── make.bat ├── repo-images │ ├── README │ ├── managed-job-schedule-state-diagram.png │ └── managed-job-status-diagram.png ├── requirements-docs.txt └── source │ ├── .gitignore │ ├── _static │ ├── SkyPilot_wide_dark.svg │ ├── SkyPilot_wide_light.svg │ ├── custom.css │ ├── custom.js │ ├── favicon.ico │ ├── intro.gif │ ├── intro.mp4 │ └── rtd-data.js │ ├── _templates │ ├── author.html │ ├── header.html │ ├── layout.html │ ├── main-sidebar-home.html │ ├── main-sidebar.html │ └── navbar-skypilot-logo.html │ ├── admin │ └── workspaces.rst │ ├── cloud-setup │ ├── cloud-permissions │ │ ├── aws.rst │ │ ├── gcp.rst │ │ ├── index.rst │ │ ├── kubernetes.rst │ │ ├── nebius.rst │ │ └── vsphere.rst │ ├── policy.rst │ └── quota.rst │ ├── compute │ ├── cloud-vm.rst │ ├── gpus.rst │ ├── show-gpus-all.txt │ └── show-gpus-h100-8.txt │ ├── conf.py │ ├── developers │ ├── CONTRIBUTING.md │ └── index.rst │ ├── docs │ └── index.rst │ ├── examples │ ├── applications │ │ ├── batch_inference.md │ │ ├── index.rst │ │ ├── localgpt.md │ │ ├── rag.md │ │ ├── stable_diffusion.md │ │ ├── tabby.md │ │ └── vector_database.md │ ├── auto-failover.rst │ ├── docker-containers.rst │ ├── frameworks │ │ ├── airflow.md │ │ ├── dvc.md │ │ ├── index.rst │ │ ├── jupyter.md │ │ └── mpi.md │ ├── index.rst │ ├── interactive-development.rst │ ├── managed-jobs.rst │ ├── models │ │ ├── codellama.md │ │ ├── dbrx.md │ │ ├── deepseek-janus.md │ │ ├── deepseek-r1-distilled.md │ │ ├── deepseek-r1.md │ │ ├── gemma.md │ │ ├── gemma3.md │ │ ├── gpt-2.md │ │ ├── index.rst │ │ ├── llama-2.md │ │ ├── llama-3.md │ │ ├── llama-3_1.md │ │ ├── llama-3_2.md │ │ ├── llama-4.md │ │ ├── mixtral.md │ │ ├── pixtral.md │ │ ├── qwen.md │ │ ├── vicuna.md │ │ └── yi.md │ ├── performance │ │ ├── aws_efa.md │ │ ├── gcp_gpu_direct_tcpx.md │ │ ├── index.rst │ │ └── nebius_infiniband.md │ ├── ports.rst │ ├── serving │ │ ├── cog.md │ │ ├── index.rst │ │ ├── lorax.md │ │ ├── ollama.md │ │ ├── sglang.md │ │ ├── tgi.md │ │ └── vllm.md │ ├── syncing-code-artifacts.rst │ └── training │ │ ├── axolotl.md │ │ ├── deepspeed.md │ │ ├── distributed-pytorch.md │ │ ├── distributed-tensorflow.md │ │ ├── index.rst │ │ ├── llama-2-finetuning.md │ │ ├── llama-3_1-finetuning.md │ │ ├── nemo.md │ │ ├── ray.md │ │ ├── tpu.md │ │ └── unsloth.md │ ├── extension │ └── linting.py │ ├── gallery │ ├── applications │ │ ├── localgpt.rst │ │ ├── rag.rst │ │ ├── tabby.rst │ │ └── vector_database.rst │ ├── frameworks │ │ ├── lorax.rst │ │ ├── ollama.rst │ │ ├── sglang.rst │ │ ├── tgi.rst │ │ └── vllm.rst │ ├── index.rst │ └── llms │ │ ├── codellama.rst │ │ ├── dbrx.rst │ │ ├── deepseek-janus.rst │ │ ├── deepseek-r1-distilled.rst │ │ ├── deepseek-r1.rst │ │ ├── gemma.rst │ │ ├── gpt-2.rst │ │ ├── index.rst │ │ ├── llama-2.rst │ │ ├── llama-3.rst │ │ ├── llama-3_1.rst │ │ ├── llama-3_2.rst │ │ ├── mixtral.rst │ │ ├── pixtral.rst │ │ ├── qwen.rst │ │ ├── vicuna.rst │ │ └── yi.rst │ ├── generate_examples.py │ ├── getting-started │ ├── installation.rst │ ├── quickstart.rst │ └── tutorial.rst │ ├── images │ ├── SkyPilot-logo-wide.png │ ├── ai-gallery-cover.png │ ├── client-server │ │ ├── arch.png │ │ ├── auth-proxy-internals.svg │ │ ├── auth-proxy-user-flow.svg │ │ ├── cluster-users.png │ │ ├── executor.png │ │ ├── high-level-arch.png │ │ ├── local.png │ │ ├── okta-setup.png │ │ ├── okta.png │ │ ├── remote.png │ │ └── token-page.png │ ├── cloud-logos-dark.png │ ├── cloud-logos-light.png │ ├── config-cheatsheet-dark.svg │ ├── config-cheatsheet-light.svg │ ├── dashboard-clusters.png │ ├── dashboard-managed-jobs.png │ ├── gcp-vm.png │ ├── jupyter-auth.png │ ├── jupyter-covid.png │ ├── jupyter-create.png │ ├── jupyter-gpu.png │ ├── k8s-pod.png │ ├── k8s-skypilot-architecture-dark.png │ ├── k8s-skypilot-architecture-light.png │ ├── managed-jobs-arch.png │ ├── managed-jobs-dashboard.png │ ├── multi-kubernetes.svg │ ├── screenshots │ │ ├── aws │ │ │ ├── aws-add-policy.png │ │ │ ├── aws-add-role-entity.png │ │ │ ├── aws-add-role.png │ │ │ ├── aws-add-user.png │ │ │ ├── aws-create-access-key.png │ │ │ └── aws-create-policy.png │ │ ├── gcp │ │ │ ├── cloud-nat.png │ │ │ ├── create-iam.png │ │ │ ├── create-role.png │ │ │ ├── create-service-account.png │ │ │ ├── service-account-grant-role.png │ │ │ └── service-account-name.png │ │ ├── kubernetes │ │ │ └── kubernetes-dashboard.png │ │ ├── nebius │ │ │ └── nebius-k8s-attach-fs.png │ │ └── vsphere │ │ │ ├── content-lib-item-tag-adding.png │ │ │ ├── content-lib-item.png │ │ │ ├── content-lib-local.png │ │ │ ├── content-lib-name.png │ │ │ ├── content-lib-security-policy.png │ │ │ ├── content-lib-storage.png │ │ │ ├── content-libs-navigate.png │ │ │ ├── content-libs.png │ │ │ ├── vm-clone-to-template-cl.png │ │ │ ├── vm-clone-to-template-ovf.png │ │ │ ├── vm-clone-to-template.png │ │ │ ├── vsphere-catagory-create.png │ │ │ ├── vsphere-catagory-create_navigate.png │ │ │ ├── vsphere-catagory-create_navigate_new.png │ │ │ ├── vsphere-datastore-tag-adding.png │ │ │ ├── vsphere-tags-create.png │ │ │ ├── vsphere-tags-create_navigate.png │ │ │ ├── vsphere-vm-storage-policy-inventory.png │ │ │ ├── vsphere-vm-storage-policy-name.png │ │ │ ├── vsphere-vm-storage-policy-navigate-new.png │ │ │ ├── vsphere-vm-storage-policy-navigate.png │ │ │ ├── vsphere-vm-storage-policy-review.png │ │ │ ├── vsphere-vm-storage-policy-rule.png │ │ │ └── vsphere-vm-storage-policy-tag.png │ ├── sky-above-clouds-gen.jpg │ ├── sky-existing-infra-workflow-dark.png │ ├── sky-existing-infra-workflow-light.png │ ├── sky-serve-architecture.png │ ├── sky-serve-status-full.png │ ├── sky-serve-status-output-provisioning.png │ ├── sky-serve-status-tgi.png │ ├── sky-serve-status-vicuna-ready.png │ ├── sky-serve-status-vllm.png │ ├── sky-storage-modes.svg │ ├── skypilot-abstractions-long-2.png │ ├── skypilot-wide-dark-1k.png │ ├── skypilot-wide-light-1k.png │ ├── ssh-node-pools │ │ ├── infra.png │ │ └── pool-details.png │ └── workspaces │ │ ├── config.png │ │ ├── edit.png │ │ ├── overview.png │ │ └── resources.png │ ├── index.rst │ ├── overview.rst │ ├── reference │ ├── api-server │ │ ├── api-server-admin-deploy.rst │ │ ├── api-server-troubleshooting.rst │ │ ├── api-server-tunning.rst │ │ ├── api-server-upgrade.rst │ │ ├── api-server.rst │ │ ├── examples │ │ │ ├── api-server-auth-proxy.rst │ │ │ ├── api-server-persistence.rst │ │ │ └── example-deploy-gke-nebius-okta.rst │ │ └── helm-values-spec.rst │ ├── api.rst │ ├── async.rst │ ├── auto-stop.rst │ ├── cli.rst │ ├── comparison.rst │ ├── config-sources.rst │ ├── config.rst │ ├── faq.rst │ ├── job-queue.rst │ ├── kubernetes │ │ ├── examples │ │ │ └── index.rst │ │ ├── index.rst │ │ ├── kubernetes-deployment.rst │ │ ├── kubernetes-getting-started.rst │ │ ├── kubernetes-ports.rst │ │ ├── kubernetes-priorities.rst │ │ ├── kubernetes-setup.rst │ │ ├── kubernetes-troubleshooting.rst │ │ ├── multi-kubernetes.rst │ │ └── skypilot-and-vanilla-k8s.rst │ ├── logging.rst │ ├── storage.rst │ ├── tpu.rst │ ├── training-guide.rst │ ├── volumes.rst │ └── yaml-spec.rst │ ├── reservations │ ├── existing-machines.rst │ └── reservations.rst │ ├── running-jobs │ ├── distributed-jobs.rst │ ├── environment-variables.rst │ └── many-jobs.rst │ ├── serving │ ├── auth.rst │ ├── autoscaling.rst │ ├── https.rst │ ├── sky-serve.rst │ ├── spot-policy.rst │ ├── update.rst │ └── user-guides.rst │ └── sky-computing.rst ├── examples ├── README.md ├── admin_policy │ ├── add_labels.yaml │ ├── disable_public_ip.yaml │ ├── dynamic_kubernetes_contexts_update.yaml │ ├── enforce_autostop.yaml │ ├── example_policy │ │ ├── example_policy │ │ │ ├── __init__.py │ │ │ └── skypilot_policy.py │ │ └── pyproject.toml │ ├── reject_all.yaml │ ├── task.yaml │ └── use_spot_for_gpu.yaml ├── airflow │ ├── README.md │ ├── data_preprocessing.yaml │ ├── eval.yaml │ ├── sky_train_dag.py │ └── train.yaml ├── api-deploy-gke-nebius-okta │ └── README.md ├── autogluon.yaml ├── aws-neuron │ ├── README.md │ ├── inferentia.yaml │ └── multi-accelerator.yaml ├── aws_efa │ ├── README.md │ └── nccl_efa.yaml ├── azure_start_stop.yaml ├── benchmark │ ├── keras_asr.yaml │ ├── keras_asr │ │ └── callback.patch │ ├── lightning_gan.yaml │ ├── lightning_gan │ │ └── callback.patch │ ├── timm.yaml │ ├── timm │ │ ├── callback.patch │ │ └── dummy_dataset.patch │ ├── transformers_qa.yaml │ └── transformers_qa │ │ └── callback.patch ├── cog │ ├── README.md │ ├── cog.yaml │ ├── predict.py │ └── sky.yaml ├── containerized_app.py ├── custom_image.yaml ├── deepspeed-multinode │ ├── README.md │ └── sky.yaml ├── detectron2_app.yaml ├── detectron2_docker.yaml ├── disk_size.yaml ├── distributed-pytorch │ ├── README.md │ ├── train-rdzv.yaml │ └── train.yaml ├── distributed_ray_train │ ├── README.md │ ├── ray_train.yaml │ └── train.py ├── docker │ ├── compose │ │ ├── compose_example.yaml │ │ └── docker-compose.yml │ ├── detectron2 │ │ ├── Dockerfile │ │ └── deploy.Dockerfile │ ├── echo_app.py │ ├── echo_app.yaml │ └── echo_app │ │ ├── Dockerfile │ │ ├── README.md │ │ └── echo.py ├── dvc │ ├── README.md │ └── dvc_pipeline.yaml ├── env_check.yaml ├── example_app.py ├── gcp_gpu_direct_tcpx │ ├── README.md │ ├── gpu_direct_tcpx.yaml │ └── tcpx_sglang_serving.yaml ├── gcp_start_stop.yaml ├── horovod_distributed_tf_app.py ├── http_server_with_custom_ports │ ├── server.py │ └── task.yaml ├── huggingface_glue_imdb_app.py ├── huggingface_glue_imdb_app.yaml ├── huggingface_glue_imdb_grid_search_app.py ├── hyperpod-eks │ ├── README.md │ └── train.yaml ├── image_with_tag.yaml ├── job_queue │ ├── cluster.yaml │ ├── cluster_docker.yaml │ ├── cluster_multinode.yaml │ ├── job.yaml │ ├── job_docker.yaml │ ├── job_gpu.yaml │ ├── job_ibm.yaml │ ├── job_multinode.yaml │ └── job_multinode_ibm.yaml ├── jupyter_app.py ├── jupyter_lab.yaml ├── k8s_cloud_deploy │ ├── README.md │ ├── cloud_k8s.yaml │ └── launch_k8s.sh ├── managed_job.yaml ├── managed_job_with_storage.yaml ├── managed_spot.yaml ├── many_gpu_vms.yaml ├── minimal.yaml ├── mpirun.yaml ├── multi_accelerators.yaml ├── multi_echo.py ├── multi_hostname.py ├── multi_hostname.yaml ├── multi_resources.yaml ├── nccl_test.yaml ├── nebius_infiniband │ ├── README.md │ ├── infiniband.yaml │ ├── nccl.yaml │ ├── nccl_network_tier.yaml │ ├── nccl_no_ib.yaml │ └── nccl_vm_ib.yaml ├── nemo │ ├── README.md │ ├── nemo_bert.yaml │ ├── nemo_gpt_distributed.yaml │ ├── nemo_gpt_preprocessing.yaml │ └── nemo_gpt_singlenode.yaml ├── oci │ ├── config.yaml │ ├── dataset-mount.yaml │ ├── dataset-upload-and-mount.yaml │ ├── gpu-oraclelinux9.yaml │ ├── gpu-ubuntu-2204.yaml │ ├── oci-mounts.yaml │ ├── oci_cpu-sky-preemptible.yaml │ ├── oci_cpu-sky.yaml │ ├── oci_gpu-sky.yaml │ ├── serve-http-cpu.yaml │ └── serve-qwen-7b.yaml ├── per_region_images.yaml ├── perf │ ├── results.md │ └── storage_rawperf.yaml ├── playground │ ├── min_fail.yaml │ ├── min_progress_bar.yaml │ ├── storage_playground.py │ └── symlink_playground.yaml ├── ray_tune_app.py ├── ray_tune_app.yaml ├── ray_tune_examples │ ├── tune_basic_example.py │ └── tune_ptl_example.py ├── resnet_app.py ├── resnet_app.yaml ├── resnet_app_storage.py ├── resnet_app_storage.yaml ├── resnet_app_storage_spot.yaml ├── resnet_distributed_tf_app.py ├── resnet_distributed_torch.yaml ├── resnet_distributed_torch_app.py ├── resnet_distributed_torch_scripts │ ├── run.sh │ └── setup.sh ├── resnet_distributed_torch_with_script.yaml ├── resnet_inference_app.yaml ├── sample_dotenv ├── serve │ ├── gorilla │ │ ├── gorilla.yaml │ │ └── use_gorilla.ipynb │ ├── http_server │ │ ├── server.py │ │ └── task.yaml │ ├── https │ │ └── service.yaml │ ├── huggingface-tgi.yaml │ ├── llama2 │ │ ├── chat.py │ │ └── llama2.yaml │ ├── load_balancing_policies_example.yaml │ ├── min_replicas_zero.yaml │ ├── minimal.yaml │ ├── misc │ │ └── cancel │ │ │ ├── README.md │ │ │ ├── send_cancel_request.py │ │ │ ├── server.py │ │ │ └── service.yaml │ ├── ray_serve │ │ ├── ray_serve.yaml │ │ └── serve.py │ ├── spot_policy │ │ ├── base_on_demand_fallback_replicas.yaml │ │ ├── dynamic_on_demand_fallback.yaml │ │ └── multi_accelerators.yaml │ ├── stable_diffusion_service.yaml │ ├── vicuna-v1.5.yaml │ └── vllm.yaml ├── show_gpus.sh ├── spot │ ├── bert_qa.yaml │ ├── lightning_cifar10.yaml │ ├── lightning_cifar10 │ │ ├── requirements.txt │ │ └── train.py │ ├── resnet.yaml │ └── resnet_ddp │ │ ├── README.md │ │ ├── requirements.txt │ │ └── resnet_ddp.py ├── spot_pipeline │ ├── bert_qa_train_eval.yaml │ ├── multi_jobs.yaml │ └── single.yaml ├── stable_diffusion │ ├── Dockerfile │ ├── README.md │ ├── docker-compose.yml │ ├── pushing_docker_image.md │ └── stable_diffusion_docker.yaml ├── storage │ ├── checkpointed_training.yaml │ ├── hostname_echo_demo.yaml │ └── pingpong.yaml ├── storage_demo.yaml ├── tensorboard_app.py ├── tensorflow_distributed │ ├── README.md │ ├── tf_distributed.yaml │ └── train.py ├── time_estimators.py ├── timm_app.py ├── torch_ddp_benchmark │ ├── torch_ddp_benchmark.py │ └── torch_ddp_benchmark.yaml ├── tpu │ ├── README.md │ ├── tpu_app.py │ ├── tpu_app.yaml │ ├── tpu_app_code │ │ ├── requirements.txt │ │ └── run_tpu.py │ ├── tpu_node_mnist.yaml │ ├── tpuvm_mnist.yaml │ └── v6e │ │ ├── README.md │ │ ├── benchmark-llama2-7b.yaml │ │ ├── config-8B.json │ │ ├── fsdp_config.json │ │ ├── serve-llama2-7b.yaml │ │ └── train-llama3-8b.yaml ├── unsloth │ ├── README.md │ ├── unsloth.yaml │ └── unsloth_example.py ├── using_file_mounts.yaml ├── using_file_mounts_with_env_vars.yaml └── vector_database │ ├── README.md │ ├── batch_compute_vectors.py │ ├── build_vectordb.yaml │ ├── compute_vectors.yaml │ ├── scripts │ ├── build_vectordb.py │ ├── compute_vectors.py │ └── serve_vectordb.py │ └── serve_vectordb.yaml ├── format.sh ├── llm ├── axolotl │ ├── axolotl-docker.yaml │ ├── axolotl-spot.yaml │ ├── axolotl.yaml │ ├── mistral │ │ ├── qlora-checkpoint.yaml │ │ └── qlora.yaml │ └── readme.md ├── batch_inference │ ├── README.md │ ├── batch_compute_vectors.py │ ├── compute_text_vectors.yaml │ ├── monitor_progress.yaml │ └── scripts │ │ ├── base_vector_processor.py │ │ ├── monitor_progress.py │ │ └── text_vector_processor.py ├── codellama │ ├── README.md │ ├── complete.py │ ├── endpoint.yaml │ ├── gui.yaml │ └── tabby.yaml ├── dbrx │ ├── README.md │ └── dbrx.yaml ├── deepseek-janus │ ├── README.md │ ├── janus_1.5b.yaml │ └── januspro_7b.yaml ├── deepseek-r1-distilled │ ├── README.md │ └── deepseek-r1-vllm.yaml ├── deepseek-r1 │ ├── README.md │ ├── deepseek-r1-671B-A100.yaml │ └── deepseek-r1-671B.yaml ├── falcon │ ├── README.md │ ├── falcon.yaml │ └── train.py ├── gemma │ ├── README.md │ └── serve.yaml ├── gemma3 │ ├── README.md │ └── gemma3.yaml ├── gpt-2 │ ├── README.md │ ├── gpt2-data.yaml │ ├── gpt2-pipeline.yaml │ ├── gpt2-train.yaml │ └── gpt2.yaml ├── llama-2 │ ├── README.md │ ├── chatbot-hf.yaml │ └── chatbot-meta.yaml ├── llama-3 │ ├── README.md │ ├── gui.yaml │ └── llama3.yaml ├── llama-3_1-finetuning │ ├── configs │ │ ├── 70B-lora.yaml │ │ └── 8B-lora.yaml │ ├── lora.yaml │ ├── readme.md │ └── serve.yaml ├── llama-3_1 │ ├── README.md │ └── llama-3_1.yaml ├── llama-3_2 │ ├── README.md │ ├── llama3_2-vision-11b.yaml │ └── llama3_2.yaml ├── llama-4 │ ├── README.md │ └── llama4.yaml ├── llama-chatbots │ ├── README.md │ ├── llama-13b-upload.yaml │ ├── llama-13b.yaml │ ├── llama-30b-upload.yaml │ ├── llama-30b.yaml │ ├── llama-65b-upload.yaml │ ├── llama-65b.yaml │ ├── llama-7b-upload.yaml │ └── llama-7b.yaml ├── localgpt │ ├── README.md │ └── localgpt.yaml ├── lorax │ ├── README.md │ └── lorax.yaml ├── mixtral │ ├── README.md │ └── serve.yaml ├── ollama │ ├── README.md │ └── ollama.yaml ├── pixtral │ ├── README.md │ └── pixtral.yaml ├── qwen │ ├── README.md │ ├── gui.yaml │ ├── qwen15-110b.yaml │ ├── qwen2-vl-7b.yaml │ ├── qwen25-72b.yaml │ ├── qwen25-7b.yaml │ └── qwen3-235b.yaml ├── rag │ ├── README.md │ ├── batch_compute_embeddings.py │ ├── build_rag.yaml │ ├── compute_embeddings.yaml │ ├── scripts │ │ ├── build_rag.py │ │ ├── compute_embeddings.py │ │ ├── serve_rag.py │ │ └── templates │ │ │ └── index.html │ └── serve_rag.yaml ├── sglang │ ├── README.md │ ├── llama2.yaml │ └── llava.yaml ├── tabby │ ├── README.md │ ├── docker-compose.cuda.yaml │ ├── docker-compose.yaml │ ├── tabby.yaml │ └── tabby │ │ └── config.toml ├── tgi │ ├── README.md │ └── serve.yaml ├── vicuna-llama-2 │ ├── README.md │ ├── scripts │ │ ├── flash_attn_patch.py │ │ ├── hardcoded_questions.py │ │ ├── train.py │ │ ├── train_flash_attn.py │ │ ├── train_xformers.py │ │ └── xformers_patch.py │ ├── serve.yaml │ └── train.yaml ├── vicuna │ ├── LICENSE │ ├── README.md │ ├── dummy.json │ ├── scripts │ │ └── sync_local_checkpoint.sh │ ├── serve-openai-api-endpoint.yaml │ ├── serve.yaml │ └── train.yaml ├── vllm │ ├── README.md │ ├── serve-openai-api-docker.yaml │ ├── serve-openai-api.yaml │ ├── serve.yaml │ ├── service-with-auth.yaml │ └── service.yaml └── yi │ ├── README.md │ ├── yi15-34b.yaml │ ├── yi15-6b.yaml │ ├── yi15-9b.yaml │ ├── yicoder-1_5b.yaml │ └── yicoder-9b.yaml ├── pyproject.toml ├── requirements-dev.txt ├── setup.py ├── sky ├── __init__.py ├── adaptors │ ├── README.md │ ├── __init__.py │ ├── aws.py │ ├── azure.py │ ├── cloudflare.py │ ├── common.py │ ├── cudo.py │ ├── do.py │ ├── docker.py │ ├── gcp.py │ ├── ibm.py │ ├── kubernetes.py │ ├── nebius.py │ ├── oci.py │ ├── runpod.py │ ├── vast.py │ └── vsphere.py ├── admin_policy.py ├── authentication.py ├── backends │ ├── __init__.py │ ├── backend.py │ ├── backend_utils.py │ ├── cloud_vm_ray_backend.py │ ├── docker_utils.py │ ├── local_docker_backend.py │ ├── monkey_patches │ │ └── monkey_patch_ray_up.py │ ├── playground │ │ └── demo_dockerutils.py │ └── wheel_utils.py ├── benchmark │ ├── __init__.py │ ├── benchmark_state.py │ └── benchmark_utils.py ├── callbacks │ ├── setup.py │ └── sky_callback │ │ ├── __init__.py │ │ ├── api.py │ │ ├── base.py │ │ ├── integrations │ │ ├── __init__.py │ │ ├── keras.py │ │ ├── pytorch_lightning.py │ │ └── transformers.py │ │ └── utils.py ├── check.py ├── cli.py ├── client │ ├── __init__.py │ ├── cli.py │ ├── common.py │ ├── oauth.py │ └── sdk.py ├── cloud_stores.py ├── clouds │ ├── __init__.py │ ├── aws.py │ ├── azure.py │ ├── cloud.py │ ├── cudo.py │ ├── do.py │ ├── fluidstack.py │ ├── gcp.py │ ├── ibm.py │ ├── kubernetes.py │ ├── lambda_cloud.py │ ├── nebius.py │ ├── oci.py │ ├── paperspace.py │ ├── runpod.py │ ├── scp.py │ ├── service_catalog │ │ ├── README.md │ │ ├── __init__.py │ │ ├── aws_catalog.py │ │ ├── azure_catalog.py │ │ ├── common.py │ │ ├── config.py │ │ ├── constants.py │ │ ├── cudo_catalog.py │ │ ├── data_fetchers │ │ │ ├── __init__.py │ │ │ ├── analyze.py │ │ │ ├── fetch_aws.py │ │ │ ├── fetch_azure.py │ │ │ ├── fetch_cudo.py │ │ │ ├── fetch_fluidstack.py │ │ │ ├── fetch_gcp.py │ │ │ ├── fetch_ibm.py │ │ │ ├── fetch_lambda_cloud.py │ │ │ ├── fetch_vast.py │ │ │ ├── fetch_vsphere.py │ │ │ └── requirements.txt │ │ ├── do_catalog.py │ │ ├── fluidstack_catalog.py │ │ ├── gcp_catalog.py │ │ ├── ibm_catalog.py │ │ ├── images │ │ │ ├── README.md │ │ │ ├── aws_utils │ │ │ │ ├── image_delete.py │ │ │ │ └── image_gen.py │ │ │ ├── plugins.pkr.hcl │ │ │ ├── provisioners │ │ │ │ ├── cuda-azure-grid.sh │ │ │ │ ├── cuda.sh │ │ │ │ ├── docker.sh │ │ │ │ ├── nvidia-container-toolkit.sh │ │ │ │ ├── skypilot.sh │ │ │ │ └── user-toolkit.sh │ │ │ ├── skypilot-aws-cpu-ubuntu.pkr.hcl │ │ │ ├── skypilot-aws-gpu-ubuntu.pkr.hcl │ │ │ ├── skypilot-azure-cpu-ubuntu.pkr.hcl │ │ │ ├── skypilot-azure-gpu-ubuntu.pkr.hcl │ │ │ ├── skypilot-gcp-cpu-ubuntu.pkr.hcl │ │ │ ├── skypilot-gcp-gpu-ubuntu.pkr.hcl │ │ │ └── skypilot-k8s-image.sh │ │ ├── kubernetes_catalog.py │ │ ├── lambda_catalog.py │ │ ├── nebius_catalog.py │ │ ├── oci_catalog.py │ │ ├── paperspace_catalog.py │ │ ├── runpod_catalog.py │ │ ├── scp_catalog.py │ │ ├── ssh_catalog.py │ │ ├── vast_catalog.py │ │ └── vsphere_catalog.py │ ├── ssh.py │ ├── utils │ │ ├── README.md │ │ ├── __init__.py │ │ ├── aws_utils.py │ │ ├── azure_utils.py │ │ ├── gcp_utils.py │ │ ├── oci_utils.py │ │ └── scp_utils.py │ ├── vast.py │ └── vsphere.py ├── core.py ├── dag.py ├── dashboard │ ├── .eslintrc.json │ ├── .gitignore │ ├── .prettierrc │ ├── README.md │ ├── components.json │ ├── eslint.config.mjs │ ├── jest.config.js │ ├── jest.setup.js │ ├── jsconfig.json │ ├── next.config.mjs │ ├── package-lock.json │ ├── package.json │ ├── postcss.config.mjs │ ├── public │ │ ├── favicon.ico │ │ ├── skypilot.svg │ │ └── videos │ │ │ └── cursor-small.mp4 │ ├── server.js │ ├── src │ │ ├── app │ │ │ └── globals.css │ │ ├── components │ │ │ ├── clusters.jsx │ │ │ ├── elements │ │ │ │ ├── ErrorDisplay.jsx │ │ │ │ ├── StatusBadge.jsx │ │ │ │ ├── events.jsx │ │ │ │ ├── icons.jsx │ │ │ │ ├── layout.jsx │ │ │ │ ├── modals.jsx │ │ │ │ ├── sidebar.jsx │ │ │ │ └── version-display.jsx │ │ │ ├── infra.jsx │ │ │ ├── jobs.jsx │ │ │ ├── ui │ │ │ │ ├── alert.jsx │ │ │ │ ├── avatar.jsx │ │ │ │ ├── button.jsx │ │ │ │ ├── card.jsx │ │ │ │ ├── dialog.jsx │ │ │ │ ├── flip_card.jsx │ │ │ │ ├── input.jsx │ │ │ │ ├── label.jsx │ │ │ │ ├── select.jsx │ │ │ │ ├── table.jsx │ │ │ │ ├── tabs.jsx │ │ │ │ └── textarea.jsx │ │ │ ├── users.jsx │ │ │ ├── utils.jsx │ │ │ ├── workspace-editor.jsx │ │ │ └── workspaces.jsx │ │ ├── data │ │ │ ├── connectors │ │ │ │ ├── clusters.jsx │ │ │ │ ├── constants.jsx │ │ │ │ ├── infra.jsx │ │ │ │ ├── jobs.jsx │ │ │ │ ├── toast.jsx │ │ │ │ ├── users.js │ │ │ │ └── workspaces.jsx │ │ │ └── utils.jsx │ │ ├── hooks │ │ │ └── useMobile.js │ │ ├── lib │ │ │ ├── README.md │ │ │ ├── cache-preloader.js │ │ │ ├── cache.js │ │ │ ├── config.js │ │ │ └── utils.js │ │ └── pages │ │ │ ├── _app.js │ │ │ ├── clusters.js │ │ │ ├── clusters │ │ │ ├── [cluster].js │ │ │ └── [cluster] │ │ │ │ └── [job].js │ │ │ ├── config.js │ │ │ ├── index.js │ │ │ ├── infra.js │ │ │ ├── infra │ │ │ └── [context].js │ │ │ ├── jobs.js │ │ │ ├── jobs │ │ │ └── [job].js │ │ │ ├── users.js │ │ │ ├── workspace │ │ │ └── new.js │ │ │ ├── workspaces.js │ │ │ └── workspaces │ │ │ └── [name].js │ └── tailwind.config.js ├── data │ ├── __init__.py │ ├── data_transfer.py │ ├── data_utils.py │ ├── mounting_utils.py │ ├── storage.py │ └── storage_utils.py ├── design_docs │ ├── client_server.md │ ├── cluster_name.md │ ├── cluster_status.md │ ├── figures │ │ ├── cluster-state-transition.svg │ │ └── grafana-loki-setup.png │ ├── usage_collection.md │ └── workspaces.md ├── exceptions.py ├── execution.py ├── global_user_state.py ├── jobs │ ├── README.md │ ├── __init__.py │ ├── client │ │ ├── __init__.py │ │ └── sdk.py │ ├── constants.py │ ├── controller.py │ ├── dashboard │ │ ├── dashboard.py │ │ ├── static │ │ │ └── favicon.ico │ │ └── templates │ │ │ └── index.html │ ├── recovery_strategy.py │ ├── scheduler.py │ ├── server │ │ ├── __init__.py │ │ ├── core.py │ │ ├── dashboard_utils.py │ │ └── server.py │ ├── state.py │ └── utils.py ├── models.py ├── optimizer.py ├── provision │ ├── __init__.py │ ├── aws │ │ ├── __init__.py │ │ ├── config.py │ │ ├── instance.py │ │ └── utils.py │ ├── azure │ │ ├── __init__.py │ │ ├── azure-config-template.json │ │ ├── config.py │ │ └── instance.py │ ├── common.py │ ├── constants.py │ ├── cudo │ │ ├── __init__.py │ │ ├── config.py │ │ ├── cudo_machine_type.py │ │ ├── cudo_utils.py │ │ ├── cudo_wrapper.py │ │ └── instance.py │ ├── do │ │ ├── __init__.py │ │ ├── config.py │ │ ├── constants.py │ │ ├── instance.py │ │ └── utils.py │ ├── docker_utils.py │ ├── fluidstack │ │ ├── __init__.py │ │ ├── config.py │ │ ├── fluidstack_utils.py │ │ └── instance.py │ ├── gcp │ │ ├── __init__.py │ │ ├── config.py │ │ ├── constants.py │ │ ├── instance.py │ │ ├── instance_utils.py │ │ ├── mig_utils.py │ │ └── volume_utils.py │ ├── instance_setup.py │ ├── kubernetes │ │ ├── __init__.py │ │ ├── config.py │ │ ├── constants.py │ │ ├── instance.py │ │ ├── manifests │ │ │ └── fusermount-server-daemonset.yaml │ │ ├── network.py │ │ ├── network_utils.py │ │ └── utils.py │ ├── lambda_cloud │ │ ├── __init__.py │ │ ├── config.py │ │ ├── instance.py │ │ └── lambda_utils.py │ ├── logging.py │ ├── metadata_utils.py │ ├── nebius │ │ ├── __init__.py │ │ ├── config.py │ │ ├── instance.py │ │ └── utils.py │ ├── oci │ │ ├── __init__.py │ │ ├── config.py │ │ ├── instance.py │ │ └── query_utils.py │ ├── paperspace │ │ ├── __init__.py │ │ ├── config.py │ │ ├── constants.py │ │ ├── instance.py │ │ └── utils.py │ ├── provisioner.py │ ├── runpod │ │ ├── __init__.py │ │ ├── api │ │ │ ├── __init__.py │ │ │ ├── commands.py │ │ │ └── pods.py │ │ ├── config.py │ │ ├── instance.py │ │ └── utils.py │ ├── ssh │ │ └── __init__.py │ ├── vast │ │ ├── __init__.py │ │ ├── config.py │ │ ├── instance.py │ │ └── utils.py │ └── vsphere │ │ ├── __init__.py │ │ ├── common │ │ ├── __init__.py │ │ ├── cls_api_client.py │ │ ├── cls_api_helper.py │ │ ├── custom_script.py │ │ ├── id_generator.py │ │ ├── metadata_utils.py │ │ ├── service_manager.py │ │ ├── service_manager_factory.py │ │ ├── ssl_helper.py │ │ ├── vapiconnect.py │ │ └── vim_utils.py │ │ ├── config.py │ │ ├── instance.py │ │ └── vsphere_utils.py ├── resources.py ├── serve │ ├── README.md │ ├── __init__.py │ ├── autoscalers.py │ ├── client │ │ ├── __init__.py │ │ └── sdk.py │ ├── constants.py │ ├── controller.py │ ├── load_balancer.py │ ├── load_balancing_policies.py │ ├── replica_managers.py │ ├── serve_state.py │ ├── serve_utils.py │ ├── server │ │ ├── __init__.py │ │ ├── core.py │ │ └── server.py │ ├── service.py │ ├── service_spec.py │ └── spot_placer.py ├── server │ ├── __init__.py │ ├── common.py │ ├── config.py │ ├── constants.py │ ├── html │ │ ├── log.html │ │ └── token_page.html │ ├── requests │ │ ├── __init__.py │ │ ├── event_loop.py │ │ ├── executor.py │ │ ├── payloads.py │ │ ├── preconditions.py │ │ ├── process.py │ │ ├── queues │ │ │ ├── __init__.py │ │ │ ├── local_queue.py │ │ │ └── mp_queue.py │ │ ├── requests.py │ │ └── serializers │ │ │ ├── __init__.py │ │ │ ├── decoders.py │ │ │ └── encoders.py │ ├── server.py │ ├── stream_utils.py │ └── uvicorn.py ├── setup_files │ ├── MANIFEST.in │ ├── dependencies.py │ └── setup.py ├── sky_logging.py ├── skylet │ ├── LICENSE │ ├── README.md │ ├── __init__.py │ ├── attempt_skylet.py │ ├── autostop_lib.py │ ├── configs.py │ ├── constants.py │ ├── events.py │ ├── job_lib.py │ ├── log_lib.py │ ├── log_lib.pyi │ ├── providers │ │ ├── __init__.py │ │ ├── command_runner.py │ │ ├── ibm │ │ │ ├── __init__.py │ │ │ ├── node_provider.py │ │ │ ├── utils.py │ │ │ └── vpc_provider.py │ │ └── scp │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ └── node_provider.py │ ├── ray_patches │ │ ├── __init__.py │ │ ├── autoscaler.py.patch │ │ ├── cli.py.patch │ │ ├── command_runner.py.patch │ │ ├── log_monitor.py.patch │ │ ├── resource_demand_scheduler.py.patch │ │ ├── updater.py.patch │ │ └── worker.py.patch │ ├── skylet.py │ └── subprocess_daemon.py ├── skypilot_config.py ├── task.py ├── templates │ ├── aws-ray.yml.j2 │ ├── azure-ray.yml.j2 │ ├── cudo-ray.yml.j2 │ ├── do-ray.yml.j2 │ ├── fluidstack-ray.yml.j2 │ ├── gcp-ray.yml.j2 │ ├── ibm-ray.yml.j2 │ ├── jobs-controller.yaml.j2 │ ├── kubernetes-ingress.yml.j2 │ ├── kubernetes-loadbalancer.yml.j2 │ ├── kubernetes-port-forward-proxy-command.sh │ ├── kubernetes-ray.yml.j2 │ ├── kubernetes-ssh-jump.yml.j2 │ ├── lambda-ray.yml.j2 │ ├── local-ray.yml.j2 │ ├── nebius-ray.yml.j2 │ ├── oci-ray.yml.j2 │ ├── paperspace-ray.yml.j2 │ ├── runpod-ray.yml.j2 │ ├── scp-ray.yml.j2 │ ├── sky-serve-controller.yaml.j2 │ ├── skypilot-server-kubernetes-proxy.sh │ ├── vast-ray.yml.j2 │ ├── vsphere-ray.yml.j2 │ └── websocket_proxy.py ├── usage │ ├── __init__.py │ ├── constants.py │ ├── loki-s3-config.yaml │ └── usage_lib.py ├── utils │ ├── __init__.py │ ├── accelerator_registry.py │ ├── admin_policy_utils.py │ ├── annotations.py │ ├── atomic.py │ ├── aws │ │ ├── __init__.py │ │ └── get_default_security_group.py │ ├── cli_utils │ │ ├── __init__.py │ │ └── status_utils.py │ ├── cluster_utils.py │ ├── command_runner.py │ ├── command_runner.pyi │ ├── common.py │ ├── common_utils.py │ ├── config_utils.py │ ├── context.py │ ├── context_utils.py │ ├── control_master_utils.py │ ├── controller_utils.py │ ├── dag_utils.py │ ├── db_utils.py │ ├── env_options.py │ ├── infra_utils.py │ ├── kubernetes │ │ ├── __init__.py │ │ ├── cleanup-tunnel.sh │ │ ├── config_map_utils.py │ │ ├── create_cluster.sh │ │ ├── delete_cluster.sh │ │ ├── deploy_remote_cluster.py │ │ ├── exec_kubeconfig_converter.py │ │ ├── generate_kind_config.py │ │ ├── generate_kubeconfig.sh │ │ ├── gpu_labeler.py │ │ ├── k8s_gpu_labeler_job.yaml │ │ ├── k8s_gpu_labeler_setup.yaml │ │ ├── kubernetes_deploy_utils.py │ │ ├── rsync_helper.sh │ │ ├── ssh-tunnel.sh │ │ └── ssh_jump_lifecycle_manager.py │ ├── kubernetes_enums.py │ ├── log_utils.py │ ├── message_utils.py │ ├── registry.py │ ├── resources_utils.py │ ├── rich_console_utils.py │ ├── rich_utils.py │ ├── schemas.py │ ├── status_lib.py │ ├── subprocess_utils.py │ ├── timeline.py │ ├── ux_utils.py │ └── validator.py └── workspaces │ ├── __init__.py │ ├── core.py │ └── server.py └── tests ├── common_test_fixtures.py ├── conftest.py ├── default_aws_az_mappings.csv ├── default_vsphere_vms.csv ├── git_info_exclude_test ├── gitignore_test ├── kubernetes ├── README.md ├── cpu_test_pod.yaml ├── eks_test_cluster.yaml ├── gpu_test_pod.yaml ├── ingress_test.yaml ├── loadbalancer_test_svc.yaml ├── networking_benchmarks │ ├── k8s_network_benchmarks.md │ ├── rsync_bench.sh │ └── skylaunch_bench.sh └── scripts │ ├── clean_k8s.sh │ ├── dashboard.yaml │ ├── delete.sh │ ├── deploy_k3s.sh │ ├── install_dashboard.sh │ ├── ray_k8s_sky.yaml │ ├── run.sh │ └── skypilot_ssh_k8s_deployment.yaml ├── load_tests ├── README.md ├── serve.yaml ├── sys_profiling.py ├── test_distribute_load_on_server.py ├── test_load_on_server.py └── test_queue_dispatcher.py ├── mypy_files.txt ├── run_smoke_tests.yaml ├── skyserve ├── auto_restart.yaml ├── cancel │ ├── cancel.yaml │ └── send_cancel_request.py ├── failures │ ├── initial_delay.yaml │ ├── probing.py │ └── probing.yaml ├── high_availability │ ├── config.yaml │ └── service.yaml ├── http │ ├── aws.yaml │ ├── azure.yaml │ ├── gcp.yaml │ ├── kubernetes.yaml │ └── oci.yaml ├── https │ └── service.yaml ├── llm │ ├── get_response.py │ ├── prompt_output.json │ └── service.yaml ├── load_balancer │ ├── server.py │ ├── service.yaml │ └── test_round_robin.py ├── multi_ports.yaml ├── readiness_timeout │ ├── server.py │ ├── task.yaml │ └── task_large_timeout.yaml ├── restart │ ├── user_bug.py │ └── user_bug.yaml ├── spot │ ├── base_ondemand_fallback.yaml │ ├── dynamic_ondemand_fallback.yaml │ ├── recovery.yaml │ ├── spot_hedge.yaml │ └── spot_hedge_T4.yaml ├── streaming │ ├── example.txt │ ├── send_streaming_request.py │ ├── server.py │ └── streaming.yaml └── update │ ├── bump_version_after.yaml │ ├── bump_version_before.yaml │ ├── new.yaml │ ├── new_autoscaler_after.yaml │ ├── new_autoscaler_before.yaml │ ├── new_server.py │ ├── num_min_one.yaml │ ├── num_min_two.yaml │ ├── old.yaml │ └── old_server.py ├── smoke_tests ├── __init__.py ├── backward_compat │ ├── sdk_backward_compat_utils.py │ └── test_backward_compat.py ├── docker │ ├── Dockerfile_test │ ├── __init__.py │ ├── docker_utils.py │ ├── entrypoint.sh │ └── stop_sky_resource.sh ├── smoke_tests_utils.py ├── test_api_server.py ├── test_basic.py ├── test_cluster_job.py ├── test_images.py ├── test_managed_job.py ├── test_mount_and_storage.py ├── test_quick_tests_core.py ├── test_region_and_zone.py ├── test_sky_serve.py ├── test_volume_mount.py └── test_workspaces.py ├── stress └── mountedstorage │ ├── mount_stress.yaml │ └── read_parallel.py ├── test_api.py ├── test_cli.py ├── test_config.py ├── test_db_utils.py ├── test_failover.py ├── test_global_user_state.py ├── test_jobs.py ├── test_jobs_and_serve.py ├── test_list_accelerators.py ├── test_optimizer_dryruns.py ├── test_optimizer_random_dag.py ├── test_serve_autoscaler.py ├── test_smoke.py ├── test_storage.py ├── test_wheels.py ├── test_yaml_parser.py ├── test_yamls ├── different_default_conda_env.yaml ├── failed_setup.yaml ├── failed_setup_pipeline.yaml ├── failed_worker_run.yaml ├── failed_worker_setup.yaml ├── force_enable_external_ips_config.yaml ├── gcp_per_region_images.yaml ├── intermediate_bucket.yaml ├── low_resource_sky_config.yaml ├── minimal.yaml ├── minimal_test_quick_tests_core.yaml ├── pipeline.yaml ├── pipeline_aws.yaml ├── pipeline_gcp.yaml ├── test_aws_config.yaml ├── test_custom_default_conda_env.yaml ├── test_custom_image.yaml ├── test_ibm_cos_storage_mounting.yaml ├── test_k8s_logs.yaml ├── test_labels.yaml.j2 ├── test_long_setup.yaml ├── test_managed_jobs_retry.yaml ├── test_multiple_accelerators_ordered.yaml ├── test_multiple_accelerators_ordered_with_default.yaml ├── test_multiple_accelerators_unordered.yaml ├── test_multiple_accelerators_unordered_with_default.yaml ├── test_multiple_resources.yaml ├── test_nebius_storage_mounting.yaml ├── test_only_setup.yaml ├── test_r2_storage_mounting.yaml ├── test_rclone_mount.yaml ├── test_serve_autoscaler.yaml ├── test_skyignore.yaml ├── test_skyignore_verification.py ├── test_storage_mounting.yaml.j2 ├── test_volume_mount.yaml.j2 ├── use_intermediate_bucket_config.yaml ├── use_internal_ips_config.yaml └── use_mig_config.yaml └── unit_tests ├── kubernetes ├── test_gpu_label_formatters.py ├── test_instance_type.py └── test_kubernetes_utils.py ├── test_adaptor.py ├── test_admin_policy.py ├── test_authentication.py ├── test_aws.py ├── test_aws_utils.py ├── test_azure_utils.py ├── test_backend_utils.py ├── test_cloud.py ├── test_controller_utils.py ├── test_dag.py ├── test_dag_utils.py ├── test_exceptions.py ├── test_gcp.py ├── test_jobs_utils.py ├── test_lambda.py ├── test_resources.py ├── test_sky ├── adaptors │ └── test_oci.py ├── clouds │ ├── test_kubernetes.py │ └── test_ssh.py ├── server │ ├── requests │ │ ├── queues │ │ │ └── test_mp_queue.py │ │ ├── test_precond.py │ │ ├── test_process.py │ │ └── test_requests.py │ ├── test_common.py │ ├── test_config.py │ ├── test_sdk.py │ └── test_server.py ├── storage │ └── test_storage_utils.py ├── test_sky_logging.py ├── test_task.py ├── utils │ ├── kubernetes │ │ └── test_skypilot_config_configmap_sync.py │ ├── test_cli_utils.py │ ├── test_common_utils.py │ ├── test_config_utils.py │ ├── test_context_utils.py │ ├── test_infra_utils.py │ ├── test_rich_utils.py │ ├── test_schemas.py │ ├── test_subprocess_utils.py │ └── text_context.py └── workspaces │ ├── test_workspace_config_concurrency.py │ ├── test_workspace_management.py │ └── test_workspace_race_condition_demo.py ├── test_sky_import.py └── test_zip_and_unzip.py /.dockerignore: -------------------------------------------------------------------------------- 1 | **/.git 2 | -------------------------------------------------------------------------------- /.github/issue_template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Issue 3 | about: Use this to open new issues. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | 12 | 13 | 14 | 15 | _Version & Commit info:_ 16 | * `sky -v`: PLEASE_FILL_IN 17 | * `sky -c`: PLEASE_FILL_IN 18 | -------------------------------------------------------------------------------- /.github/workflows/dashboard.yml: -------------------------------------------------------------------------------- 1 | name: Dashboard Linting and Formatting 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - master 9 | - 'releases/**' 10 | pull_request: 11 | branches: 12 | - master 13 | - 'releases/**' 14 | merge_group: 15 | 16 | jobs: 17 | dashboard: 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v3 21 | - name: Install dependencies and check 22 | run: | 23 | npm --prefix sky/dashboard install 24 | npm --prefix sky/dashboard run lint 25 | npm --prefix sky/dashboard run format:check 26 | npm --prefix sky/dashboard run build 27 | -------------------------------------------------------------------------------- /.github/workflows/go-reviewable.yaml: -------------------------------------------------------------------------------- 1 | name: go-reviewable 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - master 7 | - 'releases/**' 8 | paths: 9 | - 'addons/fuse-proxy/**' 10 | 11 | jobs: 12 | reviewable: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Go 17 | uses: actions/setup-go@v4 18 | with: 19 | go-version: '1.23' 20 | - name: Run reviewable 21 | working-directory: addons/fuse-proxy 22 | run: make reviewable 23 | -------------------------------------------------------------------------------- /.github/workflows/pytest-generic.yml: -------------------------------------------------------------------------------- 1 | # This is needed for GitHub Actions for the "Waiting for status to be reported" problem, 2 | # according to https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/defining-the-mergeability-of-pull-requests/troubleshooting-required-status-checks 3 | name: Python Tests 4 | on: 5 | # Trigger the workflow on push or pull request, 6 | # but only for the main branch 7 | push: 8 | branches: 9 | - master 10 | - 'releases/**' 11 | pull_request: 12 | branches: 13 | - master 14 | - 'releases/**' 15 | merge_group: 16 | 17 | jobs: 18 | python-test: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - run: 'echo "No tests to run"' 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/.DS_Store 2 | **/*.pyc 3 | **/__pycache__/ 4 | *.egg-info/ 5 | *.eggs/ 6 | 7 | docs/build/ 8 | docs/_build/ 9 | build/ 10 | sky_logs/ 11 | sky/clouds/service_catalog/data_fetchers/*.csv 12 | .vscode/ 13 | .idea/ 14 | .env 15 | 16 | # For editor files 17 | *.swp 18 | .buildkite/*.yaml 19 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | build: 9 | os: "ubuntu-22.04" 10 | tools: 11 | python: "3.10" 12 | 13 | # Build documentation in the docs/ directory with Sphinx 14 | sphinx: 15 | configuration: docs/source/conf.py 16 | 17 | # Optionally set the version of Python and requirements required to build your docs 18 | python: 19 | install: 20 | - method: pip 21 | path: . 22 | - requirements: docs/requirements-docs.txt 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | sky/setup_files/MANIFEST.in -------------------------------------------------------------------------------- /addons/fuse-proxy/.gitignore: -------------------------------------------------------------------------------- 1 | bin 2 | -------------------------------------------------------------------------------- /addons/fuse-proxy/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/skypilot-org/skypilot/addons/fuse-proxy 2 | 3 | go 1.23.3 4 | 5 | require ( 6 | github.com/pfnet-research/meta-fuse-csi-plugin v0.2.2 7 | github.com/sevlyar/go-daemon v0.1.6 8 | k8s.io/klog/v2 v2.100.1 9 | ) 10 | 11 | require ( 12 | github.com/kardianos/osext v0.0.0-20190222173326-2bc1f35cddc0 // indirect 13 | golang.org/x/sys v0.18.0 // indirect 14 | ) 15 | 16 | require ( 17 | github.com/go-logr/logr v1.2.4 // indirect 18 | github.com/spf13/pflag v1.0.6 19 | k8s.io/apimachinery v0.28.1 // indirect 20 | ) 21 | -------------------------------------------------------------------------------- /charts/skypilot/.gitignore: -------------------------------------------------------------------------------- 1 | Chart.lock 2 | charts/ 3 | -------------------------------------------------------------------------------- /charts/skypilot/.helmignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/charts/skypilot/.helmignore -------------------------------------------------------------------------------- /charts/skypilot/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: skypilot 3 | description: A Helm chart for deploying SkyPilot API server on Kubernetes 4 | type: application 5 | version: 0.0.0 6 | appVersion: "0.0" 7 | dependencies: 8 | - name: ingress-nginx 9 | version: 4.11.3 10 | repository: https://kubernetes.github.io/ingress-nginx 11 | condition: ingress-nginx.enabled 12 | -------------------------------------------------------------------------------- /charts/skypilot/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | {{- if not .Values.apiService.skipResourceCheck }} 2 | {{- include "skypilot.checkResources" . }} 3 | {{- end }} 4 | {{- include "skypilot.checkUpgradeConfig" . }} 5 | -------------------------------------------------------------------------------- /charts/skypilot/templates/api-configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ .Release.Name }}-config 5 | namespace: {{ .Release.Namespace }} 6 | data: 7 | config.yaml: |- 8 | {{- if .Values.apiService.config }} 9 | {{- .Values.apiService.config | nindent 4 }} 10 | {{- else }} 11 | {} 12 | {{- end }} 13 | -------------------------------------------------------------------------------- /charts/skypilot/templates/api-secrets.yaml: -------------------------------------------------------------------------------- 1 | {{- /* Use serect since sshNodePools config may contain credentials */ -}} 2 | {{- if .Values.apiService.sshNodePools}} 3 | apiVersion: v1 4 | kind: Secret 5 | metadata: 6 | name: {{ .Release.Name }}-ssh-node-pools 7 | namespace: {{ .Release.Namespace }} 8 | stringData: 9 | ssh_node_pools.yaml: | 10 | {{ .Values.apiService.sshNodePools | indent 4 }} 11 | {{- end }} 12 | -------------------------------------------------------------------------------- /charts/skypilot/templates/api-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ .Release.Name }}-api-service 5 | namespace: {{ .Release.Namespace }} 6 | spec: 7 | type: ClusterIP # Use clusterIP to allow ingress to authenticate 8 | ports: 9 | - port: 80 10 | targetPort: 46580 # Assuming your container listens on port 46580 11 | protocol: TCP 12 | selector: 13 | app: {{ .Release.Name }}-api 14 | skypilot.co/ready: "true" 15 | -------------------------------------------------------------------------------- /charts/skypilot/templates/auth.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (not .Values.ingress.authSecret) .Values.ingress.authCredentials (not (index .Values.ingress "oauth2-proxy" "enabled")) }} 2 | apiVersion: v1 3 | kind: Secret 4 | metadata: 5 | name: {{ .Release.Name }}-basic-auth 6 | namespace: {{ .Release.Namespace }} 7 | type: Opaque 8 | stringData: 9 | auth: {{ .Values.ingress.authCredentials | quote }} 10 | {{- end }} 11 | -------------------------------------------------------------------------------- /charts/skypilot/templates/oauth2-proxy-service.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.ingress.enabled (index .Values.ingress "oauth2-proxy" "enabled") }} 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | labels: 6 | app: {{ .Release.Name }}-oauth2-proxy 7 | skypilot.co/component: oauth2-proxy 8 | name: {{ .Release.Name }}-oauth2-proxy 9 | namespace: {{ .Release.Namespace }} 10 | spec: 11 | ports: 12 | - name: http 13 | port: 4180 14 | protocol: TCP 15 | targetPort: 4180 16 | selector: 17 | app: {{ .Release.Name }}-oauth2-proxy 18 | {{- end }} 19 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/.nojekyll -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | Sphinx docs based on ReadTheDocs. 3 | 4 | ## Styleguide 5 | 6 | - Each page's title is in `Title Case `_. 7 | - Each subsection's title is in `Sentence case `_. 8 | 9 | ## Build and view locally 10 | ```bash 11 | pip install -r requirements-docs.txt 12 | ./build.sh --watch --port 8000 13 | ``` 14 | -------------------------------------------------------------------------------- /docs/repo-images/README: -------------------------------------------------------------------------------- 1 | Images in this directory are intended to be used by README.md files in the repo. 2 | They should not be used in the docs. 3 | -------------------------------------------------------------------------------- /docs/repo-images/managed-job-schedule-state-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/repo-images/managed-job-schedule-state-diagram.png -------------------------------------------------------------------------------- /docs/repo-images/managed-job-status-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/repo-images/managed-job-status-diagram.png -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | linkify-it-py==2.0.3 2 | myst-parser==2.0.0 3 | sphinx==7.1.2 4 | sphinx-click==5.0.1 5 | sphinx-copybutton==0.5.2 6 | sphinxemoji==0.2.0 7 | sphinx-design==0.5.0 8 | pydata-sphinx-theme==0.14.4 9 | Pygments==2.16.1 10 | sphinx-autobuild==2021.3.14 11 | sphinx-autodoc-typehints==1.25.2 12 | sphinx-book-theme==1.1.0 13 | sphinx-togglebutton==0.3.2 14 | sphinx-notfound-page==1.0.4 15 | sphinxcontrib-applehelp==1.0.7 16 | sphinxcontrib-devhelp==1.0.5 17 | sphinxcontrib-googleanalytics==0.4 18 | sphinxcontrib-htmlhelp==2.0.4 19 | sphinxcontrib-jsmath==1.0.1 20 | sphinxcontrib-qthelp==1.0.6 21 | sphinxcontrib-serializinghtml==1.1.9 22 | -------------------------------------------------------------------------------- /docs/source/.gitignore: -------------------------------------------------------------------------------- 1 | generated-examples/ 2 | -------------------------------------------------------------------------------- /docs/source/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/_static/favicon.ico -------------------------------------------------------------------------------- /docs/source/_static/intro.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/_static/intro.gif -------------------------------------------------------------------------------- /docs/source/_static/intro.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/_static/intro.mp4 -------------------------------------------------------------------------------- /docs/source/_static/rtd-data.js: -------------------------------------------------------------------------------- 1 | // Dummy data for testing ReadTheDocs footer insertion 2 | // This mimics RTD data for a project that uses both versions + languages 3 | var READTHEDOCS_DATA = { 4 | project: "frc-docs", 5 | version: "latest", 6 | language: "en", 7 | proxied_api_host: "https://readthedocs.org", 8 | }; 9 | -------------------------------------------------------------------------------- /docs/source/_templates/author.html: -------------------------------------------------------------------------------- 1 | {% if author %} 2 |

3 | By {{ author }} 4 |

5 | {% endif %} 6 | -------------------------------------------------------------------------------- /docs/source/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "pydata_sphinx_theme/layout.html" %} 2 | 3 | {% block docs_navbar %} 4 | {% include "header.html" ignore missing %} 5 | {{ super() }} 6 | {% endblock %} 7 | -------------------------------------------------------------------------------- /docs/source/_templates/main-sidebar-home.html: -------------------------------------------------------------------------------- 1 | {# Displays the TOC-subtree for pages nested under the currently active top-level TOCtree element. #} 2 | 17 | -------------------------------------------------------------------------------- /docs/source/_templates/main-sidebar.html: -------------------------------------------------------------------------------- 1 | {# Displays the TOC-subtree for pages nested under the currently active top-level TOCtree element. #} 2 | 16 | -------------------------------------------------------------------------------- /docs/source/developers/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ../../../CONTRIBUTING.md -------------------------------------------------------------------------------- /docs/source/developers/index.rst: -------------------------------------------------------------------------------- 1 | Developer Guides 2 | ================= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | ../developers/CONTRIBUTING 8 | Guide: Adding a New Cloud 9 | -------------------------------------------------------------------------------- /docs/source/examples/applications/batch_inference.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/batch_inference.md -------------------------------------------------------------------------------- /docs/source/examples/applications/index.rst: -------------------------------------------------------------------------------- 1 | AI Applications 2 | ==================== 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | DeepSeek-R1 for RAG 8 | Large-Scale Batch Inference 9 | Image Vector Database 10 | Tabby: Coding Assistant 11 | LocalGPT: Chat with PDF 12 | Stable Diffusion 13 | -------------------------------------------------------------------------------- /docs/source/examples/applications/localgpt.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/localgpt.md -------------------------------------------------------------------------------- /docs/source/examples/applications/rag.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/rag.md -------------------------------------------------------------------------------- /docs/source/examples/applications/stable_diffusion.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/stable_diffusion.md -------------------------------------------------------------------------------- /docs/source/examples/applications/tabby.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/tabby.md -------------------------------------------------------------------------------- /docs/source/examples/applications/vector_database.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/vector_database.md -------------------------------------------------------------------------------- /docs/source/examples/frameworks/airflow.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/airflow.md -------------------------------------------------------------------------------- /docs/source/examples/frameworks/dvc.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/dvc.md -------------------------------------------------------------------------------- /docs/source/examples/frameworks/index.rst: -------------------------------------------------------------------------------- 1 | Frameworks 2 | ==================== 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | Airflow 8 | Cross-cloud data transfer 9 | DVC 10 | GCP DWS/Kueue 11 | Jupyter 12 | MLFlow 13 | MPI 14 | -------------------------------------------------------------------------------- /docs/source/examples/frameworks/jupyter.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/jupyter_lab.md -------------------------------------------------------------------------------- /docs/source/examples/frameworks/mpi.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/mpirun.md -------------------------------------------------------------------------------- /docs/source/examples/models/codellama.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/codellama.md -------------------------------------------------------------------------------- /docs/source/examples/models/dbrx.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/dbrx.md -------------------------------------------------------------------------------- /docs/source/examples/models/deepseek-janus.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/deepseek-janus.md -------------------------------------------------------------------------------- /docs/source/examples/models/deepseek-r1-distilled.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/deepseek-r1-distilled.md -------------------------------------------------------------------------------- /docs/source/examples/models/deepseek-r1.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/deepseek-r1.md -------------------------------------------------------------------------------- /docs/source/examples/models/gemma.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/gemma.md -------------------------------------------------------------------------------- /docs/source/examples/models/gemma3.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/gemma3.md -------------------------------------------------------------------------------- /docs/source/examples/models/gpt-2.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/gpt-2.md -------------------------------------------------------------------------------- /docs/source/examples/models/index.rst: -------------------------------------------------------------------------------- 1 | Models 2 | ============ 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | DeepSeek-R1 8 | DeepSeek-R1 Distilled 9 | DeepSeek-Janus 10 | Gemma 3 11 | Llama 4 12 | Llama 3.2 13 | Llama 3.1 14 | Llama 3 15 | Llama 2 16 | CodeLlama 17 | Pixtral 18 | Mixtral 19 | Mistral 7B 20 | Qwen 2.5 21 | Yi 22 | Gemma 23 | DBRX 24 | GPT-2 via llm.c 25 | Vicuna 26 | -------------------------------------------------------------------------------- /docs/source/examples/models/llama-2.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/llama-2.md -------------------------------------------------------------------------------- /docs/source/examples/models/llama-3.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/llama-3.md -------------------------------------------------------------------------------- /docs/source/examples/models/llama-3_1.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/llama-3_1.md -------------------------------------------------------------------------------- /docs/source/examples/models/llama-3_2.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/llama-3_2.md -------------------------------------------------------------------------------- /docs/source/examples/models/llama-4.md: -------------------------------------------------------------------------------- 1 | ../../../../llm/llama-4/README.md -------------------------------------------------------------------------------- /docs/source/examples/models/mixtral.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/mixtral.md -------------------------------------------------------------------------------- /docs/source/examples/models/pixtral.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/pixtral.md -------------------------------------------------------------------------------- /docs/source/examples/models/qwen.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/qwen.md -------------------------------------------------------------------------------- /docs/source/examples/models/vicuna.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/vicuna.md -------------------------------------------------------------------------------- /docs/source/examples/models/yi.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/yi.md -------------------------------------------------------------------------------- /docs/source/examples/performance/aws_efa.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/aws_efa.md -------------------------------------------------------------------------------- /docs/source/examples/performance/gcp_gpu_direct_tcpx.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/gcp_gpu_direct_tcpx.md -------------------------------------------------------------------------------- /docs/source/examples/performance/index.rst: -------------------------------------------------------------------------------- 1 | AI Performance 2 | ==================== 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | AWS EFA 8 | GCP GPUDirect-TCPX 9 | Nebius with InfiniBand 10 | -------------------------------------------------------------------------------- /docs/source/examples/performance/nebius_infiniband.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/nebius_infiniband.md -------------------------------------------------------------------------------- /docs/source/examples/serving/cog.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/cog.md -------------------------------------------------------------------------------- /docs/source/examples/serving/index.rst: -------------------------------------------------------------------------------- 1 | Serving 2 | ==================== 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | vLLM 8 | SGLang 9 | Ollama 10 | Hugging Face TGI 11 | LoRAX 12 | Cog 13 | -------------------------------------------------------------------------------- /docs/source/examples/serving/lorax.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/lorax.md -------------------------------------------------------------------------------- /docs/source/examples/serving/ollama.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/ollama.md -------------------------------------------------------------------------------- /docs/source/examples/serving/sglang.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/sglang.md -------------------------------------------------------------------------------- /docs/source/examples/serving/tgi.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/tgi.md -------------------------------------------------------------------------------- /docs/source/examples/serving/vllm.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/vllm.md -------------------------------------------------------------------------------- /docs/source/examples/training/axolotl.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/axolotl.md -------------------------------------------------------------------------------- /docs/source/examples/training/deepspeed.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/deepspeed-multinode.md -------------------------------------------------------------------------------- /docs/source/examples/training/distributed-pytorch.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/distributed-pytorch.md -------------------------------------------------------------------------------- /docs/source/examples/training/distributed-tensorflow.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/tensorflow_distributed.md -------------------------------------------------------------------------------- /docs/source/examples/training/index.rst: -------------------------------------------------------------------------------- 1 | Training 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | Axolotl 8 | DeepSpeed 9 | Distributed PyTorch 10 | Distributed TensorFlow 11 | Finetuning Llama 3 12 | Finetuning Llama 2 13 | NeMo 14 | Ray 15 | Training on TPUs 16 | Unsloth 17 | Vertex AI 18 | -------------------------------------------------------------------------------- /docs/source/examples/training/llama-2-finetuning.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/vicuna-llama-2.md -------------------------------------------------------------------------------- /docs/source/examples/training/llama-3_1-finetuning.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/llama-3_1-finetuning.md -------------------------------------------------------------------------------- /docs/source/examples/training/nemo.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/nemo.md -------------------------------------------------------------------------------- /docs/source/examples/training/ray.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/distributed_ray_train.md -------------------------------------------------------------------------------- /docs/source/examples/training/tpu.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/tpu.md -------------------------------------------------------------------------------- /docs/source/examples/training/unsloth.md: -------------------------------------------------------------------------------- 1 | ../../generated-examples/unsloth.md -------------------------------------------------------------------------------- /docs/source/gallery/applications/localgpt.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/applications/rag.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/applications/tabby.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/applications/vector_database.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/frameworks/lorax.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/frameworks/ollama.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/frameworks/sglang.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/frameworks/tgi.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/frameworks/vllm.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/index.rst: -------------------------------------------------------------------------------- 1 | .. The whole gallery/ folder is deprecated, and replaced by examples/. It is only kept here for redirecting old URLs. 2 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 3 | 4 | :orphan: 5 | 6 | .. _ai-gallery: 7 | 8 | .. raw:: html 9 | 10 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/codellama.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/dbrx.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/deepseek-janus.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/deepseek-r1-distilled.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/deepseek-r1.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/gemma.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/gpt-2.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/index.rst: -------------------------------------------------------------------------------- 1 | .. The whole gallery/ folder is deprecated, and replaced by examples/. It is only kept here for redirecting old URLs. 2 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 3 | 4 | :orphan: 5 | 6 | .. raw:: html 7 | 8 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/llama-2.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/llama-3.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/llama-3_1.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/llama-3_2.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/mixtral.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/pixtral.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/qwen.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/vicuna.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/gallery/llms/yi.rst: -------------------------------------------------------------------------------- 1 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 2 | 3 | :orphan: 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | -------------------------------------------------------------------------------- /docs/source/images/SkyPilot-logo-wide.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/SkyPilot-logo-wide.png -------------------------------------------------------------------------------- /docs/source/images/ai-gallery-cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/ai-gallery-cover.png -------------------------------------------------------------------------------- /docs/source/images/client-server/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/client-server/arch.png -------------------------------------------------------------------------------- /docs/source/images/client-server/cluster-users.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/client-server/cluster-users.png -------------------------------------------------------------------------------- /docs/source/images/client-server/executor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/client-server/executor.png -------------------------------------------------------------------------------- /docs/source/images/client-server/high-level-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/client-server/high-level-arch.png -------------------------------------------------------------------------------- /docs/source/images/client-server/local.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/client-server/local.png -------------------------------------------------------------------------------- /docs/source/images/client-server/okta-setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/client-server/okta-setup.png -------------------------------------------------------------------------------- /docs/source/images/client-server/okta.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/client-server/okta.png -------------------------------------------------------------------------------- /docs/source/images/client-server/remote.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/client-server/remote.png -------------------------------------------------------------------------------- /docs/source/images/client-server/token-page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/client-server/token-page.png -------------------------------------------------------------------------------- /docs/source/images/cloud-logos-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/cloud-logos-dark.png -------------------------------------------------------------------------------- /docs/source/images/cloud-logos-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/cloud-logos-light.png -------------------------------------------------------------------------------- /docs/source/images/dashboard-clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/dashboard-clusters.png -------------------------------------------------------------------------------- /docs/source/images/dashboard-managed-jobs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/dashboard-managed-jobs.png -------------------------------------------------------------------------------- /docs/source/images/gcp-vm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/gcp-vm.png -------------------------------------------------------------------------------- /docs/source/images/jupyter-auth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/jupyter-auth.png -------------------------------------------------------------------------------- /docs/source/images/jupyter-covid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/jupyter-covid.png -------------------------------------------------------------------------------- /docs/source/images/jupyter-create.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/jupyter-create.png -------------------------------------------------------------------------------- /docs/source/images/jupyter-gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/jupyter-gpu.png -------------------------------------------------------------------------------- /docs/source/images/k8s-pod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/k8s-pod.png -------------------------------------------------------------------------------- /docs/source/images/k8s-skypilot-architecture-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/k8s-skypilot-architecture-dark.png -------------------------------------------------------------------------------- /docs/source/images/k8s-skypilot-architecture-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/k8s-skypilot-architecture-light.png -------------------------------------------------------------------------------- /docs/source/images/managed-jobs-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/managed-jobs-arch.png -------------------------------------------------------------------------------- /docs/source/images/managed-jobs-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/managed-jobs-dashboard.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/aws/aws-add-policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/aws/aws-add-policy.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/aws/aws-add-role-entity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/aws/aws-add-role-entity.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/aws/aws-add-role.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/aws/aws-add-role.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/aws/aws-add-user.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/aws/aws-add-user.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/aws/aws-create-access-key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/aws/aws-create-access-key.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/aws/aws-create-policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/aws/aws-create-policy.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/gcp/cloud-nat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/gcp/cloud-nat.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/gcp/create-iam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/gcp/create-iam.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/gcp/create-role.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/gcp/create-role.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/gcp/create-service-account.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/gcp/create-service-account.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/gcp/service-account-grant-role.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/gcp/service-account-grant-role.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/gcp/service-account-name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/gcp/service-account-name.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/kubernetes/kubernetes-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/kubernetes/kubernetes-dashboard.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/nebius/nebius-k8s-attach-fs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/nebius/nebius-k8s-attach-fs.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/content-lib-item-tag-adding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/content-lib-item-tag-adding.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/content-lib-item.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/content-lib-item.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/content-lib-local.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/content-lib-local.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/content-lib-name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/content-lib-name.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/content-lib-security-policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/content-lib-security-policy.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/content-lib-storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/content-lib-storage.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/content-libs-navigate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/content-libs-navigate.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/content-libs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/content-libs.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vm-clone-to-template-cl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vm-clone-to-template-cl.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vm-clone-to-template-ovf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vm-clone-to-template-ovf.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vm-clone-to-template.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vm-clone-to-template.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vsphere-catagory-create.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vsphere-catagory-create.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vsphere-catagory-create_navigate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vsphere-catagory-create_navigate.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vsphere-catagory-create_navigate_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vsphere-catagory-create_navigate_new.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vsphere-datastore-tag-adding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vsphere-datastore-tag-adding.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vsphere-tags-create.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vsphere-tags-create.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vsphere-tags-create_navigate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vsphere-tags-create_navigate.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-inventory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-inventory.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-name.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-navigate-new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-navigate-new.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-navigate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-navigate.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-review.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-review.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-rule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-rule.png -------------------------------------------------------------------------------- /docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-tag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/screenshots/vsphere/vsphere-vm-storage-policy-tag.png -------------------------------------------------------------------------------- /docs/source/images/sky-above-clouds-gen.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/sky-above-clouds-gen.jpg -------------------------------------------------------------------------------- /docs/source/images/sky-existing-infra-workflow-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/sky-existing-infra-workflow-dark.png -------------------------------------------------------------------------------- /docs/source/images/sky-existing-infra-workflow-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/sky-existing-infra-workflow-light.png -------------------------------------------------------------------------------- /docs/source/images/sky-serve-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/sky-serve-architecture.png -------------------------------------------------------------------------------- /docs/source/images/sky-serve-status-full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/sky-serve-status-full.png -------------------------------------------------------------------------------- /docs/source/images/sky-serve-status-output-provisioning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/sky-serve-status-output-provisioning.png -------------------------------------------------------------------------------- /docs/source/images/sky-serve-status-tgi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/sky-serve-status-tgi.png -------------------------------------------------------------------------------- /docs/source/images/sky-serve-status-vicuna-ready.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/sky-serve-status-vicuna-ready.png -------------------------------------------------------------------------------- /docs/source/images/sky-serve-status-vllm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/sky-serve-status-vllm.png -------------------------------------------------------------------------------- /docs/source/images/skypilot-abstractions-long-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/skypilot-abstractions-long-2.png -------------------------------------------------------------------------------- /docs/source/images/skypilot-wide-dark-1k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/skypilot-wide-dark-1k.png -------------------------------------------------------------------------------- /docs/source/images/skypilot-wide-light-1k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/skypilot-wide-light-1k.png -------------------------------------------------------------------------------- /docs/source/images/ssh-node-pools/infra.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/ssh-node-pools/infra.png -------------------------------------------------------------------------------- /docs/source/images/ssh-node-pools/pool-details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/ssh-node-pools/pool-details.png -------------------------------------------------------------------------------- /docs/source/images/workspaces/config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/workspaces/config.png -------------------------------------------------------------------------------- /docs/source/images/workspaces/edit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/workspaces/edit.png -------------------------------------------------------------------------------- /docs/source/images/workspaces/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/workspaces/overview.png -------------------------------------------------------------------------------- /docs/source/images/workspaces/resources.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/docs/source/images/workspaces/resources.png -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to SkyPilot! 2 | ==================== 3 | 4 | 5 | .. raw:: html 6 | 7 | 10 | 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | :caption: Contents 15 | :hidden: 16 | 17 | Docs 18 | Blog 19 | Community 20 | 21 | -------------------------------------------------------------------------------- /docs/source/reference/comparison.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. People may hit this old URL; we redirect to the new one instead of 404ing. 4 | .. raw:: html 5 | 6 | 9 | 10 | -------------------------------------------------------------------------------- /docs/source/reference/kubernetes/examples/index.rst: -------------------------------------------------------------------------------- 1 | .. _kubernetes-examples: 2 | 3 | Kubernetes Examples 4 | =================== 5 | 6 | .. toctree:: 7 | :maxdepth: 1 8 | 9 | Dynamic Workload Scheduler 10 | Kueue 11 | Multi-region Kubernetes 12 | -------------------------------------------------------------------------------- /docs/source/serving/user-guides.rst: -------------------------------------------------------------------------------- 1 | Serving User Guides 2 | ================================================ 3 | 4 | .. toctree:: 5 | 6 | autoscaling 7 | update 8 | auth 9 | spot-policy 10 | https 11 | -------------------------------------------------------------------------------- /examples/admin_policy/add_labels.yaml: -------------------------------------------------------------------------------- 1 | admin_policy: example_policy.AddLabelsPolicy 2 | -------------------------------------------------------------------------------- /examples/admin_policy/disable_public_ip.yaml: -------------------------------------------------------------------------------- 1 | admin_policy: example_policy.DisablePublicIpPolicy 2 | -------------------------------------------------------------------------------- /examples/admin_policy/dynamic_kubernetes_contexts_update.yaml: -------------------------------------------------------------------------------- 1 | admin_policy: example_policy.DynamicKubernetesContextsUpdatePolicy 2 | -------------------------------------------------------------------------------- /examples/admin_policy/enforce_autostop.yaml: -------------------------------------------------------------------------------- 1 | admin_policy: example_policy.EnforceAutostopPolicy 2 | -------------------------------------------------------------------------------- /examples/admin_policy/example_policy/example_policy/__init__.py: -------------------------------------------------------------------------------- 1 | """Example admin policy module and prebuilt policies.""" 2 | from example_policy.skypilot_policy import AddLabelsPolicy 3 | from example_policy.skypilot_policy import DisablePublicIpPolicy 4 | from example_policy.skypilot_policy import DynamicKubernetesContextsUpdatePolicy 5 | from example_policy.skypilot_policy import EnforceAutostopPolicy 6 | from example_policy.skypilot_policy import RejectAllPolicy 7 | from example_policy.skypilot_policy import UseSpotForGpuPolicy 8 | -------------------------------------------------------------------------------- /examples/admin_policy/example_policy/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "example_policy" 7 | version = "0.0.1" 8 | -------------------------------------------------------------------------------- /examples/admin_policy/reject_all.yaml: -------------------------------------------------------------------------------- 1 | admin_policy: example_policy.RejectAllPolicy 2 | -------------------------------------------------------------------------------- /examples/admin_policy/task.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | infra: aws 3 | cpus: 2 4 | labels: 5 | other_labels: test 6 | 7 | 8 | setup: | 9 | echo "setup" 10 | 11 | run: | 12 | echo "run" 13 | -------------------------------------------------------------------------------- /examples/admin_policy/use_spot_for_gpu.yaml: -------------------------------------------------------------------------------- 1 | admin_policy: example_policy.UseSpotForGpuPolicy 2 | -------------------------------------------------------------------------------- /examples/airflow/data_preprocessing.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | cpus: 1 3 | 4 | envs: 5 | DATA_BUCKET_NAME: sky-demo-data-test 6 | DATA_BUCKET_STORE_TYPE: s3 7 | 8 | file_mounts: 9 | /data: 10 | name: $DATA_BUCKET_NAME 11 | store: $DATA_BUCKET_STORE_TYPE 12 | 13 | setup: | 14 | echo "Setting up dependencies for data preprocessing..." 15 | 16 | run: | 17 | echo "Running data preprocessing..." 18 | 19 | # Generate few files with random data to simulate data preprocessing 20 | for i in {0..9}; do 21 | dd if=/dev/urandom of=/data/file_$i bs=1M count=10 22 | done 23 | 24 | echo "Data preprocessing completed, wrote to $DATA_BUCKET_NAME" 25 | 26 | -------------------------------------------------------------------------------- /examples/airflow/eval.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | cpus: 1 3 | # Add GPUs here 4 | 5 | envs: 6 | DATA_BUCKET_NAME: sky-demo-data-test 7 | DATA_BUCKET_STORE_TYPE: s3 8 | 9 | file_mounts: 10 | /data: 11 | name: $DATA_BUCKET_NAME 12 | store: $DATA_BUCKET_STORE_TYPE 13 | 14 | setup: | 15 | echo "Setting up dependencies for eval..." 16 | 17 | run: | 18 | echo "Evaluating the trained model..." 19 | 20 | # Run a mock evaluation job that reads the trained model from /data/trained_model.txt 21 | cat /data/trained_model.txt || true 22 | # Generate a mock accuracy 23 | ACCURACY=$(shuf -i 90-100 -n 1) 24 | echo "Metric - accuracy: $ACCURACY%" 25 | echo "Evaluation report" > /data/evaluation_report.txt 26 | 27 | echo "Evaluation completed, report written to $DATA_BUCKET_NAME" 28 | -------------------------------------------------------------------------------- /examples/autogluon.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | infra: gcp 3 | 4 | setup: | 5 | git clone https://github.com/autogluon/autogluon.git 6 | 7 | conda activate autogluon 8 | if [ $? -eq 0 ]; then 9 | echo 'conda env exists' 10 | else 11 | conda create -n autogluon python=3.8 -y 12 | conda activate autogluon 13 | pip install torch==1.13.1+cpu torchvision==0.14.1+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html 14 | pip install autogluon 15 | # Ray + Torch Dataloader failed with latest grpcio 16 | # See: https://github.com/ray-project/ray/pull/33903 17 | pip install grpcio==1.51.3 18 | fi 19 | 20 | run: | 21 | conda activate autogluon 22 | cd autogluon 23 | python examples/automm/tabular_dl/example_tabular.py --mode single_hpo 24 | -------------------------------------------------------------------------------- /examples/azure_start_stop.yaml: -------------------------------------------------------------------------------- 1 | # start and stop Azure instances 2 | name: azure-start-stop 3 | 4 | resources: 5 | infra: azure 6 | 7 | # Optimizing for smoke tests 8 | # 2 nodes: smoke tests ~37 mins 9 | # 1 node: smoke tests ~19 mins 10 | # num_nodes: 2 11 | 12 | # The setup command. Will be run under the working directory. 13 | setup: 'echo "azure-start-stop [setup]"' 14 | 15 | # The command to run. Will be run under the working directory. 16 | run: 'echo "azure-start-stop [run]"' 17 | -------------------------------------------------------------------------------- /examples/cog/cog.yaml: -------------------------------------------------------------------------------- 1 | build: 2 | python_version: "3.8" 3 | python_packages: 4 | - "pillow==8.2.0" 5 | system_packages: 6 | - "libpng-dev" 7 | - "libjpeg-dev" 8 | predict: "predict.py:Predictor" 9 | -------------------------------------------------------------------------------- /examples/cog/predict.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | 3 | import cog 4 | from PIL import Image 5 | from PIL import ImageFilter 6 | 7 | 8 | class Predictor(cog.BasePredictor): 9 | 10 | def predict( 11 | self, 12 | image: cog.Path = cog.Input(description='Input image'), 13 | blur: float = cog.Input(description='Blur radius', default=5), 14 | ) -> cog.Path: 15 | if blur == 0: 16 | return input 17 | im = Image.open(str(image)) 18 | im = im.filter(ImageFilter.BoxBlur(blur)) 19 | out_path = cog.Path(tempfile.mkdtemp()) / 'out.png' 20 | im.save(str(out_path)) 21 | return out_path 22 | -------------------------------------------------------------------------------- /examples/custom_image.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | infra: aws/us-east-2 3 | # Nvidia image from 4 | # https://aws.amazon.com/marketplace/pp/prodview-rf7na2b2ttvdg 5 | image_id: ami-062ddd90fb6f8267a 6 | accelerators: V100 7 | 8 | setup: | 9 | echo "running setup" 10 | 11 | run: | 12 | echo "hello sky" 13 | -------------------------------------------------------------------------------- /examples/deepspeed-multinode/README.md: -------------------------------------------------------------------------------- 1 | # DeepSpeed 2 | 3 | This example shows how to launch a multinode DeepSpeed training job with SkyPilot. 4 | 5 | -------------------------------------------------------------------------------- /examples/disk_size.yaml: -------------------------------------------------------------------------------- 1 | # A minimal example to ask a 512GB OS disk. 2 | # 3 | # Runs a task that simply lists the default conda environments. 4 | # 5 | # Usage: 6 | # sky launch -c min minimal_os_disk.yaml 7 | # sky down min 8 | 9 | name: minimal 10 | 11 | resources: 12 | infra: azure 13 | disk_size: 512 14 | 15 | setup: | 16 | echo "running setup" 17 | lsblk 18 | 19 | run: | 20 | conda env list 21 | -------------------------------------------------------------------------------- /examples/distributed_ray_train/README.md: -------------------------------------------------------------------------------- 1 | # Ray 2 | 3 | This example shows how to launch distributed Ray jobs with SkyPilot. 4 | -------------------------------------------------------------------------------- /examples/docker/compose/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | gpu-app1: 5 | image: nvidia/cuda:11.5.2-runtime-ubuntu20.04 6 | command: nvidia-smi # To keep running in a loop, add -l 1 7 | deploy: 8 | resources: 9 | reservations: 10 | devices: 11 | - driver: nvidia 12 | device_ids: ['0'] 13 | capabilities: [gpu] 14 | 15 | gpu-app2: 16 | image: nvidia/cuda:11.5.2-runtime-ubuntu20.04 17 | command: nvidia-smi 18 | deploy: 19 | resources: 20 | reservations: 21 | devices: 22 | - driver: nvidia 23 | device_ids: ['1'] # Allocates GPU ID 1 to this container. Inside the container, this will be visible as device id 0 24 | capabilities: [gpu] 25 | -------------------------------------------------------------------------------- /examples/docker/echo_app/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python 2 | 3 | ADD echo.py /app/echo.py 4 | 5 | WORKDIR /app 6 | 7 | ENTRYPOINT ["python", "echo.py"] 8 | -------------------------------------------------------------------------------- /examples/docker/echo_app/README.md: -------------------------------------------------------------------------------- 1 | # Echo App 2 | 3 | A simple app that ingests a file and writes it out back to a specified path. 4 | -------------------------------------------------------------------------------- /examples/docker/echo_app/echo.py: -------------------------------------------------------------------------------- 1 | """Echo app 2 | 3 | Reads a file, echoes it and writes back to a specified path. 4 | """ 5 | import argparse 6 | 7 | 8 | def main(): 9 | """Main function""" 10 | parser = argparse.ArgumentParser(description='Echo app') 11 | parser.add_argument('input', type=str) 12 | parser.add_argument('output', type=str) 13 | args = parser.parse_args() 14 | 15 | with open(args.input, 'r') as input_file: 16 | content = input_file.read() 17 | print("===== echo app =====") 18 | print("Input file content:") 19 | print(content) 20 | with open(args.output, 'w') as output_file: 21 | output_file.write(content) 22 | print("Output written to {}".format(args.output)) 23 | 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /examples/dvc/README.md: -------------------------------------------------------------------------------- 1 | # DVC 2 | 3 | This example shows how to use DVC with SkyPilot. 4 | -------------------------------------------------------------------------------- /examples/dvc/dvc_pipeline.yaml: -------------------------------------------------------------------------------- 1 | # adapted from https://alex000kim.com/posts/2023-08-10-ml-experiments-in-cloud-skypilot-dvc/ 2 | name: dvc-pipeline 3 | resources: 4 | accelerators: T4:1 5 | infra: aws/us-east-2 6 | 7 | workdir: . 8 | file_mounts: 9 | ~/.ssh/id_rsa: ~/.ssh/id_rsa 10 | ~/.ssh/id_rsa.pub: ~/.ssh/id_rsa.pub 11 | ~/.gitconfig: ~/.gitconfig 12 | setup: | 13 | pip install -r requirements.txt 14 | pip install dvc[s3] 15 | run: | 16 | # pull data versioned by DVC from DVC remote 17 | dvc pull 18 | # run DVC pipeline as an experiment 19 | dvc exp run --pull --allow-missing 20 | # push experiment results to DVC remote 21 | dvc exp push origin 22 | -------------------------------------------------------------------------------- /examples/gcp_start_stop.yaml: -------------------------------------------------------------------------------- 1 | # start and stop GCP instances 2 | name: gcp-start-stop 3 | 4 | resources: 5 | infra: gcp 6 | 7 | num_nodes: 2 8 | 9 | # The setup command. Will be run under the working directory. 10 | setup: 'echo "gcp_start_stop [setup]"' 11 | 12 | # The command to run. Will be run under the working directory. 13 | run: 'echo "gcp_start_stop [run]"' 14 | -------------------------------------------------------------------------------- /examples/http_server_with_custom_ports/task.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | ports: 3 | - 33828 4 | 5 | workdir: ./examples/http_server_with_custom_ports 6 | 7 | run: python3 server.py 8 | -------------------------------------------------------------------------------- /examples/image_with_tag.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | infra: aws 3 | image_id: skypilot:gpu-ubuntu-1804 4 | 5 | 6 | setup: | 7 | echo "running setup" 8 | 9 | run: | 10 | conda env list 11 | -------------------------------------------------------------------------------- /examples/job_queue/cluster.yaml: -------------------------------------------------------------------------------- 1 | # A dummy task for cluster creation. 2 | # 3 | # Runs a dummy task that provision a cluster. 4 | # 5 | # Usage: 6 | # sky launch -c jq cluster.yaml 7 | # sky exec jq job.yaml 8 | 9 | resources: 10 | accelerators: T4 11 | -------------------------------------------------------------------------------- /examples/job_queue/cluster_docker.yaml: -------------------------------------------------------------------------------- 1 | # A dummy task for cluster creation. 2 | # 3 | # Runs a dummy task that provision a cluster. 4 | # 5 | # Usage: 6 | # sky launch -c djq cluster_docker.yaml 7 | # sky exec djq job_docker.yaml 8 | 9 | resources: 10 | accelerators: T4 11 | image_id: docker:ubuntu:20.04 12 | -------------------------------------------------------------------------------- /examples/job_queue/cluster_multinode.yaml: -------------------------------------------------------------------------------- 1 | # A dummy task for multinode cluster creation. 2 | # 3 | # Runs a dummy task that provision a cluster. 4 | # 5 | # Usage: 6 | # sky launch -c mjq cluster_multinode.yaml 7 | # sky exec mjq job_multinode.yaml 8 | # sky exec mjq job.yaml 9 | 10 | resources: 11 | accelerators: T4 12 | 13 | num_nodes: 2 14 | -------------------------------------------------------------------------------- /examples/job_queue/job.yaml: -------------------------------------------------------------------------------- 1 | # A task submitted to an existing cluster. 2 | # 3 | # Runs a task on a existing cluster. 4 | # 5 | # Usage: 6 | # sky launch -c jq cluster.yaml 7 | # sky exec jq job.yaml 8 | 9 | name: job 10 | 11 | resources: 12 | accelerators: T4:0.5 13 | 14 | setup: | 15 | echo "running setup" 16 | 17 | run: | 18 | timestamp=$(date +%s) 19 | conda env list 20 | for i in {1..180}; do 21 | echo "$timestamp $i" 22 | sleep 1 23 | done 24 | -------------------------------------------------------------------------------- /examples/job_queue/job_docker.yaml: -------------------------------------------------------------------------------- 1 | # A task submitted to an existing cluster. 2 | # 3 | # Runs a task on a existing cluster with docker. 4 | # 5 | # Usage: 6 | # sky launch -c djq cluster_docker.yaml 7 | # sky exec djq job_docker.yaml 8 | 9 | name: job_docker 10 | 11 | envs: 12 | TIME_TO_SLEEP: 180 13 | 14 | resources: 15 | accelerators: T4:0.5 16 | image_id: docker:ubuntu:20.04 17 | 18 | setup: | 19 | echo "running setup" 20 | 21 | run: | 22 | timestamp=$(date +%s) 23 | conda env list 24 | for i in $(seq 1 $TIME_TO_SLEEP); do 25 | echo "$timestamp $i" 26 | sleep 1 27 | done 28 | -------------------------------------------------------------------------------- /examples/job_queue/job_gpu.yaml: -------------------------------------------------------------------------------- 1 | # A task submitted to an existing cluster. 2 | # 3 | # Runs a task on a existing cluster. 4 | # 5 | # Usage: 6 | # sky launch -c jq cluster.yaml 7 | # sky exec -c jq job.yaml 8 | 9 | name: job 10 | 11 | resources: 12 | accelerators: K80:0.5 13 | 14 | # setup: | 15 | # conda create -n test python=3.7 -y 16 | # conda activate test 17 | # conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch 18 | 19 | 20 | 21 | run: | 22 | timestamp=$(date +%s) 23 | conda activate test 24 | echo "started" 25 | python -u -c "import torch; a = torch.randn(10000, 10000).cuda(); b = torch.randn(10000, 10000).cuda(); [print((a @ b).sum()) for _ in range(10000000000)]" 26 | echo "ended" 27 | -------------------------------------------------------------------------------- /examples/job_queue/job_ibm.yaml: -------------------------------------------------------------------------------- 1 | # A task submitted to an existing cluster. 2 | # 3 | # Runs a task on a existing cluster. 4 | # 5 | # Usage: 6 | # sky launch -c jq cluster.yaml 7 | # sky exec jq job_ibm.yaml 8 | 9 | name: job 10 | 11 | resources: 12 | accelerators: v100:0.5 13 | 14 | setup: | 15 | echo "running setup" 16 | 17 | run: | 18 | timestamp=$(date +%s) 19 | conda env list 20 | for i in {1..120}; do 21 | echo "$timestamp $i" 22 | sleep 1 23 | done 24 | -------------------------------------------------------------------------------- /examples/job_queue/job_multinode.yaml: -------------------------------------------------------------------------------- 1 | # A task runs on an existing multinode cluster. 2 | # 3 | # Runs a task that requires multinode. 4 | # 5 | # Usage: 6 | # sky launch -c mjq cluster_multinode.yaml 7 | # sky exec mjq job_multinode.yaml 8 | # sky exec mjq job.yaml 9 | 10 | name: job_multinode 11 | 12 | resources: 13 | accelerators: T4:0.5 14 | 15 | num_nodes: 2 16 | 17 | setup: | 18 | echo "running setup" 19 | sleep 80 20 | 21 | run: | 22 | timestamp=$(date +%s) 23 | conda env list 24 | for i in {1..360}; do 25 | echo "$timestamp $i" 26 | sleep 1 27 | done 28 | -------------------------------------------------------------------------------- /examples/job_queue/job_multinode_ibm.yaml: -------------------------------------------------------------------------------- 1 | # A task runs on an existing multinode cluster. 2 | # 3 | # Runs a task that requires multinode. 4 | # 5 | # Usage: 6 | # sky launch -c mjq cluster_multinode.yaml 7 | # sky exec mjq job_multinode_ibm.yaml 8 | # sky exec mjq job_ibm.yaml 9 | 10 | name: job_multinode 11 | 12 | resources: 13 | accelerators: v100:0.5 14 | 15 | num_nodes: 2 16 | 17 | setup: | 18 | echo "running setup" 19 | sleep 80 20 | 21 | run: | 22 | timestamp=$(date +%s) 23 | conda env list 24 | for i in {1..240}; do 25 | echo "$timestamp $i" 26 | sleep 1 27 | done 28 | -------------------------------------------------------------------------------- /examples/jupyter_lab.yaml: -------------------------------------------------------------------------------- 1 | # Example: Launch Jupyter Lab and auto-expose its port to Internet. 2 | # 3 | # Usage: 4 | # $ sky launch -c jupyter jupyter_lab.yaml 5 | # # Then look for the logs for some output like: 6 | # # Jupyter Server 2.7.0 is running at: 7 | # # http://127.0.0.1:29324/lab?token= 8 | # # Run 9 | # $ sky status -a jupyter 10 | # # to get the HEAD_IP of the cluster, replace the 127.0.0.1 with 11 | # # the HEAD_IP and open browser for the URL. 12 | # 13 | # # This is an alternative to port forwarding. 14 | 15 | resources: 16 | ports: 17 | - 29324 18 | 19 | setup: pip install jupyter 20 | 21 | run: jupyter lab --port 29324 --no-browser --ip=0.0.0.0 22 | -------------------------------------------------------------------------------- /examples/managed_job.yaml: -------------------------------------------------------------------------------- 1 | name: minimal 2 | 3 | setup: | 4 | echo "running setup" 5 | pip install tqdm 6 | 7 | run: | 8 | conda env list 9 | echo "start counting" 10 | python -u - << EOF 11 | import time 12 | import tqdm 13 | 14 | for i in tqdm.trange(240): 15 | time.sleep(1) 16 | 17 | EOF 18 | -------------------------------------------------------------------------------- /examples/managed_spot.yaml: -------------------------------------------------------------------------------- 1 | name: minimal 2 | 3 | resources: 4 | use_spot: true 5 | 6 | setup: | 7 | echo "running setup" 8 | pip install tqdm 9 | 10 | run: | 11 | conda env list 12 | python -u - << EOF 13 | import time 14 | import tqdm 15 | 16 | for i in tqdm.trange(240): 17 | time.sleep(1) 18 | 19 | EOF 20 | -------------------------------------------------------------------------------- /examples/many_gpu_vms.yaml: -------------------------------------------------------------------------------- 1 | name: many_gpu_vms 2 | 3 | resources: 4 | infra: aws 5 | accelerators: V100:8 6 | # use_spot: true 7 | 8 | num_nodes: 16 9 | 10 | setup: 'pip3 install wandb' 11 | 12 | run: "python3 -c 'import wandb; print(wandb.__path__)'; nvidia-smi" 13 | -------------------------------------------------------------------------------- /examples/minimal.yaml: -------------------------------------------------------------------------------- 1 | # A minimal example. 2 | # 3 | # Runs a task that simply lists the default conda environments. 4 | # 5 | # Usage: 6 | # sky launch -c min minimal.yaml 7 | # sky down min 8 | 9 | name: minimal 10 | 11 | resources: 12 | infra: aws 13 | 14 | setup: | 15 | echo "running setup" 16 | 17 | run: | 18 | conda env list 19 | -------------------------------------------------------------------------------- /examples/mpirun.yaml: -------------------------------------------------------------------------------- 1 | workdir: . 2 | 3 | resources: 4 | infra: aws 5 | 6 | num_nodes: 2 # Total number of nodes (1 head + 1 worker) 7 | 8 | setup: | 9 | echo "Running setup on node ${SKYPILOT_NODE_RANK}." 10 | # Install MPI if not already present. This will vary based on your OS/distro. 11 | sudo apt update 12 | sudo apt install -y openmpi-bin openmpi-common libopenmpi-dev 13 | 14 | run: | 15 | if [ "${SKYPILOT_NODE_RANK}" == "0" ]; then 16 | echo "head node" 17 | num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l` 18 | mpi_nodes=$(echo "$SKYPILOT_NODE_IPS" | tr '\n' ',') 19 | mpi_nodes=${mpi_nodes::-1} 20 | echo "$mpi_nodes" 21 | mpirun -np $num_nodes -H $mpi_nodes bash -c 'echo "mpirun hello from IP $(hostname -I)"' 22 | else 23 | echo "worker nodes" 24 | fi 25 | -------------------------------------------------------------------------------- /examples/multi_accelerators.yaml: -------------------------------------------------------------------------------- 1 | name: multi-accelerators 2 | 3 | resources: 4 | 5 | # Ordered list of accelerators: Try the accelerators in the specified order. 6 | # accelerators: ['A100-40GB:1', 'V100:1', 'K80:1', 'T4:1'] 7 | 8 | # Unordered set of accelerators: Optimize all specified accelerators together, and try accelerator with lowest cost first. 9 | accelerators: {'A100-40GB:1', 'K80:1', 'V100:1', 'T4:1', 'T4:4'} 10 | 11 | run: | 12 | nvidia-smi 13 | -------------------------------------------------------------------------------- /examples/multi_hostname.py: -------------------------------------------------------------------------------- 1 | import sky 2 | 3 | with sky.Dag() as dag: 4 | # The run command will be run on *all* nodes. 5 | # Should see two lines: 6 | # My hostname: 7 | # My hostname: 8 | sky.Task(run='echo My hostname: $(hostname)', 9 | num_nodes=2).set_resources(sky.Resources(infra='aws')) 10 | 11 | sky.launch(dag) 12 | -------------------------------------------------------------------------------- /examples/multi_hostname.yaml: -------------------------------------------------------------------------------- 1 | name: multi_hostname 2 | 3 | num_nodes: 2 4 | 5 | # The run command will be run on *all* nodes. 6 | # Should see two lines: 7 | # My hostname: 8 | # My hostname: 9 | run: 'echo My hostname: $(hostname)' 10 | -------------------------------------------------------------------------------- /examples/multi_resources.yaml: -------------------------------------------------------------------------------- 1 | name: multi-resources 2 | 3 | resources: 4 | ordered: 5 | - infra: aws 6 | accelerators: A10g 7 | - infra: gcp 8 | accelerators: L4 9 | 10 | # resources: 11 | # any_of: 12 | # - infra: aws 13 | # accelerators: A10g 14 | # - infra: gcp 15 | # accelerators: L4 16 | 17 | run: | 18 | nvidia-smi 19 | -------------------------------------------------------------------------------- /examples/nebius_infiniband/infiniband.yaml: -------------------------------------------------------------------------------- 1 | # This example is used to test the InfiniBand 2 | # connection between two VMs. 3 | resources: 4 | cloud: nebius 5 | region: eu-north1 6 | accelerators: H100:8 7 | 8 | num_nodes: 2 9 | 10 | setup: | 11 | sudo apt install perftest -y 12 | 13 | run: | 14 | MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) 15 | if [ "${SKYPILOT_NODE_RANK}" == "0" ]; then 16 | ib_send_bw --report_gbits -n 1000 -F > /dev/null 17 | elif [ "${SKYPILOT_NODE_RANK}" == "1" ]; then 18 | echo "MASTER_ADDR: $MASTER_ADDR" 19 | sleep 2 # wait for the master to start 20 | ib_send_bw $MASTER_ADDR --report_gbits -n 1000 -F 21 | fi 22 | -------------------------------------------------------------------------------- /examples/nemo/README.md: -------------------------------------------------------------------------------- 1 | # Nvidia NeMo 2 | 3 | This example shows how to launch Nvidia NeMo jobs with SkyPilot. 4 | -------------------------------------------------------------------------------- /examples/oci/config.yaml: -------------------------------------------------------------------------------- 1 | oci: 2 | default: 3 | # oci_config_profile: DEFAULT 4 | compartment_ocid: ocid1.compartment.oc1..aaaaaaaahr7aicqtodxmcfor6pbqn3hvsngpftozyxzqw36gj4kh3w3kkj4q 5 | # image_tag_general: skypilot:cpu-ubuntu-2004 6 | # image_tag_gpu: skypilot:gpu-ubuntu-2004 7 | 8 | ap-seoul-1: 9 | vcn_subnet: ocid1.subnet.oc1.ap-seoul-1.aaaaaaaa5c6wndifsij6yfyfehmi3tazn6mvhhiewqmajzcrlryurnl7nuja 10 | 11 | us-ashburn-1: 12 | vcn_subnet: ocid1.subnet.oc1.iad.aaaaaaaafbj7i3aqc4ofjaapa5edakde6g4ea2yaslcsay32cthp7qo55pxa 13 | -------------------------------------------------------------------------------- /examples/oci/oci-mounts.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | infra: oci 3 | 4 | file_mounts: 5 | ~/tmpfile: ~/tmpfile 6 | ~/a/b/c/tmpfile: ~/tmpfile 7 | /tmp/workdir: ~/tmp-workdir 8 | 9 | /mydir: 10 | name: skybucket 11 | source: ['~/tmp-workdir'] 12 | store: oci 13 | mode: MOUNT 14 | 15 | setup: | 16 | echo "*** Setup ***" 17 | 18 | run: | 19 | echo "*** Run ***" 20 | 21 | ls -lthr ~/tmpfile 22 | ls -lthr ~/a/b/c 23 | echo hi >> /tmp/workdir/new_file 24 | ls -lthr /tmp/workdir 25 | 26 | ls -lthr /mydir 27 | -------------------------------------------------------------------------------- /examples/oci/serve-http-cpu.yaml: -------------------------------------------------------------------------------- 1 | service: 2 | readiness_probe: / 3 | replicas: 2 4 | 5 | resources: 6 | infra: oci/us-sanjose-1 7 | ports: 8080 8 | cpus: 2+ 9 | 10 | run: python -m http.server 8080 11 | -------------------------------------------------------------------------------- /examples/oci/serve-qwen-7b.yaml: -------------------------------------------------------------------------------- 1 | # service.yaml 2 | service: 3 | readiness_probe: /v1/models 4 | replicas: 2 5 | 6 | # Fields below describe each replica. 7 | resources: 8 | infra: oci/us-sanjose-1 9 | ports: 8080 10 | accelerators: {A10:1} 11 | 12 | setup: | 13 | conda create -n vllm python=3.12 -y 14 | conda activate vllm 15 | pip install vllm==0.6.3.post1 16 | pip install vllm-flash-attn==2.6.2 17 | 18 | run: | 19 | conda activate vllm 20 | python -u -m vllm.entrypoints.openai.api_server \ 21 | --host 0.0.0.0 --port 8080 \ 22 | --model Qwen/Qwen2-7B-Instruct \ 23 | --served-model-name Qwen2-7B-Instruct \ 24 | --device=cuda --dtype auto --max-model-len=2048 25 | -------------------------------------------------------------------------------- /examples/per_region_images.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | infra: aws 3 | instance_type: g4dn.xlarge 4 | image_id: 5 | us-west-2: skypilot:gpu-ubuntu-1804 6 | us-east-2: skypilot:gpu-ubuntu-2004 7 | 8 | 9 | setup: | 10 | echo "running setup" 11 | 12 | run: | 13 | conda env list 14 | nvidia-smi 15 | -------------------------------------------------------------------------------- /examples/playground/min_fail.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | infra: aws 3 | 4 | setup: | 5 | echo "running setup" 6 | 7 | run: | 8 | conda env list 9 | exit 1 10 | -------------------------------------------------------------------------------- /examples/playground/min_progress_bar.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | infra: aws 3 | 4 | setup: | 5 | echo "running setup" 6 | 7 | run: | 8 | conda env list 9 | python3 -u -c "from tqdm import tqdm; import time; import sys; [time.sleep(0.5) for i in tqdm(range(120))]; print('done'); print('new')" 10 | -------------------------------------------------------------------------------- /examples/ray_tune_app.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | infra: aws 3 | accelerators: V100 4 | 5 | num_nodes: 2 6 | 7 | workdir: examples/ray_tune_examples 8 | 9 | setup: | 10 | pip3 install --upgrade pip 11 | pip3 install ray[tune] pytorch-lightning==1.4.9 lightning-bolts torchvision 12 | 13 | run: | 14 | if [ "${SKYPILOT_NODE_RANK}" == "0" ]; then 15 | python3 tune_ptl_example.py 16 | fi 17 | -------------------------------------------------------------------------------- /examples/resnet_distributed_torch_scripts/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | conda activate resnet 4 | conda env list 5 | 6 | cd pytorch-distributed-resnet 7 | num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l` 8 | master_addr=`echo "$SKYPILOT_NODE_IPS" | head -n1` 9 | echo MASTER_ADDR $master_addr 10 | python -m torch.distributed.launch --nproc_per_node=1 \ 11 | --nnodes=$num_nodes --node_rank=${SKYPILOT_NODE_RANK} --master_addr=$master_addr \ 12 | --master_port=8008 resnet_ddp.py --num_epochs 20 13 | -------------------------------------------------------------------------------- /examples/resnet_distributed_torch_scripts/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | [ -d pytorch-distributed-resnet ] || git clone https://github.com/michaelzhiluo/pytorch-distributed-resnet 3 | cd pytorch-distributed-resnet 4 | 5 | conda activate resnet 6 | if [ $? -eq 0 ]; then 7 | echo "conda env exists" 8 | else 9 | echo "conda env does not exist" 10 | conda create -n resnet python=3.7 -y 11 | conda activate resnet 12 | fi 13 | # SkyPilot's default image on AWS/GCP has CUDA 11.6 (Azure 11.5). 14 | pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 15 | 16 | mkdir -p data 17 | mkdir -p saved_models 18 | cd data 19 | wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz 20 | tar -xvzf cifar-10-python.tar.gz 21 | -------------------------------------------------------------------------------- /examples/resnet_distributed_torch_with_script.yaml: -------------------------------------------------------------------------------- 1 | name: resnet-distributed-app 2 | 3 | 4 | resources: 5 | infra: aws 6 | accelerators: V100 7 | 8 | num_nodes: 2 9 | 10 | workdir: ./examples/resnet_distributed_torch_scripts 11 | 12 | setup: | 13 | bash ./setup.sh 14 | 15 | run: | 16 | bash ./run.sh 17 | -------------------------------------------------------------------------------- /examples/sample_dotenv: -------------------------------------------------------------------------------- 1 | TEST_ENV2="success" 2 | -------------------------------------------------------------------------------- /examples/serve/http_server/task.yaml: -------------------------------------------------------------------------------- 1 | # SkyServe YAML to run a simple http server. 2 | # 3 | # Usage: 4 | # sky serve up -n http examples/serve/http_server/task.yaml 5 | # The endpoint will be printed in the console. You 6 | # could also check the endpoint by running: 7 | # sky serve status --endpoint http 8 | 9 | service: 10 | readiness_probe: 11 | path: /health 12 | initial_delay_seconds: 20 13 | replicas: 2 14 | 15 | resources: 16 | ports: 8080 17 | cpus: 2+ 18 | 19 | workdir: examples/serve/http_server 20 | 21 | run: python3 server.py 22 | -------------------------------------------------------------------------------- /examples/serve/huggingface-tgi.yaml: -------------------------------------------------------------------------------- 1 | # SkyServe YAML to run HuggingFace TGI 2 | # 3 | # Usage: 4 | # sky serve up -n tgi huggingface-tgi.yaml \ 5 | # [--env MODEL_ID=] 6 | # Then visit the endpoint printed in the console. You could also 7 | # check the endpoint by running: 8 | # sky serve status --endpoint tgi 9 | 10 | envs: 11 | MODEL_ID: lmsys/vicuna-13b-v1.5 12 | 13 | service: 14 | readiness_probe: /health 15 | replicas: 2 16 | 17 | resources: 18 | ports: 8080 19 | accelerators: A100:1 20 | 21 | run: | 22 | docker run --gpus all --shm-size 1g -p 8080:80 \ 23 | -v ~/data:/data ghcr.io/huggingface/text-generation-inference \ 24 | --model-id $MODEL_ID 25 | -------------------------------------------------------------------------------- /examples/serve/minimal.yaml: -------------------------------------------------------------------------------- 1 | # An minimal example of a serve application. 2 | 3 | service: 4 | readiness_probe: / 5 | replicas: 1 6 | 7 | resources: 8 | ports: 8080 9 | cpus: 2+ 10 | 11 | run: python3 -m http.server 8080 12 | -------------------------------------------------------------------------------- /examples/serve/misc/cancel/service.yaml: -------------------------------------------------------------------------------- 1 | # Usage: Please refer to the README.md in this directory. 2 | 3 | service: 4 | readiness_probe: 5 | path: /health 6 | initial_delay_seconds: 120 7 | 8 | resources: 9 | ports: 9000 10 | cpus: 2+ 11 | 12 | workdir: examples/serve/misc/cancel 13 | 14 | run: python3 server.py --port 9000 15 | -------------------------------------------------------------------------------- /examples/serve/ray_serve/ray_serve.yaml: -------------------------------------------------------------------------------- 1 | # SkyServe YAML to run a simple rayserve endpoint. 2 | # 3 | # Usage: 4 | # sky serve up examples/serve/ray_serve/ray_serve.yaml 5 | 6 | service: 7 | readiness_probe: / 8 | replicas: 1 9 | 10 | resources: 11 | ports: 8000 12 | cpus: 2+ 13 | 14 | workdir: examples/serve/ray_serve 15 | 16 | setup: pip install "ray[serve]" 17 | 18 | run: serve run serve:app --host 0.0.0.0 19 | -------------------------------------------------------------------------------- /examples/serve/ray_serve/serve.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from ray import serve 4 | from starlette import requests 5 | 6 | 7 | # 2 Ray actors, each running on 1 vCPU. 8 | @serve.deployment(route_prefix='/', num_replicas=2) 9 | class ModelDeployment: 10 | 11 | def __init__(self, msg: str): 12 | self._msg = msg 13 | 14 | def __call__(self, request: requests.Request) -> Dict: 15 | del request # unused 16 | return {'result': self._msg} 17 | 18 | 19 | app = ModelDeployment.bind(msg='Hello Ray Serve!') 20 | -------------------------------------------------------------------------------- /examples/serve/spot_policy/base_on_demand_fallback_replicas.yaml: -------------------------------------------------------------------------------- 1 | # SkyServe YAML to launch a service with mixed spot and on-demand instances. 2 | # The policy will maintain `base_ondemand_fallback_replicas` number of on-demand instances, in addition to spot instances. 3 | # On-demand instances are counted in autoscaling decisions (i.e., between `min_replicas` and `max_replicas`). 4 | 5 | service: 6 | readiness_probe: /health 7 | replica_policy: 8 | min_replicas: 2 9 | max_replicas: 3 10 | target_qps_per_replica: 1 11 | base_ondemand_fallback_replicas: 1 12 | 13 | resources: 14 | ports: 8081 15 | cpus: 2+ 16 | # use_spot is needed for ondemand fallback 17 | use_spot: true 18 | 19 | workdir: examples/serve/http_server 20 | 21 | run: python3 server.py 22 | -------------------------------------------------------------------------------- /examples/serve/spot_policy/dynamic_on_demand_fallback.yaml: -------------------------------------------------------------------------------- 1 | # SkyServe YAML to launch a service with mixed spot and on-demand instances. 2 | # The policy will dynamically fallback to on-demand instances when spot instances are not available. 3 | 4 | service: 5 | readiness_probe: /health 6 | replica_policy: 7 | min_replicas: 2 8 | max_replicas: 3 9 | target_qps_per_replica: 1 10 | dynamic_ondemand_fallback: true 11 | 12 | resources: 13 | any_of: 14 | - infra: gcp/*/us-central1-a 15 | - infra: gcp/us-east1 16 | ports: 8081 17 | cpus: 2+ 18 | # use_spot is needed for ondemand fallback 19 | use_spot: true 20 | 21 | workdir: examples/serve/http_server 22 | 23 | run: python3 server.py 24 | -------------------------------------------------------------------------------- /examples/serve/spot_policy/multi_accelerators.yaml: -------------------------------------------------------------------------------- 1 | # SkyServe YAML to launch a service with mixed spot and on-demand instances and an ordered preference for accelerators. 2 | # The policy will maintain `base_ondemand_fallback_replicas` number of on-demand instances, in addition to spot instances. 3 | 4 | service: 5 | readiness_probe: /health 6 | replica_policy: 7 | min_replicas: 2 8 | max_replicas: 3 9 | target_qps_per_replica: 1 10 | base_ondemand_fallback_replicas: 1 11 | 12 | resources: 13 | ordered: 14 | - accelerators: V100 15 | - accelerators: T4 16 | ports: 8081 17 | cpus: 2+ 18 | # use_spot is needed for ondemand fallback 19 | use_spot: true 20 | 21 | workdir: examples/serve/http_server 22 | 23 | run: python3 server.py 24 | -------------------------------------------------------------------------------- /examples/show_gpus.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | sky show-gpus --help 5 | sky show-gpus 6 | sky show-gpus V100 7 | sky show-gpus A100 8 | sky show-gpus --all 9 | -------------------------------------------------------------------------------- /examples/spot/lightning_cifar10/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch-lightning>=1.3 2 | torchvision 3 | wandb 4 | torchmetrics==0.4.1 5 | torch>=1.6, <1.9 6 | lightning-bolts 7 | -------------------------------------------------------------------------------- /examples/spot/resnet_ddp/README.md: -------------------------------------------------------------------------------- 1 | # pytorch-distributed-resnet 2 | Example of Pytorch Resnet Distributed Training - pulled from https://leimao.github.io/blog/PyTorch-Distributed-Training/ 3 | -------------------------------------------------------------------------------- /examples/spot/resnet_ddp/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | wandb 4 | absl-py 5 | -------------------------------------------------------------------------------- /examples/spot_pipeline/single.yaml: -------------------------------------------------------------------------------- 1 | name: dag-name 2 | 3 | --- 4 | name: simple-task 5 | 6 | resources: 7 | cpus: 2+ 8 | memory: 8+ 9 | 10 | setup: | 11 | echo setup for task 1 12 | 13 | run: | 14 | echo run for task 1 15 | 16 | -------------------------------------------------------------------------------- /examples/stable_diffusion/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.9' 2 | 3 | services: 4 | model: 5 | image: berkeleyskypilot/stable-diffusion 6 | restart: on-failure 7 | ports: 8 | - "7860:7860" 9 | volumes: 10 | - ./cache:/cache 11 | - ./output:/output 12 | - ./models:/models 13 | environment: 14 | - CLI_ARGS=--extra-models-cpu --optimized-turbo 15 | deploy: 16 | resources: 17 | reservations: 18 | devices: 19 | - driver: nvidia 20 | device_ids: ['0'] 21 | capabilities: [gpu] 22 | -------------------------------------------------------------------------------- /examples/tensorflow_distributed/README.md: -------------------------------------------------------------------------------- 1 | # Distributed TensorFlow 2 | 3 | This example shows how to launch a distributed TensorFlow training job with SkyPilot. 4 | -------------------------------------------------------------------------------- /examples/tpu/README.md: -------------------------------------------------------------------------------- 1 | # TPU 2 | 3 | This example shows how to launch TPU jobs with SkyPilot. 4 | 5 | > Note: Some examples may be old. See the `v6e/` files for the latest examples. See also: https://docs.skypilot.co/en/latest/reference/tpu.html. 6 | -------------------------------------------------------------------------------- /examples/tpu/tpu_app.yaml: -------------------------------------------------------------------------------- 1 | name: tpu_app 2 | 3 | # The working directory contains all code and will be synced to remote. 4 | workdir: ./examples/tpu/tpu_app_code 5 | 6 | resources: 7 | accelerators: tpu-v2-8 8 | 9 | # The setup command. Will be run under the working directory. 10 | setup: | 11 | pip install --upgrade pip 12 | 13 | conda activate huggingface 14 | if [ $? -eq 0 ]; then 15 | echo 'conda env exists' 16 | else 17 | conda create -n huggingface python=3.8 -y 18 | conda activate huggingface 19 | pip install -r requirements.txt 20 | fi 21 | 22 | # The command to run. Will be run under the working directory. 23 | run: | 24 | conda activate huggingface 25 | python -u run_tpu.py 26 | -------------------------------------------------------------------------------- /examples/tpu/tpu_app_code/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==2.5.1 2 | tensorflow-datasets==4.4.0 3 | transformers==4.12.0 4 | tensorflow-text==2.5.0 5 | cloud-tpu-client==0.10 6 | -------------------------------------------------------------------------------- /examples/tpu/v6e/benchmark-llama2-7b.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | model_name: llama-2 3 | tokenizer_path: /home/gcpuser/sky_workdir/ckpt/llama2-7b/original/tokenizer.model 4 | 5 | run: | 6 | cd JetStream 7 | python benchmarks/benchmark_serving.py \ 8 | --tokenizer=$tokenizer_path --num-prompts=100 \ 9 | --dataset openorca --save-request-outputs \ 10 | --warmup-mode=sampled --model=$model_name 11 | -------------------------------------------------------------------------------- /examples/tpu/v6e/fsdp_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fsdp_transformer_layer_cls_to_wrap": [ 3 | "LlamaDecoderLayer" 4 | ], 5 | "xla": true, 6 | "xla_fsdp_v2": true, 7 | "xla_fsdp_grad_ckpt": true 8 | } 9 | -------------------------------------------------------------------------------- /examples/unsloth/README.md: -------------------------------------------------------------------------------- 1 | # Unsloth 2 | 3 | This example shows how to launch Unsloth jobs with SkyPilot. 4 | -------------------------------------------------------------------------------- /llm/axolotl/axolotl.yaml: -------------------------------------------------------------------------------- 1 | # Usage: 2 | # HF_TOKEN=abc sky launch -c axolotl axolotl.yaml --env HF_TOKEN -y -i30 --down 3 | 4 | name: axolotl 5 | 6 | resources: 7 | accelerators: L4:1 8 | image_id: docker:winglian/axolotl:main-py3.10-cu118-2.0.1 9 | 10 | workdir: mistral 11 | 12 | run: | 13 | huggingface-cli login --token ${HF_TOKEN} 14 | 15 | accelerate launch -m axolotl.cli.train qlora.yaml 16 | 17 | envs: 18 | HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. 19 | -------------------------------------------------------------------------------- /llm/batch_inference/monitor_progress.yaml: -------------------------------------------------------------------------------- 1 | name: batch-inference-monitor-progress 2 | 3 | workdir: . 4 | 5 | resources: 6 | cpus: 2 7 | memory: 8+ 8 | infra: aws 9 | ports: 10 | - 8000 11 | 12 | envs: 13 | # make sure this is the same as the source in compute_vectors.yaml 14 | EMBEDDINGS_BUCKET_NAME: sky-text-embeddings 15 | 16 | file_mounts: 17 | /output: 18 | name: ${EMBEDDINGS_BUCKET_NAME} 19 | # this needs to be the same as the source in compute_vectors.yaml 20 | mode: MOUNT 21 | store: s3 22 | 23 | 24 | setup: | 25 | pip install fastapi uvicorn aiofiles 26 | pip install pandas pyarrow plotly 27 | 28 | run: | 29 | python scripts/monitor_progress.py --metrics-dir /output/metrics 30 | -------------------------------------------------------------------------------- /llm/deepseek-janus/janus_1.5b.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. 3 | 4 | resources: 5 | accelerators: {L4:1, A10G:1, A10:1, A100:8, A100-80GB:1} 6 | ports: 7 | - 8000 8 | disk_tier: best 9 | memory: 32+ 10 | 11 | setup: | 12 | git clone https://github.com/deepseek-ai/Janus.git 13 | pip install -e Janus[gradio] 14 | pip install diffusers==0.32.2 15 | python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" 16 | 17 | run: | 18 | cd Janus && python demo/app.py -------------------------------------------------------------------------------- /llm/deepseek-janus/januspro_7b.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. 3 | 4 | resources: 5 | accelerators: {L4:1, A10G:1, A10:1, A100:8, A100-80GB:1} 6 | ports: 7 | - 8000 8 | disk_tier: best 9 | memory: 64+ 10 | 11 | setup: | 12 | git clone https://github.com/deepseek-ai/Janus.git 13 | pip install -e Janus[gradio] 14 | pip install diffusers==0.32.2 15 | python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" 16 | 17 | run: | 18 | cd Janus && python demo/app_januspro.py -------------------------------------------------------------------------------- /llm/deepseek-r1-distilled/deepseek-r1-vllm.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | MODEL_NAME: deepseek-ai/DeepSeek-R1-Distill-Llama-8B 3 | MAX_MODEL_LEN: 4096 4 | 5 | resources: 6 | accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1} 7 | ports: 8 | - 8000 9 | disk_tier: best 10 | 11 | setup: | 12 | uv pip install transformers==4.48.1 13 | uv pip install vllm==0.6.6.post1 14 | 15 | 16 | run: | 17 | echo 'Starting vllm openai api server...' 18 | python -m vllm.entrypoints.openai.api_server \ 19 | --host 0.0.0.0 \ 20 | --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 21 | --model $MODEL_NAME \ 22 | --max-model-len $MAX_MODEL_LEN 23 | -------------------------------------------------------------------------------- /llm/lorax/lorax.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | accelerators: {A10G, A10, L4, A100, A100-80GB} 3 | memory: 32+ 4 | ports: 5 | - 8080 6 | 7 | envs: 8 | MODEL_ID: mistralai/Mistral-7B-Instruct-v0.1 9 | 10 | run: | 11 | docker run --gpus all --shm-size 1g -p 8080:80 -v ~/data:/data \ 12 | ghcr.io/predibase/lorax:latest \ 13 | --model-id $MODEL_ID 14 | -------------------------------------------------------------------------------- /llm/rag/build_rag.yaml: -------------------------------------------------------------------------------- 1 | name: build-legal-rag 2 | 3 | workdir: . 4 | 5 | resources: 6 | memory: 32+ # Need more memory for merging embeddings 7 | infra: aws 8 | 9 | envs: 10 | EMBEDDINGS_BUCKET_NAME: sky-rag-embeddings 11 | VECTORDB_BUCKET_NAME: sky-rag-vectordb 12 | 13 | file_mounts: 14 | /embeddings: 15 | name: ${EMBEDDINGS_BUCKET_NAME} 16 | # this needs to be the same as the output in compute_embeddings.yaml 17 | mode: MOUNT 18 | 19 | /vectordb: 20 | name: ${VECTORDB_BUCKET_NAME} 21 | mode: MOUNT 22 | 23 | setup: | 24 | pip install chromadb pandas tqdm pyarrow 25 | 26 | run: | 27 | python scripts/build_rag.py \ 28 | --collection-name legal_docs \ 29 | --persist-dir /vectordb/chroma \ 30 | --embeddings-dir /embeddings \ 31 | --batch-size 1000 32 | -------------------------------------------------------------------------------- /llm/tabby/docker-compose.cuda.yaml: -------------------------------------------------------------------------------- 1 | version: '3.5' 2 | 3 | services: 4 | tabby: 5 | restart: always 6 | container_name: tabby 7 | image: tabbyml/tabby 8 | command: serve --model TabbyML/StarCoder-1B --device cuda 9 | volumes: 10 | - "./tabby:/data" 11 | ports: 12 | - 8080:8080 13 | deploy: 14 | resources: 15 | reservations: 16 | devices: 17 | - driver: nvidia 18 | count: 1 19 | capabilities: [gpu] 20 | -------------------------------------------------------------------------------- /llm/tabby/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.5' 2 | 3 | services: 4 | tabby: 5 | restart: always 6 | container_name: tabby 7 | image: tabbyml/tabby 8 | command: serve --model TabbyML/StarCoder-1B 9 | volumes: 10 | - "./tabby:/data" 11 | ports: 12 | - 8080:8080 13 | -------------------------------------------------------------------------------- /llm/tabby/tabby/config.toml: -------------------------------------------------------------------------------- 1 | [[repositories]] 2 | git_url = "https://github.com/skypilot-org/skypilot" 3 | -------------------------------------------------------------------------------- /llm/tgi/serve.yaml: -------------------------------------------------------------------------------- 1 | # SkyServe YAML to run HuggingFace TGI 2 | # 3 | # Usage: 4 | # sky serve up -n tgi huggingface-tgi.yaml \ 5 | # [--env MODEL_ID=] 6 | # Then visit the endpoint printed in the console. You could also 7 | # check the endpoint by running: 8 | # sky serve status --endpoint tgi 9 | 10 | envs: 11 | MODEL_ID: lmsys/vicuna-13b-v1.5 12 | 13 | service: 14 | readiness_probe: /health 15 | replicas: 2 16 | 17 | resources: 18 | ports: 8080 19 | accelerators: A100:1 20 | 21 | run: | 22 | docker run --gpus all --shm-size 1g -p 8080:80 \ 23 | -v ~/data:/data ghcr.io/huggingface/text-generation-inference \ 24 | --model-id $MODEL_ID 25 | -------------------------------------------------------------------------------- /llm/vicuna-llama-2/scripts/train_flash_attn.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn. 2 | 3 | # Need to call this before importing transformers. 4 | from flash_attn_patch import replace_llama_attn_with_flash_attn 5 | 6 | replace_llama_attn_with_flash_attn() 7 | 8 | from train import train 9 | 10 | if __name__ == "__main__": 11 | train() 12 | -------------------------------------------------------------------------------- /llm/vllm/serve-openai-api-docker.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | MODEL_NAME: meta-llama/Llama-2-7b-chat-hf 3 | HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. 4 | 5 | resources: 6 | image_id: docker:vllm/vllm-openai:latest 7 | accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1} 8 | ports: 9 | - 8000 10 | 11 | setup: | 12 | conda deactivate 13 | python3 -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" 14 | 15 | run: | 16 | conda deactivate 17 | echo 'Starting vllm openai api server...' 18 | python -m vllm.entrypoints.openai.api_server \ 19 | --model $MODEL_NAME --tokenizer hf-internal-testing/llama-tokenizer \ 20 | --host 0.0.0.0 21 | -------------------------------------------------------------------------------- /llm/yi/yi15-34b.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | MODEL_NAME: 01-ai/Yi-1.5-34B-Chat 3 | 4 | resources: 5 | accelerators: {A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} 6 | disk_size: 1024 7 | disk_tier: best 8 | memory: 32+ 9 | ports: 8000 10 | 11 | setup: | 12 | pip install vllm==0.6.1.post2 13 | pip install vllm-flash-attn 14 | 15 | run: | 16 | export PATH=$PATH:/sbin 17 | vllm serve $MODEL_NAME \ 18 | --host 0.0.0.0 \ 19 | --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 20 | --max-model-len 1024 | tee ~/openai_api_server.log 21 | -------------------------------------------------------------------------------- /llm/yi/yi15-6b.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | MODEL_NAME: 01-ai/Yi-1.5-6B-Chat 3 | 4 | resources: 5 | accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} 6 | disk_tier: best 7 | ports: 8000 8 | 9 | setup: | 10 | pip install vllm==0.6.1.post2 11 | pip install vllm-flash-attn 12 | 13 | run: | 14 | export PATH=$PATH:/sbin 15 | vllm serve $MODEL_NAME \ 16 | --host 0.0.0.0 \ 17 | --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 18 | --max-model-len 1024 | tee ~/openai_api_server.log 19 | -------------------------------------------------------------------------------- /llm/yi/yi15-9b.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | MODEL_NAME: 01-ai/Yi-1.5-9B-Chat 3 | 4 | resources: 5 | accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} 6 | disk_tier: best 7 | ports: 8000 8 | 9 | setup: | 10 | pip install vllm==0.6.1.post2 11 | pip install vllm-flash-attn 12 | 13 | run: | 14 | export PATH=$PATH:/sbin 15 | vllm serve $MODEL_NAME \ 16 | --host 0.0.0.0 \ 17 | --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 18 | --max-model-len 1024 | tee ~/openai_api_server.log 19 | -------------------------------------------------------------------------------- /llm/yi/yicoder-1_5b.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | MODEL_NAME: 01-ai/Yi-Coder-1.5B-Chat 3 | 4 | resources: 5 | accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} 6 | disk_tier: best 7 | ports: 8000 8 | 9 | setup: | 10 | pip install vllm==0.6.1.post2 11 | pip install vllm-flash-attn 12 | 13 | run: | 14 | export PATH=$PATH:/sbin 15 | vllm serve $MODEL_NAME \ 16 | --host 0.0.0.0 \ 17 | --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 18 | --max-model-len 1024 | tee ~/openai_api_server.log 19 | -------------------------------------------------------------------------------- /llm/yi/yicoder-9b.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | MODEL_NAME: 01-ai/Yi-Coder-9B-Chat 3 | 4 | resources: 5 | accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} 6 | disk_tier: best 7 | ports: 8000 8 | 9 | setup: | 10 | pip install vllm==0.6.1.post2 11 | pip install vllm-flash-attn 12 | 13 | run: | 14 | export PATH=$PATH:/sbin 15 | vllm serve $MODEL_NAME \ 16 | --host 0.0.0.0 \ 17 | --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 18 | --max-model-len 1024 | tee ~/openai_api_server.log 19 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | sky/setup_files/setup.py -------------------------------------------------------------------------------- /sky/adaptors/README.md: -------------------------------------------------------------------------------- 1 | This directory is for third-party cloud adaptors. These adaptors wrap the underlying packages, so cloud-specific packages are loaded on demand. 2 | -------------------------------------------------------------------------------- /sky/adaptors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/sky/adaptors/__init__.py -------------------------------------------------------------------------------- /sky/adaptors/cudo.py: -------------------------------------------------------------------------------- 1 | """Cudo Compute cloud adaptor.""" 2 | 3 | from sky.adaptors import common 4 | 5 | cudo = common.LazyImport( 6 | 'cudo_compute', 7 | import_error_message='Failed to import dependencies for Cudo Compute. ' 8 | 'Try running: pip install "skypilot[cudo]"') 9 | -------------------------------------------------------------------------------- /sky/adaptors/docker.py: -------------------------------------------------------------------------------- 1 | """Docker adaptors""" 2 | 3 | # pylint: disable=import-outside-toplevel 4 | 5 | from sky.adaptors import common 6 | 7 | docker = common.LazyImport( 8 | 'docker', 9 | import_error_message='Failed to import dependencies for Docker. ' 10 | 'See README for how to install it.') 11 | 12 | 13 | def from_env(): 14 | return docker.from_env() 15 | 16 | 17 | def build_error(): 18 | return docker.errors.BuildError 19 | 20 | 21 | def not_found_error(): 22 | return docker.errors.NotFound 23 | 24 | 25 | def api_error(): 26 | return docker.errors.APIError 27 | -------------------------------------------------------------------------------- /sky/adaptors/runpod.py: -------------------------------------------------------------------------------- 1 | """RunPod cloud adaptor.""" 2 | 3 | from sky.adaptors import common 4 | 5 | runpod = common.LazyImport( 6 | 'runpod', 7 | import_error_message='Failed to import dependencies for RunPod. ' 8 | 'Try running: pip install "skypilot[runpod]"') 9 | -------------------------------------------------------------------------------- /sky/backends/__init__.py: -------------------------------------------------------------------------------- 1 | """Sky Backends.""" 2 | from sky.backends.backend import Backend 3 | from sky.backends.backend import ResourceHandle 4 | from sky.backends.cloud_vm_ray_backend import CloudVmRayBackend 5 | from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle 6 | from sky.backends.local_docker_backend import LocalDockerBackend 7 | from sky.backends.local_docker_backend import LocalDockerResourceHandle 8 | 9 | __all__ = [ 10 | 'Backend', 'ResourceHandle', 'CloudVmRayBackend', 11 | 'CloudVmRayResourceHandle', 'LocalDockerBackend', 12 | 'LocalDockerResourceHandle' 13 | ] 14 | -------------------------------------------------------------------------------- /sky/benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/sky/benchmark/__init__.py -------------------------------------------------------------------------------- /sky/callbacks/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | setuptools.setup( 4 | name='sky-callback', 5 | version='0.1.1-dev0', 6 | packages=setuptools.find_packages(), 7 | install_requires=['psutil'], 8 | ) 9 | -------------------------------------------------------------------------------- /sky/callbacks/sky_callback/integrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/sky/callbacks/sky_callback/integrations/__init__.py -------------------------------------------------------------------------------- /sky/client/__init__.py: -------------------------------------------------------------------------------- 1 | """Module for the SkyPilot Client.""" 2 | -------------------------------------------------------------------------------- /sky/client/cli.py: -------------------------------------------------------------------------------- 1 | ../cli.py -------------------------------------------------------------------------------- /sky/clouds/service_catalog/README.md: -------------------------------------------------------------------------------- 1 | # Service Catalog 2 | 3 | This module provides information for clouds supported by SkyPilot, including the instance type offerings, their pricing and data transfer costs. It also provides functions to query these information, and to select the most suitable instance types based on resource requirements. Primarily used by the Clouds module. 4 | 5 | - `data_fetchers/fetch_{aws,azure}.py`: each file is a standalone script that queries the cloud APIs to produce the pricing list files. 6 | - `data_fetchers/fetch_gcp.py`: A script that generates the GCP catalog based by crawling GCP websites. 7 | - `{aws,azure,gcp}_catalog.py`: Singleton-classes that load the data files and provide functions to query for instance offerings based on resource requirements. 8 | -------------------------------------------------------------------------------- /sky/clouds/service_catalog/constants.py: -------------------------------------------------------------------------------- 1 | """Constants used for service catalog.""" 2 | HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long 3 | HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs' # pylint: disable=line-too-long 4 | CATALOG_SCHEMA_VERSION = 'v7' 5 | CATALOG_DIR = '~/.sky/catalogs' 6 | ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci', 7 | 'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack', 8 | 'paperspace', 'do', 'nebius', 'ssh') 9 | -------------------------------------------------------------------------------- /sky/clouds/service_catalog/data_fetchers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/sky/clouds/service_catalog/data_fetchers/__init__.py -------------------------------------------------------------------------------- /sky/clouds/service_catalog/data_fetchers/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py 2 | boto3 3 | lxml 4 | pandas 5 | ray 6 | requests 7 | -------------------------------------------------------------------------------- /sky/clouds/service_catalog/images/plugins.pkr.hcl: -------------------------------------------------------------------------------- 1 | packer { 2 | required_plugins { 3 | amazon = { 4 | version = ">= 1.2.8" 5 | source = "github.com/hashicorp/amazon" 6 | } 7 | } 8 | } 9 | 10 | packer { 11 | required_plugins { 12 | googlecompute = { 13 | version = ">= 1.1.1" 14 | source = "github.com/hashicorp/googlecompute" 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /sky/clouds/service_catalog/images/provisioners/user-toolkit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script installs popular toolkits for users to use in the base environment. 3 | 4 | eval "$(~/miniconda3/bin/conda shell.bash hook)" 5 | conda activate base 6 | pip install numpy 7 | pip install pandas 8 | 9 | if [ "$AZURE_GRID_DRIVER" = 1 ]; then 10 | # Need PyTorch X.X.X+cu121 version to be compatible with older NVIDIA driver (535.161.08 or lower) 11 | pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 12 | fi 13 | -------------------------------------------------------------------------------- /sky/clouds/utils/README.md: -------------------------------------------------------------------------------- 1 | # Utility for Clouds 2 | 3 | This folder contains the utility functions for clouds which are required by both 4 | the `sky.skylet.providers` and other modules in SkyPilot. 5 | 6 | This is to avoid importing other unnecessary modules in `sky.skylet.providers`. 7 | When a utility file is placed under, e.g., `sky.skylet.providers.`, and is 8 | imported by other modules in SkyPilot, Python will import the `__init__.py` file in 9 | the folder, which will then import 10 | `sky.skylet.provider..node_provider`, causing the import of `ray`. 11 | Importing `ray` will cause failure for clouds that have adopted the new provisioner 12 | #1702 and removed the dependency of ray #2625. 13 | -------------------------------------------------------------------------------- /sky/clouds/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/sky/clouds/utils/__init__.py -------------------------------------------------------------------------------- /sky/dashboard/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["next/core-web-vitals", "prettier"] 3 | } 4 | -------------------------------------------------------------------------------- /sky/dashboard/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | .yarn/install-state.gz 8 | 9 | # testing 10 | /coverage 11 | 12 | # next.js 13 | /.next/ 14 | /out/ 15 | 16 | # production 17 | /build 18 | 19 | # misc 20 | .DS_Store 21 | *.pem 22 | 23 | # debug 24 | npm-debug.log* 25 | yarn-debug.log* 26 | yarn-error.log* 27 | 28 | # local env files 29 | .env*.local 30 | 31 | # vercel 32 | .vercel 33 | 34 | # typescript 35 | *.tsbuildinfo 36 | next-env.d.ts 37 | 38 | .vscode 39 | .swc 40 | -------------------------------------------------------------------------------- /sky/dashboard/.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "semi": true, 3 | "trailingComma": "es5", 4 | "singleQuote": true, 5 | "tabWidth": 2, 6 | "useTabs": false 7 | } 8 | -------------------------------------------------------------------------------- /sky/dashboard/components.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://ui.shadcn.com/schema.json", 3 | "style": "default", 4 | "rsc": true, 5 | "tsx": false, 6 | "tailwind": { 7 | "config": "tailwind.config.js", 8 | "css": "src/app/globals.css", 9 | "baseColor": "slate", 10 | "cssVariables": true 11 | }, 12 | "aliases": { 13 | "components": "@/components", 14 | "utils": "@/lib/utils" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /sky/dashboard/eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import globals from 'globals'; 2 | import pluginJs from '@eslint/js'; 3 | import pluginReactConfig from 'eslint-plugin-react/configs/recommended.js'; 4 | 5 | export default [ 6 | { files: ['**/*.{js,mjs,cjs,jsx}'] }, 7 | { languageOptions: { parserOptions: { ecmaFeatures: { jsx: true } } } }, 8 | { languageOptions: { globals: globals.browser } }, 9 | pluginJs.configs.recommended, 10 | pluginReactConfig, 11 | ]; 12 | -------------------------------------------------------------------------------- /sky/dashboard/jest.setup.js: -------------------------------------------------------------------------------- 1 | require('@testing-library/jest-dom'); 2 | 3 | // Mock fetch 4 | global.fetch = jest.fn(); 5 | 6 | // Keep original console methods for testing 7 | const originalConsole = { ...console }; 8 | global.console = { 9 | ...console, 10 | error: (...args) => { 11 | originalConsole.error(...args); 12 | }, 13 | warn: (...args) => { 14 | originalConsole.warn(...args); 15 | }, 16 | log: (...args) => { 17 | originalConsole.log(...args); 18 | }, 19 | info: (...args) => { 20 | originalConsole.info(...args); 21 | }, 22 | debug: (...args) => { 23 | originalConsole.debug(...args); 24 | }, 25 | }; 26 | 27 | // Add Jest globals 28 | global.describe = describe; 29 | global.test = test; 30 | global.expect = expect; 31 | 32 | // Add any global test setup here 33 | -------------------------------------------------------------------------------- /sky/dashboard/jsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "baseUrl": ".", 4 | "paths": { 5 | "@/*": ["./src/*"] 6 | }, 7 | "jsx": "react", 8 | "checkJs": true, 9 | "resolveJsonModule": true, 10 | "moduleResolution": "node", 11 | "target": "es6", 12 | "module": "commonjs", 13 | "allowSyntheticDefaultImports": true, 14 | "esModuleInterop": true 15 | }, 16 | "include": ["src/**/*"], 17 | "exclude": ["node_modules", ".next", "out"] 18 | } 19 | -------------------------------------------------------------------------------- /sky/dashboard/next.config.mjs: -------------------------------------------------------------------------------- 1 | /** @type {import('next').NextConfig} */ 2 | const nextConfig = { 3 | basePath: '/dashboard', 4 | output: 'export', 5 | images: { 6 | unoptimized: true, 7 | }, 8 | env: { 9 | SKYPILOT_API_SERVER_ENDPOINT: process.env.SKYPILOT_API_SERVER_ENDPOINT, 10 | INFRA_CACHE_DURATION_MINUTES: 11 | process.env.INFRA_CACHE_DURATION_MINUTES || '10', 12 | INFRA_CACHE_DEBUG: process.env.INFRA_CACHE_DEBUG || 'false', 13 | }, 14 | }; 15 | 16 | export default nextConfig; 17 | -------------------------------------------------------------------------------- /sky/dashboard/postcss.config.mjs: -------------------------------------------------------------------------------- 1 | /** @type {import('postcss-load-config').Config} */ 2 | const config = { 3 | plugins: { 4 | tailwindcss: {}, 5 | }, 6 | }; 7 | 8 | export default config; 9 | -------------------------------------------------------------------------------- /sky/dashboard/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/sky/dashboard/public/favicon.ico -------------------------------------------------------------------------------- /sky/dashboard/public/videos/cursor-small.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skypilot-org/skypilot/370d5fa04663145f36b1dc67ab8ec229b190bc54/sky/dashboard/public/videos/cursor-small.mp4 -------------------------------------------------------------------------------- /sky/dashboard/src/components/elements/version-display.jsx: -------------------------------------------------------------------------------- 1 | import React, { useState, useEffect } from 'react'; 2 | import { ENDPOINT } from '@/data/connectors/constants'; 3 | 4 | export function VersionDisplay() { 5 | const [version, setVersion] = useState(null); 6 | 7 | useEffect(() => { 8 | fetch(`${ENDPOINT}/api/health`) 9 | .then((res) => res.json()) 10 | .then((data) => { 11 | if (data.version) { 12 | setVersion(data.version); 13 | } 14 | }) 15 | .catch((error) => { 16 | console.error('Error fetching version:', error); 17 | }); 18 | }, []); 19 | 20 | if (!version) return null; 21 | 22 | return
Version: {version}
; 23 | } 24 | -------------------------------------------------------------------------------- /sky/dashboard/src/components/ui/input.jsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | 3 | import { cn } from '@/lib/utils'; 4 | 5 | const Input = React.forwardRef(({ className, type, ...props }, ref) => { 6 | return ( 7 | 16 | ); 17 | }); 18 | Input.displayName = 'Input'; 19 | 20 | export { Input }; 21 | -------------------------------------------------------------------------------- /sky/dashboard/src/components/ui/label.jsx: -------------------------------------------------------------------------------- 1 | 'use client'; 2 | 3 | import * as React from 'react'; 4 | import * as LabelPrimitive from '@radix-ui/react-label'; 5 | import { cva } from 'class-variance-authority'; 6 | 7 | import { cn } from '@/lib/utils'; 8 | 9 | const labelVariants = cva( 10 | 'text-sm font-medium leading-none peer-disabled:cursor-not-allowed peer-disabled:opacity-70' 11 | ); 12 | 13 | const Label = React.forwardRef(({ className, ...props }, ref) => ( 14 | 19 | )); 20 | Label.displayName = LabelPrimitive.Root.displayName; 21 | 22 | export { Label }; 23 | -------------------------------------------------------------------------------- /sky/dashboard/src/components/ui/textarea.jsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | 3 | import { cn } from '@/lib/utils'; 4 | 5 | const Textarea = React.forwardRef(({ className, ...props }, ref) => { 6 | return ( 7 |