├── .gitignore ├── README.md ├── argo-workflow ├── README.md ├── argo-screenshot.png ├── argo.png └── gpu-say-workflow.yaml ├── cuda-ssh ├── README.md ├── sshd-data-pvc.yaml ├── sshd-deployment.yaml ├── sshd-root-pvc.yaml └── sshd-service.yaml ├── finetuner-workflow ├── finetune-pvc.yaml ├── finetune-role.yaml ├── finetune-workflow.yaml └── finetuner │ ├── Dockerfile │ ├── compiler_wrapper.f95 │ ├── ds_config.json │ ├── evaluator.py │ ├── finetuner.py │ ├── inference.py │ ├── requirements-precompilable.txt │ ├── requirements.txt │ └── utils.py ├── getting-started └── k8ctl_setup.ps1 ├── kubeflow └── training-operator │ ├── gpt-neox │ ├── 01-pvc.yaml │ ├── 02-finetune-role.yaml │ ├── 03-wanbd-secret.yaml │ └── 04-finetune-workflow.yaml │ └── resnet50 │ ├── Dockerfile.mpi │ ├── Dockerfile.pytorch │ ├── k8s │ ├── imagenet-download-job.yaml │ ├── imagenet-mpijob.yaml │ ├── imagenet-pytorchjob.yaml │ ├── kaggle-secret.yaml │ ├── model-pvc.yaml │ └── wanbd-secret.yaml │ ├── resnet50_horovod.py │ ├── resnet50_pytorch.py │ └── util.py ├── online-inference ├── README.md ├── bloom-176b-deepspeed │ ├── 00-pvc.yaml │ ├── 01-download-job.yaml │ ├── 02-inference-service.yaml │ ├── Dockerfile │ ├── Dockerfile.downloader │ ├── downloader │ │ ├── download.py │ │ └── requirements.txt │ └── files │ │ ├── isvc-patch.txt │ │ └── requirements.txt ├── bloom-176b │ ├── 00-bloom-176b-pvc.yaml │ ├── 01-bloom-176b-download-job.yaml │ ├── 02-bloom-176b-inferenceservice.yaml │ └── model │ │ ├── Dockerfile │ │ ├── bloom.py │ │ ├── requirements.txt │ │ └── scripts │ │ └── download_model ├── custom-basnet │ ├── README.md │ ├── basnet-inferenceservice.yaml │ ├── client │ │ ├── .DS_Store │ │ ├── Dockerfile │ │ ├── expected_output.png │ │ ├── images │ │ │ ├── .DS_Store │ │ │ ├── cut_mask.png │ │ │ ├── output.png │ │ │ └── test.png │ │ ├── main.py │ │ └── requirements.txt │ └── object-detector-inferenceservice.yaml ├── custom-pytorch-aitextgen │ ├── README.md │ ├── aitextgen-inferenceservice.yaml │ └── custom-predictor │ │ ├── Dockerfile │ │ ├── model.py │ │ └── requirements.txt ├── custom-sentiment │ ├── README.md │ ├── custom-predictor │ │ ├── Dockerfile │ │ ├── model.py │ │ └── requirements.txt │ ├── image-secrets-serviceaccount.patch.yaml │ ├── model-storage-pvc.yaml │ ├── sample.json │ ├── sentiment-inferenceservice.yaml │ └── sleep-deployment.yaml ├── dalle-mini │ ├── 00-model-pvc.yaml │ ├── 01-model-download-job.yaml │ ├── 02-inference-service.yaml │ ├── Dockerfile │ ├── Dockerfile.downloader │ ├── downloader │ │ └── download.py │ └── model │ │ ├── requirements.txt │ │ └── service.py ├── fastertransformer │ ├── README.md │ ├── build │ │ └── Dockerfile │ ├── client │ │ ├── Dockerfile │ │ ├── example.py │ │ ├── gpt_bpe │ │ │ ├── gpt2-merges.txt │ │ │ ├── gpt2-vocab.json │ │ │ └── gpt_token_encoder.py │ │ ├── hf_tokenizer │ │ │ ├── 20B_tokenizer.json │ │ │ └── hf_tokenize.py │ │ ├── requirements.txt │ │ └── sample_request.json │ ├── download-weights-job-gpt-neox.yml │ ├── download-weights-job-gptj.yml │ ├── ft-inference-service-gptj.yml │ ├── ft-inference-service-neox.yml │ └── model-storage-pvc.yml ├── hf-llm │ ├── .dockerignore │ ├── 00-optional-s3-secret.yaml │ ├── 01-optional-s3-serialize-job.yaml │ ├── 02-inference-service.yaml │ ├── Dockerfile │ ├── serializer │ │ ├── requirements.txt │ │ └── serialize.py │ └── service │ │ ├── requirements.txt │ │ └── service.py ├── image-classifier │ ├── jupyter │ │ ├── inception.ipynb │ │ ├── model-storage-pvc.yaml │ │ ├── tensorflow-deployment.yaml │ │ └── tensorflow-service.yaml │ ├── service │ │ ├── classifier-inferenceservice.yaml │ │ ├── predict_url.sh │ │ └── test_base64.sh │ └── transformer │ │ ├── Dockerfile │ │ ├── main.py │ │ ├── requirements.txt │ │ ├── test_base64.sh │ │ └── transformer.py ├── overview.png ├── stable-diffusion │ ├── 00-optional-s3-secret.yaml │ ├── 01-optional-s3-serialize-job.yaml │ ├── 02-inference-service.yaml │ ├── Dockerfile │ ├── README.md │ ├── serializer │ │ ├── requirements.txt │ │ └── serialize.py │ └── service │ │ ├── requirements.txt │ │ └── service.py ├── tensorizer-isvc │ ├── README.md │ ├── benchmark │ │ ├── inputs.txt │ │ ├── load_test.py │ │ └── locustfile.py │ ├── model-download │ │ ├── Dockerfile │ │ ├── model-download-job.yaml │ │ ├── model_download.py │ │ └── requirements.txt │ ├── pvc.yaml │ └── tensorizer_hf_isvc │ │ ├── flask │ │ ├── Dockerfile │ │ ├── flask_api.py │ │ ├── hf-isvc.yaml │ │ ├── requirements.txt │ │ └── tensorizer-isvc.yaml │ │ ├── kserve │ │ ├── Dockerfile │ │ ├── hf-isvc.yaml │ │ ├── kserve_api.py │ │ ├── requirements.txt │ │ └── tensorizer-isvc.yaml │ │ └── load_model.py └── vllm │ ├── 00-s3-secret.yaml │ ├── 01-s3-serialize-job.yaml │ ├── 02-inference-service.yaml │ ├── Dockerfile │ └── README.md ├── sd-dreambooth-workflow ├── db-finetune-pvc.yaml ├── db-workflow-event-binding.yaml ├── db-workflow-template.yaml ├── huggingface-secret.yaml ├── inference-role.yaml └── wandb-secret.yaml ├── sd-finetuner-workflow ├── huggingface-secret.yaml ├── inference-role.yaml ├── sd-finetune-pvc.yaml ├── sd-finetune-workflow-event-binding.yaml ├── sd-finetune-workflow-template.yaml ├── sd-finetuner │ ├── Dockerfile │ ├── accelerate_config.yaml │ ├── datasets.py │ ├── finetuner.py │ └── requirements.txt └── wandb-secret.yaml ├── spark ├── cpu-pod-template.yaml ├── docker │ ├── Dockerfile │ ├── download_imgdataset.py │ └── requirements.txt ├── example-spark-submit.sh ├── jupyter │ ├── interactive-example.ipynb │ └── jupyter-service.yaml ├── spark-pvc.yaml ├── spark-role.yaml └── wandb-secret.yaml ├── tensorflow-jupyter ├── README.md ├── jupyter-pvc.yaml ├── screenshot.png ├── tensorflow-deployment.yaml └── tensorflow-service.yaml └── virtual-server ├── clone_block_volume.sh ├── examples ├── curl │ ├── README.md │ ├── run.sh │ └── virtual-server.json ├── go │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── go.mod │ ├── go.sum │ └── main.go ├── kubectl │ ├── README.md │ ├── virtual-server-block-pvc.yaml │ ├── virtual-server-cloudinit.yaml │ ├── virtual-server-direct-attach-lb.yaml │ ├── virtual-server-ephemeral-disk.yaml │ ├── virtual-server-ephemeral-root-disk.yaml │ ├── virtual-server-shared-pvc.yaml │ ├── virtual-server-static-mac.yaml │ ├── virtual-server-windows-cpu-only.yaml │ ├── virtual-server-windows-internal-ip-only.yaml │ ├── virtual-server-windows.yaml │ └── virtual-server.yaml ├── nodejs │ ├── Readme.md │ ├── client.js │ ├── main.js │ ├── package-lock.json │ ├── package.json │ └── util.js ├── python │ ├── .gitignore │ ├── README.md │ ├── kubevirtclient.py │ ├── main.py │ └── vsclient.py └── terraform │ ├── README.md │ ├── examples │ └── module-use.tf │ ├── main.tf │ ├── outputs.tf │ ├── variables.tf │ └── vs.tf └── pvc-clone.sh /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CoreWeave Kubernetes Cloud 2 | Documentation lives at [https://docs.coreweave.com](https://docs.coreweave.com). This repository contains related examples. 3 | 4 | -------------------------------------------------------------------------------- /argo-workflow/README.md: -------------------------------------------------------------------------------- 1 | ![Argo](argo.png) 2 | ![Screenshot](argo-screenshot.png) 3 | 4 | ### Introduction 5 | [Argo Workflows](https://argoproj.github.io/argo-workflows/) is a great tool to orchestrate parallel execution of GPU jobs. It manages retries and parallelism for you, and allows you to submit workflows via CLI, [Rest API](https://github.com/argoproj/argo/blob/master/examples/rest-examples.md) and the [Kubernetes API](https://github.com/argoproj/argo/blob/master/docs/rest-api.md). 6 | 7 | ### Getting Started 8 | 9 | Please visit the [CoreWeave Docs](https://docs.coreweave.com/workflows/argo). 10 | -------------------------------------------------------------------------------- /argo-workflow/argo-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/argo-workflow/argo-screenshot.png -------------------------------------------------------------------------------- /argo-workflow/argo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/argo-workflow/argo.png -------------------------------------------------------------------------------- /argo-workflow/gpu-say-workflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: gpu-say 5 | spec: 6 | entrypoint: main 7 | activeDeadlineSeconds: 300 # Cancel operation if not finished in 5 minutes 8 | ttlSecondsAfterFinished: 86400 # Clean out old workflows after a day 9 | # Parameters can be passed/overridden via the argo CLI. 10 | # To override the printed message, run `argo submit` with the -p option: 11 | # $ argo submit examples/arguments-parameters.yaml -p messages='["CoreWeave", "Is", "Fun"]' 12 | arguments: 13 | parameters: 14 | - name: messages 15 | value: '["Argo", "Is", "Awesome"]' 16 | 17 | templates: 18 | - name: main 19 | steps: 20 | - - name: echo 21 | template: gpu-echo 22 | arguments: 23 | parameters: 24 | - name: message 25 | value: "{{item}}" 26 | withParam: "{{workflow.parameters.messages}}" 27 | 28 | - name: gpu-echo 29 | inputs: 30 | parameters: 31 | - name: message 32 | retryStrategy: 33 | limit: 1 34 | script: 35 | image: nvidia/cuda:10.2-runtime-ubuntu18.04 36 | command: [bash] 37 | source: | 38 | nvidia-smi 39 | echo "Input was: {{inputs.parameters.message}}" 40 | 41 | resources: 42 | requests: 43 | memory: 128Mi 44 | cpu: 500m # Half a core 45 | limits: 46 | nvidia.com/gpu: 1 # Allocate one GPU 47 | affinity: 48 | nodeAffinity: 49 | requiredDuringSchedulingIgnoredDuringExecution: 50 | # This will REQUIRE the Pod to be run on a system with a GPU with 8, 10 or 11GB VRAM 51 | nodeSelectorTerms: 52 | - matchExpressions: 53 | - key: gpu.nvidia.com/vram 54 | operator: In 55 | values: 56 | - "8" 57 | - "10" 58 | - "11" 59 | -------------------------------------------------------------------------------- /cuda-ssh/README.md: -------------------------------------------------------------------------------- 1 | ## CUDA Development Toolkit with SSH Server 2 | 3 | The guide for this example can be found in the [Documentation](https://docs.coreweave.com/coreweave-kubernetes/examples/cuda-ssh). -------------------------------------------------------------------------------- /cuda-ssh/sshd-data-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: sshd-data-pv-claim 5 | spec: 6 | # https://docs.coreweave.com/storage/storage 7 | storageClassName: block-hdd-ord1 8 | accessModes: 9 | - ReadWriteOnce 10 | resources: 11 | requests: 12 | storage: 500Gi 13 | -------------------------------------------------------------------------------- /cuda-ssh/sshd-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: sshd 5 | spec: 6 | strategy: 7 | type: Recreate 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app.kubernetes.io/name: sshd 12 | template: 13 | metadata: 14 | labels: 15 | app.kubernetes.io/name: sshd 16 | spec: 17 | terminationGracePeriodSeconds: 10 18 | initContainers: 19 | - name: init 20 | image: ghcr.io/coreweave/ml-containers/cuda-ssh:209c517-torch-ceeb8c2-nccl-cuda11.8.0-nccl2.16.2-1-torch2.0.1-vision0.15.2-audio2.0.2 21 | command: ["/bin/bash", "-c"] 22 | args: 23 | - | 24 | if [ ! -f /target/initialized ]; then 25 | dpkg-reconfigure openssh-server && \ 26 | cp -ax / /target && \ 27 | echo 'Initialization complete' && \ 28 | touch /target/initialized; 29 | fi 30 | resources: 31 | requests: 32 | cpu: 1 33 | memory: 1Gi 34 | volumeMounts: 35 | - name: root-storage 36 | mountPath: /target 37 | 38 | containers: 39 | - name: sshd 40 | command: ["/usr/bin/tini", "--"] 41 | args: ["service", "ssh", "start", "-D"] 42 | tty: true 43 | image: ghcr.io/coreweave/ml-containers/cuda-ssh:209c517-torch-ceeb8c2-nccl-cuda11.8.0-nccl2.16.2-1-torch2.0.1-vision0.15.2-audio2.0.2 44 | ports: 45 | - name: sshd 46 | containerPort: 22 47 | protocol: TCP 48 | volumeMounts: 49 | - name: data-storage 50 | mountPath: /mnt/data 51 | - name: root-storage 52 | mountPath: /bin 53 | subPath: bin 54 | - name: root-storage 55 | mountPath: /boot 56 | subPath: boot 57 | - name: root-storage 58 | mountPath: /etc 59 | subPath: etc 60 | - name: root-storage 61 | mountPath: /home 62 | subPath: home 63 | - name: root-storage 64 | mountPath: /lib 65 | subPath: lib 66 | - name: root-storage 67 | mountPath: /lib64 68 | subPath: lib64 69 | - name: root-storage 70 | mountPath: /opt 71 | subPath: opt 72 | - name: root-storage 73 | mountPath: /root 74 | subPath: root 75 | - name: root-storage 76 | mountPath: /sbin 77 | subPath: sbin 78 | - name: root-storage 79 | mountPath: /srv 80 | subPath: srv 81 | - name: root-storage 82 | mountPath: /usr 83 | subPath: usr 84 | - name: root-storage 85 | mountPath: /var 86 | subPath: var 87 | - name: run-lock 88 | mountPath: /run/lock 89 | 90 | resources: 91 | requests: 92 | cpu: 2500m # The CPU unit is milli-cores. 500m is 0.5 cores 93 | memory: 64Gi 94 | limits: 95 | cpu: 7000m 96 | memory: 128Gi 97 | nvidia.com/gpu: 6 98 | # GPUs can only be allocated as a limit, which both reserves and limits the number of GPUs the Pod will have access to 99 | # Making individual Pods resource light is advantageous for bin-packing. Since this Pod is for general purpose interactive testing 100 | # we allocate 6 GPUs to it 101 | 102 | # Node affinity can be used to require / prefer the Pods to be scheduled on a node with a specific hardware type 103 | # No affinity allows scheduling on all hardware types that can fulfill the resource request. 104 | # In this example, without affinity, any NVIDIA GPU would be allowed to run the Pod. 105 | # Read more about affinity at: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity 106 | affinity: 107 | nodeAffinity: 108 | # This will REQUIRE the Pod to be run on a system with an RTX A5000 GPU 109 | requiredDuringSchedulingIgnoredDuringExecution: 110 | nodeSelectorTerms: 111 | - matchExpressions: 112 | - key: gpu.nvidia.com/class 113 | operator: In 114 | values: 115 | - RTX_A5000 116 | - key: topology.kubernetes.io/region 117 | operator: In 118 | values: 119 | - ORD1 120 | 121 | volumes: 122 | - name: root-storage 123 | persistentVolumeClaim: 124 | claimName: sshd-root-pv-claim 125 | - name: data-storage 126 | persistentVolumeClaim: 127 | claimName: sshd-data-pv-claim 128 | - name: run-lock 129 | emptyDir: 130 | medium: Memory 131 | restartPolicy: Always 132 | -------------------------------------------------------------------------------- /cuda-ssh/sshd-root-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: sshd-root-pv-claim 5 | spec: 6 | # https://docs.coreweave.com/storage/storage 7 | storageClassName: block-nvme-ord1 8 | accessModes: 9 | - ReadWriteOnce 10 | resources: 11 | requests: 12 | storage: 200Gi 13 | -------------------------------------------------------------------------------- /cuda-ssh/sshd-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | annotations: 5 | metallb.universe.tf/address-pool: public-ord1 6 | # Setting a sharing key might save public IP addresses 7 | # See https://metallb.universe.tf/usage/#ip-address-sharing for more detail 8 | metallb.universe.tf/allow-shared-ip: example-1 9 | name: sshd 10 | spec: 11 | type: LoadBalancer 12 | externalTrafficPolicy: Local 13 | ports: 14 | - name: sshd 15 | port: 22 16 | protocol: TCP 17 | targetPort: sshd 18 | selector: 19 | app.kubernetes.io/name: sshd 20 | -------------------------------------------------------------------------------- /finetuner-workflow/finetune-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: finetune-data 5 | spec: 6 | storageClassName: shared-hdd-ord1 7 | accessModes: 8 | - ReadWriteMany 9 | resources: 10 | requests: 11 | storage: 2000Gi 12 | -------------------------------------------------------------------------------- /finetuner-workflow/finetune-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: finetune 5 | --- 6 | apiVersion: rbac.authorization.k8s.io/v1 7 | kind: Role 8 | metadata: 9 | name: role:finetune 10 | rules: 11 | - apiGroups: 12 | - "" 13 | resources: 14 | - pods 15 | verbs: 16 | - 'patch' 17 | - apiGroups: 18 | - serving.kubeflow.org 19 | resources: 20 | - inferenceservices 21 | verbs: 22 | - '*' 23 | - apiGroups: 24 | - serving.knative.dev 25 | resources: 26 | - services 27 | - revisions 28 | verbs: 29 | - '*' 30 | --- 31 | apiVersion: rbac.authorization.k8s.io/v1 32 | kind: RoleBinding 33 | metadata: 34 | name: rolebinding:finetune-finetune 35 | roleRef: 36 | apiGroup: rbac.authorization.k8s.io 37 | kind: Role 38 | name: role:finetune 39 | subjects: 40 | - kind: ServiceAccount 41 | name: finetune 42 | -------------------------------------------------------------------------------- /finetuner-workflow/finetuner/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1.2 2 | 3 | ARG BASE_IMAGE=ghcr.io/coreweave/ml-containers/torch:afecfe9-base-cuda11.8.0-torch2.0.0-vision0.15.1 4 | 5 | # Dependencies requiring NVCC are built ahead of time in a separate stage 6 | # so that the ~2 GiB dev library installations don't have to be included 7 | # in the final finetuner image. 8 | # gcc-10/g++-10/lld do not need to be installed here, but they improve the build. 9 | # gfortran-10 is just for compiler_wrapper.f95. 10 | FROM ${BASE_IMAGE} as builder 11 | RUN apt-get install -y --no-install-recommends \ 12 | cuda-nvcc-11-8 cuda-nvml-dev-11-8 libcurand-dev-11-8 \ 13 | libcublas-dev-11-8 libcusparse-dev-11-8 \ 14 | libcusolver-dev-11-8 cuda-nvprof-11-8 \ 15 | cuda-profiler-api-11-8 \ 16 | ninja-build \ 17 | gcc-10 g++-10 gfortran-10 lld && \ 18 | apt-get clean && \ 19 | update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \ 20 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \ 21 | update-alternatives --install \ 22 | /usr/bin/gfortran gfortran /usr/bin/gfortran-10 10 && \ 23 | update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1 24 | RUN mkdir /wheels 25 | WORKDIR /wheels 26 | COPY compiler_wrapper.f95 . 27 | COPY requirements-precompilable.txt . 28 | RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && \ 29 | python3 -m pip install -U --no-cache-dir \ 30 | packaging setuptools wheel pip && \ 31 | DS_BUILD_UTILS=1 DS_BUILD_CPU_ADAM=1 \ 32 | CC=$(realpath ./compiler) python3 -m pip wheel \ 33 | --no-cache-dir --no-build-isolation --no-deps \ 34 | -r requirements-precompilable.txt && \ 35 | rm ./compiler_wrapper.f95 ./compiler ./requirements-precompilable.txt 36 | 37 | FROM ${BASE_IMAGE} 38 | RUN mkdir /app 39 | WORKDIR /app 40 | RUN --mount=type=bind,from=builder,source=/wheels,target=. \ 41 | pip3 install --no-cache-dir ./*.whl 42 | COPY requirements.txt . 43 | COPY requirements-precompilable.txt . 44 | RUN pip3 install --no-cache-dir -r requirements.txt 45 | COPY ds_config.json . 46 | COPY finetuner.py . 47 | COPY evaluator.py . 48 | COPY inference.py . 49 | COPY utils.py . 50 | CMD [ "/usr/bin/python3", "finetuner.py" ] 51 | -------------------------------------------------------------------------------- /finetuner-workflow/finetuner/compiler_wrapper.f95: -------------------------------------------------------------------------------- 1 | PROGRAM compiler_wrapper 2 | ! Wraps GCC invocations, 3 | ! replacing -D__AVX512__ and -D__SCALAR__ preprocessor definitions 4 | ! with -D__AVX256__, and -march=native with -march=skylake, 5 | ! for better reproducibility and compatibility. 6 | IMPLICIT NONE 7 | INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0 8 | CHARACTER(len=:), ALLOCATABLE :: arg, command 9 | ALLOCATE(CHARACTER(len=128) :: arg) 10 | command = "gcc" 11 | 12 | DO i = 1, COMMAND_ARGUMENT_COUNT() 13 | DO 14 | CALL GET_COMMAND_ARGUMENT(i, arg, full_length, truncated) 15 | IF (truncated == 0) THEN 16 | EXIT 17 | ELSEIF (truncated == -1) THEN 18 | DEALLOCATE(arg) 19 | ALLOCATE(CHARACTER(len=full_length) :: arg) 20 | ELSE 21 | CALL EXIT(95) 22 | END IF 23 | END DO 24 | IF (arg == "-march=native") THEN 25 | command = command // " '-march=skylake'" 26 | ELSEIF (arg == "-D__AVX512__" .OR. arg == "-D__SCALAR__") THEN 27 | command = command // " '-D__AVX256__'" 28 | ELSE 29 | command = command // shell_escaped(arg) 30 | END IF 31 | END DO 32 | CALL SYSTEM(command, exitcode) 33 | IF (exitcode > 255) THEN 34 | exitcode = MAX(IAND(exitcode, 255), 1) 35 | ENDIF 36 | CALL EXIT(exitcode) 37 | 38 | 39 | CONTAINS 40 | FUNCTION shell_escaped(str) result(out) 41 | ! Turns [str] into [ 'str'] and replaces all 42 | ! internal ['] characters with ['"'"'] 43 | IMPLICIT NONE 44 | CHARACTER(len=*), INTENT(IN) :: str 45 | CHARACTER(len=:), ALLOCATABLE :: out 46 | INTEGER :: i, out_i, old_len, out_len 47 | 48 | old_len = LEN_TRIM(str) 49 | ! Figure out the new length to allocate by scanning `str`. 50 | ! This always needs to add at least [ '] at the beginning 51 | ! and ['] at the end, so the length increases by at least 3. 52 | out_len = old_len + 3 53 | DO i = 1, old_len 54 | IF (str(i:i) == "'") THEN 55 | out_len = out_len + 4 56 | END IF 57 | END DO 58 | ALLOCATE(CHARACTER(len=out_len) :: out) 59 | 60 | ! Copy over the string, performing necessary escapes. 61 | out(1:2) = " '" 62 | out_i = 3 63 | DO i = 1, old_len 64 | IF (str(i:i) == "'") THEN 65 | ! Escape internal single-quotes 66 | out(out_i:out_i + 4) = '''"''"''' 67 | out_i = out_i + 5 68 | ELSE 69 | ! No escaping needed 70 | out(out_i:out_i) = str(i:i) 71 | out_i = out_i + 1 72 | END IF 73 | END DO 74 | out(out_i:out_i) = "'" 75 | END FUNCTION 76 | END PROGRAM 77 | -------------------------------------------------------------------------------- /finetuner-workflow/finetuner/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "betas": "auto", 15 | "eps": "auto", 16 | "weight_decay": "auto" 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "WarmupLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto" 25 | } 26 | }, 27 | "zero_optimization": { 28 | "stage": 3, 29 | "allgather_partitions": true, 30 | "allgather_bucket_size": 2e8, 31 | "overlap_comm": true, 32 | "reduce_scatter": true, 33 | "reduce_bucket_size": 2e8, 34 | "contiguous_gradients": true, 35 | "offload_optimizer": { 36 | "device": "cpu" 37 | }, 38 | "offload_param": { 39 | "device": "cpu" 40 | }, 41 | "stage3_gather_16bit_weights_on_model_save": true 42 | }, 43 | "gradient_accumulation_steps": "auto", 44 | "gradient_clipping": "auto", 45 | "communication_data_type": "fp32", 46 | "steps_per_print": 1000000000000000, 47 | "train_batch_size": "auto", 48 | "train_micro_batch_size_per_gpu": "auto", 49 | "wall_clock_breakdown": false 50 | } 51 | -------------------------------------------------------------------------------- /finetuner-workflow/finetuner/inference.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | import torch 4 | import uvicorn 5 | from fastapi import FastAPI 6 | from fastapi.middleware.cors import CORSMiddleware 7 | from pydantic import BaseModel 8 | from transformers import pipeline 9 | 10 | from utils import DashParser 11 | from utils import validation as val 12 | 13 | parser = DashParser(description="Text model inference HTTP server") 14 | 15 | parser.add_argument( 16 | "--model", 17 | type=str, 18 | default="distilgpt2", 19 | help="Model to use for inference (directory, or HuggingFace ID) [default = distilgpt2]", 20 | ) 21 | parser.add_argument( 22 | "--device-id", 23 | type=val.non_negative(int, special_val=-1), 24 | default=0, 25 | help="GPU ID to use for inference, or -1 for CPU [default = 0]", 26 | ) 27 | parser.add_argument( 28 | "--port", 29 | type=val.non_negative(int), 30 | default=80, 31 | help="Port to listen on [default = 80 (http)]", 32 | ) 33 | parser.add_argument( 34 | "--ip", 35 | type=str, 36 | default="0.0.0.0", 37 | help="IP address to listen on [default = 0.0.0.0 (all interfaces)]", 38 | ) 39 | 40 | args = parser.parse_args() 41 | 42 | 43 | class Completion(BaseModel): 44 | prompt: str 45 | max_new_tokens: Optional[int] = 10 46 | temperature: Optional[float] = None 47 | top_p: Optional[float] = None 48 | top_k: Optional[int] = None 49 | typical_p: Optional[float] = None 50 | repetition_penalty: Optional[float] = None 51 | do_sample: Optional[bool] = True 52 | penalty_alpha: Optional[float] = None 53 | num_return_sequences: Optional[int] = 1 54 | stop_sequence: Optional[str] = None 55 | bad_words: Optional[List] = None 56 | 57 | 58 | app = FastAPI(title="Inference API") 59 | 60 | app.add_middleware( 61 | CORSMiddleware, 62 | allow_origins=["*"], 63 | allow_methods=["*"], 64 | allow_headers=["*"], 65 | ) 66 | 67 | model = pipeline( 68 | "text-generation", 69 | model=args.model, 70 | torch_dtype=None if args.device_id == -1 else torch.float16, 71 | device=args.device_id, 72 | ) 73 | 74 | 75 | @app.get("/") 76 | def get_health(): 77 | return "OK" 78 | 79 | 80 | @app.post("/completion") 81 | def completion(completion: Completion): 82 | try: 83 | return model( 84 | completion.prompt, 85 | max_new_tokens=completion.max_new_tokens, 86 | temperature=completion.temperature, 87 | top_p=completion.top_p, 88 | top_k=completion.top_k, 89 | repetition_penalty=completion.repetition_penalty, 90 | do_sample=completion.do_sample, 91 | penalty_alpha=completion.penalty_alpha, 92 | num_return_sequences=completion.num_return_sequences, 93 | stop_sequence=completion.stop_sequence, 94 | ) 95 | except Exception as e: 96 | return {"error": str(e)} 97 | 98 | 99 | if __name__ == "__main__": 100 | uvicorn.run("inference:app", host=args.ip, port=args.port) 101 | -------------------------------------------------------------------------------- /finetuner-workflow/finetuner/requirements-precompilable.txt: -------------------------------------------------------------------------------- 1 | deepspeed==0.9.2 2 | flash-attn==1.0.4 3 | einops==0.6.1 -------------------------------------------------------------------------------- /finetuner-workflow/finetuner/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers~=4.28.1 2 | numpy~=1.24.2 3 | wandb~=0.14.0 4 | torch==2.0.0 5 | psutil==5.9.4 6 | accelerate~=0.17.1 7 | tensorizer==1.1.0 8 | fastapi==0.85.1 9 | uvicorn==0.19.0 10 | -r requirements-precompilable.txt -------------------------------------------------------------------------------- /kubeflow/training-operator/gpt-neox/01-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: neox-checkpoints 5 | spec: 6 | storageClassName: shared-nvme-las1 7 | accessModes: 8 | - ReadWriteMany 9 | resources: 10 | requests: 11 | storage: 512Gi 12 | --- 13 | apiVersion: v1 14 | kind: PersistentVolumeClaim 15 | metadata: 16 | name: neox-data 17 | spec: 18 | storageClassName: shared-hdd-las1 19 | accessModes: 20 | - ReadWriteMany 21 | resources: 22 | requests: 23 | storage: 64Gi 24 | -------------------------------------------------------------------------------- /kubeflow/training-operator/gpt-neox/02-finetune-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: finetune 5 | --- 6 | apiVersion: rbac.authorization.k8s.io/v1 7 | kind: Role 8 | metadata: 9 | name: role:finetune 10 | rules: 11 | # Permissions for the config map step 12 | - apiGroups: 13 | - "" 14 | resources: 15 | - "configmaps" 16 | verbs: 17 | - 'patch' 18 | - 'create' 19 | - 'get' 20 | 21 | # Permissions for the finetune step 22 | - apiGroups: 23 | - "kubeflow.org" 24 | resources: 25 | - "mpijobs" 26 | verbs: 27 | - "create" 28 | - "get" 29 | --- 30 | apiVersion: rbac.authorization.k8s.io/v1 31 | kind: RoleBinding 32 | metadata: 33 | name: rolebinding:finetune-finetune 34 | roleRef: 35 | apiGroup: rbac.authorization.k8s.io 36 | kind: Role 37 | name: role:finetune 38 | subjects: 39 | - kind: ServiceAccount 40 | name: finetune 41 | -------------------------------------------------------------------------------- /kubeflow/training-operator/gpt-neox/03-wanbd-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | token: enterYourSecret== 4 | kind: Secret 5 | metadata: 6 | name: wandb-token-secret 7 | type: Opaque 8 | -------------------------------------------------------------------------------- /kubeflow/training-operator/resnet50/Dockerfile.mpi: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:22.12-py3 2 | 3 | RUN HOROVOD_GPU_OPERATIONS=NCCL pip install tensorboardX horovod[pytorch] filelock wandb 4 | RUN mkdir -p /opt/resnet50 5 | 6 | RUN chgrp -R 0 /opt/resnet50 \ 7 | && chmod -R g+rwX /opt/resnet50 8 | 9 | RUN apt-get -qq update && \ 10 | apt-get -qq install -y --allow-change-held-packages --no-install-recommends \ 11 | openssh-server 12 | 13 | # SSH dependencies for MPI 14 | RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ 15 | echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 16 | sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ 17 | mkdir /var/run/sshd -p 18 | 19 | WORKDIR /opt/resnet50/src 20 | ADD resnet50_horovod.py /opt/resnet50/src/resnet50.py 21 | ADD util.py /opt/resnet50/src/util.py 22 | -------------------------------------------------------------------------------- /kubeflow/training-operator/resnet50/Dockerfile.pytorch: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:22.12-py3 2 | 3 | RUN pip install tensorboardX filelock wandb 4 | RUN mkdir -p /opt/resnet50 5 | 6 | WORKDIR /opt/resnet50/src 7 | ADD resnet50_pytorch.py /opt/resnet50/src/resnet50.py 8 | ADD util.py /opt/resnet50/src/util.py 9 | 10 | RUN chgrp -R 0 /opt/resnet50 \ 11 | && chmod -R g+rwX /opt/resnet50 12 | 13 | ENTRYPOINT ["python", "/opt/resnet50/src/resnet50.py"] 14 | -------------------------------------------------------------------------------- /kubeflow/training-operator/resnet50/k8s/imagenet-download-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: imagenet-download 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: model-downloader 10 | image: python:3.8 11 | imagePullPolicy: IfNotPresent 12 | command: [ "bash", "-c" ] 13 | args: 14 | - apt-get install unzip; 15 | pip install kaggle; 16 | mkdir /mnt/pvc/dataset; 17 | kaggle competitions download -c imagenet-object-localization-challenge -p /mnt/pvc/dataset; 18 | unzip /mnt/pvc/dataset/imagenet-object-localization-challenge.zip -d /mnt/pvc/dataset; 19 | rm /mnt/pvc/dataset/imagenet-object-localization-challenge.zip; 20 | cd /mnt/pvc/dataset/ILSVRC/Data/CLS-LOC/val/; 21 | wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash 22 | env: 23 | - name: KAGGLE_KEY 24 | valueFrom: 25 | secretKeyRef: 26 | name: kaggle-token-secret 27 | key: token 28 | - name: KAGGLE_USERNAME 29 | value: navarreprattcw 30 | volumeMounts: 31 | - name: kubeflow-imagenet 32 | mountPath: /mnt/pvc 33 | resources: 34 | requests: 35 | cpu: 1 36 | memory: 4Gi 37 | limits: 38 | cpu: 1 39 | memory: 4Gi 40 | volumes: 41 | - name: kubeflow-imagenet 42 | persistentVolumeClaim: 43 | claimName: kubeflow-imagenet 44 | affinity: 45 | nodeAffinity: 46 | requiredDuringSchedulingIgnoredDuringExecution: 47 | nodeSelectorTerms: 48 | - matchExpressions: 49 | - key: topology.kubernetes.io/region 50 | operator: In 51 | values: 52 | - LAS1 53 | restartPolicy: Never 54 | backoffLimit: 2 55 | -------------------------------------------------------------------------------- /kubeflow/training-operator/resnet50/k8s/imagenet-mpijob.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "kubeflow.org/v2beta1" 2 | kind: "MPIJob" 3 | metadata: 4 | name: "imagenet-16gpu-mpijob" 5 | spec: 6 | slotsPerWorker: 8 7 | runPolicy: 8 | cleanPodPolicy: Running 9 | mpiReplicaSpecs: 10 | Launcher: 11 | replicas: 1 12 | restartPolicy: OnFailure 13 | template: 14 | spec: 15 | containers: 16 | - name: pytorch 17 | image: navarrepratt/pytorch_mpi_resnet50:6 18 | command: 19 | - "mpirun" 20 | - "-np" 21 | - "16" # Total processes = num workers * slots per workers 22 | - "-x" 23 | - "WANDB_API_KEY=$(WANDB_API_KEY)" 24 | - "-x" 25 | - "NCCL_DEBUG=INFO" 26 | - "--allow-run-as-root" 27 | - "python" 28 | - "/opt/resnet50/src/resnet50.py" 29 | - "--data-dir" 30 | - "/mnt/pvc/dataset/ILSVRC/Data/CLS-LOC" 31 | - "--model-dir" 32 | - "/mnt/pvc/mpi/checkpoints" 33 | - "--epochs" 34 | - "10" 35 | - "--batch-size" 36 | - "256" 37 | - "--wandb-project" 38 | - "resnet50-imagenet-horovod" 39 | - "--wandb-run" 40 | - "a40-16gpu" 41 | resources: 42 | requests: 43 | cpu: 2 44 | memory: 128Mi 45 | env: 46 | - name: WANDB_API_KEY 47 | valueFrom: 48 | secretKeyRef: 49 | name: wandb-token-secret 50 | key: token 51 | affinity: 52 | nodeAffinity: 53 | requiredDuringSchedulingIgnoredDuringExecution: 54 | nodeSelectorTerms: 55 | - matchExpressions: 56 | - key: failure-domain.beta.kubernetes.io/region 57 | operator: In 58 | values: 59 | - LAS1 60 | 61 | Worker: 62 | replicas: 2 63 | restartPolicy: OnFailure 64 | template: 65 | spec: 66 | containers: 67 | - name: pytorch 68 | image: navarrepratt/pytorch_mpi_resnet50:6 69 | resources: # Use the full node 70 | requests: 71 | cpu: 90 72 | memory: 700G 73 | nvidia.com/gpu: 8 74 | limits: 75 | cpu: 90 76 | memory: 700G 77 | nvidia.com/gpu: 8 78 | volumeMounts: 79 | - name: kubeflow-resnet50 80 | mountPath: /mnt/pvc 81 | - name: dshm 82 | mountPath: /dev/shm 83 | volumes: 84 | - name: kubeflow-resnet50 85 | persistentVolumeClaim: 86 | claimName: kubeflow-resnet50 87 | - emptyDir: 88 | medium: Memory 89 | name: dshm 90 | affinity: 91 | nodeAffinity: 92 | requiredDuringSchedulingIgnoredDuringExecution: 93 | nodeSelectorTerms: 94 | - matchExpressions: 95 | - key: gpu.nvidia.com/model 96 | operator: In 97 | values: 98 | - A40 99 | - key: failure-domain.beta.kubernetes.io/region 100 | operator: In 101 | values: 102 | - LAS1 103 | -------------------------------------------------------------------------------- /kubeflow/training-operator/resnet50/k8s/kaggle-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | token: enterYourSecret== 4 | kind: Secret 5 | metadata: 6 | name: kaggle-token-secret 7 | type: Opaque 8 | -------------------------------------------------------------------------------- /kubeflow/training-operator/resnet50/k8s/model-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: kubeflow-imagenet 5 | spec: 6 | storageClassName: shared-hdd-las1 7 | accessModes: 8 | - ReadWriteMany 9 | resources: 10 | requests: 11 | storage: 1000Gi 12 | -------------------------------------------------------------------------------- /kubeflow/training-operator/resnet50/k8s/wanbd-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | token: enterYourSecret== 4 | kind: Secret 5 | metadata: 6 | name: wandb-token-secret 7 | type: Opaque 8 | -------------------------------------------------------------------------------- /online-inference/bloom-176b-deepspeed/00-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: microsoft-bloom-deepspeed-inference-fp16 5 | spec: 6 | storageClassName: shared-nvme-las1 7 | accessModes: 8 | - ReadWriteMany 9 | resources: 10 | requests: 11 | storage: 350Gi 12 | -------------------------------------------------------------------------------- /online-inference/bloom-176b-deepspeed/01-download-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: microsoft-bloom-deepspeed-inference-fp16-download 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: model-downloader 10 | image: tweldoncw/huggingface-hub-downloader:2 11 | imagePullPolicy: IfNotPresent 12 | command: 13 | - "python3" 14 | - "/app/download.py" 15 | - "--model-id=microsoft/bloom-deepspeed-inference-fp16" 16 | - "--revision=main" 17 | env: 18 | - name: HF_HOME 19 | value: /mnt/models 20 | volumeMounts: 21 | - name: model-cache 22 | mountPath: /mnt/models 23 | resources: 24 | requests: 25 | cpu: 1 26 | memory: 4Gi 27 | limits: 28 | cpu: 1 29 | memory: 4Gi 30 | volumes: 31 | - name: model-cache 32 | persistentVolumeClaim: 33 | claimName: microsoft-bloom-deepspeed-inference-fp16 34 | affinity: 35 | nodeAffinity: 36 | requiredDuringSchedulingIgnoredDuringExecution: 37 | nodeSelectorTerms: 38 | - matchExpressions: 39 | - key: topology.kubernetes.io/region 40 | operator: In 41 | values: 42 | - LAS1 43 | restartPolicy: Never 44 | backoffLimit: 2 45 | -------------------------------------------------------------------------------- /online-inference/bloom-176b-deepspeed/02-inference-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kubeflow.org/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | name: microsoft-bloom-deepspeed-inference-fp16 5 | spec: 6 | predictor: 7 | containerConcurrency: 1 8 | minReplicas: 1 9 | maxReplicas: 1 10 | affinity: 11 | nodeAffinity: 12 | requiredDuringSchedulingIgnoredDuringExecution: 13 | nodeSelectorTerms: 14 | - matchExpressions: 15 | - key: topology.kubernetes.io/region 16 | operator: In 17 | values: 18 | - LAS1 19 | - key: gpu.nvidia.com/class 20 | operator: In 21 | values: 22 | - A100_NVLINK_80GB 23 | containers: 24 | - name: kfserving-container 25 | image: tweldoncw/microsoft-bloom-deepspeed-inference-fp16:7 26 | command: 27 | - "/usr/bin/bash" 28 | - "server.sh" 29 | ports: 30 | - containerPort: 5000 31 | protocol: TCP 32 | env: 33 | - name: STORAGE_URI # Kserve mounts the PVC at /mnt/pvc/ 34 | value: pvc://microsoft-bloom-deepspeed-inference-fp16/ 35 | - name: HF_HOME 36 | value: /mnt/models 37 | resources: 38 | limits: 39 | cpu: 12 40 | memory: 64Gi 41 | nvidia.com/gpu: 8 42 | requests: 43 | cpu: 12 44 | memory: 64Gi 45 | nvidia.com/gpu: 8 46 | -------------------------------------------------------------------------------- /online-inference/bloom-176b-deepspeed/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG CUDA_RELEASE=11.6.2-cudnn8-devel-ubuntu20.04 2 | FROM nvidia/cuda:${CUDA_RELEASE} 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | ADD files/ /app 5 | WORKDIR /app 6 | RUN apt -y update && \ 7 | apt -y upgrade && \ 8 | apt install -y git python3 python3-pip python3-mpi4py && \ 9 | pip install --no-cache-dir -r requirements.txt && \ 10 | pip install --no-cache-dir git+https://github.com/microsoft/DeepSpeed-MII && \ 11 | git clone https://github.com/huggingface/transformers-bloom-inference.git 12 | WORKDIR /app/transformers-bloom-inference/bloom-inference-server 13 | RUN git checkout bd8af12 && \ 14 | git apply /app/isvc-patch.txt && \ 15 | chmod +x server.sh 16 | -------------------------------------------------------------------------------- /online-inference/bloom-176b-deepspeed/Dockerfile.downloader: -------------------------------------------------------------------------------- 1 | FROM python:3.9.13-alpine3.16 2 | RUN mkdir /app 3 | ADD downloader/ /app/ 4 | WORKDIR /app 5 | RUN pip3 install --no-cache-dir --upgrade pip 6 | RUN pip3 install --no-cache-dir -r requirements.txt 7 | CMD ["python3", "/app/download.py"] 8 | -------------------------------------------------------------------------------- /online-inference/bloom-176b-deepspeed/downloader/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import huggingface_hub as hf 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--model-id', required=True) 7 | parser.add_argument('--revision', default="main") 8 | args = parser.parse_args() 9 | 10 | hf.snapshot_download(repo_id=args.model_id, revision=args.revision) 11 | -------------------------------------------------------------------------------- /online-inference/bloom-176b-deepspeed/downloader/requirements.txt: -------------------------------------------------------------------------------- 1 | huggingface_hub==0.8.1 -------------------------------------------------------------------------------- /online-inference/bloom-176b-deepspeed/files/isvc-patch.txt: -------------------------------------------------------------------------------- 1 | diff --git a/bloom-inference-server/models/model.py b/bloom-inference-server/models/model.py 2 | index a16c8bb..0c8d69c 100644 3 | --- a/bloom-inference-server/models/model.py 4 | +++ b/bloom-inference-server/models/model.py 5 | @@ -18,7 +18,8 @@ class Model: 6 | raise NotImplementedError("This is a dummy class") 7 | 8 | def generate(self, request: GenerateRequest) -> GenerateResponse: 9 | - input_tokens = self.tokenizer(request.text, return_tensors="pt", padding=True) 10 | + input_tokens = self.tokenizer( 11 | + request.text, return_tensors="pt", padding=True) 12 | 13 | for t in input_tokens: 14 | if torch.is_tensor(input_tokens[t]): 15 | @@ -58,14 +59,17 @@ class Model: 16 | 17 | input_token_lengths = [x.shape[0] for x in input_tokens.input_ids] 18 | output_token_lengths = [x.shape[0] for x in output_tokens] 19 | - generated_tokens = [o - i for i, o in zip(input_token_lengths, output_token_lengths)] 20 | + generated_tokens = [o - i for i, 21 | + o in zip(input_token_lengths, output_token_lengths)] 22 | 23 | if request.remove_input_from_output: 24 | # the generate method's output includes input too. Remove input if 25 | # that is requested by the user 26 | - output_tokens = [x[-i:] for x, i in zip(output_tokens, generated_tokens)] 27 | + output_tokens = [x[-i:] 28 | + for x, i in zip(output_tokens, generated_tokens)] 29 | 30 | - output_text = self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True) 31 | + output_text = self.tokenizer.batch_decode( 32 | + output_tokens, skip_special_tokens=True) 33 | 34 | return GenerateResponse(text=output_text, num_generated_tokens=generated_tokens) 35 | 36 | @@ -79,14 +83,31 @@ class Model: 37 | 38 | 39 | def get_downloaded_model_path(model_name: str): 40 | - f = partial( 41 | - snapshot_download, 42 | - repo_id=model_name, 43 | - allow_patterns=["*"], 44 | - local_files_only=is_offline_mode(), 45 | - cache_dir=os.getenv("TRANSFORMERS_CACHE", None), 46 | - ) 47 | - # download only on 1 process 48 | - run_rank_n(f, barrier=True) 49 | - # now since the snapshot is downloaded, pass the model_path to all processes 50 | - return f() 51 | + # Modified to not use snapshot_download() which requires write permissions. 52 | + # InferenceServices mount PVC's as read-only. 53 | + model_id_split = model_name.split('/') 54 | + model_org = model_id_split[0] 55 | + model_repo = model_id_split[1] 56 | + 57 | + model_directory = ( 58 | + "models" + 59 | + "--" + 60 | + model_org + 61 | + "--" + 62 | + model_repo 63 | + ) 64 | + 65 | + HF_HOME = os.getenv('HF_HOME', '/mnt/models') 66 | + HUB_CACHE = os.path.join(HF_HOME, "hub") 67 | + MODEL_REVISION = os.getenv('MODEL_REVISION', 'main') 68 | + 69 | + model_path = os.path.join(HUB_CACHE, model_directory) 70 | + 71 | + model_ref_path = os.path.join(model_path, 'refs', MODEL_REVISION) 72 | + 73 | + with open(model_ref_path, 'r') as f: 74 | + model_git_ref = f.readlines()[0] 75 | + 76 | + model_snapshot_path = os.path.join(model_path, "snapshots", model_git_ref) 77 | + 78 | + return model_snapshot_path 79 | diff --git a/bloom-inference-server/server.sh b/bloom-inference-server/server.sh 80 | old mode 100644 81 | new mode 100755 82 | index 92179fb..0dcf756 83 | --- a/bloom-inference-server/server.sh 84 | +++ b/bloom-inference-server/server.sh 85 | @@ -1,5 +1,5 @@ 86 | -export MODEL_NAME=bigscience/bloom 87 | -export DEPLOYMENT_FRAMEWORK=hf_accelerate 88 | +export MODEL_NAME=microsoft/bloom-deepspeed-inference-fp16 89 | +export DEPLOYMENT_FRAMEWORK=ds_inference 90 | export DTYPE=fp16 91 | 92 | # for more information on gunicorn see https://docs.gunicorn.org/en/stable/settings.html 93 | -------------------------------------------------------------------------------- /online-inference/bloom-176b-deepspeed/files/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | torchaudio 4 | --extra-index-url https://download.pytorch.org/whl/cu116 5 | transformers==4.21.3 6 | accelerate==0.12.0 7 | deepspeed>=0.7.3 8 | flask==2.2.2 9 | flask_api==3.0.post1 10 | gunicorn==20.1.0 11 | pydantic==1.10.2 12 | huggingface_hub>=0.9.0 13 | -------------------------------------------------------------------------------- /online-inference/bloom-176b/00-bloom-176b-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: model-cache 5 | spec: 6 | storageClassName: shared-nvme-ord1 7 | accessModes: 8 | - ReadWriteMany 9 | resources: 10 | requests: 11 | storage: 350Gi 12 | 13 | 14 | -------------------------------------------------------------------------------- /online-inference/bloom-176b/01-bloom-176b-download-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: bloom-176b-download 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: model-downloader 10 | image: tweldoncw/bloom-176b:1 11 | imagePullPolicy: IfNotPresent 12 | command: ["bash", "-c"] 13 | args: 14 | - 'download_model bloom /mnt/pvc' 15 | volumeMounts: 16 | - name: cache 17 | mountPath: /mnt/pvc 18 | resources: 19 | requests: 20 | cpu: 1 21 | memory: 4Gi 22 | limits: 23 | cpu: 1 24 | memory: 4Gi 25 | volumes: 26 | - name: cache 27 | persistentVolumeClaim: 28 | claimName: model-cache 29 | affinity: 30 | nodeAffinity: 31 | requiredDuringSchedulingIgnoredDuringExecution: 32 | nodeSelectorTerms: 33 | - matchExpressions: 34 | - key: topology.kubernetes.io/region 35 | operator: In 36 | values: 37 | - ORD1 38 | restartPolicy: Never 39 | backoffLimit: 2 40 | -------------------------------------------------------------------------------- /online-inference/bloom-176b/02-bloom-176b-inferenceservice.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kubeflow.org/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | name: bloom-176b 5 | spec: 6 | predictor: 7 | containerConcurrency: 1 8 | minReplicas: 1 9 | maxReplicas: 1 10 | affinity: 11 | nodeAffinity: 12 | requiredDuringSchedulingIgnoredDuringExecution: 13 | nodeSelectorTerms: 14 | - matchExpressions: 15 | - key: gpu.nvidia.com/class 16 | operator: In 17 | values: 18 | - A100_PCIE_80GB 19 | - key: topology.kubernetes.io/region 20 | operator: In 21 | values: 22 | - ORD1 23 | containers: 24 | - name: kserve-container 25 | image: tweldoncw/bloom-176b:1 26 | command: 27 | - "python3" 28 | - "/workspace/bloom.py" 29 | env: 30 | # The following values are defaults which may be changed as needed 31 | - name: MODEL_PATH 32 | value: "/mnt/pvc/bloom" 33 | - name: STORAGE_URI # Kserve mounts the PVC at /mnt/pvc/ 34 | value: pvc://model-cache/ 35 | - name: MODEL_DOWNLOAD_TIMEOUT 36 | value: "3600" 37 | # The following values are defaults which may be changed as needed here, as well in each predictor request. 38 | - name: MIN_LENGTH 39 | value: "1" 40 | - name: MAX_LENGTH 41 | value: "40" 42 | - name: TEMPERATURE 43 | value: "1.0" 44 | - name: TOP_K 45 | value: "50" 46 | - name: TOP_P 47 | value: "1.0" 48 | - name: REPETITION_PENALTY 49 | value: "1.125" 50 | resources: 51 | requests: 52 | cpu: 12 53 | memory: 64Gi 54 | nvidia.com/gpu: 5 55 | limits: 56 | cpu: 12 57 | memory: 64Gi 58 | nvidia.com/gpu: 5 59 | -------------------------------------------------------------------------------- /online-inference/bloom-176b/model/Dockerfile: -------------------------------------------------------------------------------- 1 | # PyTorch and Hugging Face 2 | FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime AS pytorch-huggingface 3 | 4 | # Upgrade packages 5 | RUN apt update && apt upgrade -y 6 | 7 | RUN apt update && apt install -y curl git wget zip tree 8 | 9 | ADD requirements.txt /tmp/ 10 | RUN pip3 install -r /tmp/requirements.txt && \ 11 | # Remove Apache Log4j 2 CVE-2021-44228, ray 1.9.1 has not upgraded log4j as they promised \ 12 | rm -rf /opt/conda/lib/python3.7/site-packages/ray/jars 13 | 14 | ADD scripts/ /usr/bin/ 15 | ADD bloom.py /workspace 16 | 17 | #RUN mkdir -p /inference 18 | #WORKDIR /inference 19 | # 20 | #ADD huggingface/wiki_corpus.txt huggingface/wiki_corpus.py ./ 21 | #ADD huggingface/huggingface.py ./ 22 | -------------------------------------------------------------------------------- /online-inference/bloom-176b/model/bloom.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import re 4 | import logging 5 | import kserve 6 | from typing import Dict 7 | 8 | from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 9 | import torch 10 | 11 | mem_map = {0: '71GIB', 1: '71GIB', 2: '71GIB', 3: '71GIB', 4: '71GIB'} 12 | 13 | model_id = os.getenv('MODEL_ID', "bigscience/bloom") 14 | 15 | options = { 16 | 'MODEL_PATH': os.getenv('MODEL_PATH', "/mnt/pvc/bloom"), 17 | 'MODEL_NAME': re.sub(r'[^\w-]', '-', model_id).lower(), 18 | 'MODEL_TYPE': os.getenv('MODEL_TYPE', 'text-generation'), 19 | #'DEVICE_MAP': os.getenv('DEVICE_MAP', "auto"), 20 | 'MODEL_DOWNLOAD_TIMEOUT': int(os.getenv('MODEL_DOWNLOAD_TIMEOUT', 300)) 21 | } 22 | 23 | model_params = { 24 | 'MIN_LENGTH': int(os.getenv('MIN_LENGTH', 1)), 25 | 'MAX_LENGTH': int(os.getenv('MAX_LENGTH', 40)), 26 | 'TEMPERATURE': float(os.getenv('TEMPERATURE', 1.0)), 27 | 'TOP_K': int(os.getenv('TOP_K', 50)), 28 | 'TOP_P': float(os.getenv('TOP_P', 1.0)), 29 | 'REPETITION_PENALTY': float(os.getenv('REPETITION_PENALTY', 1.0)), 30 | } 31 | 32 | logging.basicConfig(level=kserve.constants.KSERVE_LOGLEVEL) 33 | logger = logging.getLogger(options['MODEL_NAME']) 34 | 35 | class Model(kserve.Model): 36 | def __init__(self, name: str): 37 | super().__init__(name) 38 | self.name = name 39 | self.ready = False 40 | self.model = None 41 | self.tokenizer = None 42 | self.generator = None 43 | self.model_name = options['MODEL_NAME'] 44 | 45 | def load(self): 46 | self.model = AutoModelForCausalLM.from_pretrained(options["MODEL_PATH"], device_map="auto", max_memory=mem_map, torch_dtype=torch.bfloat16, local_files_only=True) 47 | self.model.bfloat16().eval() 48 | self.tokenizer = AutoTokenizer.from_pretrained(options["MODEL_PATH"], local_files_only=True) 49 | self.generator = pipeline( 50 | options['MODEL_TYPE'], 51 | model=self.model, 52 | tokenizer=self.tokenizer, 53 | device_map="auto", 54 | ) 55 | self.ready = True 56 | 57 | def predict(self, request: Dict) -> Dict: 58 | request_params = model_params.copy() 59 | 60 | if 'parameters' in request: 61 | parameters = request['parameters'] 62 | for k, pv in parameters.items(): 63 | pk = k.upper() 64 | if pk in request_params: 65 | logger.debug(f'Parameter {pk} changed from {request_params[pk]} to {pv}') 66 | request_params[pk] = pv 67 | 68 | return {'predictions': self.generator( 69 | request['instances'], 70 | #do_sample=True, 71 | min_length=request_params['MIN_LENGTH'], 72 | max_length=request_params['MAX_LENGTH'], 73 | temperature=request_params['TEMPERATURE'], 74 | top_k=request_params['TOP_K'], 75 | top_p=request_params['TOP_P'], 76 | repetition_penalty=request_params['REPETITION_PENALTY'] 77 | )} 78 | 79 | @staticmethod 80 | def is_ready(): 81 | ready_path = os.path.join(options['MODEL_PATH'], '.ready.txt') 82 | logger.info(f'Waiting for download to be ready ...') 83 | interval_time = 10 84 | intervals = options['MODEL_DOWNLOAD_TIMEOUT'] // interval_time 85 | for i in range(intervals): 86 | time.sleep(interval_time) 87 | if os.path.exists(ready_path): 88 | logger.info('Download ready') 89 | return 90 | raise Exception(f'Download timeout {interval_time * intervals}!') 91 | 92 | if __name__ == '__main__': 93 | Model.is_ready() 94 | with torch.no_grad(): 95 | model = Model(options['MODEL_NAME']) 96 | model.load() 97 | kserve.ModelServer().start([model]) 98 | -------------------------------------------------------------------------------- /online-inference/bloom-176b/model/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.11.0 2 | git+https://github.com/huggingface/transformers.git@ccc0897 # For device_map in pipelines() support 3 | kserve==0.8.0.2 -------------------------------------------------------------------------------- /online-inference/bloom-176b/model/scripts/download_model: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Author: Marcin Gucki 4 | # https://github.com/coreweave/ml-images/blob/master/huggingface/download_huggingface.sh 5 | 6 | set -ex 7 | 8 | if [[ $# -ne 2 ]]; then 9 | echo "Invalid number of arguments" 10 | echo "Usage: ./download_huggingface.sh " 11 | echo " - model_name - model name to download e.g. NovelAI/genji-python-6b" 12 | echo " - save_path - base directory where the model is saved, e.g. /mnt/pvc" 13 | exit 1 14 | fi 15 | 16 | MODEL_NAME=$1 17 | SAVE_PATH=$2 18 | 19 | BLOBSTORE_PREFIX="inference" 20 | PATH=${PATH}:"${CURRENT_DIR}/scripts/bin" 21 | 22 | echo "SAVE_PATH: ${SAVE_PATH}" 23 | echo "MODEL_NAME: ${MODEL_NAME}" 24 | 25 | mkdir -pv "${SAVE_PATH}/${MODEL_NAME}" 26 | 27 | function download_file { 28 | local FILE_PATH=$1 29 | local DIR_PATH=$(dirname "${FILE_PATH}") 30 | mkdir -p "${DIR_PATH}" 31 | curl "http://blobstore.s3.ord1.coreweave.com/inference/${FILE_PATH}" --output "${FILE_PATH}" 32 | } 33 | 34 | function download { 35 | echo "Downloading model ${MODEL_NAME} into ${SAVE_PATH}" 36 | 37 | pushd "${SAVE_PATH}" 38 | mkdir -p "${MODEL_NAME}" 39 | pushd "${MODEL_NAME}" 40 | FILE_LIST=($(curl --insecure "http://blobstore.s3.ord1.coreweave.com/inference/${MODEL_NAME}/files.txt" | awk '{print $4;}')) 41 | popd 42 | 43 | for file in "${FILE_LIST[@]}";do 44 | relative_path=${file#"s3://blobstore/inference/"} 45 | download_file "${relative_path}" 46 | done 47 | 48 | popd 49 | } 50 | 51 | function set_ready { 52 | echo "Save .ready.txt in ${SAVE_PATH}/${MODEL_NAME}" 53 | pushd "${SAVE_PATH}/${MODEL_NAME}" 54 | touch ".ready.txt" 55 | tree 56 | popd 57 | } 58 | 59 | date 60 | download 61 | set_ready 62 | date 63 | 64 | exit 0 65 | -------------------------------------------------------------------------------- /online-inference/custom-basnet/README.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | This example demonstrates deploying an auto-scaling Inference service from a pre-existing docker image. This can be useful when deploying off-the-shelf models that aren't available as ie. Tensorflow SavedModels. One example of this is the [IBM COCO Based Object Detector](https://github.com/IBM/MAX-Object-Detector). An [example InferenceService](./object-detector-inferenceservice.yaml) for that also exists in this repository. The rest of this example will focus on a [public wrapped version](https://github.com/cyrildiagne/basnet-http) of the [BASNet object detection model](https://github.com/NathanUA/BASNet). This example and the test client is based on work by [Cyril Diagne](https://twitter.com/cyrildiagne/status/1256916982764646402). 4 | 5 | **Input** 6 | ![input](./client/images/test.png) 7 | 8 | 9 | **Output** 10 | ![output](./client/expected_output.png) 11 | 12 | ### Getting Started 13 | 14 | After installing `kubectl` and adding your CoreWeave Cloud access credentials, the following steps will deploy the Inference Service. Clone all the files in this repository to follow along. 15 | 16 | 1. Apply the resources. This can be used to both create and update existing manifests 17 | ```bash 18 | $ kubectl apply -f basnet-inferenceservice.yaml 19 | inferenceservice.serving.kubeflow.org/basnet configured 20 | ``` 21 | 22 | 2. List pods to see that the Transformer and Predictor have launched successfully 23 | ```bash 24 | $ kubectl get pods 25 | NAME READY STATUS RESTARTS AGE 26 | basnet-predictor-default-sj9kr-deployment-76b67d669-4gjrp 2/2 Running 0 34s 27 | ``` 28 | If the predictor fails to init, look in the logs for clues `kubectl logs basnet-predictor-default-sj9kr-deployment-76b67d669-4gjrp kfserving-container`. 29 | 30 | 3. Once all the Pods are running, we can get the API endpoint for our model. Since this model doesn't adhere to the [Tensorflow V1 HTTP API](https://www.tensorflow.org/tfx/serving/api_rest#predict_api), we can't use the API endpoint provided by `kubectl get inferenceservices`. We have to hit up the predictor directly. 31 | ```bash 32 | $ kubectl get ksvc 33 | NAME URL LATESTCREATED LATESTREADY READY REASON 34 | basnet-predictor-default https://basnet-predictor-default.tenant-test.knative.chi.coreweave.com basnet-predictor-default-sj9kr basnet-predictor-default-sj9kr True 35 | ``` 36 | The URL in the output is the public API URL for your newly deployed model. 37 | 38 | 4. Enter the client directory. You can either run the test client locally or in docker. The output will be in `images/output.png`. 39 | ```bash 40 | $ cd client/ 41 | $ export SERVICE_URL=https://basnet-predictor-default.tenant-test.knative.chi.coreweave.com 42 | $ docker build -t test .; docker run --rm -it -v $(pwd)/images:/app/images test --basnet_service_host $SERVICE_URL 43 | INFO:root: > sending to BASNet... 44 | INFO:root:200 45 | INFO:root: > saving results... 46 | INFO:root: > opening mask... 47 | INFO:root: > compositing final image... 48 | INFO:root: > saving final image... 49 | $ open images/output.png 50 | ``` 51 | 52 | 5. Remove the inference service 53 | ```bash 54 | $ kubectl delete inferenceservices basnet 55 | inferenceservice.serving.kubeflow.org "basnet" deleted 56 | ``` -------------------------------------------------------------------------------- /online-inference/custom-basnet/basnet-inferenceservice.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kubeflow.org/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | labels: 5 | qos.coreweave.cloud/latency: low 6 | name: basnet 7 | spec: 8 | predictor: 9 | maxReplicas: 20 10 | minReplicas: 1 11 | containerConcurrency: 1 12 | containers: 13 | - name: kfserving-container 14 | image: docker.io/cyrildiagne/basnet-http 15 | ports: 16 | - containerPort: 80 17 | protocol: TCP 18 | resources: 19 | limits: 20 | cpu: "3" 21 | memory: 8Gi 22 | nvidia.com/gpu: "1" 23 | requests: 24 | cpu: 500m 25 | memory: 4Gi 26 | affinity: 27 | nodeAffinity: 28 | requiredDuringSchedulingIgnoredDuringExecution: 29 | nodeSelectorTerms: 30 | - matchExpressions: 31 | - key: gpu.nvidia.com/class 32 | operator: In 33 | values: 34 | - Quadro_RTX_5000 35 | -------------------------------------------------------------------------------- /online-inference/custom-basnet/client/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/custom-basnet/client/.DS_Store -------------------------------------------------------------------------------- /online-inference/custom-basnet/client/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-slim 2 | 3 | #ARG DEBIAN_FRONTEND=noninteractive 4 | #RUN apt-get update && apt-get install -y build-essential 5 | 6 | ENV APP_HOME /app 7 | WORKDIR $APP_HOME 8 | 9 | # Install production dependencies. 10 | COPY requirements.txt ./ 11 | RUN pip install --no-cache-dir -r ./requirements.txt 12 | 13 | # Copy local code to container image 14 | COPY main.py ./ 15 | 16 | ENTRYPOINT ["python", "main.py"] 17 | -------------------------------------------------------------------------------- /online-inference/custom-basnet/client/expected_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/custom-basnet/client/expected_output.png -------------------------------------------------------------------------------- /online-inference/custom-basnet/client/images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/custom-basnet/client/images/.DS_Store -------------------------------------------------------------------------------- /online-inference/custom-basnet/client/images/cut_mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/custom-basnet/client/images/cut_mask.png -------------------------------------------------------------------------------- /online-inference/custom-basnet/client/images/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/custom-basnet/client/images/output.png -------------------------------------------------------------------------------- /online-inference/custom-basnet/client/images/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/custom-basnet/client/images/test.png -------------------------------------------------------------------------------- /online-inference/custom-basnet/client/main.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | import argparse 4 | import io 5 | from PIL import Image 6 | 7 | logging.basicConfig(level=logging.INFO) 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--basnet_service_host', required=True, help="The BASNet service host") 11 | args = parser.parse_args() 12 | 13 | # Send to BASNet service. 14 | logging.info(' > sending to BASNet...') 15 | source = open('images/test.png', 'rb') 16 | files = {'data': source } 17 | res = requests.post(args.basnet_service_host, files=files) 18 | logging.info(res.status_code) 19 | 20 | # Save mask locally. 21 | logging.info(' > saving results...') 22 | with open('images/cut_mask.png', 'wb') as f: 23 | f.write(res.content) 24 | # shutil.copyfileobj(res.raw, f) 25 | 26 | logging.info(' > opening mask...') 27 | mask = Image.open('images/cut_mask.png').convert("L").resize((512, 512)) 28 | 29 | # Convert string data to PIL Image. 30 | logging.info(' > compositing final image...') 31 | ref = Image.open(source).resize((512, 512)) 32 | empty = Image.new("RGBA", ref.size, 0) 33 | img = Image.composite(ref, empty, mask) 34 | 35 | # Save locally. 36 | logging.info(' > saving final image...') 37 | img.save('images/output.png') 38 | -------------------------------------------------------------------------------- /online-inference/custom-basnet/client/requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.23.0 2 | Pillow==7.1.2 3 | -------------------------------------------------------------------------------- /online-inference/custom-basnet/object-detector-inferenceservice.yaml: -------------------------------------------------------------------------------- 1 | # This is a CPU only export of the model, for demonstration purposes only 2 | apiVersion: serving.kubeflow.org/v1alpha2 3 | kind: InferenceService 4 | metadata: 5 | labels: 6 | qos.coreweave.cloud/latency: low 7 | name: object-detector 8 | spec: 9 | default: 10 | predictor: 11 | custom: 12 | container: 13 | image: codait/max-object-detector 14 | name: kfserving-container 15 | ports: 16 | - containerPort: 80 17 | resources: 18 | limits: 19 | cpu: "3" 20 | memory: 8Gi 21 | requests: 22 | cpu: "1" 23 | memory: 4Gi 24 | -------------------------------------------------------------------------------- /online-inference/custom-pytorch-aitextgen/aitextgen-inferenceservice.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kubeflow.org/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | labels: 5 | qos.coreweave.cloud/latency: low 6 | name: aitextgen 7 | spec: 8 | predictor: 9 | maxReplicas: 10 10 | minReplicas: 3 11 | containerConcurrency: 1 12 | containers: 13 | - name: kfserving-container 14 | image: coreweave/aitextgen-model:11 15 | resources: 16 | limits: 17 | cpu: "3" 18 | memory: 18Gi 19 | nvidia.com/gpu: "1" 20 | requests: 21 | cpu: "1" 22 | memory: 10Gi 23 | affinity: 24 | nodeAffinity: 25 | requiredDuringSchedulingIgnoredDuringExecution: 26 | nodeSelectorTerms: 27 | - matchExpressions: 28 | - key: gpu.nvidia.com/class 29 | operator: In 30 | values: 31 | - Quadro_RTX_5000 32 | -------------------------------------------------------------------------------- /online-inference/custom-pytorch-aitextgen/custom-predictor/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.5-cuda10.1-cudnn7-devel 2 | 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | RUN apt-get update && apt-get install -y build-essential git 5 | 6 | ENV APP_HOME /app 7 | WORKDIR $APP_HOME 8 | 9 | RUN git clone https://github.com/NVIDIA/apex 10 | RUN cd apex && /opt/conda/bin/python -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'./setup.py'"'"'; __file__='"'"'.//setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' --cpp_ext --cuda_ext install --record /tmp/install-record.txt --single-version-externally-managed --compile --install-headers /opt/conda/include/python3.7m/apex 11 | RUN cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 12 | 13 | # Install production dependencies. 14 | COPY requirements.txt ./ 15 | RUN pip install --no-cache-dir -r ./requirements.txt 16 | 17 | # Copy local code to container image 18 | COPY *.py ./ 19 | 20 | CMD ["python", "model.py"] 21 | -------------------------------------------------------------------------------- /online-inference/custom-pytorch-aitextgen/custom-predictor/model.py: -------------------------------------------------------------------------------- 1 | import kfserving 2 | from typing import List, Dict 3 | 4 | from aitextgen import aitextgen 5 | 6 | class Model(kfserving.KFModel): 7 | def __init__(self, name: str): 8 | super().__init__(name) 9 | self.name = name 10 | self.ready = False 11 | 12 | def load(self): 13 | self.ai = aitextgen(tf_gpt2="1558M", to_gpu=True, to_fp16=True) 14 | self.ready = True 15 | 16 | def predict(self, request: Dict) -> Dict: 17 | payload = request["text"] 18 | 19 | prediction = self.ai.generate_one(prompt=payload, max_length=request.get("length", 64)) 20 | 21 | return { 'prediction': prediction } 22 | 23 | if __name__ == "__main__": 24 | model = Model('aitextgen') 25 | model.load() 26 | kfserving.KFServer(workers=1).start([model]) 27 | -------------------------------------------------------------------------------- /online-inference/custom-pytorch-aitextgen/custom-predictor/requirements.txt: -------------------------------------------------------------------------------- 1 | kfserving==0.5.1 2 | aitextgen 3 | tensorflow 4 | 5 | -------------------------------------------------------------------------------- /online-inference/custom-sentiment/custom-predictor/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.5-cuda10.1-cudnn7-runtime 2 | 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | RUN apt-get update && apt-get install -y build-essential 5 | 6 | ENV APP_HOME /app 7 | WORKDIR $APP_HOME 8 | 9 | # Install production dependencies. 10 | COPY requirements.txt ./ 11 | RUN pip install --no-cache-dir -r ./requirements.txt 12 | 13 | # Copy local code to container image 14 | COPY *.py ./ 15 | 16 | CMD ["python", "model.py"] 17 | -------------------------------------------------------------------------------- /online-inference/custom-sentiment/custom-predictor/model.py: -------------------------------------------------------------------------------- 1 | import kfserving 2 | from typing import List, Dict 3 | 4 | from fastai.text import load_learner 5 | 6 | class Model(kfserving.KFModel): 7 | def __init__(self, name: str): 8 | super().__init__(name) 9 | self.name = name 10 | self.ready = False 11 | 12 | def load(self): 13 | self.model = load_learner("/mnt/models") 14 | self.ready = True 15 | 16 | def predict(self, request: Dict) -> Dict: 17 | # Request and response follows the Tensorflow V1 HTTP API, 18 | # but does not have to. 19 | # No batching, grab the first instance only 20 | payload = request["instances"][0] 21 | 22 | predictions = self.model.predict(payload) 23 | prediction = predictions[0].obj 24 | 25 | return { 'predictions': [prediction] } 26 | 27 | if __name__ == "__main__": 28 | model = Model('sentiment') 29 | model.load() 30 | kfserving.KFServer(workers=1).start([model]) 31 | -------------------------------------------------------------------------------- /online-inference/custom-sentiment/custom-predictor/requirements.txt: -------------------------------------------------------------------------------- 1 | kfserving==0.5.1 2 | fastai==1.0.61 3 | torch==1.5.0 4 | -------------------------------------------------------------------------------- /online-inference/custom-sentiment/image-secrets-serviceaccount.patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | imagePullSecrets: 4 | - name: docker-hub 5 | -------------------------------------------------------------------------------- /online-inference/custom-sentiment/model-storage-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: model-storage 5 | spec: 6 | # https://docs.coreweave.com/coreweave-kubernetes/storage 7 | storageClassName: shared-nvme-ord1 8 | accessModes: 9 | - ReadWriteMany 10 | resources: 11 | requests: 12 | storage: 30Gi 13 | -------------------------------------------------------------------------------- /online-inference/custom-sentiment/sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "instances": ["CoreWeave is my favourite cloud"] 3 | } 4 | -------------------------------------------------------------------------------- /online-inference/custom-sentiment/sentiment-inferenceservice.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kubeflow.org/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | labels: 5 | qos.coreweave.cloud/latency: low 6 | name: sentiment 7 | spec: 8 | predictor: 9 | maxReplicas: 10 10 | minReplicas: 0 11 | containerConcurrency: 1 12 | containers: 13 | - name: kfserving-container 14 | image: coreweave/fastai-sentiment:4 15 | env: 16 | - name: STORAGE_URI 17 | value: pvc://model-storage/sentiment 18 | resources: 19 | limits: 20 | cpu: "3" 21 | memory: 8Gi 22 | nvidia.com/gpu: "1" 23 | requests: 24 | cpu: "1" 25 | memory: 6Gi 26 | affinity: 27 | nodeAffinity: 28 | requiredDuringSchedulingIgnoredDuringExecution: 29 | nodeSelectorTerms: 30 | - matchExpressions: 31 | - key: gpu.nvidia.com/class 32 | operator: In 33 | values: 34 | - Quadro_RTX_5000 35 | -------------------------------------------------------------------------------- /online-inference/custom-sentiment/sleep-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: sleep 5 | spec: 6 | replicas: 1 7 | revisionHistoryLimit: 1 8 | selector: 9 | matchLabels: 10 | app.kubernetes.io/name: sleep 11 | strategy: 12 | type: Recreate 13 | template: 14 | metadata: 15 | labels: 16 | app.kubernetes.io/name: sleep 17 | spec: 18 | containers: 19 | - name: sleep 20 | image: banst/awscli:1.18.56 21 | # Simple way of keeping an idle container running 22 | command: [sleep] 23 | args: ["86400d"] 24 | imagePullPolicy: IfNotPresent 25 | resources: 26 | requests: 27 | cpu: 50m 28 | memory: 10Mi 29 | limits: 30 | cpu: 1 31 | memory: 128Mi 32 | volumeMounts: 33 | - name: model-storage 34 | mountPath: /models 35 | 36 | volumes: 37 | - name: model-storage 38 | persistentVolumeClaim: 39 | claimName: model-storage 40 | 41 | affinity: 42 | nodeAffinity: 43 | requiredDuringSchedulingIgnoredDuringExecution: 44 | nodeSelectorTerms: 45 | - matchExpressions: 46 | - key: topology.kubernetes.io/region 47 | operator: In 48 | values: 49 | - ORD1 50 | -------------------------------------------------------------------------------- /online-inference/dalle-mini/00-model-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: dalle-mini-model-cache 5 | spec: 6 | storageClassName: shared-nvme-ord1 7 | accessModes: 8 | - ReadWriteMany 9 | resources: 10 | requests: 11 | storage: "30Gi" -------------------------------------------------------------------------------- /online-inference/dalle-mini/01-model-download-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: dalle-mini-download 5 | #name: dalle-mega-download 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: model-downloader 11 | image: tweldoncw/model-downloader:6 12 | imagePullPolicy: IfNotPresent 13 | command: 14 | - "python3" 15 | - "/app/download.py" 16 | - "--model-id=dalle-mini/dalle-mini" 17 | #- "--model-id=dalle-mini/dalle-mega" 18 | - "--model-cache=/mnt/pvc" 19 | volumeMounts: 20 | - name: model-cache 21 | mountPath: /mnt/pvc 22 | resources: 23 | requests: 24 | cpu: 1 25 | memory: 4Gi 26 | limits: 27 | cpu: 1 28 | memory: 4Gi 29 | volumes: 30 | - name: model-cache 31 | persistentVolumeClaim: 32 | claimName: dalle-mini-model-cache 33 | affinity: 34 | nodeAffinity: 35 | requiredDuringSchedulingIgnoredDuringExecution: 36 | nodeSelectorTerms: 37 | - matchExpressions: 38 | - key: topology.kubernetes.io/region 39 | operator: In 40 | values: 41 | - ORD1 42 | restartPolicy: Never 43 | backoffLimit: 2 44 | -------------------------------------------------------------------------------- /online-inference/dalle-mini/02-inference-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kubeflow.org/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | #name: dalle-mega 5 | name: dalle-mini 6 | spec: 7 | predictor: 8 | containerConcurrency: 1 9 | minReplicas: 1 10 | maxReplicas: 1 11 | affinity: 12 | nodeAffinity: 13 | requiredDuringSchedulingIgnoredDuringExecution: 14 | nodeSelectorTerms: 15 | - matchExpressions: 16 | - key: gpu.nvidia.com/class 17 | operator: In 18 | values: 19 | - RTX_A6000 20 | - key: topology.kubernetes.io/region 21 | operator: In 22 | values: 23 | - ORD1 24 | containers: 25 | - name: kserve-container 26 | image: tweldoncw/dalle-mini:7 27 | command: 28 | - "python3" 29 | - "/app/service.py" 30 | env: 31 | - name: MODEL_ID 32 | #value: "dalle-mini/dalle-mega" 33 | value: "dalle-mini/dalle-mini" 34 | - name: MODEL_CACHE 35 | value: "/mnt/models" 36 | - name: STORAGE_URI # Kserve mounts the PVC at /mnt/models/ 37 | value: pvc://dalle-mini-model-cache/ 38 | # The following env vars are the default model parameters, which can be changed as needed. 39 | - name: TOP_K 40 | value: "50" 41 | - name: TOP_P 42 | value: "1.0" 43 | - name: TEMPERATURE 44 | value: "1.0" 45 | - name: CONDITION_SCALE 46 | value: "10.0" 47 | resources: 48 | requests: 49 | cpu: 6 50 | memory: 48Gi 51 | nvidia.com/gpu: 1 52 | limits: 53 | cpu: 6 54 | memory: 48Gi 55 | nvidia.com/gpu: 1 56 | -------------------------------------------------------------------------------- /online-inference/dalle-mini/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG MODEL=dalle-mini 2 | ARG CUDA_RELEASE=12.2.0-devel-ubuntu20.04 3 | FROM nvidia/cuda:${CUDA_RELEASE} AS base 4 | ENV DEBIAN_FRONTEND=noninteractive 5 | RUN apt-mark unhold $(apt-mark showhold) 6 | RUN apt update && apt upgrade -y 7 | 8 | RUN apt install -y python3 python3-pip git 9 | RUN mkdir -p /app 10 | ADD model/ /app 11 | WORKDIR /app 12 | RUN pip3 install -r requirements.txt 13 | 14 | CMD ["python3", "/app/service.py"] -------------------------------------------------------------------------------- /online-inference/dalle-mini/Dockerfile.downloader: -------------------------------------------------------------------------------- 1 | FROM python:3.9.13-alpine3.16 2 | RUN mkdir /app 3 | RUN pip3 install huggingface_hub 4 | ADD downloader/download.py /app 5 | CMD ["python3", "/app/download.py"] -------------------------------------------------------------------------------- /online-inference/dalle-mini/downloader/download.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tempfile 3 | import logging 4 | import shutil 5 | from huggingface_hub import snapshot_download 6 | import os 7 | 8 | logger = logging.getLogger("downloader") 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--model-id", type=str, default="dalle-mini/dalle-mini") 12 | parser.add_argument("--model-cache", type=str, default="/model-cache") 13 | args = parser.parse_args() 14 | 15 | logger.info(f'Downloading {args.model_id}...') 16 | 17 | tmpdir = tempfile.TemporaryDirectory(dir=args.model_cache) 18 | model = snapshot_download(repo_id=args.model_id, cache_dir=tmpdir.name) 19 | model_dir = os.path.join(args.model_cache, args.model_id) 20 | os.makedirs(model_dir) 21 | 22 | os.chdir(model) 23 | for file in os.listdir(model): 24 | os.getcwd() 25 | src = os.readlink(os.path.join(model, file)) 26 | dest = os.path.join(model_dir, file) 27 | logger.info(f'moving {src} to {dest}') 28 | shutil.move(src, dest) 29 | 30 | ready = os.path.join(model_dir, '.ready.txt') 31 | with open(ready, 'w') as ready_file: 32 | pass 33 | 34 | tmpdir.cleanup() 35 | 36 | logger.info(f'Download complete') -------------------------------------------------------------------------------- /online-inference/dalle-mini/model/requirements.txt: -------------------------------------------------------------------------------- 1 | -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html 2 | jax[cuda]==0.3.15 3 | dalle-mini==0.1.1 4 | git+https://github.com/patil-suraj/vqgan-jax.git@10ef240 5 | jupyter==1.0.0 6 | jupyterlab==3.4.4 7 | ipywidgets==7.7.1 8 | tqdm==4.64.0 9 | kserve==0.9.0 10 | msrest==0.7.1 11 | 12 | -------------------------------------------------------------------------------- /online-inference/fastertransformer/README.md: -------------------------------------------------------------------------------- 1 | Please refer to [the documentation](https://docs.coreweave.com/machine-learning-and-ai/inference/examples/triton-inference/triton-inference-server-fastertransformer) for usage instructions. 2 | -------------------------------------------------------------------------------- /online-inference/fastertransformer/build/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Rahul Talari (rtalari@coreweave.com) 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Base Image 16 | ARG TRITON_VERSION=22.04 17 | ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3 18 | FROM ${BASE_IMAGE} as server-builder 19 | 20 | # Get NIVIDIA keys to authenticate 21 | RUN export this_distro="$(cat /etc/os-release | grep '^ID=' | awk -F'=' '{print $2}')" \ 22 | && export this_version="$(cat /etc/os-release | grep '^VERSION_ID=' | awk -F'=' '{print $2}' | sed 's/[^0-9]*//g')" \ 23 | && apt-key adv --fetch-keys "https://developer.download.nvidia.com/compute/cuda/repos/${this_distro}${this_version}/x86_64/7fa2af80.pub" \ 24 | && apt-key adv --fetch-keys "https://developer.download.nvidia.com/compute/cuda/repos/${this_distro}${this_version}/x86_64/3bf863cc.pub" 25 | 26 | # Run updates and install packages for build 27 | RUN apt-get update && \ 28 | apt-get install -y --no-install-recommends \ 29 | openssh-server zsh tmux mosh locales-all clangd sudo \ 30 | zip unzip wget build-essential autoconf autogen gdb \ 31 | python3.8 python3-pip python3-dev rapidjson-dev \ 32 | xz-utils zstd libz-dev && \ 33 | apt-get clean && \ 34 | rm -rf /var/lib/apt/lists/* 35 | 36 | # Setup workdir for build 37 | WORKDIR /workspace/build/ 38 | 39 | # CMake 40 | RUN CMAKE_VERSION=3.18 && \ 41 | CMAKE_BUILD=3.18.4 && \ 42 | wget -nv https://cmake.org/files/v${CMAKE_VERSION}/cmake-${CMAKE_BUILD}.tar.gz && \ 43 | tar -xf cmake-${CMAKE_BUILD}.tar.gz && \ 44 | cd cmake-${CMAKE_BUILD} && \ 45 | ./bootstrap --parallel=$(grep -c ^processor /proc/cpuinfo) -- -DCMAKE_USE_OPENSSL=OFF && \ 46 | make -j"$(grep -c ^processor /proc/cpuinfo)" install && \ 47 | cd /workspace/build/ && \ 48 | rm -rf /workspace/build/cmake-${CMAKE_BUILD} 49 | 50 | # backend build 51 | WORKDIR /workspace/build/triton-experiments 52 | 53 | RUN git clone https://github.com/triton-inference-server/fastertransformer_backend.git 54 | RUN mv /workspace/build/triton-experiments/fastertransformer_backend/cmake /workspace/build/triton-experiments 55 | RUN mv /workspace/build/triton-experiments/fastertransformer_backend/src /workspace/build/triton-experiments 56 | RUN mv /workspace/build/triton-experiments/fastertransformer_backend/CMakeLists.txt /workspace/build/triton-experiments 57 | 58 | ARG FORCE_BACKEND_REBUILD=0 59 | RUN mkdir build -p && \ 60 | cd build && \ 61 | cmake \ 62 | -D CMAKE_EXPORT_COMPILE_COMMANDS=1 \ 63 | -D CMAKE_BUILD_TYPE=Release \ 64 | -D CMAKE_INSTALL_PREFIX=/opt/tritonserver \ 65 | -D TRITON_COMMON_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ 66 | -D TRITON_CORE_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ 67 | -D TRITON_BACKEND_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \ 68 | .. && \ 69 | make -j"$(grep -c ^processor /proc/cpuinfo)" install 70 | 71 | # ================================= 72 | # Runner Image 73 | # ================================= 74 | 75 | FROM ${BASE_IMAGE} as server 76 | 77 | # TODO: Change to PARALLEL and see performance metrics 78 | ENV NCCL_LAUNCH_MODE=PARALLEL 79 | 80 | COPY --from=server-builder /opt/tritonserver/backends/fastertransformer /opt/tritonserver/backends/fastertransformer -------------------------------------------------------------------------------- /online-inference/fastertransformer/client/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Rahul Talari (rtalari@coreweave.com) 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Base Image 16 | ARG TRITON_VERSION=22.05 17 | ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3 18 | FROM ${BASE_IMAGE} as server-builder 19 | 20 | # Get NIVIDIA keys to authenticate 21 | RUN export this_distro="$(cat /etc/os-release | grep '^ID=' | awk -F'=' '{print $2}')" \ 22 | && export this_version="$(cat /etc/os-release | grep '^VERSION_ID=' | awk -F'=' '{print $2}' | sed 's/[^0-9]*//g')" \ 23 | && apt-key adv --fetch-keys "https://developer.download.nvidia.com/compute/cuda/repos/${this_distro}${this_version}/x86_64/7fa2af80.pub" \ 24 | && apt-key adv --fetch-keys "https://developer.download.nvidia.com/compute/cuda/repos/${this_distro}${this_version}/x86_64/3bf863cc.pub" 25 | 26 | # Run updates and install packages for build 27 | RUN apt-get update && \ 28 | apt-get install -y --no-install-recommends \ 29 | python3.8 python3-pip python3-dev && \ 30 | apt-get clean && \ 31 | rm -rf /var/lib/apt/lists/* 32 | 33 | # Setup workdir for build 34 | WORKDIR /workspace 35 | 36 | ADD gpt_bpe gpt_bpe 37 | ADD hf_tokenizer hf_tokenizer 38 | ADD example.py example.py 39 | ADD sample_request.json sample_request.json 40 | ADD requirements.txt requirements.txt 41 | RUN pip3 install torch==1.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html 42 | RUN pip3 install -r requirements.txt 43 | 44 | ENTRYPOINT [ "python3", "example.py" ] 45 | 46 | -------------------------------------------------------------------------------- /online-inference/fastertransformer/client/hf_tokenizer/hf_tokenize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pathlib import Path 16 | from tokenizers import Tokenizer 17 | from typing import List, Union 18 | 19 | class HFTokenizer: 20 | def __init__(self, vocab_file): 21 | self.tokenizer = Tokenizer.from_file(vocab_file) 22 | 23 | def tokenize(self, text: str): 24 | return self.tokenizer.encode(text).ids 25 | 26 | def tokenize_batch(self, text_batch: Union[List[str], str]): 27 | return self.tokenizer.encode_batch(text_batch) 28 | 29 | def detokenize(self, token_ids): 30 | return self.tokenizer.decode(token_ids) -------------------------------------------------------------------------------- /online-inference/fastertransformer/client/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.1 2 | aiosignal==1.2.0 3 | async-timeout==4.0.2 4 | attrs==22.1.0 5 | Brotli==1.0.9 6 | certifi==2022.6.15 7 | charset-normalizer==2.1.0 8 | frozenlist==1.3.1 9 | gevent==21.12.0 10 | geventhttpclient==2.0 11 | greenlet==1.1.2 12 | grpcio==1.41.0 13 | idna==3.3 14 | multidict==6.0.2 15 | numpy==1.23.1 16 | protobuf==3.19.4 17 | python-rapidjson==1.8 18 | regex==2022.7.25 19 | six==1.16.0 20 | tritonclient==2.24.0 21 | tokenizers==0.12.1 22 | typing_extensions==4.3.0 23 | yarl==1.8.1 24 | zope.event==4.5.0 25 | zope.interface==5.4.0 -------------------------------------------------------------------------------- /online-inference/fastertransformer/client/sample_request.json: -------------------------------------------------------------------------------- 1 | { 2 | "request": [ 3 | { 4 | "name": "input_ids", 5 | "data": [], 6 | "dtype": "int32" 7 | }, 8 | { 9 | "name": "input_lengths", 10 | "data": [], 11 | "dtype": "int32" 12 | }, 13 | { 14 | "name": "request_output_len", 15 | "data": [[64]], 16 | "dtype": "int32" 17 | }, 18 | { 19 | "name": "beam_search_diversity_rate", 20 | "data": [[0]], 21 | "dtype": "float32" 22 | }, 23 | { 24 | "name": "temperature", 25 | "data": [[1.0]], 26 | "dtype": "float32" 27 | }, 28 | { 29 | "name": "len_penalty", 30 | "data": [[1.0]], 31 | "dtype": "float32" 32 | }, 33 | { 34 | "name": "repetition_penalty", 35 | "data": [[1.0]], 36 | "dtype": "float32" 37 | }, 38 | { 39 | "name": "random_seed", 40 | "data": [[0]], 41 | "dtype": "uint64" 42 | }, 43 | { 44 | "name": "is_return_log_probs", 45 | "data": [[false]], 46 | "dtype": "bool" 47 | }, 48 | { 49 | "name": "beam_width", 50 | "data": [[1]], 51 | "dtype": "int32" 52 | }, 53 | { 54 | "name": "runtime_top_k", 55 | "data": [[10]], 56 | "dtype": "int32" 57 | }, 58 | { 59 | "name": "runtime_top_p", 60 | "data": [[0.0]], 61 | "dtype": "float32" 62 | }, 63 | { 64 | "name": "stop_words_list", 65 | "data": [[[0], [-1]]], 66 | "dtype": "int32" 67 | }, 68 | { 69 | "name": "bad_words_list", 70 | "data": [[[0], [-1]]], 71 | "dtype": "int32" 72 | } 73 | ] 74 | } 75 | -------------------------------------------------------------------------------- /online-inference/fastertransformer/ft-inference-service-gptj.yml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kubeflow.org/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | labels: 5 | qos.coreweave.cloud/latency: low 6 | name: fastertransformer-triton-gptj 7 | spec: 8 | predictor: 9 | maxReplicas: 1 10 | minReplicas: 1 11 | containerConcurrency: 1 12 | containers: 13 | - name: gptj-ft 14 | image: rtalaricw/gptj_ft:v1.2-22.04-new 15 | command: ["/opt/tritonserver/bin/tritonserver"] 16 | args: ["--model-repository=/mnt/pvc/gptj-store/triton-model-store"] 17 | env: 18 | - name: STORAGE_URI 19 | value: pvc://model-storage/ 20 | ports: 21 | # Uncomment to use GRPC 22 | # - containerPort: 8001 23 | # name: h2c 24 | # protocol: TCP 25 | - containerPort: 8000 26 | protocol: TCP 27 | resources: 28 | requests: 29 | cpu: 4 30 | memory: 8Gi 31 | nvidia.com/gpu: 1 32 | limits: 33 | cpu: 4 34 | memory: 8Gi 35 | nvidia.com/gpu: 1 36 | affinity: 37 | nodeAffinity: 38 | requiredDuringSchedulingIgnoredDuringExecution: 39 | nodeSelectorTerms: 40 | - matchExpressions: 41 | - key: gpu.nvidia.com/class 42 | operator: In 43 | values: 44 | - RTX_A5000 45 | - key: topology.kubernetes.io/region 46 | operator: In 47 | values: 48 | - LAS1 -------------------------------------------------------------------------------- /online-inference/fastertransformer/ft-inference-service-neox.yml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kubeflow.org/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | labels: 5 | qos.coreweave.cloud/latency: low 6 | name: fastertransformer-triton-neox 7 | spec: 8 | predictor: 9 | maxReplicas: 1 10 | minReplicas: 1 11 | containerConcurrency: 1 12 | containers: 13 | - name: gpt-neox-ft 14 | image: rtalaricw/gptj_ft:v1.2-22.04-new 15 | command: ["/opt/tritonserver/bin/tritonserver"] 16 | args: ["--model-repository=/mnt/pvc/gpt-neox/triton-model-store"] 17 | env: 18 | - name: STORAGE_URI 19 | value: pvc://model-storage/ 20 | ports: 21 | # Uncomment to use GRPC 22 | # - containerPort: 8001 23 | # name: h2c 24 | # protocol: TCP 25 | - containerPort: 8000 26 | protocol: TCP 27 | resources: 28 | requests: 29 | cpu: 4 30 | memory: 64Gi 31 | nvidia.com/gpu: 1 32 | limits: 33 | cpu: 4 34 | memory: 64Gi 35 | nvidia.com/gpu: 1 36 | affinity: 37 | nodeAffinity: 38 | requiredDuringSchedulingIgnoredDuringExecution: 39 | nodeSelectorTerms: 40 | - matchExpressions: 41 | - key: gpu.nvidia.com/class 42 | operator: In 43 | values: 44 | - RTX_A6000 45 | - key: topology.kubernetes.io/region 46 | operator: In 47 | values: 48 | - LAS1 -------------------------------------------------------------------------------- /online-inference/fastertransformer/model-storage-pvc.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: model-storage 5 | spec: 6 | # https://docs.coreweave.com/coreweave-kubernetes/storage 7 | storageClassName: shared-nvme-ord1 8 | accessModes: 9 | - ReadWriteMany 10 | resources: 11 | requests: 12 | storage: 150Gi 13 | 14 | -------------------------------------------------------------------------------- /online-inference/hf-llm/.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !serializer/requirements.txt 3 | !serializer/*.py 4 | !service/requirements.txt 5 | !service/*.py 6 | -------------------------------------------------------------------------------- /online-inference/hf-llm/00-optional-s3-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | access_key: Replace_this_with_your_base64_encoded_access_key 4 | secret_key: Replace_this_with_your_base64_encoded_secret_key 5 | host_url: Replace_this_with_your_base64_encoded_host_url 6 | kind: Secret 7 | metadata: 8 | name: s3-credentials 9 | type: Opaque 10 | -------------------------------------------------------------------------------- /online-inference/hf-llm/01-optional-s3-serialize-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: hf-llm-serializer 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: model-serializer 10 | image: ghcr.io/coreweave/ml-containers/hf-llm-inference:073f175 11 | imagePullPolicy: IfNotPresent 12 | command: 13 | - "python3" 14 | - "/app/serialize.py" 15 | - "--hf-model-id=distilgpt2" 16 | - "--precision=float16" 17 | - "--dest-bucket=your-bucket-here" 18 | env: 19 | - name: S3_KEY 20 | valueFrom: 21 | secretKeyRef: 22 | name: s3-credentials 23 | key: access_key 24 | - name: S3_SECRET 25 | valueFrom: 26 | secretKeyRef: 27 | name: s3-credentials 28 | key: secret_key 29 | - name: S3_HOST 30 | valueFrom: 31 | secretKeyRef: 32 | name: s3-credentials 33 | key: host_url 34 | resources: 35 | requests: 36 | cpu: 2 37 | memory: 16Gi 38 | limits: 39 | cpu: 2 40 | memory: 16Gi 41 | affinity: 42 | nodeAffinity: 43 | requiredDuringSchedulingIgnoredDuringExecution: 44 | nodeSelectorTerms: 45 | - matchExpressions: 46 | - key: topology.kubernetes.io/region 47 | operator: In 48 | values: 49 | - ORD1 50 | restartPolicy: Never 51 | backoffLimit: 2 52 | -------------------------------------------------------------------------------- /online-inference/hf-llm/02-inference-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.knative.dev/v1 2 | kind: Service 3 | metadata: 4 | name: hf-llm 5 | annotations: 6 | networking.knative.dev/ingress-class: kourier.ingress.networking.knative.dev 7 | labels: 8 | knative.coreweave.cloud/ingress: kourier.ingress.networking.knative.dev 9 | spec: 10 | template: 11 | metadata: 12 | annotations: 13 | autoscaling.knative.dev/minScale: "1" 14 | autoscaling.knative.dev/maxScale: "1" 15 | spec: 16 | affinity: 17 | nodeAffinity: 18 | requiredDuringSchedulingIgnoredDuringExecution: 19 | nodeSelectorTerms: 20 | - matchExpressions: 21 | - key: gpu.nvidia.com/class 22 | operator: In 23 | values: 24 | - Quadro_RTX_5000 25 | - key: topology.kubernetes.io/region 26 | operator: In 27 | values: 28 | - ORD1 29 | containers: 30 | - name: kfserving-container 31 | image: ghcr.io/coreweave/ml-containers/hf-llm-inference:073f175 32 | command: 33 | - "python3" 34 | - "/app/service.py" 35 | - "--model-uri=s3://tensorized/EleutherAI/pythia-70m" 36 | - "--precision=float16" 37 | - "--port=80" 38 | env: 39 | - name: S3_KEY 40 | valueFrom: 41 | secretKeyRef: 42 | name: s3-credentials 43 | key: access_key 44 | optional: true 45 | - name: S3_SECRET 46 | valueFrom: 47 | secretKeyRef: 48 | name: s3-credentials 49 | key: secret_key 50 | optional: true 51 | - name: S3_HOST 52 | valueFrom: 53 | secretKeyRef: 54 | name: s3-credentials 55 | key: host_url 56 | optional: true 57 | ports: 58 | - protocol: TCP 59 | containerPort: 80 60 | livenessProbe: 61 | httpGet: 62 | path: / 63 | port: 80 64 | initialDelaySeconds: 30 65 | periodSeconds: 30 66 | readinessProbe: 67 | httpGet: 68 | path: / 69 | port: 80 70 | initialDelaySeconds: 30 71 | periodSeconds: 30 72 | resources: 73 | requests: 74 | cpu: 4 75 | memory: 16Gi 76 | nvidia.com/gpu: 1 77 | limits: 78 | cpu: 4 79 | memory: 16Gi 80 | nvidia.com/gpu: 1 81 | -------------------------------------------------------------------------------- /online-inference/hf-llm/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/coreweave/ml-containers/torch:afecfe9-base-cuda12.0.1-torch2.0.0-vision0.15.1 2 | ENV DEBIAN_FRONTEND=noninteractive 3 | 4 | RUN apt-get -qq update && \ 5 | apt-get -qq install --no-install-recommends -y git curl && \ 6 | apt-get clean 7 | 8 | ADD service/ /app/ 9 | COPY serializer/serialize.py /app/serialize.py 10 | WORKDIR /app 11 | 12 | RUN pip3 install --no-cache-dir --upgrade pip && \ 13 | pip3 install --no-cache-dir -r requirements.txt 14 | -------------------------------------------------------------------------------- /online-inference/hf-llm/serializer/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=2.0.0,<=2.3.0 2 | transformers==4.36.2 3 | tensorizer==2.7.1 4 | -------------------------------------------------------------------------------- /online-inference/hf-llm/serializer/serialize.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from argparse import ArgumentParser 4 | 5 | import torch 6 | from tensorizer import TensorSerializer, stream_io 7 | from transformers import AutoModelForCausalLM 8 | 9 | logging.basicConfig(level=logging.INFO) 10 | logger = logging.getLogger(__file__) 11 | 12 | s3_access_key_default = os.getenv("S3_KEY") or None 13 | s3_secret_access_key_default = os.getenv("S3_SECRET") or None 14 | s3_endpoint_default = os.getenv("S3_HOST") or "object.ord1.coreweave.com" 15 | 16 | parser = ArgumentParser() 17 | parser.add_argument("--hf-model-id", default="distilgpt2", type=str) 18 | parser.add_argument( 19 | "--precision", choices=["float16", "float32"], default="float16", type=str 20 | ) 21 | parser.add_argument("--dest-bucket", required=True, type=str) 22 | parser.add_argument( 23 | "--s3-access-key", 24 | default=s3_access_key_default, 25 | required=s3_access_key_default is None, 26 | type=str, 27 | ) 28 | parser.add_argument( 29 | "--s3-secret-access-key", 30 | default=s3_secret_access_key_default, 31 | required=s3_secret_access_key_default is None, 32 | type=str, 33 | ) 34 | parser.add_argument("--s3-endpoint", default=s3_endpoint_default, type=str) 35 | args = parser.parse_args() 36 | 37 | 38 | def save_artifact_s3(model, path): 39 | serializer = TensorSerializer( 40 | stream_io.open_stream( 41 | path_uri=path, 42 | mode="wb", 43 | s3_access_key_id=args.s3_access_key, 44 | s3_secret_access_key=args.s3_secret_access_key, 45 | s3_endpoint=args.s3_endpoint, 46 | s3_config_path=None, 47 | ) 48 | ) 49 | serializer.write_module(model) 50 | serializer.close() 51 | logger.info(f"Tensorized S3 artifact written to {path}") 52 | 53 | 54 | if __name__ == "__main__": 55 | model_id = args.hf_model_id 56 | model = AutoModelForCausalLM.from_pretrained( 57 | model_id, 58 | torch_dtype=torch.float16 59 | if args.precision == "float16" 60 | else torch.float32, 61 | ) 62 | 63 | model_file = "fp16/model.tensors" if args.precision == "float16" else "" 64 | uri = "s3://" + "/".join((args.dest_bucket, model_id, model_file)) 65 | 66 | save_artifact_s3(model, uri) 67 | -------------------------------------------------------------------------------- /online-inference/hf-llm/service/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=2.0.0,<=2.3.0 2 | transformers==4.36.2 3 | tensorizer==2.7.1 4 | fastapi==0.105.0 5 | uvicorn==0.24.0 6 | -------------------------------------------------------------------------------- /online-inference/image-classifier/jupyter/model-storage-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: model-storage 5 | spec: 6 | # Available shared filesystem storage classes. 7 | # Only use shared filesystems when mounting on multiple nodes is a requirement. 8 | # Regular storage classes provide better performance. 9 | # 10 | # sharedfs-hdd-replicated - HDD Backend shared filesystem with replicas 11 | # sharedfs-ssd-replicated - SSD Backed shared filesystem with replicas 12 | storageClassName: sharedfs-hdd-replicated 13 | accessModes: 14 | - ReadWriteMany 15 | resources: 16 | requests: 17 | storage: 30Gi 18 | -------------------------------------------------------------------------------- /online-inference/image-classifier/jupyter/tensorflow-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: tensorflow-jupyter 5 | spec: 6 | strategy: 7 | type: Recreate 8 | # Replicas controls the number of instances of the Pod to maintain running at all times 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app.kubernetes.io/name: tensorflow-jupyter 13 | template: 14 | metadata: 15 | labels: 16 | app.kubernetes.io/name: tensorflow-jupyter 17 | spec: 18 | containers: 19 | - name: tf 20 | image: tensorflow/tensorflow:2.0.1-gpu-py3-jupyter 21 | 22 | ports: 23 | - name: notebook 24 | containerPort: 8888 25 | protocol: TCP 26 | 27 | readinessProbe: 28 | tcpSocket: 29 | port: notebook 30 | initialDelaySeconds: 5 31 | periodSeconds: 10 32 | livenessProbe: 33 | httpGet: 34 | path: / 35 | port: notebook 36 | initialDelaySeconds: 15 37 | periodSeconds: 15 38 | failureThreshold: 3 39 | timeoutSeconds: 10 40 | 41 | volumeMounts: 42 | - name: storage 43 | mountPath: /tf/notebooks 44 | - name: model-storage 45 | mountPath: /models 46 | 47 | resources: 48 | requests: 49 | cpu: 500m # The CPU unit is mili-cores. 500m is 0.5 cores 50 | memory: 2048Mi 51 | limits: 52 | # GPUs can only be allocated as a limit, which both reserves and limits the number of GPUs the Pod will have access to 53 | # Making individual Pods resource light is advantageous for bin-packing. In the case of Jupyter, we stick to two GPUs for 54 | # demonstration purposes 55 | nvidia.com/gpu: 1 56 | 57 | # Node affinity can be used to require / prefer the Pods to be scheduled on a node with a specific hardware type 58 | # No affinity allows scheduling on all hardware types that can fulfill the resource request. 59 | # In this example, without affinity, any NVIDIA GPU would be allowed to run the Pod. 60 | # Read more about affinity at: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity 61 | affinity: 62 | nodeAffinity: 63 | # This will REQUIRE the Pod to be run on a system with a GPU with 8GB VRAM 64 | requiredDuringSchedulingIgnoredDuringExecution: 65 | nodeSelectorTerms: 66 | - matchExpressions: 67 | - key: gpu.nvidia.com/vram 68 | operator: In 69 | values: 70 | - "8" 71 | 72 | preferredDuringSchedulingIgnoredDuringExecution: 73 | - weight: 10 74 | preference: 75 | matchExpressions: 76 | - key: cpu.coreweave.cloud/family 77 | operator: In 78 | values: 79 | - i5 80 | - i7 81 | - i9 82 | - xeon 83 | - ryzen 84 | 85 | volumes: 86 | - name: storage 87 | persistentVolumeClaim: 88 | claimName: jupyter-pv-claim 89 | - name: model-storage 90 | persistentVolumeClaim: 91 | claimName: model-storage 92 | restartPolicy: Always 93 | -------------------------------------------------------------------------------- /online-inference/image-classifier/jupyter/tensorflow-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | annotations: 5 | metallb.universe.tf/address-pool: public 6 | # Setting a sharing key might save public IP addresses 7 | # See https://metallb.universe.tf/usage/#ip-address-sharing for more detail 8 | metallb.universe.tf/allow-shared-ip: example-1 9 | name: tensorflow-jupyter 10 | spec: 11 | type: LoadBalancer 12 | externalTrafficPolicy: Local 13 | ports: 14 | - name: notebook 15 | port: 8888 16 | protocol: TCP 17 | targetPort: notebook 18 | selector: 19 | app.kubernetes.io/name: tensorflow-jupyter 20 | -------------------------------------------------------------------------------- /online-inference/image-classifier/service/classifier-inferenceservice.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kubeflow.org/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | name: classifier 5 | annotations: 6 | os.coreweave.cloud/latency: low 7 | spec: 8 | predictor: 9 | # Max one request processed at the same time per container (GPU) 10 | minReplicas: 0 # Allow scale to zero 11 | maxReplicas: 3 12 | containerConcurrency: 1 13 | tensorflow: 14 | # The PVC and path inside the PVC to the model. The path is what we put after /models/ in export_dir in the notebook. 15 | storageUri: pvc://model-storage/inception/ 16 | runtimeVersion: "2.1.0-gpu" 17 | resources: 18 | requests: 19 | cpu: 1 20 | memory: 6Gi 21 | limits: 22 | cpu: 3 23 | memory: 10Gi 24 | nvidia.com/gpu: 1 25 | affinity: 26 | nodeAffinity: 27 | requiredDuringSchedulingIgnoredDuringExecution: 28 | nodeSelectorTerms: 29 | - matchExpressions: 30 | - key: gpu.nvidia.com/class 31 | operator: In 32 | values: 33 | - Tesla_V100 34 | 35 | transformer: 36 | minReplicas: 1 37 | maxReplicas: 2 38 | containers: 39 | - image: coreweave/inception-transformer:0.11 # Docker image of the code found in transformer/ 40 | name: user-container 41 | resources: 42 | requests: 43 | cpu: 200m 44 | memory: 64Mi 45 | limits: 46 | cpu: 3 47 | memory: 8Gi 48 | -------------------------------------------------------------------------------- /online-inference/image-classifier/service/predict_url.sh: -------------------------------------------------------------------------------- 1 | curl -v -d "{\"instances\": [{\"url\":\"$1\"}]}" $SERVICE_URL:predict 2 | -------------------------------------------------------------------------------- /online-inference/image-classifier/transformer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-slim 2 | 3 | RUN apt update && apt install -y git 4 | 5 | RUN pip install --upgrade pip 6 | RUN pip install 'git+git://github.com/coreweave/kfserving#egg=kfserving&subdirectory=python/kfserving' 7 | 8 | ADD requirements.txt . 9 | RUN pip install -r requirements.txt 10 | 11 | RUN mkdir -p /transformer/ 12 | WORKDIR /transformer 13 | 14 | COPY *.py ./ 15 | 16 | ENTRYPOINT ["python", "main.py"] 17 | -------------------------------------------------------------------------------- /online-inference/image-classifier/transformer/main.py: -------------------------------------------------------------------------------- 1 | import kfserving 2 | import argparse 3 | from transformer import Transformer 4 | 5 | parser = argparse.ArgumentParser(parents=[kfserving.kfserver.parser]) 6 | parser.add_argument('--model_name', default="model", 7 | help='The name that the model is served under.') 8 | parser.add_argument('--predictor_host', help='The URL for the model predict function', required=True) 9 | 10 | args, _ = parser.parse_known_args() 11 | 12 | if __name__ == "__main__": 13 | transformer = Transformer(args.model_name, predictor_host=args.predictor_host) 14 | kfserver = kfserving.KFServer(workers=4) 15 | kfserver.start(models=[transformer]) 16 | -------------------------------------------------------------------------------- /online-inference/image-classifier/transformer/requirements.txt: -------------------------------------------------------------------------------- 1 | pillow==7.1.2 2 | dill==0.3.1.1 3 | msgpack==0.6.2 4 | numpy==1.18.0 5 | requests==2.22.0 6 | -------------------------------------------------------------------------------- /online-inference/image-classifier/transformer/transformer.py: -------------------------------------------------------------------------------- 1 | import kfserving 2 | from typing import List, Dict 3 | import logging 4 | import requests 5 | import numpy as np 6 | import base64 7 | 8 | # The signature name is defined at time of export, in signature_def_map supplied to builder 9 | # Tensorflows default is serving_default 10 | SERVING_SIGNATURE_NAME = 'serving_default' 11 | 12 | logging.basicConfig(level=kfserving.constants.KFSERVING_LOGLEVEL) 13 | 14 | class Transformer(kfserving.KFModel): 15 | def __init__(self, name: str, predictor_host: str): 16 | super().__init__(name) 17 | self.predictor_host = predictor_host 18 | 19 | self.labels = requests.get( 20 | "https://storage.googleapis.com/download.tensorflow.org/data/ImageNetLabels.txt" 21 | ).text.split("\n") 22 | 23 | 24 | # Accept input either in base64 format or as a url 25 | def encode(self, input): 26 | if 'b64' in input: 27 | b64 = input['b64'] 28 | else: 29 | image = requests.get(input["url"]).content 30 | b64 = base64.b64encode(image).decode("utf-8") 31 | 32 | # Input name is defined when exporting the module 33 | # Tensorflow Serving decodes base64 encoded images when sent in an object with the b64 key. 34 | # https://towardsdatascience.com/serving-image-based-deep-learning-models-with-tensorflow-servings-restful-api-d365c16a7dc4 35 | return {"image_bytes": {"b64": b64 } } 36 | 37 | # Match up the most likely prediction to the labels 38 | def decode(self, prediction): 39 | return { 40 | 'class': self.labels[np.argmax(prediction)], 41 | 'score': max(prediction) 42 | } 43 | 44 | def preprocess(self, inputs: Dict) -> Dict: 45 | return {'signature_name': SERVING_SIGNATURE_NAME, 'instances': [self.encode(instance) for instance in inputs['instances']]} 46 | 47 | 48 | def postprocess(self, inputs: List) -> List: 49 | return {'predictions': [self.decode(prediction) for prediction in inputs['predictions']]} -------------------------------------------------------------------------------- /online-inference/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/overview.png -------------------------------------------------------------------------------- /online-inference/stable-diffusion/00-optional-s3-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | access_key: Replace_this_with_your_access_key 4 | kind: Secret 5 | metadata: 6 | name: s3-access-key 7 | type: Opaque 8 | --- 9 | apiVersion: v1 10 | data: 11 | secret_key: Replace_this_with_your_secret_key 12 | kind: Secret 13 | metadata: 14 | name: s3-secret-key 15 | type: Opaque 16 | --- 17 | apiVersion: v1 18 | data: 19 | url: Replace_this_with_your_host_url 20 | kind: Secret 21 | metadata: 22 | name: s3-host-url 23 | type: Opaque -------------------------------------------------------------------------------- /online-inference/stable-diffusion/01-optional-s3-serialize-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: stable-diffusion-serializer 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: model-serializer 10 | image: ghcr.io/coreweave/ml-containers/sd-inference:amercurio-sd-overhaul-7d29c61 11 | imagePullPolicy: IfNotPresent 12 | command: 13 | - "python3" 14 | - "/app/serialize.py" 15 | - "--hf-model-id=runwayml/stable-diffusion-v1-5" 16 | - "--precision=float16" 17 | - "--dest-bucket=your-bucket-here" 18 | env: 19 | - name: AWS_KEY 20 | valueFrom: 21 | secretKeyRef: 22 | name: s3-access-key 23 | key: access_key 24 | - name: AWS_SECRET 25 | valueFrom: 26 | secretKeyRef: 27 | name: s3-secret-key 28 | key: secret_key 29 | - name: AWS_HOST 30 | valueFrom: 31 | secretKeyRef: 32 | name: s3-host-url 33 | key: url 34 | resources: 35 | requests: 36 | cpu: 2 37 | memory: 16Gi 38 | limits: 39 | cpu: 2 40 | memory: 16Gi 41 | affinity: 42 | nodeAffinity: 43 | requiredDuringSchedulingIgnoredDuringExecution: 44 | nodeSelectorTerms: 45 | - matchExpressions: 46 | - key: topology.kubernetes.io/region 47 | operator: In 48 | values: 49 | - ORD1 50 | restartPolicy: Never 51 | backoffLimit: 2 52 | -------------------------------------------------------------------------------- /online-inference/stable-diffusion/02-inference-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.knative.dev/v1 2 | kind: Service 3 | metadata: 4 | name: sd 5 | annotations: 6 | networking.knative.dev/ingress-class: kourier.ingress.networking.knative.dev 7 | labels: 8 | knative.coreweave.cloud/ingress: kourier.ingress.networking.knative.dev 9 | spec: 10 | template: 11 | metadata: 12 | annotations: 13 | autoscaling.knative.dev/minScale: "1" 14 | autoscaling.knative.dev/maxScale: "1" 15 | spec: 16 | affinity: 17 | nodeAffinity: 18 | requiredDuringSchedulingIgnoredDuringExecution: 19 | nodeSelectorTerms: 20 | - matchExpressions: 21 | - key: gpu.nvidia.com/class 22 | operator: In 23 | values: 24 | - Quadro_RTX_5000 25 | - key: topology.kubernetes.io/region 26 | operator: In 27 | values: 28 | - ORD1 29 | containers: 30 | - name: kfserving-container 31 | image: ghcr.io/coreweave/ml-containers/sd-inference:amercurio-sd-overhaul-7d29c61 32 | command: 33 | - "python3" 34 | - "/app/service.py" 35 | - "--model-uri=s3://tensorized/runwayml/stable-diffusion-v1-5" 36 | - "--precision=float16" 37 | - "--port=80" 38 | env: 39 | - name: AWS_KEY 40 | valueFrom: 41 | secretKeyRef: 42 | name: s3-access-key 43 | key: access_key 44 | optional: true 45 | - name: AWS_SECRET 46 | valueFrom: 47 | secretKeyRef: 48 | name: s3-secret-key 49 | key: secret_key 50 | optional: true 51 | - name: AWS_HOST 52 | valueFrom: 53 | secretKeyRef: 54 | name: s3-host-url 55 | key: url 56 | optional: true 57 | ports: 58 | - protocol: TCP 59 | containerPort: 80 60 | livenessProbe: 61 | httpGet: 62 | path: / 63 | port: 80 64 | initialDelaySeconds: 30 65 | periodSeconds: 30 66 | readinessProbe: 67 | httpGet: 68 | path: / 69 | port: 80 70 | initialDelaySeconds: 30 71 | periodSeconds: 30 72 | resources: 73 | requests: 74 | cpu: 4 75 | memory: 16Gi 76 | nvidia.com/gpu: 1 77 | limits: 78 | cpu: 4 79 | memory: 16Gi 80 | nvidia.com/gpu: 1 81 | -------------------------------------------------------------------------------- /online-inference/stable-diffusion/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/coreweave/ml-containers/torch:afecfe9-base-cuda11.8.0-torch2.0.0-vision0.15.1 2 | ENV DEBIAN_FRONTEND=noninteractive 3 | 4 | RUN apt update && apt upgrade -y && \ 5 | apt update && apt install -y python3 python3-pip git curl && \ 6 | apt clean 7 | 8 | ADD service/ /app/ 9 | COPY serializer/serialize.py /app/serialize.py 10 | WORKDIR /app 11 | 12 | RUN pip3 install --no-cache-dir --upgrade pip && \ 13 | pip3 install --no-cache-dir -r requirements.txt 14 | -------------------------------------------------------------------------------- /online-inference/stable-diffusion/README.md: -------------------------------------------------------------------------------- 1 | # Stable Diffusion 2 | Please refer to [CoreWeave Docs](https://docs.coreweave.com/machine-learning-and-ai/inference/examples/pytorch-jax/hugging-face/pytorch-hugging-face-diffusers-stable-diffusion-text-to-image) for a deployment tutorial. 3 | -------------------------------------------------------------------------------- /online-inference/stable-diffusion/serializer/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.0 2 | transformers==4.33.1 3 | diffusers==0.20.2 4 | tensorizer==2.3.0 5 | -------------------------------------------------------------------------------- /online-inference/stable-diffusion/serializer/serialize.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import logging 4 | from tensorizer import TensorSerializer, stream_io 5 | from diffusers import StableDiffusionPipeline 6 | from argparse import ArgumentParser 7 | 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__file__) 10 | 11 | parser = ArgumentParser() 12 | parser.add_argument("--hf-model-id", default="runwayml/stable-diffusion-v1-5", type=str) 13 | parser.add_argument("--precision", choices=["float16", "float32"], default="float16", type=str) 14 | parser.add_argument("--dest-bucket", default=None, required=True, type=str) 15 | parser.add_argument("--s3-access-key", default=os.getenv("AWS_KEY"), required=False, type=str) 16 | parser.add_argument("--s3-secret-access-key", default=os.getenv("AWS_SECRET"), required=False, type=str) 17 | parser.add_argument("--s3-endpoint", default=os.getenv("AWS_HOST", "object.ord1.coreweave.com"), required=False, type=str) 18 | args = parser.parse_args() 19 | 20 | def save_artifact(model, path, sub_path): 21 | serializer = TensorSerializer(path + sub_path) 22 | serializer.write_module(model) 23 | serializer.close() 24 | 25 | def save_artifact_s3(model, path, sub_path): 26 | serializer = TensorSerializer( 27 | stream_io.open_stream( 28 | path_uri = path + sub_path, 29 | mode = 'wb', 30 | s3_access_key_id = args.s3_access_key, 31 | s3_secret_access_key = args.s3_secret_access_key, 32 | s3_endpoint = args.s3_endpoint, 33 | s3_config_path=None 34 | ) 35 | ) 36 | serializer.write_module(model) 37 | serializer.close() 38 | logger.info(f"Tensorized S3 artifact written to {path + sub_path}") 39 | 40 | if __name__ == '__main__': 41 | model_id = args.hf_model_id 42 | model = StableDiffusionPipeline.from_pretrained( 43 | model_id, 44 | torch_dtype=torch.float16 if args.precision == "float16" else torch.float32 45 | ) 46 | 47 | BASE_S3_URL = f"s3://{args.dest_bucket}/" 48 | 49 | dtype_str = "/fp16" if args.precision == "float16" else "" 50 | 51 | save_artifact_s3(model.vae, BASE_S3_URL + model_id + dtype_str, '/vae.tensors') 52 | save_artifact_s3(model.unet, BASE_S3_URL + model_id + dtype_str, '/unet.tensors') 53 | save_artifact_s3(model.text_encoder, BASE_S3_URL + model_id + dtype_str, '/text_encoder.tensors') 54 | 55 | logger.info(f"Wrote tensorized S3 artifact to: {BASE_S3_URL + model_id}") 56 | -------------------------------------------------------------------------------- /online-inference/stable-diffusion/service/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.0 2 | transformers==4.33.1 3 | diffusers==0.20.2 4 | tensorizer==2.3.0 5 | numpy==1.24.2 6 | scipy==1.10.1 7 | fastapi==0.85.1 8 | uvicorn==0.16.0 9 | -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/README.md: -------------------------------------------------------------------------------- 1 | # GPT-J InferenceService with Tensorizer & HuggingFace 2 | 3 | The following instructions will guide you through setting up an 4 | [InferenceService](https://docs.coreweave.com/coreweave-machine-learning-and-ai/how-to-guides-and-tutorials/examples) 5 | with [Tensorizer](https://github.com/coreweave/tensorizer) 6 | or [HuggingFace Transformers](https://huggingface.co/docs/transformers/index) 7 | serving [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6b). 8 | 9 | From the root of `tensorizer-isvc`: 10 | 11 | - Provision a [PVC](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) 12 | - `kubectl apply -f pvc.yaml` 13 | - Download the model to the PVC 14 | - `kubectl apply -f model-download/model-download-job.yaml` 15 | - Run the HuggingFace InferenceService (currently using KServe) 16 | - `kubectl apply -f tensorizer_hf_isvc/kserve/hf-isvc.yaml` 17 | - Or, run the Tensorizer InferenceService (currently using KServe) 18 | - `kubectl apply -f tensorizer_hf_isvc/kserve/tensorizer-isvc.yaml` 19 | - View the InferenceService deployment information and URL 20 | - `kubectl get isvc` 21 | - `http://` may be required in place of `https://` when connecting to the displayed URL 22 | - Test the InferenceService 23 | - The KServe services use [KServe's V1 protocol](https://kserve.github.io/website/0.10/modelserving/data_plane/v1_protocol/): 24 | ```bash 25 | curl http:///v1/models/gptj:predict -X POST -H 'Content-Type: application/json' -d '{"instances": ["Hello!"]}' 26 | ``` 27 | - The Flask services simply encode queries into the URL path component: 28 | ```bash 29 | curl http:///predict/Hello%21 30 | ``` 31 | - Run the benchmark 32 | - `python benchmark/load_test.py --kserve --url= --requests=` 33 | - `load_test.py` defaults to running async requests with [`aiohttp`](https://pypi.org/project/aiohttp/) 34 | - `--sync` may be added to the command line to instead send requests sequentially 35 | using [`requests`](https://pypi.org/project/requests/) 36 | - Delete the InferenceService 37 | - `kubectl delete -f tensorizer_hf_isvc/<...>/<...>-isvc.yaml` 38 | - Use the same manifest file that was used with `kubectl apply` 39 | 40 | Each InferenceService manifest (`*-isvc.yaml`) runs a container defined 41 | in a Dockerfile in its same directory, such as `tensorizer_hf_isvc/kserve/Dockerfile`. 42 | These may be changed and rebuilt to customize the behavior of the InferenceService. 43 | 44 | > Note: The build context for each Dockerfile is its parent directory, so the build commands look like: 45 | > ```bash 46 | > docker build ./tensorizer_hf_isvc -f ./tensorizer_hf_isvc/kserve/Dockerfile 47 | > docker build ./tensorizer_hf_isvc -f ./tensorizer_hf_isvc/flask/Dockerfile 48 | > ``` 49 | -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/benchmark/inputs.txt: -------------------------------------------------------------------------------- 1 | Hello, how are you? 2 | What up dig dog? 3 | You are a killer! 4 | Live a good life 5 | Life is great 6 | Chilling on a roof 7 | Love you 8 | Mox is cute 9 | You are my enemy 10 | Change is required 11 | Love the life 12 | -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/benchmark/locustfile.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import random 3 | import urllib.parse 4 | 5 | from locust import HttpUser, task 6 | 7 | inputs_file_path = os.path.join( 8 | os.path.dirname(os.path.abspath(__file__)), "inputs.txt" 9 | ) 10 | 11 | with open(inputs_file_path, "r", encoding="utf-8") as inputs_file: 12 | inputs = [line.strip() for line in inputs_file] 13 | 14 | 15 | def random_inference_url() -> str: 16 | query = urllib.parse.quote(random.choice(inputs)) 17 | return f"/predict/{query}" 18 | 19 | 20 | class QuickstartUser(HttpUser): 21 | @task 22 | def predict(self): 23 | with self.client.get(random_inference_url()) as response: 24 | if response.status_code != 200: 25 | response.failure("Could not return response") 26 | -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/model-download/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/coreweave/ml-containers/torch:bb02bee-base-cuda11.8.0-torch2.0.0-vision0.15.1-audio2.0.1 2 | 3 | # Install cURL, for tensorizer 4 | RUN apt-get -qq update && \ 5 | apt-get -qq install --no-install-recommends -y curl && \ 6 | apt-get clean 7 | 8 | RUN mkdir -p /downloader/ 9 | WORKDIR /downloader 10 | 11 | COPY requirements.txt . 12 | RUN pip install --no-cache-dir --upgrade pip && \ 13 | pip install --no-cache-dir -r requirements.txt 14 | 15 | COPY model_download.py ./ 16 | 17 | ENTRYPOINT ["python", "model_download.py"] 18 | -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/model-download/model-download-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: model-download 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: model-downloader 10 | image: rtalaricw/model-download-gptj:v2.0 11 | imagePullPolicy: IfNotPresent 12 | volumeMounts: 13 | - name: model-cache 14 | mountPath: /mnt 15 | resources: 16 | requests: 17 | cpu: 2 18 | memory: 40Gi 19 | limits: 20 | cpu: 2 21 | memory: 40Gi 22 | volumes: 23 | - name: model-cache 24 | persistentVolumeClaim: 25 | claimName: model-storage 26 | affinity: 27 | nodeAffinity: 28 | requiredDuringSchedulingIgnoredDuringExecution: 29 | nodeSelectorTerms: 30 | - matchExpressions: 31 | - key: topology.kubernetes.io/region 32 | operator: In 33 | values: 34 | - LAS1 35 | 36 | restartPolicy: Never 37 | backoffLimit: 1 -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/model-download/model_download.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tensorizer import TensorSerializer 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | 5 | model_ref = "EleutherAI/gpt-j-6b" 6 | 7 | tokenizer = AutoTokenizer.from_pretrained(model_ref) 8 | tokenizer.save_pretrained("/mnt") 9 | del tokenizer 10 | 11 | model = AutoModelForCausalLM.from_pretrained( 12 | model_ref, 13 | revision="float16", 14 | torch_dtype=torch.float16, 15 | low_cpu_mem_usage=True, 16 | ) 17 | # If only the tensorized model is desired, instead of saving the whole 18 | # PyTorch model, only the PretrainedConfig for the model need be saved 19 | # with the tokenizer and .tensors file. 20 | # model.config.save_pretrained("/mnt") 21 | model.save_pretrained("/mnt") 22 | 23 | serializer = TensorSerializer("/mnt/gptj.tensors") 24 | serializer.write_module(model, remove_tensors=True) 25 | serializer.close() 26 | -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/model-download/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.27.1 2 | tensorizer==1.1.0 3 | accelerate==0.19.0 4 | -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: model-storage 5 | spec: 6 | storageClassName: shared-nvme-las1 7 | accessModes: 8 | - ReadWriteMany 9 | resources: 10 | requests: 11 | storage: 200Gi -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/tensorizer_hf_isvc/flask/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/coreweave/ml-containers/torch:bb02bee-base-cuda11.8.0-torch2.0.0-vision0.15.1-audio2.0.1 2 | 3 | RUN mkdir -p /transformer/ 4 | WORKDIR /transformer 5 | 6 | COPY flask/requirements.txt . 7 | 8 | RUN pip install --no-cache-dir --upgrade pip && \ 9 | pip install --no-cache-dir -r requirements.txt 10 | 11 | COPY flask/flask_api.py . 12 | COPY load_model.py . 13 | 14 | ENTRYPOINT ["python", "-m", "gunicorn", "-w1", "-b0.0.0.0", "flask_api:app", "--timeout", "300"] 15 | -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/tensorizer_hf_isvc/flask/flask_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from flask import Flask, Response 5 | from load_model import load_model_based_on_type 6 | from transformers import AutoTokenizer 7 | 8 | MODEL_LOAD_TYPE = os.getenv("MODEL_LOAD_TYPE") 9 | 10 | 11 | class Transformer: 12 | def __init__(self): 13 | self.model = load_model_based_on_type(model_load_type=MODEL_LOAD_TYPE) 14 | 15 | self.model.eval() 16 | torch.manual_seed(100) 17 | 18 | self.tokenizer = AutoTokenizer.from_pretrained("/mnt/pvc") 19 | self.eos = self.tokenizer.eos_token_id 20 | 21 | def encode(self, text): 22 | input_ids = self.tokenizer.encode(text, return_tensors="pt").to("cuda") 23 | 24 | return input_ids 25 | 26 | # Match up the most likely prediction to the labels 27 | def decode(self, input_ids): 28 | with torch.no_grad(): 29 | output_ids = self.model.generate( 30 | input_ids, 31 | max_new_tokens=50, 32 | do_sample=True, 33 | pad_token_id=self.eos, 34 | ) 35 | 36 | print(f"tensor output IDs: {output_ids}") 37 | 38 | output = self.tokenizer.decode(output_ids[0], skip_special_tokens=True) 39 | 40 | print(f"tensor output: {output}\n", flush=True) 41 | 42 | return output 43 | 44 | 45 | llm = Transformer() 46 | app = Flask(__name__) 47 | 48 | 49 | @app.route("/") 50 | def index(): 51 | return Response(status=200) 52 | 53 | 54 | @app.route("/predict/") 55 | def predict(text): 56 | input_ids = llm.encode(text) 57 | output = llm.decode(input_ids) 58 | 59 | return Response(output, mimetype="text/plain", status=200) 60 | 61 | 62 | if __name__ == "__main__": 63 | app.run(host="0.0.0.0", port=8000) 64 | -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/tensorizer_hf_isvc/flask/hf-isvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kubeflow.org/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | labels: 5 | qos.coreweave.cloud/latency: low 6 | name: flask-hf-gptj 7 | spec: 8 | predictor: 9 | maxReplicas: 100 10 | minReplicas: 1 11 | containerConcurrency: 1 12 | containers: 13 | - name: flask-hf-gptj 14 | image: rtalaricw/gptj-hf-tensorizer-pvc:v2.1 15 | ports: 16 | - protocol: TCP 17 | containerPort: 8000 18 | env: 19 | - name: STORAGE_URI 20 | value: pvc://model-storage/ 21 | - name: MODEL_LOAD_TYPE 22 | value: hf 23 | - name: PYTHONUNBUFFERED 24 | value: "1" 25 | resources: 26 | requests: 27 | cpu: 8 28 | memory: 64Gi 29 | nvidia.com/gpu: 1 30 | limits: 31 | cpu: 8 32 | memory: 64Gi 33 | nvidia.com/gpu: 1 34 | affinity: 35 | nodeAffinity: 36 | requiredDuringSchedulingIgnoredDuringExecution: 37 | nodeSelectorTerms: 38 | - matchExpressions: 39 | - key: gpu.nvidia.com/class 40 | operator: In 41 | values: 42 | - A40 43 | - key: topology.kubernetes.io/region 44 | operator: In 45 | values: 46 | - LAS1 -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/tensorizer_hf_isvc/flask/requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==2.3.2 2 | gunicorn==20.1.0 3 | transformers==4.27.1 4 | tensorizer==1.1.0 5 | -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/tensorizer_hf_isvc/flask/tensorizer-isvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kubeflow.org/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | labels: 5 | qos.coreweave.cloud/latency: low 6 | name: flask-tensorizer-gptj 7 | spec: 8 | predictor: 9 | maxReplicas: 100 10 | minReplicas: 1 11 | containerConcurrency: 1 12 | containers: 13 | - name: flask-tensorizer-gptj 14 | image: rtalaricw/gptj-hf-tensorizer-pvc:v2.1 15 | ports: 16 | - protocol: TCP 17 | containerPort: 8000 18 | env: 19 | - name: STORAGE_URI 20 | value: pvc://model-storage/ 21 | - name: MODEL_LOAD_TYPE 22 | value: tensorizer 23 | - name: PYTHONUNBUFFERED 24 | value: "1" 25 | resources: 26 | requests: 27 | cpu: 8 28 | memory: 64Gi 29 | nvidia.com/gpu: 1 30 | limits: 31 | cpu: 8 32 | memory: 64Gi 33 | nvidia.com/gpu: 1 34 | affinity: 35 | nodeAffinity: 36 | requiredDuringSchedulingIgnoredDuringExecution: 37 | nodeSelectorTerms: 38 | - matchExpressions: 39 | - key: gpu.nvidia.com/class 40 | operator: In 41 | values: 42 | - A40 43 | - key: topology.kubernetes.io/region 44 | operator: In 45 | values: 46 | - LAS1 -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/tensorizer_hf_isvc/kserve/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/coreweave/ml-containers/torch:bb02bee-base-cuda11.8.0-torch2.0.0-vision0.15.1-audio2.0.1 2 | 3 | RUN mkdir -p /transformer/ 4 | WORKDIR /transformer 5 | 6 | COPY kserve/requirements.txt . 7 | 8 | RUN pip install --no-cache-dir --upgrade pip && \ 9 | pip install --no-cache-dir -r requirements.txt 10 | 11 | COPY kserve/kserve_api.py . 12 | COPY load_model.py . 13 | 14 | ENTRYPOINT ["python", "kserve_api.py"] 15 | -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/tensorizer_hf_isvc/kserve/hf-isvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kubeflow.org/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | labels: 5 | qos.coreweave.cloud/latency: low 6 | name: kserve-hf-gptj 7 | spec: 8 | predictor: 9 | maxReplicas: 100 10 | minReplicas: 1 11 | containerConcurrency: 1 12 | containers: 13 | - name: kserve-hf-gptj 14 | image: rtalaricw/gptj-hf-tensorizer-pvc-kserve:v2.1 15 | env: 16 | - name: STORAGE_URI 17 | value: pvc://model-storage/ 18 | - name: MODEL_LOAD_TYPE 19 | value: hf 20 | - name: PYTHONUNBUFFERED 21 | value: "1" 22 | resources: 23 | requests: 24 | cpu: 8 25 | memory: 64Gi 26 | nvidia.com/gpu: 1 27 | limits: 28 | cpu: 8 29 | memory: 64Gi 30 | nvidia.com/gpu: 1 31 | affinity: 32 | nodeAffinity: 33 | requiredDuringSchedulingIgnoredDuringExecution: 34 | nodeSelectorTerms: 35 | - matchExpressions: 36 | - key: gpu.nvidia.com/class 37 | operator: In 38 | values: 39 | - A40 40 | - key: topology.kubernetes.io/region 41 | operator: In 42 | values: 43 | - LAS1 -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/tensorizer_hf_isvc/kserve/kserve_api.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Dict 4 | 5 | import kserve 6 | import kserve.errors 7 | import torch 8 | from load_model import load_model_based_on_type 9 | from transformers import AutoTokenizer 10 | 11 | MODEL_NAME = "gptj" 12 | MODEL_LOAD_TYPE = os.getenv("MODEL_LOAD_TYPE") 13 | 14 | logging.basicConfig(level=kserve.constants.KSERVE_LOGLEVEL) 15 | logger = logging.getLogger(MODEL_NAME) 16 | logger.info(f"Model Name: {MODEL_NAME}") 17 | 18 | 19 | class Model(kserve.Model): 20 | def __init__(self, name: str): 21 | super().__init__(name) 22 | self.name = name 23 | self.model = None 24 | self.tokenizer = None 25 | self.eos = None 26 | self.ready = False 27 | 28 | def load(self): 29 | logger.info(f"Loading {MODEL_NAME}") 30 | 31 | self.model = load_model_based_on_type(model_load_type=MODEL_LOAD_TYPE) 32 | 33 | self.model.eval() 34 | torch.manual_seed(100) 35 | 36 | self.tokenizer = AutoTokenizer.from_pretrained("/mnt/pvc") 37 | self.eos = self.tokenizer.eos_token_id 38 | 39 | self.ready = True 40 | 41 | def validate(self, payload: Dict): 42 | # Ensure that the request has the appropriate type to process 43 | if not isinstance(payload, Dict): 44 | raise kserve.errors.InvalidInput("Expected payload to be a dict") 45 | return super().validate(payload) 46 | 47 | def predict(self, payload: Dict, headers: Dict[str, str] = None) -> Dict: 48 | inputs = payload.get("instances") or ["Please input some text"] 49 | outputs = [] 50 | for text in inputs: 51 | input_ids = self.tokenizer.encode(text, return_tensors="pt").to( 52 | "cuda" 53 | ) 54 | 55 | with torch.no_grad(): 56 | output_ids = self.model.generate( 57 | input_ids, 58 | max_new_tokens=50, 59 | do_sample=True, 60 | pad_token_id=self.eos, 61 | ) 62 | 63 | print(f"tensor output IDs: {output_ids}") 64 | 65 | output = self.tokenizer.decode( 66 | output_ids[0], skip_special_tokens=True 67 | ) 68 | outputs.append(output) 69 | 70 | print(f"tensor output: {output}\n", flush=True) 71 | 72 | return {"predictions": outputs} 73 | 74 | 75 | if __name__ == "__main__": 76 | model = Model(name=MODEL_NAME) 77 | model.load() 78 | kserve.ModelServer().start([model]) 79 | -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/tensorizer_hf_isvc/kserve/requirements.txt: -------------------------------------------------------------------------------- 1 | kserve==0.10.1 2 | transformers==4.27.1 3 | tensorizer==1.1.0 4 | -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/tensorizer_hf_isvc/kserve/tensorizer-isvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kubeflow.org/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | labels: 5 | qos.coreweave.cloud/latency: low 6 | name: kserve-tensorizer-gptj 7 | spec: 8 | predictor: 9 | maxReplicas: 100 10 | minReplicas: 1 11 | containerConcurrency: 1 12 | containers: 13 | - name: kserve-tensorizer-gptj 14 | image: rtalaricw/gptj-hf-tensorizer-pvc-kserve:v2.1 15 | env: 16 | - name: STORAGE_URI 17 | value: pvc://model-storage/ 18 | - name: MODEL_LOAD_TYPE 19 | value: tensorizer 20 | - name: PYTHONUNBUFFERED 21 | value: "1" 22 | resources: 23 | requests: 24 | cpu: 8 25 | memory: 64Gi 26 | nvidia.com/gpu: 1 27 | limits: 28 | cpu: 8 29 | memory: 64Gi 30 | nvidia.com/gpu: 1 31 | affinity: 32 | nodeAffinity: 33 | requiredDuringSchedulingIgnoredDuringExecution: 34 | nodeSelectorTerms: 35 | - matchExpressions: 36 | - key: gpu.nvidia.com/class 37 | operator: In 38 | values: 39 | - A40 40 | - key: topology.kubernetes.io/region 41 | operator: In 42 | values: 43 | - LAS1 -------------------------------------------------------------------------------- /online-inference/tensorizer-isvc/tensorizer_hf_isvc/load_model.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Literal 3 | 4 | import torch 5 | from tensorizer import TensorDeserializer 6 | from tensorizer.utils import convert_bytes, get_mem_usage, no_init_or_tensor 7 | from transformers import AutoConfig, AutoModelForCausalLM, GPTJForCausalLM 8 | 9 | DEVICE = "cuda" 10 | 11 | 12 | def load_model_based_on_type( 13 | model_load_type: Literal["tensorizer", "hf"] = "tensorizer", 14 | model_path: str = "/mnt/pvc", 15 | ): 16 | """ 17 | Loads the model using Tensorizer or HuggingFace. 18 | 19 | Args: 20 | model_load_type: Method to load the model [Options: "tensorizer", "hf"] 21 | model_path: Path to the model files 22 | """ 23 | if model_load_type not in ("tensorizer", "hf"): 24 | raise ValueError( 25 | 'model_load_type must be either "tensorizer" or "hf";' 26 | f" got {model_load_type}" 27 | ) 28 | 29 | if model_load_type == "hf": 30 | start = time.time() 31 | model = GPTJForCausalLM.from_pretrained( 32 | model_path, torch_dtype=torch.float16 33 | ).to(DEVICE) 34 | duration = time.time() - start 35 | print( 36 | f"Deserialized model in {duration:0.2f}s" 37 | " using HuggingFace Transformers" 38 | ) 39 | 40 | return model 41 | 42 | # If the config file were not pre-downloaded along with the HuggingFace 43 | # model as in this example, this could use a HuggingFace model reference 44 | # instead of a path for a small download of just the relevant config file. 45 | # model_ref = "EleutherAI/gpt-j-6B" 46 | config = AutoConfig.from_pretrained(model_path) 47 | 48 | # This ensures that the model is not initialized. 49 | with no_init_or_tensor(): 50 | model = AutoModelForCausalLM.from_config(config) 51 | 52 | before_mem = get_mem_usage() 53 | 54 | # Lazy load the tensors from PVC into the model. 55 | start = time.time() 56 | deserializer = TensorDeserializer( 57 | f"{model_path}/gptj.tensors", plaid_mode=True 58 | ) 59 | deserializer.load_into_module(model) 60 | end = time.time() 61 | 62 | # Brag about how fast we are. 63 | total_bytes_str = convert_bytes(deserializer.total_tensor_bytes) 64 | duration = end - start 65 | per_second = convert_bytes(deserializer.total_tensor_bytes / duration) 66 | after_mem = get_mem_usage() 67 | deserializer.close() 68 | print( 69 | f"Deserialized {total_bytes_str} in {duration:0.2f}s, {per_second}/s" 70 | " using Tensorizer" 71 | ) 72 | print(f"Memory usage before: {before_mem}") 73 | print(f"Memory usage after: {after_mem}") 74 | 75 | return model 76 | -------------------------------------------------------------------------------- /online-inference/vllm/00-s3-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: s3-credentials 5 | type: Opaque 6 | data: 7 | access_key: 8 | secret_key: 9 | host_url: 10 | -------------------------------------------------------------------------------- /online-inference/vllm/01-s3-serialize-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: vllm-serializer 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: model-serializer 10 | image: rtalaricw/vllm:0.1 11 | imagePullPolicy: IfNotPresent 12 | command: 13 | - "/bin/sh" 14 | - "-c" 15 | - | 16 | cd /app/vllm && python3 -m examples.tensorize_vllm_model --model EleutherAI/pythia-70m serialize --serialized-directory s3://model-store/ --suffix vllm 17 | env: 18 | - name: S3_ACCESS_KEY_ID 19 | valueFrom: 20 | secretKeyRef: 21 | name: s3-credentials 22 | key: access_key 23 | - name: S3_SECRET_ACCESS_KEY 24 | valueFrom: 25 | secretKeyRef: 26 | name: s3-credentials 27 | key: secret_key 28 | - name: S3_ENDPOINT_URL 29 | valueFrom: 30 | secretKeyRef: 31 | name: s3-credentials 32 | key: host_url 33 | resources: 34 | requests: 35 | cpu: "2" 36 | memory: 16Gi 37 | nvidia.com/gpu: "1" 38 | limits: 39 | cpu: "2" 40 | memory: 16Gi 41 | nvidia.com/gpu: "1" 42 | affinity: 43 | nodeAffinity: 44 | requiredDuringSchedulingIgnoredDuringExecution: 45 | nodeSelectorTerms: 46 | - matchExpressions: 47 | - key: topology.kubernetes.io/region 48 | operator: In 49 | values: 50 | - ORD1 51 | - key: gpu.nvidia.com/class 52 | operator: In 53 | values: 54 | - RTX_A5000 55 | 56 | restartPolicy: Never 57 | backoffLimit: 1 58 | -------------------------------------------------------------------------------- /online-inference/vllm/02-inference-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.knative.dev/v1 2 | kind: Service 3 | metadata: 4 | name: vllm-inference-service 5 | annotations: 6 | networking.knative.dev/ingress-class: kourier.ingress.networking.knative.dev 7 | labels: 8 | knative.coreweave.cloud/ingress: kourier.ingress.networking.knative.dev 9 | spec: 10 | template: 11 | metadata: 12 | annotations: 13 | autoscaling.knative.dev/minScale: "1" 14 | autoscaling.knative.dev/maxScale: "1" 15 | spec: 16 | affinity: 17 | nodeAffinity: 18 | requiredDuringSchedulingIgnoredDuringExecution: 19 | nodeSelectorTerms: 20 | - matchExpressions: 21 | - key: gpu.nvidia.com/class 22 | operator: In 23 | values: 24 | - RTX_A5000 25 | - key: topology.kubernetes.io/region 26 | operator: In 27 | values: 28 | - ORD1 29 | containers: 30 | - name: kfserving-container 31 | image: rtalaricw/vllm:0.1 32 | command: 33 | - "/bin/sh" 34 | - "-c" 35 | - | 36 | python -m vllm.entrypoints.openai.api_server \ 37 | --model EleutherAI/pythia-70m \ 38 | --model-loader-extra-config '{"tensorizer_uri": "s3://model-store/vllm/EleutherAI/pythia-70m/vllm/model.tensors"}' \ 39 | --load-format tensorizer \ 40 | --port 80 41 | env: 42 | - name: S3_ACCESS_KEY_ID 43 | valueFrom: 44 | secretKeyRef: 45 | name: s3-credentials 46 | key: access_key 47 | - name: S3_SECRET_ACCESS_KEY 48 | valueFrom: 49 | secretKeyRef: 50 | name: s3-credentials 51 | key: secret_key 52 | - name: S3_ENDPOINT_URL 53 | valueFrom: 54 | secretKeyRef: 55 | name: s3-credentials 56 | key: host_url 57 | ports: 58 | - protocol: TCP 59 | containerPort: 80 60 | livenessProbe: 61 | httpGet: 62 | path: /v1/models 63 | port: 80 64 | initialDelaySeconds: 30 65 | periodSeconds: 30 66 | readinessProbe: 67 | httpGet: 68 | path: /health 69 | port: 80 70 | initialDelaySeconds: 30 71 | periodSeconds: 30 72 | resources: 73 | requests: 74 | cpu: 4 75 | memory: 16Gi 76 | nvidia.com/gpu: 1 77 | limits: 78 | cpu: 4 79 | memory: 16Gi 80 | nvidia.com/gpu: 1 81 | -------------------------------------------------------------------------------- /online-inference/vllm/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/coreweave/ml-containers/torch-extras:c1bf355-nccl-cuda12.2.2-ubuntu22.04-nccl2.19.3-1-torch2.2.2-vision0.17.2-audio2.2.2 2 | ENV DEBIAN_FRONTEND=noninteractive 3 | 4 | RUN apt-get -qq update && \ 5 | apt-get -qq install --no-install-recommends -y git curl libsodium23 && \ 6 | apt-get clean 7 | 8 | RUN pip install git+https://github.com/coreweave/vllm.git@sangstar/tensorizer-update#egg=vllm[tensorizer] 9 | 10 | WORKDIR /app 11 | 12 | RUN git clone -b sangstar/tensorizer-update https://github.com/coreweave/vllm.git 13 | RUN cd vllm && python3 setup.py build_ext --inplace 14 | -------------------------------------------------------------------------------- /online-inference/vllm/README.md: -------------------------------------------------------------------------------- 1 | This folder contains instructions to run the vLLM inference server. 2 | 3 | Some of the features include: 4 | 5 | 1. Serialize a [vLLM-supported model](https://github.com/vllm-project/vllm?tab=readme-ov-file#about) from the HuggingFace Model Hub. 6 | 2. Tensorizer support for fast model deserialization and loading from vLLM 7 | 8 | To run the example: 9 | 10 | 1. Run ```kubectl apply -f 00-optional-s3-secret.yaml``` and replace ```access_key```, ```secret_key``` and ```host_url``` 11 | 2. Run ```kubectl apply -f 01-optional-s3-serialize-job.yaml``` and replace ```--model EleutherAI/pythia-70m```, ```--serialized-directory s3://my-bucket/``` and optionally ```--suffix vllm``` 12 | 3. Run ```kubectl apply -f 02-inference-service.yaml``` and replace ```--model EleutherAI/pythia-70m``` and ```--model-loader-extra-config '{"tensorizer_uri": "s3://model-store/vllm/EleutherAI/pythia-70m/vllm/model.tensors"}'``` with your serialized model path 13 | 14 | You should have an inference service running a container with an OpenAI compatible server. 15 | 16 | To interact with the client, you can ```kubectl get ksvc``` to find your inference service named: ```vllm-inference-service``` to get the URL. 17 | 18 | The URL will be ```:80/```. 19 | 20 | You can use the OpenAI Python client or CURL to interact with it. More information about the client can be found here: https://docs.vllm.ai/en/latest/getting_started/quickstart.html 21 | -------------------------------------------------------------------------------- /sd-dreambooth-workflow/db-finetune-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: db-finetune-data 5 | spec: 6 | storageClassName: shared-hdd-las1 7 | accessModes: 8 | - ReadWriteMany 9 | resources: 10 | requests: 11 | storage: 2000Gi 12 | -------------------------------------------------------------------------------- /sd-dreambooth-workflow/db-workflow-event-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: WorkflowEventBinding 3 | metadata: 4 | name: db-finetune-event-binding 5 | spec: 6 | event: 7 | selector: discriminator == "db-finetune" 8 | submit: 9 | workflowTemplateRef: 10 | name: db-finetune-template 11 | arguments: 12 | parameters: 13 | - name: run_name 14 | valueFrom: 15 | event: payload.run_name 16 | - name: instance_dataset 17 | valueFrom: 18 | event: payload.instance_dataset 19 | - name: instance_prompt 20 | valueFrom: 21 | event: payload.instance_prompt 22 | - name: class_dataset 23 | valueFrom: 24 | event: payload.class_dataset 25 | - name: class_prompt 26 | valueFrom: 27 | event: payload.class_prompt 28 | - name: output 29 | valueFrom: 30 | event: payload.output 31 | - name: num_class_images 32 | valueFrom: 33 | event: "payload.num_class_images == null ? 100: payload.num_class_images" 34 | - name: run_inference 35 | valueFrom: 36 | event: "payload.run_inference == null ? true : payload.run_inference" 37 | - name: inference_only 38 | valueFrom: 39 | event: "payload.inference_only == null ? false : payload.inference_only" 40 | -------------------------------------------------------------------------------- /sd-dreambooth-workflow/huggingface-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | token: enterYourSecret== 4 | kind: Secret 5 | metadata: 6 | name: huggingface-token-secret 7 | type: Opaque 8 | -------------------------------------------------------------------------------- /sd-dreambooth-workflow/inference-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: inference 5 | --- 6 | apiVersion: rbac.authorization.k8s.io/v1 7 | kind: Role 8 | metadata: 9 | name: role:inference 10 | rules: 11 | - apiGroups: 12 | - serving.kubeflow.org 13 | resources: 14 | - inferenceservices 15 | verbs: 16 | - '*' 17 | - apiGroups: 18 | - serving.knative.dev 19 | resources: 20 | - services 21 | - revisions 22 | verbs: 23 | - '*' 24 | --- 25 | apiVersion: rbac.authorization.k8s.io/v1 26 | kind: RoleBinding 27 | metadata: 28 | name: rolebinding:inference-inference 29 | roleRef: 30 | apiGroup: rbac.authorization.k8s.io 31 | kind: Role 32 | name: role:inference 33 | subjects: 34 | - kind: ServiceAccount 35 | name: inference 36 | 37 | -------------------------------------------------------------------------------- /sd-dreambooth-workflow/wandb-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | token: enterYourSecret== 4 | kind: Secret 5 | metadata: 6 | name: wandb-token-secret 7 | type: Opaque 8 | -------------------------------------------------------------------------------- /sd-finetuner-workflow/huggingface-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | token: enterYourSecret== 4 | kind: Secret 5 | metadata: 6 | name: huggingface-token-secret 7 | type: Opaque 8 | -------------------------------------------------------------------------------- /sd-finetuner-workflow/inference-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: inference 5 | --- 6 | apiVersion: rbac.authorization.k8s.io/v1 7 | kind: Role 8 | metadata: 9 | name: role:inference 10 | rules: 11 | - apiGroups: 12 | - serving.kubeflow.org 13 | resources: 14 | - inferenceservices 15 | verbs: 16 | - '*' 17 | - apiGroups: 18 | - serving.knative.dev 19 | resources: 20 | - services 21 | - revisions 22 | verbs: 23 | - '*' 24 | --- 25 | apiVersion: rbac.authorization.k8s.io/v1 26 | kind: RoleBinding 27 | metadata: 28 | name: rolebinding:inference-inference 29 | roleRef: 30 | apiGroup: rbac.authorization.k8s.io 31 | kind: Role 32 | name: role:inference 33 | subjects: 34 | - kind: ServiceAccount 35 | name: inference 36 | -------------------------------------------------------------------------------- /sd-finetuner-workflow/sd-finetune-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: sd-finetune-data 5 | spec: 6 | storageClassName: shared-hdd-ord1 7 | accessModes: 8 | - ReadWriteMany 9 | resources: 10 | requests: 11 | storage: 2000Gi 12 | -------------------------------------------------------------------------------- /sd-finetuner-workflow/sd-finetune-workflow-event-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: WorkflowEventBinding 3 | metadata: 4 | name: sd-finetune-event-binding 5 | spec: 6 | event: 7 | selector: discriminator == "sd-finetune" 8 | submit: 9 | workflowTemplateRef: 10 | name: sd-finetune-template 11 | arguments: 12 | parameters: 13 | - name: run_name 14 | valueFrom: 15 | event: payload.run_name 16 | - name: dataset 17 | valueFrom: 18 | event: payload.dataset 19 | - name: run_inference 20 | valueFrom: 21 | event: "payload.run_inference == null ? true : payload.run_inference" 22 | - name: inference_only 23 | valueFrom: 24 | event: "payload.inference_only == null ? false : payload.inference_only" 25 | -------------------------------------------------------------------------------- /sd-finetuner-workflow/sd-finetuner/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gooseai/torch-base:6cfdc11 2 | RUN apt-get install -y cuda-nvcc-11-3 cuda-nvml-dev-11-3 libcurand-dev-11-3 \ 3 | libcublas-dev-11-3 libcusparse-dev-11-3 \ 4 | libcusolver-dev-11-3 cuda-nvprof-11-3 \ 5 | ninja-build && \ 6 | apt-get clean 7 | RUN mkdir /app 8 | WORKDIR /app 9 | COPY requirements.txt . 10 | RUN pip3 install --no-cache-dir -r requirements.txt 11 | RUN pip3 install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 --upgrade 12 | COPY accelerate_config.yaml . 13 | COPY datasets.py . 14 | COPY finetuner.py . 15 | CMD [ "/usr/bin/python3", "-m", "accelerate.commands.launch", "finetuner.py" ] 16 | -------------------------------------------------------------------------------- /sd-finetuner-workflow/sd-finetuner/accelerate_config.yaml: -------------------------------------------------------------------------------- 1 | command_file: null 2 | commands: null 3 | compute_environment: LOCAL_MACHINE 4 | deepspeed_config: {} 5 | distributed_type: MULTI_GPU 6 | downcast_bf16: 'no' 7 | fsdp_config: {} 8 | gpu_ids: all 9 | machine_rank: 0 10 | main_process_ip: null 11 | main_process_port: null 12 | main_training_function: main 13 | megatron_lm_config: {} 14 | mixed_precision: 'no' 15 | num_machines: 1 16 | num_processes: 1 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_name: null 20 | tpu_zone: null 21 | use_cpu: false 22 | -------------------------------------------------------------------------------- /sd-finetuner-workflow/sd-finetuner/requirements.txt: -------------------------------------------------------------------------------- 1 | diffusers==0.14.0 2 | numpy==1.23.4 3 | wandb==0.13.4 4 | torch 5 | torchvision 6 | transformers>=4.21.0 7 | huggingface-hub>=0.10.0 8 | Pillow==9.2.0 9 | tqdm==4.64.1 10 | ftfy==6.1.1 11 | bitsandbytes 12 | pynvml~=11.4.1 13 | psutil~=5.9.0 14 | accelerate==0.15.0 15 | -------------------------------------------------------------------------------- /sd-finetuner-workflow/wandb-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | token: enterYourSecret== 4 | kind: Secret 5 | metadata: 6 | name: wandb-token-secret 7 | type: Opaque 8 | -------------------------------------------------------------------------------- /spark/cpu-pod-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: cpu-job 5 | spec: 6 | terminationGracePeriodSeconds: 10 7 | containers: 8 | - name: cpu-job 9 | volumeMounts: 10 | - mountPath: /dev/shm 11 | name: dshm 12 | - name: spark-pvc 13 | mountPath: /mnt/pvc 14 | readOnly: false 15 | 16 | affinity: 17 | nodeAffinity: 18 | requiredDuringSchedulingIgnoredDuringExecution: 19 | nodeSelectorTerms: 20 | - matchExpressions: 21 | - key: topology.kubernetes.io/region 22 | operator: In 23 | values: 24 | - "LGA1" 25 | - key: node.coreweave.cloud/cpu 26 | operator: In 27 | values: 28 | - amd-epyc-rome 29 | - amd-epyc-milan 30 | - intel-xeon-v3 31 | - intel-xeon-v4 32 | volumes: 33 | - name: dshm 34 | emptyDir: 35 | medium: Memory 36 | - name: spark-pvc 37 | persistentVolumeClaim: 38 | claimName: spark-pvc 39 | readOnly: false 40 | restartPolicy: Always 41 | -------------------------------------------------------------------------------- /spark/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG SPARK_VERSION="v3.4.0" 2 | FROM apache/spark-py:$SPARK_VERSION 3 | 4 | USER 0 5 | 6 | RUN mkdir /app 7 | 8 | ARG MSCOCO_SOURCE=https://huggingface.co/datasets/ChristophSchuhmann/MS_COCO_2017_URL_TEXT/resolve/main/mscoco.parquet 9 | RUN wget $MSCOCO_SOURCE -O /app/mscoco.parquet 10 | 11 | ADD requirements.txt /app/requirements.txt 12 | RUN pip install -r /app/requirements.txt 13 | 14 | ADD download_imgdataset.py /app/download_imgdataset.py 15 | -------------------------------------------------------------------------------- /spark/docker/download_imgdataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | from img2dataset import download 5 | from pyspark.sql import SparkSession 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--url-list", type=Path, default=Path("/mnt/pvc/mscoco.parquet"), help="Path to the url list file") 9 | parser.add_argument("--output", type=Path, default=Path("/mnt/pvc/mscoco"), help="Path to output folder") 10 | parser.add_argument("--thread-count", "-t", type=int, default=16, help="Number of threads for img2dataset") 11 | args = parser.parse_args() 12 | 13 | args.output.mkdir(parents=True, exist_ok=True) 14 | 15 | if not args.url_list.exists(): 16 | raise ValueError(f"The URL list does not exist at: {args.url_list}") 17 | 18 | # All options are specified in the spark submit command. Any options specified here will override the spark submit conf 19 | spark = SparkSession.builder.getOrCreate() 20 | 21 | download( 22 | thread_count=args.thread_count, # Process count will be num executors * num cores per executor 23 | url_list=str(args.url_list), 24 | image_size=256, 25 | output_folder=str(args.output), 26 | output_format="webdataset", 27 | input_format="parquet", 28 | url_col="URL", 29 | caption_col="TEXT", 30 | subjob_size=1000, 31 | distributor="pyspark", 32 | ) 33 | -------------------------------------------------------------------------------- /spark/docker/requirements.txt: -------------------------------------------------------------------------------- 1 | img2dataset==1.41.0 2 | -------------------------------------------------------------------------------- /spark/example-spark-submit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Replace this command with your desired namespace if you don't want to use your default namespace 4 | NAMESPACE=$(kubectl config view --minify -o jsonpath='{..namespace}') 5 | echo "Using the namespace: $NAMESPACE" 6 | 7 | $SPARK_HOME/bin/spark-submit \ 8 | --master k8s://https://k8s.ord1.coreweave.com \ 9 | --deploy-mode cluster \ 10 | --name download-mscoco-16-64 \ 11 | --conf spark.driver.cores=16 \ 12 | --conf spark.kubernetes.driver.limit.cores=16 \ 13 | --conf spark.driver.memory="64G" \ 14 | --conf spark.executor.cores=16 \ 15 | --conf spark.kubernetes.executor.limit.cores=16 \ 16 | --conf spark.executor.memory="64G" \ 17 | --conf spark.executor.instances=1 \ 18 | --conf spark.kubernetes.driver.container.image=navarrepratt/spark-download-imgdataset:1.0.2 \ 19 | --conf spark.kubernetes.executor.container.image=navarrepratt/spark-download-imgdataset:1.0.2 \ 20 | --conf spark.kubernetes.driver.podTemplateFile=./cpu-pod-template.yaml \ 21 | --conf spark.kubernetes.executor.podTemplateFile=./cpu-pod-template.yaml \ 22 | --conf spark.kubernetes.namespace="$NAMESPACE" \ 23 | --conf spark.kubernetes.authenticate.driver.serviceAccountName=spark-sa \ 24 | local:///app/download_imgdataset.py --output /mnt/pvc/mscoco -t 2048 25 | -------------------------------------------------------------------------------- /spark/jupyter/jupyter-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: spark-jupyter 5 | spec: 6 | type: ClusterIP 7 | clusterIP: None 8 | ports: 9 | - name: notebook 10 | port: 8888 11 | protocol: TCP 12 | - name: spark-ui 13 | port: 4040 14 | protocol: TCP 15 | - name: blockmanager 16 | port: 7777 17 | protocol: TCP 18 | - name: driver 19 | port: 2222 20 | protocol: TCP 21 | selector: 22 | app.kubernetes.io/name: spark-jupyter 23 | --- 24 | apiVersion: apps/v1 25 | kind: Deployment 26 | metadata: 27 | name: spark-jupyter 28 | spec: 29 | strategy: 30 | type: Recreate 31 | replicas: 1 32 | selector: 33 | matchLabels: 34 | app.kubernetes.io/name: spark-jupyter 35 | template: 36 | metadata: 37 | labels: 38 | app.kubernetes.io/name: spark-jupyter 39 | spec: 40 | serviceAccountName: spark-sa 41 | containers: 42 | - name: jupyter 43 | image: jupyter/all-spark-notebook:python-3.10 44 | command: 45 | - "jupyter" 46 | - "lab" 47 | - "--ip" 48 | - "0.0.0.0" 49 | - "--no-browser" 50 | - "--allow-root" 51 | - "--notebook-dir" 52 | - "/mnt/pvc" 53 | - "--LabApp.token=''" 54 | 55 | securityContext: 56 | runAsUser: 0 57 | 58 | ports: 59 | - name: notebook 60 | containerPort: 8888 61 | protocol: TCP 62 | - name: blockmanager 63 | containerPort: 7777 64 | protocol: TCP 65 | - name: driver 66 | containerPort: 2222 67 | protocol: TCP 68 | - name: spark-ui 69 | containerPort: 4040 70 | protocol: TCP 71 | 72 | readinessProbe: 73 | tcpSocket: 74 | port: notebook 75 | initialDelaySeconds: 5 76 | periodSeconds: 10 77 | livenessProbe: 78 | httpGet: 79 | path: / 80 | port: notebook 81 | initialDelaySeconds: 15 82 | periodSeconds: 15 83 | failureThreshold: 3 84 | timeoutSeconds: 10 85 | 86 | volumeMounts: 87 | - name: storage 88 | mountPath: /mnt/pvc 89 | 90 | env: 91 | - name: WANDB_API_KEY 92 | valueFrom: 93 | secretKeyRef: 94 | name: wandb-token-secret 95 | key: token 96 | 97 | resources: 98 | requests: 99 | cpu: "4" 100 | memory: 16Gi 101 | limits: 102 | cpu: "4" 103 | memory: 16Gi 104 | affinity: 105 | nodeAffinity: 106 | requiredDuringSchedulingIgnoredDuringExecution: 107 | nodeSelectorTerms: 108 | - matchExpressions: 109 | - key: topology.kubernetes.io/region 110 | operator: In 111 | values: 112 | - "LGA1" 113 | volumes: 114 | - name: storage 115 | persistentVolumeClaim: 116 | claimName: spark-pvc 117 | restartPolicy: Always -------------------------------------------------------------------------------- /spark/spark-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: spark-pvc 5 | spec: 6 | storageClassName: shared-nvme-lga1 7 | accessModes: 8 | - ReadWriteMany 9 | resources: 10 | requests: 11 | storage: "400Gi" 12 | -------------------------------------------------------------------------------- /spark/spark-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: spark-sa 5 | --- 6 | apiVersion: rbac.authorization.k8s.io/v1 7 | kind: Role 8 | metadata: 9 | name: role:spark 10 | rules: 11 | - apiGroups: 12 | - "" 13 | resources: 14 | - configmaps 15 | - pods 16 | - services 17 | - persistentvolumeclaims 18 | verbs: 19 | - '*' 20 | --- 21 | apiVersion: rbac.authorization.k8s.io/v1 22 | kind: RoleBinding 23 | metadata: 24 | name: spark 25 | roleRef: 26 | apiGroup: rbac.authorization.k8s.io 27 | kind: Role 28 | name: role:spark 29 | subjects: 30 | - kind: ServiceAccount 31 | name: spark-sa 32 | -------------------------------------------------------------------------------- /spark/wandb-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | token: enterYourSecret== 4 | kind: Secret 5 | metadata: 6 | name: wandb-token-secret 7 | type: Opaque 8 | -------------------------------------------------------------------------------- /tensorflow-jupyter/jupyter-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: jupyter-pv-claim 5 | spec: 6 | # Available storage classes at time of writing are 7 | # block-nvme-lga1 - New York - NVMe Storage with 3 Replicas 8 | # block-hdd-lga1 - New York - HDD Storage with 3 Replicas 9 | # Other data centers currently available [ewr1, las1] 10 | storageClassName: block-nvme-lga1 11 | accessModes: 12 | - ReadWriteOnce 13 | resources: 14 | requests: 15 | storage: 10Gi 16 | -------------------------------------------------------------------------------- /tensorflow-jupyter/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/tensorflow-jupyter/screenshot.png -------------------------------------------------------------------------------- /tensorflow-jupyter/tensorflow-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: tensorflow-jupyter 5 | spec: 6 | strategy: 7 | type: Recreate 8 | # Replicas controls the number of instances of the Pod to maintain running at all times 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app.kubernetes.io/name: tensorflow-jupyter 13 | template: 14 | metadata: 15 | labels: 16 | app.kubernetes.io/name: tensorflow-jupyter 17 | spec: 18 | containers: 19 | - name: tf 20 | image: tensorflow/tensorflow:2.12.0-gpu-jupyter 21 | 22 | ports: 23 | - name: notebook 24 | containerPort: 8888 25 | protocol: TCP 26 | 27 | readinessProbe: 28 | tcpSocket: 29 | port: notebook 30 | initialDelaySeconds: 5 31 | periodSeconds: 10 32 | livenessProbe: 33 | httpGet: 34 | path: / 35 | port: notebook 36 | initialDelaySeconds: 15 37 | periodSeconds: 15 38 | failureThreshold: 3 39 | timeoutSeconds: 10 40 | 41 | volumeMounts: 42 | - name: storage 43 | mountPath: /tf/notebooks 44 | 45 | resources: 46 | requests: 47 | cpu: 500m # The CPU unit is milli-cores. 500m is 0.5 cores 48 | memory: 16Gi 49 | limits: 50 | cpu: 2000m 51 | memory: 16Gi 52 | # GPUs can only be allocated as a limit, which both reserves and limits the number of GPUs the Pod will have access to 53 | # Making individual Pods resource light is advantageous for bin-packing. In the case of Jupyter, we stick to two GPUs for 54 | # demonstration purposes 55 | nvidia.com/gpu: 2 56 | 57 | # Node affinity can be used to require / prefer the Pods to be scheduled on a node with a specific hardware type 58 | # No affinity allows scheduling on all hardware types that can fulfill the resource request. 59 | # In this example, without affinity, any NVIDIA GPU would be allowed to run the Pod. 60 | # Read more about affinity at: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity 61 | affinity: 62 | nodeAffinity: 63 | # This will REQUIRE the Pod to be run on a system with an NVIDIA A40 GPU 64 | requiredDuringSchedulingIgnoredDuringExecution: 65 | nodeSelectorTerms: 66 | - matchExpressions: 67 | - key: gpu.nvidia.com/class 68 | operator: In 69 | values: 70 | - A40 71 | - key: failure-domain.beta.kubernetes.io/region 72 | operator: In 73 | values: 74 | - LGA1 75 | 76 | # As ML testing doesn't require a lot of network throughput, we try to play nice and only schedule 77 | # the Pod on systems with only 1G network connections. We also desire decent CPUs. This is a preference, not a requirement. 78 | # If systems with i5 / i9 / Xeon CPUs and/or 1G ethernet are not available to fulfill the requested resources, the Pods 79 | # will be scheduled on higher end systems. 80 | preferredDuringSchedulingIgnoredDuringExecution: 81 | # - weight: 10 82 | # preference: 83 | # matchExpressions: 84 | # - key: cpu.atlantic.cloud/family 85 | # operator: In 86 | # values: 87 | # - i7 88 | # - i5 89 | # - i9 90 | # - xeon 91 | - weight: 10 92 | preference: 93 | matchExpressions: 94 | - key: ethernet.atlantic.cloud/speed 95 | operator: In 96 | values: 97 | - 1G 98 | volumes: 99 | - name: storage 100 | persistentVolumeClaim: 101 | claimName: jupyter-pv-claim 102 | restartPolicy: Always 103 | -------------------------------------------------------------------------------- /tensorflow-jupyter/tensorflow-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | annotations: 5 | metallb.universe.tf/address-pool: public 6 | # Setting a sharing key might save public IP addresses 7 | # See https://metallb.universe.tf/usage/#ip-address-sharing for more detail 8 | metallb.universe.tf/allow-shared-ip: example-1 9 | name: tensorflow-jupyter 10 | spec: 11 | type: LoadBalancer 12 | externalTrafficPolicy: Local 13 | ports: 14 | - name: notebook 15 | port: 8888 16 | protocol: TCP 17 | targetPort: notebook 18 | selector: 19 | app.kubernetes.io/name: tensorflow-jupyter 20 | -------------------------------------------------------------------------------- /virtual-server/examples/curl/README.md: -------------------------------------------------------------------------------- 1 | ## Virtual Server `curl` Example 2 | 3 | This is an example implementation in `bash` of a Kubernetes client interacting with a Virtual Server resource on CoreWeave Cloud using `curl`. The example script provided creates, lists, and deletes a simple Ubuntu 20.04 Virtual Server with 2 CPU cores and 2Gi of memory. 4 | 5 | ## Usage 6 | 7 | ### Dependencies 8 | Before invoking the script, the `jq` and `curl` commands must be installed and available from the `PATH`. 9 | 10 | ### Environment variables 11 | In invoking this script, `TOKEN` and `NAMESPACE` will be exported as environment variables. The value of `NAMESPACE` should be set to the desired namespace. The value of `TOKEN` should be replaced with the value of `'token:'` generated in the `kubeconfig` file. 12 | 13 | > ℹ️ [See more about how to generate the kubeconfig file.](https://docs.coreweave.com/coreweave-kubernetes/getting-started#obtain-access-credentials) 14 | 15 | 16 | ### Running the script 17 | 18 | The script is invoked like so: 19 | 20 | ```bash 21 | TOKEN= NAMESPACE= ./run.sh 22 | ``` 23 | 24 | ## Implementation Breakdown 25 | 26 | The implementation consists of a few simple `curl` calls to two APIs: 27 | 28 | 1. **[Kubevirt](https://kubevirt.io/)** - An open-source project that allows running virtual systems on the Kubernetes cluster. 29 | 1. **[Virtual Server](https://docs.coreweave.com/virtual-servers/getting-started)** - A Kubernetes Custom Resource that allows deploying a virtual system and interacting with Kubevirt with ease. 30 | 31 | > 💡 **Additional resources** 32 | > 33 | > The latest resource details, such as statuses and conditions, are available on [Virtual Servers reference API](https://pkg.go.dev/github.com/coreweave/virtual-server/api/v1alpha1#VirtualServerConditionType) 34 | > The general description of Kubernetes RESTful API is available in [the official documentation of the Kubernetes API Overview](https://kubernetes.io/docs/reference/using-api/). Basic concepts of the API are described in [the official documentation of the Kubernetes API Concepts](https://kubernetes.io/docs/reference/using-api/api-concepts/). 35 | 36 | ## Virtual Server functions 37 | 38 | - `create_vs()` - creates the Virtual Server 39 | - `delete_vs()` - deletes the Virtual Server 40 | - `list_vs()` - lists of all the Virtual Servers in the designated namespace 41 | - `get_vs()` - prints formatted JSON details about the Virtual Server 42 | - `wait_until_vs_status()` - loops until the expected condition is met. 43 | 44 | ## Kubevirt functions 45 | 46 | **VM** 47 | - `start_vm()` - starts a Virtual Machine and creates a Virtual Machine Instance 48 | - `stop_vm()` - stops the Virtual Machine, then the deletes Virtual Machine Instance 49 | - `list_vm()` - lists all the Virtual Machines in namespace 50 | - `get_vm()` - prints formatted JSON details about the Virtual Machine 51 | 52 | **VMI** 53 | - `list_vmi()` - lists all the Virtual Machine Instances in the designated namespace 54 | - `get_vmi()` - prints formatted JSON details about Virtual Machine Instance 55 | 56 | > 💡 **Additional resources** 57 | > 58 | > The [Kubevirt Python client](https://github.com/kubevirt/client-python#documentation-for-api-endpoints) can list all of the Kubevirt RESTful API, both for VMs and VMIs. 59 | -------------------------------------------------------------------------------- /virtual-server/examples/curl/virtual-server.json: -------------------------------------------------------------------------------- 1 | { 2 | "apiVersion": "virtualservers.coreweave.com/v1alpha1", 3 | "kind": "VirtualServer", 4 | "metadata": { 5 | "name": "vs-example" 6 | }, 7 | "spec": { 8 | "region": "ORD1", 9 | "os": { 10 | "type": "linux" 11 | }, 12 | "initializeRunning": true, 13 | "resources": { 14 | "cpu": { 15 | "count": 2, 16 | "type": "amd-epyc-rome" 17 | }, 18 | "memory": "2Gi" 19 | }, 20 | "storage": { 21 | "root": { 22 | "size": "40Gi", 23 | "storageClassName": "block-nvme-ord1", 24 | "source": { 25 | "pvc": { 26 | "namespace": "vd-images", 27 | "name": "ubuntu2004-nvidia-515-86-01-1-docker-master-20221205-ord1" 28 | } 29 | } 30 | } 31 | }, 32 | "users": [ 33 | { 34 | "username": "myuser", 35 | "password": "password1234" 36 | } 37 | ], 38 | "network": { 39 | "public": true, 40 | "tcp": { 41 | "ports": [ 42 | 22 43 | ] 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /virtual-server/examples/go/.gitignore: -------------------------------------------------------------------------------- 1 | vs 2 | -------------------------------------------------------------------------------- /virtual-server/examples/go/Makefile: -------------------------------------------------------------------------------- 1 | # Go parameters 2 | GOCMD=go 3 | GOMOD=$(GOCMD) mod 4 | GOBUILD=$(GOCMD) build 5 | GOCLEAN=$(GOCMD) clean 6 | BINARY_NAME=vs 7 | 8 | all: install run 9 | 10 | install: 11 | $(GOMOD) download 12 | 13 | run: 14 | export GO111MODULES=on 15 | $(GOBUILD) -o $(BINARY_NAME) -v ./... 16 | ./$(BINARY_NAME) 17 | 18 | clean: 19 | $(GOCLEAN) 20 | rm -rf $(BINARY_NAME) 21 | -------------------------------------------------------------------------------- /virtual-server/examples/go/README.md: -------------------------------------------------------------------------------- 1 | # Go example 2 | 3 | An example Go implementation of a kubernetes client that interacts with the CoreWeave VirtualServer resource as well as the kubevirt subresource api. 4 | 5 | The Go example illustrates the following: 6 | 1. Build VirtualServer definition based on API https://github.com/coreweave/virtual-server. 7 | 2. Builds a Service and PVC to be used as a FloatingIP and Additional Filesystem respectively. 8 | 3. Removal of an existing VirtualServer. 9 | 4. Creation of a new VirtualServer. The instance is started automatically. 10 | 5. Waiting for VirtualServer ready status. 11 | 6. Create floating service when environment variable `FLOATING_SERVICE_NAME` is specified. 12 | 7. Stop the instance and wait until it is fully stopped. 13 | 8. Delete the VirtualServer. 14 | 15 | ## Run 16 | 17 | The first run takes more time until all necessary packages are downloaded. 18 | 19 | Be sure to use secure credentials for `USERNAME` and `PASSWORD` as they will be used to create a user in your Virtual Server 20 | ``` 21 | USERNAME= PASSWORD= KUBECONFIG=/home//.kubeconfig NAMESPACE= make 22 | ``` 23 | 24 | In order to create floating service, environment variable `FLOATING_SERVICE_NAME` must be spefified: 25 | ``` 26 | FLOATING_SERVICE_NAME= PASSWORD= KUBECONFIG=/home//.kubeconfig NAMESPACE= make 27 | ``` 28 | -------------------------------------------------------------------------------- /virtual-server/examples/go/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/coreweave/kubernetes-cloud/virtual-server/examples/go 2 | 3 | go 1.13 4 | 5 | replace ( 6 | github.com/go-kit/kit => github.com/go-kit/kit v0.3.0 7 | github.com/openshift/api => github.com/openshift/api v0.0.0-20210105115604-44119421ec6b 8 | github.com/openshift/client-go => github.com/openshift/client-go v0.0.0-20210112165513-ebc401615f47 9 | github.com/operator-framework/operator-lifecycle-manager => github.com/operator-framework/operator-lifecycle-manager v0.17.0 10 | github.com/operator-framework/operator-registry => github.com/operator-framework/operator-registry v1.16.1 11 | k8s.io/api => k8s.io/api v0.20.2 12 | k8s.io/apimachinery => k8s.io/apimachinery v0.20.2 13 | k8s.io/client-go => k8s.io/client-go v0.20.2 14 | k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.16.4 15 | kubevirt.io/containerized-data-importer => kubevirt.io/containerized-data-importer v1.26.1 16 | sigs.k8s.io/structured-merge-diff => sigs.k8s.io/structured-merge-diff v1.0.1-0.20191108220359-b1b620dd3f06 17 | ) 18 | 19 | require ( 20 | github.com/coreweave/virtual-server v1.15.0 21 | github.com/spf13/pflag v1.0.5 22 | k8s.io/api v0.20.2 23 | k8s.io/apimachinery v0.20.2 24 | kubevirt.io/client-go v0.39.0 25 | sigs.k8s.io/controller-runtime v0.8.3 26 | ) 27 | -------------------------------------------------------------------------------- /virtual-server/examples/kubectl/README.md: -------------------------------------------------------------------------------- 1 | This directory contains several example manifests for `VirtualServer`. 2 | 3 | To run any of the examples issue: `kubectl apply -f ` 4 | 5 | CoreWeave provides base images for different operating systems, including images pre-loaded with NVIDIA drivers and remote desktop software. Refer to the [System Images Documentation](https://docs.coreweave.com/virtual-servers/coreweave-system-images) to learn how to list these images via CLI. 6 | 7 | - [virtual-server-direct-attach-lb.yaml](virtual-server-direct-attach-lb.yaml) shows how to directly attach the Load Balancer IP to a Virtual Server. This will the VS a unfiltered Public IP that also is visible to the VM itself. This will give a classic VPS style experience. 8 | 9 | - [virtual-server-windows-internal-ip-only.yaml](virtual-server-windows-internal-ip-only.yaml) creates a Windows Virtual Server with no public IP (STATIC internal IP only) - useful for servers that will only be accessed in your namespace, such as Domain Controllers. 10 | 11 | - [virtual-server-windows-cpu-only.yaml](virtual-server-windows-cpu-only.yaml) creates a Windows Virtual Server with no GPU - CPU compute only. 12 | 13 | - [virtual-server-shared-pvc.yaml](virtual-server-shared-pvc.yaml) attaches shared `PVC` to the Virtual Server. The `PVC` formated already and mounted as `/mnt/shared-pvc`. 14 | 15 | - [virtual-server-ephemeral-root-disk.yaml](virtual-server-ephemeral-root-disk.yaml) boots a Virtual Server from a root-disk image in ephemeral mode. Changes to the VM root disk will be written to local node ephemeral storage, and lost on restart. Useful for epehemeral tasks such as pixel-streaming and data-processing. 16 | 17 | - [virtual-server-windows.yaml](virtual-server-windows.yaml) creates Windows10 Virtual Server. To get the external IP and use remote desktop via `RDP` protocol, issue the following command: 18 | ``` 19 | kubectl get svc vs-windows10-tcp -o jsonpath="{.status.loadBalancer.ingress[*].ip}" 20 | ``` 21 | 22 | - [virtual-server-block-pvc.yaml](virtual-server-block-pvc.yaml) attaches an additional block `PVC` disk to the virtual machine. The new disk is raw and needs to be formatted. 23 | 24 | ``` 25 | 26 | myuser@vs-ubuntu2004-block-pvc:~$ sudo mkfs.ext4 /dev/vdb 27 | mke2fs 1.45.5 (07-Jan-2020) 28 | Discarding device blocks: done 29 | Creating filesystem with 5242880 4k blocks and 1310720 inodes 30 | Filesystem UUID: 0a05b295-9518-41f3-8b64-18d5902d419e 31 | Superblock backups stored on blocks: 32 | 32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632, 2654208, 33 | 4096000 34 | 35 | Allocating group tables: done 36 | Writing inode tables: done 37 | Creating journal (32768 blocks): done 38 | Writing superblocks and filesystem accounting information: done 39 | 40 | ``` 41 | 42 | Now, the disk is ready: 43 | ``` 44 | myuser@vs-ubuntu2004-block-pvc:~$ sudo mkdir /mnt/vdb && sudo mount /dev/vdb /mnt/vdb 45 | myuser@vs-ubuntu2004-block-pvc:~$ df -h 46 | Filesystem Size Used Avail Use% Mounted on 47 | udev 7.9G 0 7.9G 0% /dev 48 | tmpfs 1.6G 1.1M 1.6G 1% /run 49 | /dev/vda1 39G 2.6G 37G 7% / 50 | tmpfs 7.9G 0 7.9G 0% /dev/shm 51 | tmpfs 5.0M 0 5.0M 0% /run/lock 52 | tmpfs 7.9G 0 7.9G 0% /sys/fs/cgroup 53 | /dev/vda15 105M 7.8M 97M 8% /boot/efi 54 | /dev/loop0 71M 71M 0 100% /snap/lxd/19647 55 | /dev/loop1 56M 56M 0 100% /snap/core18/1988 56 | /dev/loop2 33M 33M 0 100% /snap/snapd/11107 57 | /dev/loop3 33M 33M 0 100% /snap/snapd/12704 58 | /dev/loop4 56M 56M 0 100% /snap/core18/2128 59 | /dev/loop5 71M 71M 0 100% /snap/lxd/21029 60 | tmpfs 1.6G 0 1.6G 0% /run/user/1001 61 | /dev/vdb 20G 45M 19G 1% /mnt/vdb 62 | ``` 63 | 64 | Additional examples and documentation: 65 | 66 | - [Kubernetes documentation](https://docs.coreweave.com/coreweave-kubernetes/getting-started) 67 | - [VirtualServer documentation](https://docs.coreweave.com/virtual-servers/getting-started) 68 | - [Advanced Label selectors](https://docs.coreweave.com/coreweave-kubernetes/label-selectors) 69 | - [CPU and GPU Availability](https://docs.coreweave.com/coreweave-kubernetes/node-types) 70 | - [Storage](https://docs.coreweave.com/coreweave-kubernetes/storage) 71 | 72 | -------------------------------------------------------------------------------- /virtual-server/examples/kubectl/virtual-server-block-pvc.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: PersistentVolumeClaim 4 | metadata: 5 | name: vs-block-pvc 6 | spec: 7 | accessModes: 8 | - ReadWriteOnce 9 | storageClassName: block-nvme-ord1 10 | volumeMode: Block 11 | resources: 12 | requests: 13 | storage: 20Gi 14 | --- 15 | apiVersion: virtualservers.coreweave.com/v1alpha1 16 | kind: VirtualServer 17 | metadata: 18 | name: vs-ubuntu2004-block-pvc 19 | spec: 20 | region: ORD1 21 | os: 22 | type: linux 23 | resources: 24 | gpu: 25 | type: Quadro_RTX_4000 26 | count: 1 27 | cpu: 28 | count: 3 29 | memory: 16Gi 30 | storage: 31 | root: 32 | size: 40Gi 33 | storageClassName: block-nvme-ord1 34 | source: 35 | pvc: 36 | namespace: vd-images 37 | name: ubuntu2004-nvidia-510-47-03-1-docker-master-20220517-ord1 38 | additionalDisks: 39 | - name: additional-block-volume 40 | spec: 41 | persistentVolumeClaim: 42 | claimName: vs-block-pvc 43 | # users: 44 | # - username: SET YOUR USERNAME HERE 45 | # password: SET YOUR PASSWORD HERE 46 | # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key 47 | # sshpublickey: | 48 | # ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname 49 | network: 50 | public: true 51 | tcp: 52 | ports: 53 | - 22 54 | cloudInit: | 55 | # The disk_setup directive instructs Cloud-init to partition a disk. 56 | disk_setup: 57 | /dev/vdb: 58 | table_type: gpt 59 | layout: True 60 | overwrite: False 61 | # fs_setup describes the how the file systems are supposed to look. 62 | fs_setup: 63 | - label: None 64 | filesystem: ext4 65 | device: /dev/vdb 66 | partition: 'auto' 67 | # 'mounts' contains a list of lists; the inner list are entries for an /etc/fstab line 68 | mounts: 69 | - [ vdb, /mnt/block-pvc, auto, "defaults" ] 70 | initializeRunning: true 71 | -------------------------------------------------------------------------------- /virtual-server/examples/kubectl/virtual-server-cloudinit.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: virtualservers.coreweave.com/v1alpha1 2 | kind: VirtualServer 3 | metadata: 4 | name: example-vs 5 | spec: 6 | region: ORD1 7 | os: 8 | type: linux 9 | resources: 10 | gpu: 11 | type: Quadro_RTX_4000 12 | count: 1 13 | cpu: 14 | count: 4 15 | memory: 16Gi 16 | storage: 17 | root: 18 | size: 40Gi 19 | storageClassName: block-nvme-ord1 20 | source: 21 | pvc: 22 | namespace: vd-images 23 | name: ubuntu2004-nvidia-510-47-03-1-docker-master-20220421-ord1 24 | # Change user name and pasword 25 | # User is on the sudoers list 26 | # users: 27 | # - username: SET YOUR USERNAME HERE 28 | # password: SET YOUR PASSWORD HERE 29 | # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key 30 | # sshpublickey: | 31 | # ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname 32 | network: 33 | public: true 34 | tcp: 35 | ports: 36 | - 22 37 | - 443 38 | - 60443 39 | - 4172 40 | - 3389 41 | udp: 42 | ports: 43 | - 4172 44 | - 3389 45 | cloudInit: | 46 | # Write a simple script 47 | write_files: 48 | - content: | 49 | #!/bin/bash 50 | echo "Hello world!" 51 | path: /home/myuser/script.sh 52 | permissions: '0744' 53 | owner: myuser:myuser 54 | # Update packages 55 | package_update: true 56 | # Install packages 57 | packages: 58 | - curl 59 | - git 60 | # Run additional commands 61 | runcmd: 62 | - [df, -h] 63 | - [git, version] 64 | - [curl, --version ] 65 | - [bash, /home/myuser/script.sh] 66 | initializeRunning: true 67 | -------------------------------------------------------------------------------- /virtual-server/examples/kubectl/virtual-server-direct-attach-lb.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: virtualservers.coreweave.com/v1alpha1 2 | kind: VirtualServer 3 | metadata: 4 | name: example-vs-direct 5 | spec: 6 | region: ORD1 7 | os: 8 | type: linux 9 | resources: 10 | gpu: 11 | type: Quadro_RTX_4000 12 | count: 1 13 | cpu: 14 | count: 4 15 | memory: 16Gi 16 | storage: 17 | root: 18 | size: 40Gi 19 | storageClassName: block-nvme-ord1 20 | source: 21 | pvc: 22 | namespace: vd-images 23 | name: ubuntu2004-nvidia-510-47-03-1-docker-master-20220421-ord1 24 | # Change user name and pasword 25 | # User is on the sudoers list 26 | # users: 27 | # - username: SET YOUR USERNAME HERE 28 | # password: SET YOUR PASSWORD HERE 29 | # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key 30 | # sshpublickey: | 31 | # ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname 32 | network: 33 | public: true 34 | directAttachLoadBalancerIP: true 35 | -------------------------------------------------------------------------------- /virtual-server/examples/kubectl/virtual-server-ephemeral-disk.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: virtualservers.coreweave.com/v1alpha1 2 | kind: VirtualServer 3 | metadata: 4 | name: vs-ubuntu2004-ephemeral-disk 5 | spec: 6 | region: ORD1 7 | os: 8 | type: linux 9 | resources: 10 | cpu: 11 | count: 2 12 | type: intel-xeon-v4 13 | memory: 4Gi 14 | storage: 15 | root: 16 | size: 40Gi 17 | storageClassName: block-nvme-ord1 18 | source: 19 | pvc: 20 | namespace: vd-images 21 | name: ubuntu2004-docker-master-20220708-ord1 22 | additionalDisks: 23 | - name: ephemeral-disk 24 | spec: 25 | emptyDisk: 26 | capacity: 10Gi 27 | # users: 28 | # - username: SET YOUR USERNAME HERE 29 | # password: SET YOUR PASSWORD HERE 30 | # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key 31 | # sshpublickey: | 32 | # ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname 33 | network: 34 | public: true 35 | tcp: 36 | ports: 37 | - 22 38 | # Format and mount the ephemeral disk 39 | cloudInit: | 40 | bootcmd: 41 | - test "$(lsblk /dev/vdb)" && mkfs.ext4 /dev/vdb 42 | - mkdir -p /mnt/vdb 43 | mounts: 44 | - [ "/dev/vdb", "/mnt/vdb", "ext4", "defaults,nofail", "0", "2" ] 45 | runcmd: 46 | - [df, -h] 47 | initializeRunning: true 48 | -------------------------------------------------------------------------------- /virtual-server/examples/kubectl/virtual-server-ephemeral-root-disk.yaml: -------------------------------------------------------------------------------- 1 | ### 2 | ## Ephemeral Root Disks 3 | # Many use cases, such as data processing or pixel-streaming is ephemeral. VM instances are short-lived and deleted on shut-down. 4 | # In these instances, leveraging ephemeral root disks will speed up instantiation as well as lower costs. 5 | # Ephemeral root-disks don't require a new root volume to be allocated, removing a time consuming step in the instantiation process. 6 | # Epehemeral disks are still writeable, modifications made at run-time is temporarily stored in the ephemeral disk of the serving node. 7 | # All changes written to the root disk are lost on when the VM is shut down. A shared filesystem volume or NFS/SMB/Object storage should be used 8 | # to store persistent data in ie data-processing use cases. 9 | # 10 | # To launch a VS using an ephemeral root disk, the source image needs to be cloned into a `ReadOnlyMany` type volume. 11 | ### 12 | --- 13 | apiVersion: v1 14 | kind: PersistentVolumeClaim 15 | metadata: 16 | name: image-rox 17 | spec: 18 | accessModes: 19 | - ReadOnlyMany 20 | dataSource: 21 | kind: PersistentVolumeClaim 22 | name: # This name will be the same name as a DataVolume/VirtualServer used as the source. 23 | resources: 24 | requests: 25 | storage: 40Gi # Must match the size of the source volume 26 | storageClassName: block-nvme-ord1 27 | volumeMode: Block 28 | --- 29 | apiVersion: virtualservers.coreweave.com/v1alpha1 30 | kind: VirtualServer 31 | metadata: 32 | name: example-vs 33 | spec: 34 | region: ORD1 35 | os: 36 | type: linux 37 | resources: 38 | gpu: 39 | type: Quadro_RTX_4000 40 | count: 1 41 | cpu: 42 | count: 4 43 | memory: 16Gi 44 | storage: 45 | root: 46 | size: 40Gi 47 | storageClassName: block-nvme-ord1 48 | ephemeral: true 49 | source: 50 | pvc: 51 | namespace: tenant-example # Replace with your namespace 52 | name: image-rox 53 | # Change user name and pasword 54 | # User is on the sudoers list 55 | # users: 56 | # - username: SET YOUR USERNAME HERE 57 | # password: SET YOUR PASSWORD HERE 58 | # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key 59 | # sshpublickey: | 60 | # ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname 61 | network: 62 | public: false 63 | -------------------------------------------------------------------------------- /virtual-server/examples/kubectl/virtual-server-shared-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: vs-shared-pvc 5 | spec: 6 | accessModes: 7 | - ReadWriteMany 8 | storageClassName: shared-nvme-ord1 9 | volumeMode: Filesystem 10 | resources: 11 | requests: 12 | storage: 20Gi 13 | --- 14 | apiVersion: virtualservers.coreweave.com/v1alpha1 15 | kind: VirtualServer 16 | metadata: 17 | name: vs-ubuntu2004-shared-pvc 18 | spec: 19 | region: ORD1 20 | os: 21 | type: linux 22 | resources: 23 | gpu: 24 | type: Quadro_RTX_4000 25 | count: 1 26 | cpu: 27 | count: 3 28 | memory: 16Gi 29 | storage: 30 | root: 31 | size: 40Gi 32 | storageClassName: block-nvme-ord1 33 | source: 34 | pvc: 35 | namespace: vd-images 36 | name: ubuntu2004-nvidia-510-47-03-1-docker-master-20220517-ord1 37 | filesystems: 38 | - name: shared-pvc 39 | spec: 40 | persistentVolumeClaim: 41 | claimName: vs-shared-pvc 42 | # Change user name and pasword 43 | # User is on the sudoer list 44 | # users: 45 | # - username: SET YOUR USERNAME HERE 46 | # password: SET YOUR PASSWORD HERE 47 | # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key 48 | # sshpublickey: | 49 | # ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname 50 | network: 51 | public: true 52 | tcp: 53 | ports: 54 | - 22 55 | cloudInit: | 56 | # Write a simple script 57 | write_files: 58 | - content: | 59 | #!/bin/bash 60 | echo "Hello world!" 61 | path: /home/myuser/script.sh 62 | permissions: '0744' 63 | owner: myuser:myuser 64 | # Update packages 65 | package_update: true 66 | # Install packages 67 | packages: 68 | - curl 69 | - git 70 | # Run additional commands 71 | runcmd: 72 | - [df, -h] 73 | - [git, version] 74 | - [curl, --version ] 75 | - [bash, /home/myuser/script.sh] 76 | initializeRunning: true 77 | -------------------------------------------------------------------------------- /virtual-server/examples/kubectl/virtual-server-static-mac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: virtualservers.coreweave.com/v1alpha1 2 | kind: VirtualServer 3 | metadata: 4 | name: example-vs-static-mac 5 | spec: 6 | region: ORD1 7 | os: 8 | type: linux 9 | resources: 10 | cpu: 11 | count: 2 12 | type: amd-epyc-rome 13 | memory: 2Gi 14 | storage: 15 | root: 16 | size: 40Gi 17 | storageClassName: block-nvme-ord1 18 | source: 19 | pvc: 20 | namespace: vd-images 21 | name: ubuntu2004-docker-master-20220103-ord1 22 | # Change user name and pasword 23 | # User is on the sudoers list 24 | # users: 25 | # - username: SET YOUR USERNAME HERE 26 | # password: SET YOUR PASSWORD HERE 27 | # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key 28 | # sshpublickey: | 29 | # ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname 30 | network: 31 | macAddress: A2-1F-EE-09-06-5D 32 | public: true 33 | tcp: 34 | ports: 35 | - 22 36 | initializeRunning: true 37 | 38 | -------------------------------------------------------------------------------- /virtual-server/examples/kubectl/virtual-server-windows-cpu-only.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: virtualservers.coreweave.com/v1alpha1 2 | kind: VirtualServer 3 | metadata: 4 | name: vs-windows-cpu 5 | spec: 6 | region: LAS1 7 | os: 8 | type: windows 9 | resources: 10 | cpu: 11 | # Reference CPU instance label selectors here: 12 | # https://docs.coreweave.com/resources/resource-based-pricing#cpu-only-instance-resource-pricing 13 | type: amd-epyc-rome 14 | count: 3 15 | memory: 16Gi 16 | storage: 17 | root: 18 | size: 80Gi 19 | storageClassName: block-nvme-las1 20 | source: 21 | pvc: 22 | namespace: vd-images 23 | # Reference querying source image here: 24 | # https://docs.coreweave.com/virtual-servers/root-disk-lifecycle-management/exporting-coreweave-images-to-a-writable-pvc#identifying-source-image 25 | name: winserver2019std-master-20210813-las1 26 | # Change user name and pasword 27 | # users: 28 | # - username: SET YOUR USERNAME HERE 29 | # password: SET YOUR PASSWORD HERE 30 | network: 31 | public: true 32 | tcp: 33 | ports: 34 | - 443 35 | - 60443 36 | - 4172 37 | - 3389 38 | udp: 39 | ports: 40 | - 4172 41 | - 3389 42 | initializeRunning: true 43 | -------------------------------------------------------------------------------- /virtual-server/examples/kubectl/virtual-server-windows-internal-ip-only.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: virtualservers.coreweave.com/v1alpha1 2 | kind: VirtualServer 3 | metadata: 4 | name: vs-windows-internal 5 | spec: 6 | region: ORD1 7 | os: 8 | type: windows 9 | resources: 10 | cpu: 11 | # Reference CPU instance label selectors here: 12 | # https://docs.coreweave.com/resources/resource-based-pricing#cpu-only-instance-resource-pricing 13 | type: amd-epyc-rome 14 | count: 4 15 | memory: 16Gi 16 | storage: 17 | root: 18 | size: 80Gi 19 | storageClassName: block-nvme-ord1 20 | source: 21 | pvc: 22 | namespace: vd-images 23 | # Reference querying source image here: 24 | # https://docs.coreweave.com/virtual-servers/root-disk-lifecycle-management/exporting-coreweave-images-to-a-writable-pvc#identifying-source-image 25 | name: winserver2019std-master-20210819-ord1 26 | # Change user name and pasword 27 | # users: 28 | # - username: SET YOUR USERNAME HERE 29 | # password: SET YOUR PASSWORD HERE 30 | network: 31 | directAttachLoadBalancerIP: true 32 | public: false 33 | initializeRunning: true -------------------------------------------------------------------------------- /virtual-server/examples/kubectl/virtual-server-windows.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: virtualservers.coreweave.com/v1alpha1 2 | kind: VirtualServer 3 | metadata: 4 | name: vs-windows10 5 | spec: 6 | region: LAS1 7 | os: 8 | type: windows 9 | resources: 10 | gpu: 11 | type: Quadro_RTX_4000 12 | count: 1 13 | cpu: 14 | count: 3 15 | memory: 16Gi 16 | storage: 17 | root: 18 | size: 80Gi 19 | storageClassName: block-nvme-las1 20 | source: 21 | pvc: 22 | namespace: vd-images 23 | # Reference querying source image here: 24 | # https://docs.coreweave.com/virtual-servers/root-disk-lifecycle-management/exporting-coreweave-images-to-a-writable-pvc#identifying-source-image 25 | name: win10-master-20210722-las1 26 | # Change user name and pasword 27 | # users: 28 | # - username: SET YOUR USERNAME HERE 29 | # password: SET YOUR PASSWORD HERE 30 | network: 31 | public: true 32 | tcp: 33 | ports: 34 | - 443 35 | - 60443 36 | - 4172 37 | - 3389 38 | udp: 39 | ports: 40 | - 4172 41 | - 3389 42 | initializeRunning: true 43 | -------------------------------------------------------------------------------- /virtual-server/examples/kubectl/virtual-server.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: virtualservers.coreweave.com/v1alpha1 2 | kind: VirtualServer 3 | metadata: 4 | name: example-vs 5 | spec: 6 | region: ORD1 7 | os: 8 | type: linux 9 | resources: 10 | gpu: 11 | type: Quadro_RTX_4000 12 | count: 1 13 | cpu: 14 | count: 4 15 | memory: 16Gi 16 | storage: 17 | root: 18 | size: 40Gi 19 | storageClassName: block-nvme-ord1 20 | source: 21 | pvc: 22 | namespace: vd-images 23 | name: ubuntu2004-nvidia-510-47-03-1-docker-master-20220421-ord1 24 | # Change user name and pasword 25 | # User is on the sudoers list 26 | # users: 27 | # - username: SET YOUR USERNAME HERE 28 | # password: SET YOUR PASSWORD HERE 29 | # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key 30 | # sshpublickey: | 31 | # ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname 32 | network: 33 | public: true 34 | tcp: 35 | ports: 36 | - 22 37 | -------------------------------------------------------------------------------- /virtual-server/examples/nodejs/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "virtual-server-client-example", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "client.js", 6 | "scripts": { 7 | "start": "node main.js" 8 | }, 9 | "author": "Yitzy Dier", 10 | "license": "ISC", 11 | "dependencies": { 12 | "kubernetes-client": "^9.0.0" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /virtual-server/examples/nodejs/util.js: -------------------------------------------------------------------------------- 1 | // Validates is a quantity is a valid k8s resource.Quantity 2 | const k8sValidateQuantity = (size) => /^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$/.test(size) 3 | // Create a new blank VirtualServer Manifest object 4 | const newVirtualServerManifest = ({name, namespace}) => ({ 5 | apiVersion: "virtualservers.coreweave.com/v1alpha1", 6 | kind: "VirtualServer", 7 | metadata: { 8 | name, 9 | namespace 10 | }, 11 | spec: { 12 | affinity: {}, 13 | region: "", 14 | os: { 15 | definition: "a", 16 | type: "" 17 | }, 18 | resources: { 19 | definition: "a", 20 | gpu: { 21 | type: null, 22 | count: null, 23 | }, 24 | cpu: { 25 | type: null, 26 | count: null, 27 | }, 28 | memory: "" 29 | }, 30 | storage: { 31 | root: { 32 | size: "", 33 | source: { 34 | pvc: { 35 | namespace: "", 36 | name: "" 37 | }, 38 | storageClassName: "", 39 | volumeMode: null, 40 | accessMode: null 41 | } 42 | }, 43 | additionalDisks: [ 44 | 45 | ], 46 | filesystems: [ 47 | 48 | ], 49 | swap: null 50 | }, 51 | users: [ 52 | 53 | ], 54 | network: { 55 | tcp: [], 56 | udp: [], 57 | directAttachLoadBalancerIP: false, 58 | floatingIPs: [] 59 | }, 60 | initializeRunning: false 61 | } 62 | }) 63 | 64 | module.exports = { 65 | k8sValidateQuantity, 66 | newVirtualServerManifest 67 | } -------------------------------------------------------------------------------- /virtual-server/examples/python/.gitignore: -------------------------------------------------------------------------------- 1 | .*/ 2 | __pycache__/ 3 | -------------------------------------------------------------------------------- /virtual-server/examples/python/README.md: -------------------------------------------------------------------------------- 1 | # Python example 2 | 3 | An example Python implementation of a kubernetes client that interacts with the Coreweave VirtualServer resource as well as the kubevirt subresource api. 4 | 5 | The python example illustrates the following: 6 | 1. Removal of an existing Virtual Server. 7 | 2. Creation of a new Virtual Server based on the `my_virtualserver` example configuration 8 | 3. Waiting for a Virtual Server ready status. 9 | 4. Stop the Virtual Server instance and wait until it is stopped. 10 | 5. Delete the Virtual Server instance. 11 | 12 | In order to workaround unresolved issues with resource paths in the native python client for kubevirt https://github.com/kubevirt/client-python, we introduced the class `KubeVirtClient` for basic operations on kubevirt VirtualMachine resources. 13 | 14 | Class VSClient performs basic operations on the Vitrual Server resource. 15 | 16 | ## Install 17 | 18 | ``` 19 | virtualenv -p python3 .venv && source ./.venv/bin/activate 20 | pip install kubernetes 21 | ``` 22 | 23 | ## Run 24 | 25 | ``` 26 | Be sure to set secure credentials for your USERNAME and PASSWORD, as they will be used to create a user in your Virtual Server 27 | USERNAME= PASSWORD= NAMESPACE= KUBECONFIG=$HOME/.kube/config python3 main.py 28 | ``` 29 | -------------------------------------------------------------------------------- /virtual-server/examples/python/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import sys 4 | 5 | from kubernetes.client.rest import ApiException 6 | from vsclient import VSClient 7 | 8 | name = 'my-virtual-server' 9 | namespace = os.environ.get('NAMESPACE', 'default') 10 | username = os.environ.get('USERNAME') 11 | password = os.environ.get('PASSWORD') 12 | 13 | if username == None or password == None: 14 | print('USERNAME and PASSWORD environment variables are required') 15 | sys.exit() 16 | 17 | my_virtualserver = { 18 | 'apiVersion': f'{VSClient.GROUP}/{VSClient.VERSION}', 19 | 'kind': 'VirtualServer', 20 | 'metadata': {'name': name, 'namespace': namespace}, 21 | 'spec': { 22 | 'region': 'ORD1', # ord1, ewr1, ewr2 23 | 'os': { 24 | 'type': 'linux', 25 | }, 26 | 'resources': { 27 | 'gpu': { 28 | 'type': 'Quadro_RTX_4000', 29 | 'count': 1 30 | }, 31 | 'cpu': { 32 | # GPU type and CPU type are mutually exclusive i.e. CPU type cannot be specified when GPU type is selected. 33 | # CPU is selected automatically based on GPU type. 34 | # 'type': 'amd-epyc-rome', 35 | 'count': 2, 36 | }, 37 | 'memory': '16Gi' 38 | }, 39 | # Add user 40 | # SSH public key is optional and allows to login without a password 41 | # Public key is located in $HOME/.ssh/id_rsa.pub 42 | # publicKey = `ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDEQCQpab6UWuA ... user@hostname` 43 | 'users': [ 44 | { 45 | 'username': username, 46 | 'password': password, 47 | # SSHPublicKey: publicKey 48 | } 49 | ], 50 | # Add cloud config 51 | # more examples on https://cloudinit.readthedocs.io/en/latest/topics/examples.html 52 | 'cloudInit': """ 53 | # Update packages 54 | package_update: true 55 | # Install packages 56 | packages: 57 | - curl 58 | - git 59 | # Run additional commands 60 | runcmd: 61 | - [df, -h] 62 | - [git, version] 63 | - [curl, --version ] 64 | """, 65 | 'storage': { 66 | 'root': { 67 | 'size': '40Gi', 68 | 'source': { 69 | 'pvc': { 70 | 'name': 'ubuntu2004-nvidia-515-86-01-1-docker-master-20221205-ord1', 71 | 'namespace': 'vd-images' 72 | } 73 | }, 74 | 'storageClassName': 'block-nvme-ord1', 75 | 'volumeMode': 'Block', 76 | 'accessMode': 'ReadWriteOnce' 77 | } 78 | }, 79 | 'network': { 80 | 'tcp': { 81 | 'ports': [22, 443, 60443, 4172, 3389] 82 | }, 83 | 'udp': { 84 | 'ports': [4172, 3389] 85 | } 86 | }, 87 | 'initializeRunning': True 88 | } 89 | } 90 | 91 | 92 | vsclient = VSClient() 93 | 94 | try: 95 | vsclient.delete(namespace, name) 96 | except ApiException as e: 97 | if e.status == 404: 98 | print(f'VirtualServer {name} in namespace {namespace} already deleted') 99 | else: 100 | print(f'VirtualServer delete exception {e}') 101 | exit(1) 102 | 103 | # Create virtual server 104 | print(vsclient.create(my_virtualserver)) 105 | print(f'VirtualServer status: {vsclient.ready(namespace, name)}') 106 | 107 | # Stop the Virtual Machine Instance to apply changes. 108 | print(vsclient.kubevirt_api.stop(namespace, name)) 109 | print(f'VirtualServer status: {vsclient.ready(namespace, name, expected_state="Stopped")}') 110 | 111 | # Update the manifest and attach directly to Load Balancer 112 | my_virtualserver['spec']['network']['tcp']['ports'] = [] 113 | my_virtualserver['spec']['network']['udp']['ports'] = [] 114 | my_virtualserver['spec']['network']['directAttachLoadBalancerIP'] = True 115 | print(vsclient.update(my_virtualserver)) 116 | 117 | print(vsclient.kubevirt_api.start(namespace, name)) 118 | print(f'VirtualServer status: {vsclient.ready(namespace, name)}') 119 | 120 | # Delete virtual server 121 | vsclient.delete(namespace, name) 122 | 123 | exit(0) 124 | -------------------------------------------------------------------------------- /virtual-server/examples/terraform/README.md: -------------------------------------------------------------------------------- 1 | # Deploying Virtual Servers to Kubernetes with Terraform 2 | 3 | This [terraform](terraform.io) module uses the [kubernetes provider](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs) to deploy `VirtualServers` to [CoreWeave Cloud](coreweave.com). 4 | 5 | ## Setup 6 | 7 | This module requires your `user_namespace`, `kubeconfig_path`, your desired desktop `vs_username` (and you can optionally supply `vs_password` or set `vs_password_generate` to `true`), `vs_image` (defaults to Ubuntu 20.04), `vs_gpu_enable` (and `vs_gpu_count`), and your desired `vs_name` to set your system hostname. 8 | 9 | ## Installation 10 | 11 | Run: 12 | 13 | ```bash 14 | terraform plan 15 | terraform apply -auto-approve 16 | ``` 17 | 18 | This module will output the network and credential information for the system, consumable by another module via the `network` and `password` attributes. 19 | 20 | ## Examples 21 | 22 | In the `examples/` directory is a sample Terraform plan that demonstrates consuming the module to create two Virtual Server instances. 23 | -------------------------------------------------------------------------------- /virtual-server/examples/terraform/examples/module-use.tf: -------------------------------------------------------------------------------- 1 | variable "kubeconfig_path" {} 2 | variable "vs_name" {} 3 | variable "vs_username" {} 4 | variable "vs_generate_password" { 5 | default = true 6 | } 7 | variable "user_namespace" {} 8 | 9 | module "virtualserver_1" { 10 | 11 | source = "../" 12 | 13 | kubeconfig_path = var.kubeconfig_path 14 | vs_name = "hostOne" 15 | vs_username = "onePerson" 16 | vs_generate_password = var.vs_generate_password 17 | user_namespace = var.user_namespace 18 | } 19 | 20 | module "virtualserver_2" { 21 | 22 | source = "../" 23 | 24 | kubeconfig_path = var.kubeconfig_path 25 | vs_name = "hostTwo" 26 | vs_username = "secondPerson" 27 | vs_generate_password = var.vs_generate_password 28 | user_namespace = var.user_namespace 29 | } 30 | 31 | output "vs_one_info" { 32 | value = module.virtualserver_1.vs_network 33 | } 34 | 35 | output "vs_two_info" { 36 | value = module.virtualserver_2.vs_network 37 | } 38 | -------------------------------------------------------------------------------- /virtual-server/examples/terraform/main.tf: -------------------------------------------------------------------------------- 1 | provider "kubernetes" { 2 | config_path = var.kubeconfig_path 3 | } 4 | 5 | provider "kubernetes-alpha" { 6 | config_path = var.kubeconfig_path 7 | } 8 | 9 | resource "random_string" "vs_generate_password" { 10 | count = var.vs_generate_password ? 1 : 0 11 | length = 16 12 | special = true 13 | override_special = "_%@" 14 | } 15 | -------------------------------------------------------------------------------- /virtual-server/examples/terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | output "vs_network" { 2 | value = data.kubernetes_service.vs_loadbalancer.status.0.load_balancer.0.ingress.0.ip 3 | } 4 | 5 | output "vs_password" { 6 | value = var.vs_generate_password ? random_string.vs_generate_password[0].result : var.vs_password 7 | } 8 | -------------------------------------------------------------------------------- /virtual-server/examples/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | variable "kubeconfig_path" { 2 | description = "Path to kubeconfig" 3 | default = "~/.kube/config" 4 | } 5 | 6 | variable "user_namespace" { 7 | description = "Namespace Virtual Server will be installed to" 8 | } 9 | 10 | variable "vs_name" { 11 | description = "Virtual Server hostname" 12 | default = "MY-VS" 13 | } 14 | 15 | variable "vs_username" { 16 | description = "Virtual Server username" 17 | } 18 | 19 | variable "vs_password" { 20 | type = string 21 | default = "null" 22 | description = "User provided password (vs_generate_password must be set to false)" 23 | } 24 | 25 | variable "vs_generate_password" { 26 | type = bool 27 | default = true 28 | description = "Generate password" 29 | } 30 | 31 | variable "vs_memory" { 32 | description = "Virtual Server RAM" 33 | default = "16Gi" 34 | } 35 | 36 | variable "vs_root_storage" { 37 | description = "Virtual Server root device storage (i.e 80Gi)" 38 | default = "80Gi" 39 | } 40 | 41 | variable "vs_os_type" { 42 | default = "linux" 43 | } 44 | 45 | variable "vs_image" { 46 | description = "OS image" 47 | default = "ubuntu2004-docker-master-20210601-ord1" 48 | } 49 | 50 | variable "vs_gpu" { 51 | description = "GPU" 52 | default = "Quadro_RTX_4000" 53 | } 54 | 55 | variable "vs_gpu_enable" { 56 | default = true 57 | } 58 | 59 | variable "vs_cpu_count" { 60 | default = 3 61 | } 62 | 63 | variable "vs_gpu_count" { 64 | default = 1 65 | } 66 | 67 | variable "vs_region" { 68 | description = "Region default from vs_regions map" 69 | default = "ORD1" 70 | } 71 | 72 | variable "vs_running" { 73 | description = "Running virtual server on provisioning" 74 | default = true 75 | } 76 | 77 | variable "vs_public_networking" { 78 | default = true 79 | } 80 | 81 | variable "vs_attach_loadbalancer" { 82 | description = "Attach Service LoadBalancer IP directly to VS (vs_tcp_ports and vs_udp_ports must be empty)." 83 | default = false 84 | } 85 | 86 | variable "vs_tcp_ports" { 87 | type = list(any) 88 | default = [22, 443, 60443, 4172, 3389] 89 | } 90 | 91 | variable "vs_udp_ports" { 92 | type = list(any) 93 | default = [4172, 3389] 94 | } 95 | -------------------------------------------------------------------------------- /virtual-server/examples/terraform/vs.tf: -------------------------------------------------------------------------------- 1 | resource "kubernetes_manifest" "virtualserver" { 2 | provider = kubernetes-alpha 3 | 4 | manifest = { 5 | "apiVersion" = "virtualservers.coreweave.com/v1alpha1" 6 | "kind" = "VirtualServer" 7 | "metadata" = { 8 | "name" = var.vs_name 9 | "namespace" = var.user_namespace 10 | } 11 | "spec" = { 12 | "initializeRunning" = var.vs_running 13 | "network" = { 14 | "directAttachLoadBalancerIP" = var.vs_attach_loadbalancer 15 | "public" = var.vs_public_networking 16 | "tcp" = { 17 | "ports" = var.vs_tcp_ports 18 | } 19 | "udp" = { 20 | "ports" = var.vs_udp_ports 21 | } 22 | } 23 | "os" = { 24 | "type" = var.vs_os_type 25 | } 26 | "region" = var.vs_region 27 | "resources" = { 28 | "cpu" = { 29 | "count" = var.vs_cpu_count 30 | } 31 | "gpu" = { 32 | "count" = var.vs_gpu_count 33 | "type" = var.vs_gpu_enable ? var.vs_gpu : "Quadro_RTX_4000" 34 | } 35 | "memory" = var.vs_memory 36 | } 37 | "storage" = { 38 | "root" = { 39 | "size" = var.vs_root_storage 40 | "source" = { 41 | "pvc" = { 42 | "name" = var.vs_image 43 | "namespace" = "vd-images" 44 | } 45 | } 46 | "storageClassName" = "block-nvme-${var.vs_region}" 47 | } 48 | } 49 | "users" = [ 50 | { 51 | "username" = var.vs_username 52 | "password" = var.vs_generate_password ? random_string.vs_generate_password[0].result : var.vs_password 53 | }, 54 | ] 55 | 56 | } 57 | } 58 | } 59 | 60 | data "kubernetes_service" "vs_loadbalancer" { 61 | depends_on = [kubernetes_manifest.virtualserver] 62 | metadata { 63 | name = "${var.vs_name}-tcp" 64 | namespace = var.user_namespace 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /virtual-server/pvc-clone.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Clone the disk PVC for a given VM instance to a new PVC. 3 | set -e -o pipefail -u 4 | 5 | if [ $# -ne 2 ]; then 6 | echo "Usage: $0 " 7 | exit 1 8 | fi 9 | 10 | SRC="$1" 11 | DST="$2" 12 | 13 | 14 | get_field() { 15 | kubectl get $1 $2 -o=jsonpath='{'"$3"'}' 16 | } 17 | 18 | if kubectl get vmi $SRC &>/dev/null; then 19 | echo "Found running VM instance: $SRC" 20 | read -p "Stop it? [y/N] " STOP 21 | 22 | if [ "$(get_field vmi $SRC ".metadata.annotations.vs\.coreweave\.com/vmi")" == "true" ]; then 23 | SRC_PVC=$(get_field vmi $SRC ".spec.volumes..dataVolume.name") 24 | else 25 | SRC_PVC=$(get_field vmi $SRC ".spec.volumes[?(@.name=='dv')].persistentVolumeClaim.claimName") 26 | fi 27 | 28 | if [[ "$STOP" =~ ^[yY]$ ]]; then 29 | virtctl stop $SRC 30 | 31 | echo -n "Waiting for $SRC to stop..." 32 | while kubectl get vmi $SRC &>/dev/null; do 33 | sleep 1 34 | echo -n "." 35 | done 36 | echo " stopped." 37 | else 38 | echo "ERROR: cannot clone pvc of a running VM" 39 | exit 1 40 | fi 41 | 42 | elif kubectl get pvc $SRC &>/dev/null; then 43 | 44 | SRC_PVC="$SRC" 45 | 46 | else 47 | echo "ERROR: Did not find PVC or VM instance named: $SRC" 48 | exit 1 49 | fi 50 | 51 | SRC_PVC_CLASS=$(get_field pvc $SRC_PVC ".spec.storageClassName") 52 | SRC_PVC_SIZE=$(get_field pvc $SRC_PVC ".spec.resources.requests.storage") 53 | 54 | REGION=${SRC_PVC_CLASS//*-} 55 | 56 | if [ "$REGION" == "replica" ]; then 57 | REGION="ord1" 58 | fi 59 | 60 | DST_PVC="${DST}-$(date '+%Y%m%d')-block-${REGION}" 61 | 62 | cat <