├── .gitignore
├── README.md
├── argo-workflow
    ├── README.md
    ├── argo-screenshot.png
    ├── argo.png
    └── gpu-say-workflow.yaml
├── cuda-ssh
    ├── README.md
    ├── sshd-data-pvc.yaml
    ├── sshd-deployment.yaml
    ├── sshd-root-pvc.yaml
    └── sshd-service.yaml
├── finetuner-workflow
    ├── finetune-pvc.yaml
    ├── finetune-role.yaml
    ├── finetune-workflow.yaml
    └── finetuner
    │   ├── Dockerfile
    │   ├── compiler_wrapper.f95
    │   ├── ds_config.json
    │   ├── evaluator.py
    │   ├── finetuner.py
    │   ├── inference.py
    │   ├── requirements-precompilable.txt
    │   ├── requirements.txt
    │   └── utils.py
├── getting-started
    └── k8ctl_setup.ps1
├── kubeflow
    └── training-operator
    │   ├── gpt-neox
    │       ├── 01-pvc.yaml
    │       ├── 02-finetune-role.yaml
    │       ├── 03-wanbd-secret.yaml
    │       └── 04-finetune-workflow.yaml
    │   └── resnet50
    │       ├── Dockerfile.mpi
    │       ├── Dockerfile.pytorch
    │       ├── k8s
    │           ├── imagenet-download-job.yaml
    │           ├── imagenet-mpijob.yaml
    │           ├── imagenet-pytorchjob.yaml
    │           ├── kaggle-secret.yaml
    │           ├── model-pvc.yaml
    │           └── wanbd-secret.yaml
    │       ├── resnet50_horovod.py
    │       ├── resnet50_pytorch.py
    │       └── util.py
├── online-inference
    ├── README.md
    ├── bloom-176b-deepspeed
    │   ├── 00-pvc.yaml
    │   ├── 01-download-job.yaml
    │   ├── 02-inference-service.yaml
    │   ├── Dockerfile
    │   ├── Dockerfile.downloader
    │   ├── downloader
    │   │   ├── download.py
    │   │   └── requirements.txt
    │   └── files
    │   │   ├── isvc-patch.txt
    │   │   └── requirements.txt
    ├── bloom-176b
    │   ├── 00-bloom-176b-pvc.yaml
    │   ├── 01-bloom-176b-download-job.yaml
    │   ├── 02-bloom-176b-inferenceservice.yaml
    │   └── model
    │   │   ├── Dockerfile
    │   │   ├── bloom.py
    │   │   ├── requirements.txt
    │   │   └── scripts
    │   │       └── download_model
    ├── custom-basnet
    │   ├── README.md
    │   ├── basnet-inferenceservice.yaml
    │   ├── client
    │   │   ├── .DS_Store
    │   │   ├── Dockerfile
    │   │   ├── expected_output.png
    │   │   ├── images
    │   │   │   ├── .DS_Store
    │   │   │   ├── cut_mask.png
    │   │   │   ├── output.png
    │   │   │   └── test.png
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   └── object-detector-inferenceservice.yaml
    ├── custom-pytorch-aitextgen
    │   ├── README.md
    │   ├── aitextgen-inferenceservice.yaml
    │   └── custom-predictor
    │   │   ├── Dockerfile
    │   │   ├── model.py
    │   │   └── requirements.txt
    ├── custom-sentiment
    │   ├── README.md
    │   ├── custom-predictor
    │   │   ├── Dockerfile
    │   │   ├── model.py
    │   │   └── requirements.txt
    │   ├── image-secrets-serviceaccount.patch.yaml
    │   ├── model-storage-pvc.yaml
    │   ├── sample.json
    │   ├── sentiment-inferenceservice.yaml
    │   └── sleep-deployment.yaml
    ├── dalle-mini
    │   ├── 00-model-pvc.yaml
    │   ├── 01-model-download-job.yaml
    │   ├── 02-inference-service.yaml
    │   ├── Dockerfile
    │   ├── Dockerfile.downloader
    │   ├── downloader
    │   │   └── download.py
    │   └── model
    │   │   ├── requirements.txt
    │   │   └── service.py
    ├── fastertransformer
    │   ├── README.md
    │   ├── build
    │   │   └── Dockerfile
    │   ├── client
    │   │   ├── Dockerfile
    │   │   ├── example.py
    │   │   ├── gpt_bpe
    │   │   │   ├── gpt2-merges.txt
    │   │   │   ├── gpt2-vocab.json
    │   │   │   └── gpt_token_encoder.py
    │   │   ├── hf_tokenizer
    │   │   │   ├── 20B_tokenizer.json
    │   │   │   └── hf_tokenize.py
    │   │   ├── requirements.txt
    │   │   └── sample_request.json
    │   ├── download-weights-job-gpt-neox.yml
    │   ├── download-weights-job-gptj.yml
    │   ├── ft-inference-service-gptj.yml
    │   ├── ft-inference-service-neox.yml
    │   └── model-storage-pvc.yml
    ├── hf-llm
    │   ├── .dockerignore
    │   ├── 00-optional-s3-secret.yaml
    │   ├── 01-optional-s3-serialize-job.yaml
    │   ├── 02-inference-service.yaml
    │   ├── Dockerfile
    │   ├── serializer
    │   │   ├── requirements.txt
    │   │   └── serialize.py
    │   └── service
    │   │   ├── requirements.txt
    │   │   └── service.py
    ├── image-classifier
    │   ├── jupyter
    │   │   ├── inception.ipynb
    │   │   ├── model-storage-pvc.yaml
    │   │   ├── tensorflow-deployment.yaml
    │   │   └── tensorflow-service.yaml
    │   ├── service
    │   │   ├── classifier-inferenceservice.yaml
    │   │   ├── predict_url.sh
    │   │   └── test_base64.sh
    │   └── transformer
    │   │   ├── Dockerfile
    │   │   ├── main.py
    │   │   ├── requirements.txt
    │   │   ├── test_base64.sh
    │   │   └── transformer.py
    ├── overview.png
    ├── stable-diffusion
    │   ├── 00-optional-s3-secret.yaml
    │   ├── 01-optional-s3-serialize-job.yaml
    │   ├── 02-inference-service.yaml
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── serializer
    │   │   ├── requirements.txt
    │   │   └── serialize.py
    │   └── service
    │   │   ├── requirements.txt
    │   │   └── service.py
    ├── tensorizer-isvc
    │   ├── README.md
    │   ├── benchmark
    │   │   ├── inputs.txt
    │   │   ├── load_test.py
    │   │   └── locustfile.py
    │   ├── model-download
    │   │   ├── Dockerfile
    │   │   ├── model-download-job.yaml
    │   │   ├── model_download.py
    │   │   └── requirements.txt
    │   ├── pvc.yaml
    │   └── tensorizer_hf_isvc
    │   │   ├── flask
    │   │       ├── Dockerfile
    │   │       ├── flask_api.py
    │   │       ├── hf-isvc.yaml
    │   │       ├── requirements.txt
    │   │       └── tensorizer-isvc.yaml
    │   │   ├── kserve
    │   │       ├── Dockerfile
    │   │       ├── hf-isvc.yaml
    │   │       ├── kserve_api.py
    │   │       ├── requirements.txt
    │   │       └── tensorizer-isvc.yaml
    │   │   └── load_model.py
    └── vllm
    │   ├── 00-s3-secret.yaml
    │   ├── 01-s3-serialize-job.yaml
    │   ├── 02-inference-service.yaml
    │   ├── Dockerfile
    │   └── README.md
├── sd-dreambooth-workflow
    ├── db-finetune-pvc.yaml
    ├── db-workflow-event-binding.yaml
    ├── db-workflow-template.yaml
    ├── huggingface-secret.yaml
    ├── inference-role.yaml
    └── wandb-secret.yaml
├── sd-finetuner-workflow
    ├── huggingface-secret.yaml
    ├── inference-role.yaml
    ├── sd-finetune-pvc.yaml
    ├── sd-finetune-workflow-event-binding.yaml
    ├── sd-finetune-workflow-template.yaml
    ├── sd-finetuner
    │   ├── Dockerfile
    │   ├── accelerate_config.yaml
    │   ├── datasets.py
    │   ├── finetuner.py
    │   └── requirements.txt
    └── wandb-secret.yaml
├── spark
    ├── cpu-pod-template.yaml
    ├── docker
    │   ├── Dockerfile
    │   ├── download_imgdataset.py
    │   └── requirements.txt
    ├── example-spark-submit.sh
    ├── jupyter
    │   ├── interactive-example.ipynb
    │   └── jupyter-service.yaml
    ├── spark-pvc.yaml
    ├── spark-role.yaml
    └── wandb-secret.yaml
├── tensorflow-jupyter
    ├── README.md
    ├── jupyter-pvc.yaml
    ├── screenshot.png
    ├── tensorflow-deployment.yaml
    └── tensorflow-service.yaml
└── virtual-server
    ├── clone_block_volume.sh
    ├── examples
        ├── curl
        │   ├── README.md
        │   ├── run.sh
        │   └── virtual-server.json
        ├── go
        │   ├── .gitignore
        │   ├── Makefile
        │   ├── README.md
        │   ├── go.mod
        │   ├── go.sum
        │   └── main.go
        ├── kubectl
        │   ├── README.md
        │   ├── virtual-server-block-pvc.yaml
        │   ├── virtual-server-cloudinit.yaml
        │   ├── virtual-server-direct-attach-lb.yaml
        │   ├── virtual-server-ephemeral-disk.yaml
        │   ├── virtual-server-ephemeral-root-disk.yaml
        │   ├── virtual-server-shared-pvc.yaml
        │   ├── virtual-server-static-mac.yaml
        │   ├── virtual-server-windows-cpu-only.yaml
        │   ├── virtual-server-windows-internal-ip-only.yaml
        │   ├── virtual-server-windows.yaml
        │   └── virtual-server.yaml
        ├── nodejs
        │   ├── Readme.md
        │   ├── client.js
        │   ├── main.js
        │   ├── package-lock.json
        │   ├── package.json
        │   └── util.js
        ├── python
        │   ├── .gitignore
        │   ├── README.md
        │   ├── kubevirtclient.py
        │   ├── main.py
        │   └── vsclient.py
        └── terraform
        │   ├── README.md
        │   ├── examples
        │       └── module-use.tf
        │   ├── main.tf
        │   ├── outputs.tf
        │   ├── variables.tf
        │   └── vs.tf
    └── pvc-clone.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CoreWeave Kubernetes Cloud
2 | Documentation lives at [https://docs.coreweave.com](https://docs.coreweave.com). This repository contains related examples.
3 | 
4 | 


--------------------------------------------------------------------------------
/argo-workflow/README.md:
--------------------------------------------------------------------------------
 1 | ![Argo](argo.png)
 2 | ![Screenshot](argo-screenshot.png)
 3 | 
 4 | ### Introduction
 5 | [Argo Workflows](https://argoproj.github.io/argo-workflows/) is a great tool to orchestrate parallel execution of GPU jobs. It manages retries and parallelism for you, and allows you to submit workflows via CLI, [Rest API](https://github.com/argoproj/argo/blob/master/examples/rest-examples.md) and the [Kubernetes API](https://github.com/argoproj/argo/blob/master/docs/rest-api.md).
 6 | 
 7 | ### Getting Started
 8 | 
 9 | Please visit the [CoreWeave Docs](https://docs.coreweave.com/workflows/argo).
10 | 


--------------------------------------------------------------------------------
/argo-workflow/argo-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/argo-workflow/argo-screenshot.png


--------------------------------------------------------------------------------
/argo-workflow/argo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/argo-workflow/argo.png


--------------------------------------------------------------------------------
/argo-workflow/gpu-say-workflow.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Workflow
 3 | metadata:
 4 |   generateName: gpu-say
 5 | spec:
 6 |   entrypoint: main
 7 |   activeDeadlineSeconds: 300 # Cancel operation if not finished in 5 minutes
 8 |   ttlSecondsAfterFinished: 86400 # Clean out old workflows after a day
 9 |   # Parameters can be passed/overridden via the argo CLI.
10 |   # To override the printed message, run `argo submit` with the -p option:
11 |   # $ argo submit examples/arguments-parameters.yaml -p messages='["CoreWeave", "Is", "Fun"]'
12 |   arguments:
13 |     parameters:
14 |     - name: messages
15 |       value: '["Argo", "Is", "Awesome"]'
16 | 
17 |   templates:
18 |   - name: main
19 |     steps:
20 |     - - name: echo
21 |         template: gpu-echo
22 |         arguments:
23 |           parameters:
24 |           - name: message
25 |             value: "{{item}}"
26 |         withParam: "{{workflow.parameters.messages}}"
27 | 
28 |   - name: gpu-echo
29 |     inputs:
30 |       parameters:
31 |       - name: message
32 |     retryStrategy:
33 |       limit: 1
34 |     script:
35 |       image: nvidia/cuda:10.2-runtime-ubuntu18.04
36 |       command: [bash]
37 |       source: |
38 |         nvidia-smi
39 |         echo "Input was: {{inputs.parameters.message}}"
40 | 
41 |       resources:
42 |         requests:
43 |           memory: 128Mi
44 |           cpu: 500m # Half a core
45 |         limits:
46 |           nvidia.com/gpu: 1 # Allocate one GPU
47 |     affinity:
48 |       nodeAffinity:
49 |         requiredDuringSchedulingIgnoredDuringExecution:
50 |             # This will REQUIRE the Pod to be run on a system with a GPU with 8, 10 or 11GB VRAM
51 |               nodeSelectorTerms:
52 |               - matchExpressions:
53 |                 - key: gpu.nvidia.com/vram
54 |                   operator: In
55 |                   values:
56 |                     - "8"
57 |                     - "10"
58 |                     - "11"
59 | 


--------------------------------------------------------------------------------
/cuda-ssh/README.md:
--------------------------------------------------------------------------------
1 | ## CUDA Development Toolkit with SSH Server
2 | 
3 | The guide for this example can be found in the [Documentation](https://docs.coreweave.com/coreweave-kubernetes/examples/cuda-ssh).


--------------------------------------------------------------------------------
/cuda-ssh/sshd-data-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: sshd-data-pv-claim
 5 | spec:
 6 |   # https://docs.coreweave.com/storage/storage
 7 |   storageClassName: block-hdd-ord1
 8 |   accessModes:
 9 |     - ReadWriteOnce
10 |   resources:
11 |     requests:
12 |       storage: 500Gi
13 | 


--------------------------------------------------------------------------------
/cuda-ssh/sshd-deployment.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: apps/v1
  2 | kind: Deployment
  3 | metadata:
  4 |   name: sshd
  5 | spec:
  6 |   strategy:
  7 |     type: Recreate
  8 |   replicas: 1
  9 |   selector:
 10 |     matchLabels:
 11 |       app.kubernetes.io/name: sshd
 12 |   template:
 13 |     metadata:
 14 |       labels:
 15 |         app.kubernetes.io/name: sshd
 16 |     spec:
 17 |       terminationGracePeriodSeconds: 10
 18 |       initContainers:
 19 |       - name: init
 20 |         image: ghcr.io/coreweave/ml-containers/cuda-ssh:209c517-torch-ceeb8c2-nccl-cuda11.8.0-nccl2.16.2-1-torch2.0.1-vision0.15.2-audio2.0.2
 21 |         command: ["/bin/bash", "-c"]
 22 |         args:
 23 |           - |
 24 |             if [ ! -f /target/initialized ]; then
 25 |               dpkg-reconfigure openssh-server && \
 26 |               cp -ax / /target && \
 27 |               echo 'Initialization complete' && \
 28 |               touch /target/initialized;
 29 |             fi
 30 |         resources:
 31 |           requests:
 32 |             cpu: 1
 33 |             memory: 1Gi
 34 |         volumeMounts:
 35 |         - name: root-storage
 36 |           mountPath: /target
 37 | 
 38 |       containers:
 39 |       - name: sshd
 40 |         command: ["/usr/bin/tini", "--"]
 41 |         args: ["service", "ssh", "start", "-D"]
 42 |         tty: true
 43 |         image: ghcr.io/coreweave/ml-containers/cuda-ssh:209c517-torch-ceeb8c2-nccl-cuda11.8.0-nccl2.16.2-1-torch2.0.1-vision0.15.2-audio2.0.2
 44 |         ports:
 45 |           - name: sshd
 46 |             containerPort: 22
 47 |             protocol: TCP
 48 |         volumeMounts:
 49 |         - name: data-storage
 50 |           mountPath: /mnt/data
 51 |         - name: root-storage
 52 |           mountPath: /bin
 53 |           subPath: bin
 54 |         - name: root-storage
 55 |           mountPath: /boot
 56 |           subPath: boot
 57 |         - name: root-storage
 58 |           mountPath: /etc
 59 |           subPath: etc
 60 |         - name: root-storage
 61 |           mountPath: /home
 62 |           subPath: home
 63 |         - name: root-storage
 64 |           mountPath: /lib
 65 |           subPath: lib
 66 |         - name: root-storage
 67 |           mountPath: /lib64
 68 |           subPath: lib64
 69 |         - name: root-storage
 70 |           mountPath: /opt
 71 |           subPath: opt
 72 |         - name: root-storage
 73 |           mountPath: /root
 74 |           subPath: root
 75 |         - name: root-storage
 76 |           mountPath: /sbin
 77 |           subPath: sbin
 78 |         - name: root-storage
 79 |           mountPath: /srv
 80 |           subPath: srv
 81 |         - name: root-storage
 82 |           mountPath: /usr
 83 |           subPath: usr
 84 |         - name: root-storage
 85 |           mountPath: /var
 86 |           subPath: var
 87 |         - name: run-lock
 88 |           mountPath: /run/lock
 89 | 
 90 |         resources:
 91 |           requests:
 92 |             cpu: 2500m # The CPU unit is milli-cores. 500m is 0.5 cores
 93 |             memory: 64Gi
 94 |           limits:
 95 |             cpu: 7000m
 96 |             memory: 128Gi
 97 |             nvidia.com/gpu: 6
 98 |             # GPUs can only be allocated as a limit, which both reserves and limits the number of GPUs the Pod will have access to
 99 |             # Making individual Pods resource light is advantageous for bin-packing. Since this Pod is for general purpose interactive testing
100 |             # we allocate 6 GPUs to it
101 | 
102 |       # Node affinity can be used to require / prefer the Pods to be scheduled on a node with a specific hardware type
103 |       # No affinity allows scheduling on all hardware types that can fulfill the resource request.
104 |       # In this example, without affinity, any NVIDIA GPU would be allowed to run the Pod.
105 |       # Read more about affinity at: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
106 |       affinity:
107 |         nodeAffinity:
108 |           # This will REQUIRE the Pod to be run on a system with an RTX A5000 GPU
109 |           requiredDuringSchedulingIgnoredDuringExecution:
110 |             nodeSelectorTerms:
111 |             - matchExpressions:
112 |               - key: gpu.nvidia.com/class
113 |                 operator: In
114 |                 values:
115 |                   - RTX_A5000
116 |               - key: topology.kubernetes.io/region
117 |                 operator: In
118 |                 values:
119 |                   - ORD1
120 | 
121 |       volumes:
122 |         - name: root-storage
123 |           persistentVolumeClaim:
124 |             claimName: sshd-root-pv-claim
125 |         - name: data-storage
126 |           persistentVolumeClaim:
127 |             claimName: sshd-data-pv-claim
128 |         - name: run-lock
129 |           emptyDir:
130 |             medium: Memory
131 |       restartPolicy: Always
132 | 


--------------------------------------------------------------------------------
/cuda-ssh/sshd-root-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: sshd-root-pv-claim
 5 | spec:
 6 |   # https://docs.coreweave.com/storage/storage
 7 |   storageClassName: block-nvme-ord1
 8 |   accessModes:
 9 |     - ReadWriteOnce
10 |   resources:
11 |     requests:
12 |       storage: 200Gi
13 | 


--------------------------------------------------------------------------------
/cuda-ssh/sshd-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   annotations:
 5 |     metallb.universe.tf/address-pool: public-ord1
 6 |     # Setting a sharing key might save public IP addresses
 7 |     # See https://metallb.universe.tf/usage/#ip-address-sharing for more detail
 8 |     metallb.universe.tf/allow-shared-ip: example-1
 9 |   name: sshd
10 | spec:
11 |   type: LoadBalancer
12 |   externalTrafficPolicy: Local
13 |   ports:
14 |     - name: sshd
15 |       port: 22
16 |       protocol: TCP
17 |       targetPort: sshd
18 |   selector:
19 |     app.kubernetes.io/name: sshd
20 | 


--------------------------------------------------------------------------------
/finetuner-workflow/finetune-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: finetune-data
 5 | spec:
 6 |   storageClassName: shared-hdd-ord1
 7 |   accessModes:
 8 |     - ReadWriteMany
 9 |   resources:
10 |     requests:
11 |       storage: 2000Gi
12 | 


--------------------------------------------------------------------------------
/finetuner-workflow/finetune-role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   name: finetune
 5 | ---
 6 | apiVersion: rbac.authorization.k8s.io/v1
 7 | kind: Role
 8 | metadata:
 9 |   name: role:finetune
10 | rules:
11 |   - apiGroups:
12 |       - ""
13 |     resources:
14 |       - pods
15 |     verbs:
16 |       - 'patch'
17 |   - apiGroups:
18 |       - serving.kubeflow.org
19 |     resources:
20 |       - inferenceservices
21 |     verbs:
22 |       - '*'
23 |   - apiGroups:
24 |       - serving.knative.dev
25 |     resources:
26 |       - services
27 |       - revisions
28 |     verbs:
29 |       - '*'
30 | ---
31 | apiVersion: rbac.authorization.k8s.io/v1
32 | kind: RoleBinding
33 | metadata:
34 |   name: rolebinding:finetune-finetune
35 | roleRef:
36 |   apiGroup: rbac.authorization.k8s.io
37 |   kind: Role
38 |   name: role:finetune
39 | subjects:
40 |   - kind: ServiceAccount
41 |     name: finetune
42 | 


--------------------------------------------------------------------------------
/finetuner-workflow/finetuner/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1.2
 2 | 
 3 | ARG BASE_IMAGE=ghcr.io/coreweave/ml-containers/torch:afecfe9-base-cuda11.8.0-torch2.0.0-vision0.15.1
 4 | 
 5 | # Dependencies requiring NVCC are built ahead of time in a separate stage
 6 | # so that the ~2 GiB dev library installations don't have to be included
 7 | # in the final finetuner image.
 8 | # gcc-10/g++-10/lld do not need to be installed here, but they improve the build.
 9 | # gfortran-10 is just for compiler_wrapper.f95.
10 | FROM ${BASE_IMAGE} as builder
11 | RUN apt-get install -y --no-install-recommends \
12 |         cuda-nvcc-11-8 cuda-nvml-dev-11-8 libcurand-dev-11-8 \
13 |         libcublas-dev-11-8 libcusparse-dev-11-8 \
14 |         libcusolver-dev-11-8 cuda-nvprof-11-8 \
15 |         cuda-profiler-api-11-8 \
16 |         ninja-build \
17 |         gcc-10 g++-10 gfortran-10 lld && \
18 |     apt-get clean && \
19 |     update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \
20 |     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \
21 |     update-alternatives --install \
22 |       /usr/bin/gfortran gfortran /usr/bin/gfortran-10 10 && \
23 |     update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1
24 | RUN mkdir /wheels
25 | WORKDIR /wheels
26 | COPY compiler_wrapper.f95 .
27 | COPY requirements-precompilable.txt .
28 | RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && \
29 |     python3 -m pip install -U --no-cache-dir \
30 |       packaging setuptools wheel pip && \
31 |     DS_BUILD_UTILS=1 DS_BUILD_CPU_ADAM=1 \
32 |       CC=$(realpath ./compiler) python3 -m pip wheel \
33 |       --no-cache-dir --no-build-isolation --no-deps \
34 |       -r requirements-precompilable.txt && \
35 |     rm ./compiler_wrapper.f95 ./compiler ./requirements-precompilable.txt
36 | 
37 | FROM ${BASE_IMAGE}
38 | RUN mkdir /app
39 | WORKDIR /app
40 | RUN --mount=type=bind,from=builder,source=/wheels,target=. \
41 |     pip3 install --no-cache-dir ./*.whl
42 | COPY requirements.txt .
43 | COPY requirements-precompilable.txt .
44 | RUN pip3 install --no-cache-dir -r requirements.txt
45 | COPY ds_config.json .
46 | COPY finetuner.py .
47 | COPY evaluator.py .
48 | COPY inference.py .
49 | COPY utils.py .
50 | CMD [ "/usr/bin/python3", "finetuner.py" ]
51 | 


--------------------------------------------------------------------------------
/finetuner-workflow/finetuner/compiler_wrapper.f95:
--------------------------------------------------------------------------------
 1 | PROGRAM compiler_wrapper
 2 |     ! Wraps GCC invocations,
 3 |     ! replacing -D__AVX512__ and -D__SCALAR__ preprocessor definitions
 4 |     ! with -D__AVX256__, and -march=native with -march=skylake,
 5 |     ! for better reproducibility and compatibility.
 6 |     IMPLICIT NONE
 7 |     INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0
 8 |     CHARACTER(len=:), ALLOCATABLE :: arg, command
 9 |     ALLOCATE(CHARACTER(len=128) :: arg)
10 |     command = "gcc"
11 | 
12 |     DO i = 1, COMMAND_ARGUMENT_COUNT()
13 |         DO
14 |             CALL GET_COMMAND_ARGUMENT(i, arg, full_length, truncated)
15 |             IF (truncated == 0) THEN
16 |                 EXIT
17 |             ELSEIF (truncated == -1) THEN
18 |                 DEALLOCATE(arg)
19 |                 ALLOCATE(CHARACTER(len=full_length) :: arg)
20 |             ELSE
21 |                 CALL EXIT(95)
22 |             END IF
23 |         END DO
24 |         IF (arg == "-march=native") THEN
25 |             command = command // " '-march=skylake'"
26 |         ELSEIF (arg == "-D__AVX512__" .OR. arg == "-D__SCALAR__") THEN
27 |             command = command // " '-D__AVX256__'"
28 |         ELSE
29 |             command = command // shell_escaped(arg)
30 |         END IF
31 |     END DO
32 |     CALL SYSTEM(command, exitcode)
33 |     IF (exitcode > 255) THEN
34 |         exitcode = MAX(IAND(exitcode, 255), 1)
35 |     ENDIF
36 |     CALL EXIT(exitcode)
37 | 
38 | 
39 |     CONTAINS
40 |         FUNCTION shell_escaped(str) result(out)
41 |             ! Turns [str] into [ 'str'] and replaces all
42 |             ! internal ['] characters with ['"'"']
43 |             IMPLICIT NONE
44 |             CHARACTER(len=*), INTENT(IN) :: str
45 |             CHARACTER(len=:), ALLOCATABLE :: out
46 |             INTEGER :: i, out_i, old_len, out_len
47 | 
48 |             old_len = LEN_TRIM(str)
49 |             ! Figure out the new length to allocate by scanning `str`.
50 |             ! This always needs to add at least [ '] at the beginning
51 |             ! and ['] at the end, so the length increases by at least 3.
52 |             out_len = old_len + 3
53 |             DO i = 1, old_len
54 |                 IF (str(i:i) == "'") THEN
55 |                     out_len = out_len + 4
56 |                 END IF
57 |             END DO
58 |             ALLOCATE(CHARACTER(len=out_len) :: out)
59 | 
60 |             ! Copy over the string, performing necessary escapes.
61 |             out(1:2) = " '"
62 |             out_i = 3
63 |             DO i = 1, old_len
64 |                 IF (str(i:i) == "'") THEN
65 |                     ! Escape internal single-quotes
66 |                     out(out_i:out_i + 4) = '''"''"'''
67 |                     out_i = out_i + 5
68 |                 ELSE
69 |                     ! No escaping needed
70 |                     out(out_i:out_i) = str(i:i)
71 |                     out_i = out_i + 1
72 |                 END IF
73 |             END DO
74 |             out(out_i:out_i) = "'"
75 |         END FUNCTION
76 | END PROGRAM
77 | 


--------------------------------------------------------------------------------
/finetuner-workflow/finetuner/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "optimizer": {
11 |         "type": "AdamW",
12 |         "params": {
13 |             "lr": "auto",
14 |             "betas": "auto",
15 |             "eps": "auto",
16 |             "weight_decay": "auto"
17 |         }
18 |     },
19 |     "scheduler": {
20 |         "type": "WarmupLR",
21 |         "params": {
22 |             "warmup_min_lr": "auto",
23 |             "warmup_max_lr": "auto",
24 |             "warmup_num_steps": "auto"
25 |         }
26 |     },
27 |     "zero_optimization": {
28 |         "stage": 3,
29 |         "allgather_partitions": true,
30 |         "allgather_bucket_size": 2e8,
31 |         "overlap_comm": true,
32 |         "reduce_scatter": true,
33 |         "reduce_bucket_size": 2e8,
34 |         "contiguous_gradients": true,
35 |         "offload_optimizer": {
36 |             "device": "cpu"
37 |         },
38 |         "offload_param": {
39 |             "device": "cpu"
40 |         },
41 |         "stage3_gather_16bit_weights_on_model_save": true
42 |     },
43 |     "gradient_accumulation_steps": "auto",
44 |     "gradient_clipping": "auto",
45 |     "communication_data_type": "fp32",
46 |     "steps_per_print": 1000000000000000,
47 |     "train_batch_size": "auto",
48 |     "train_micro_batch_size_per_gpu": "auto",
49 |     "wall_clock_breakdown": false
50 | }
51 | 


--------------------------------------------------------------------------------
/finetuner-workflow/finetuner/inference.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | 
  3 | import torch
  4 | import uvicorn
  5 | from fastapi import FastAPI
  6 | from fastapi.middleware.cors import CORSMiddleware
  7 | from pydantic import BaseModel
  8 | from transformers import pipeline
  9 | 
 10 | from utils import DashParser
 11 | from utils import validation as val
 12 | 
 13 | parser = DashParser(description="Text model inference HTTP server")
 14 | 
 15 | parser.add_argument(
 16 |     "--model",
 17 |     type=str,
 18 |     default="distilgpt2",
 19 |     help="Model to use for inference (directory, or HuggingFace ID) [default = distilgpt2]",
 20 | )
 21 | parser.add_argument(
 22 |     "--device-id",
 23 |     type=val.non_negative(int, special_val=-1),
 24 |     default=0,
 25 |     help="GPU ID to use for inference, or -1 for CPU [default = 0]",
 26 | )
 27 | parser.add_argument(
 28 |     "--port",
 29 |     type=val.non_negative(int),
 30 |     default=80,
 31 |     help="Port to listen on [default = 80 (http)]",
 32 | )
 33 | parser.add_argument(
 34 |     "--ip",
 35 |     type=str,
 36 |     default="0.0.0.0",
 37 |     help="IP address to listen on [default = 0.0.0.0 (all interfaces)]",
 38 | )
 39 | 
 40 | args = parser.parse_args()
 41 | 
 42 | 
 43 | class Completion(BaseModel):
 44 |     prompt: str
 45 |     max_new_tokens: Optional[int] = 10
 46 |     temperature: Optional[float] = None
 47 |     top_p: Optional[float] = None
 48 |     top_k: Optional[int] = None
 49 |     typical_p: Optional[float] = None
 50 |     repetition_penalty: Optional[float] = None
 51 |     do_sample: Optional[bool] = True
 52 |     penalty_alpha: Optional[float] = None
 53 |     num_return_sequences: Optional[int] = 1
 54 |     stop_sequence: Optional[str] = None
 55 |     bad_words: Optional[List] = None
 56 | 
 57 | 
 58 | app = FastAPI(title="Inference API")
 59 | 
 60 | app.add_middleware(
 61 |     CORSMiddleware,
 62 |     allow_origins=["*"],
 63 |     allow_methods=["*"],
 64 |     allow_headers=["*"],
 65 | )
 66 | 
 67 | model = pipeline(
 68 |     "text-generation",
 69 |     model=args.model,
 70 |     torch_dtype=None if args.device_id == -1 else torch.float16,
 71 |     device=args.device_id,
 72 | )
 73 | 
 74 | 
 75 | @app.get("/")
 76 | def get_health():
 77 |     return "OK"
 78 | 
 79 | 
 80 | @app.post("/completion")
 81 | def completion(completion: Completion):
 82 |     try:
 83 |         return model(
 84 |             completion.prompt,
 85 |             max_new_tokens=completion.max_new_tokens,
 86 |             temperature=completion.temperature,
 87 |             top_p=completion.top_p,
 88 |             top_k=completion.top_k,
 89 |             repetition_penalty=completion.repetition_penalty,
 90 |             do_sample=completion.do_sample,
 91 |             penalty_alpha=completion.penalty_alpha,
 92 |             num_return_sequences=completion.num_return_sequences,
 93 |             stop_sequence=completion.stop_sequence,
 94 |         )
 95 |     except Exception as e:
 96 |         return {"error": str(e)}
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     uvicorn.run("inference:app", host=args.ip, port=args.port)
101 | 


--------------------------------------------------------------------------------
/finetuner-workflow/finetuner/requirements-precompilable.txt:
--------------------------------------------------------------------------------
1 | deepspeed==0.9.2
2 | flash-attn==1.0.4
3 | einops==0.6.1


--------------------------------------------------------------------------------
/finetuner-workflow/finetuner/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers~=4.28.1
 2 | numpy~=1.24.2
 3 | wandb~=0.14.0
 4 | torch==2.0.0
 5 | psutil==5.9.4
 6 | accelerate~=0.17.1
 7 | tensorizer==1.1.0
 8 | fastapi==0.85.1
 9 | uvicorn==0.19.0
10 | -r requirements-precompilable.txt


--------------------------------------------------------------------------------
/kubeflow/training-operator/gpt-neox/01-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: neox-checkpoints
 5 | spec:
 6 |   storageClassName: shared-nvme-las1
 7 |   accessModes:
 8 |     - ReadWriteMany
 9 |   resources:
10 |     requests:
11 |       storage: 512Gi
12 | ---
13 | apiVersion: v1
14 | kind: PersistentVolumeClaim
15 | metadata:
16 |   name: neox-data
17 | spec:
18 |   storageClassName: shared-hdd-las1
19 |   accessModes:
20 |     - ReadWriteMany
21 |   resources:
22 |     requests:
23 |       storage: 64Gi
24 | 


--------------------------------------------------------------------------------
/kubeflow/training-operator/gpt-neox/02-finetune-role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   name: finetune
 5 | ---
 6 | apiVersion: rbac.authorization.k8s.io/v1
 7 | kind: Role
 8 | metadata:
 9 |   name: role:finetune
10 | rules:
11 |   # Permissions for the config map step
12 |   - apiGroups:
13 |       - ""
14 |     resources:
15 |       - "configmaps"
16 |     verbs:
17 |       - 'patch'
18 |       - 'create'
19 |       - 'get'
20 | 
21 |   # Permissions for the finetune step
22 |   - apiGroups:
23 |       - "kubeflow.org"
24 |     resources:
25 |       - "mpijobs"
26 |     verbs:
27 |       - "create"
28 |       - "get"
29 | ---
30 | apiVersion: rbac.authorization.k8s.io/v1
31 | kind: RoleBinding
32 | metadata:
33 |   name: rolebinding:finetune-finetune
34 | roleRef:
35 |   apiGroup: rbac.authorization.k8s.io
36 |   kind: Role
37 |   name: role:finetune
38 | subjects:
39 |   - kind: ServiceAccount
40 |     name: finetune
41 | 


--------------------------------------------------------------------------------
/kubeflow/training-operator/gpt-neox/03-wanbd-secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 |   token: enterYourSecret==
4 | kind: Secret
5 | metadata:
6 |   name: wandb-token-secret
7 | type: Opaque
8 | 


--------------------------------------------------------------------------------
/kubeflow/training-operator/resnet50/Dockerfile.mpi:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:22.12-py3
 2 | 
 3 | RUN HOROVOD_GPU_OPERATIONS=NCCL pip install tensorboardX horovod[pytorch] filelock wandb
 4 | RUN mkdir -p /opt/resnet50
 5 | 
 6 | RUN  chgrp -R 0 /opt/resnet50 \
 7 |   && chmod -R g+rwX /opt/resnet50
 8 | 
 9 | RUN apt-get -qq update && \
10 |     apt-get -qq install -y --allow-change-held-packages --no-install-recommends \
11 |     openssh-server
12 | 
13 | # SSH dependencies for MPI
14 | RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
15 |     echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
16 |     sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
17 |     mkdir /var/run/sshd -p
18 | 
19 | WORKDIR /opt/resnet50/src
20 | ADD resnet50_horovod.py /opt/resnet50/src/resnet50.py
21 | ADD util.py /opt/resnet50/src/util.py
22 | 


--------------------------------------------------------------------------------
/kubeflow/training-operator/resnet50/Dockerfile.pytorch:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:22.12-py3
 2 | 
 3 | RUN pip install tensorboardX filelock wandb
 4 | RUN mkdir -p /opt/resnet50
 5 | 
 6 | WORKDIR /opt/resnet50/src
 7 | ADD resnet50_pytorch.py /opt/resnet50/src/resnet50.py
 8 | ADD util.py /opt/resnet50/src/util.py
 9 | 
10 | RUN  chgrp -R 0 /opt/resnet50 \
11 |   && chmod -R g+rwX /opt/resnet50
12 | 
13 | ENTRYPOINT ["python", "/opt/resnet50/src/resnet50.py"]
14 | 


--------------------------------------------------------------------------------
/kubeflow/training-operator/resnet50/k8s/imagenet-download-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: imagenet-download
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       containers:
 9 |         - name: model-downloader
10 |           image: python:3.8
11 |           imagePullPolicy: IfNotPresent
12 |           command: [ "bash", "-c" ]
13 |           args:
14 |             - apt-get install unzip;
15 |               pip install kaggle;
16 |               mkdir /mnt/pvc/dataset;
17 |               kaggle competitions download -c imagenet-object-localization-challenge -p /mnt/pvc/dataset;
18 |               unzip /mnt/pvc/dataset/imagenet-object-localization-challenge.zip -d /mnt/pvc/dataset;
19 |               rm /mnt/pvc/dataset/imagenet-object-localization-challenge.zip;
20 |               cd /mnt/pvc/dataset/ILSVRC/Data/CLS-LOC/val/;
21 |               wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
22 |           env:
23 |             - name: KAGGLE_KEY
24 |               valueFrom:
25 |                 secretKeyRef:
26 |                   name: kaggle-token-secret
27 |                   key: token
28 |             - name: KAGGLE_USERNAME
29 |               value: navarreprattcw
30 |           volumeMounts:
31 |             - name: kubeflow-imagenet
32 |               mountPath: /mnt/pvc
33 |           resources:
34 |             requests:
35 |               cpu: 1
36 |               memory: 4Gi
37 |             limits:
38 |               cpu: 1
39 |               memory: 4Gi
40 |       volumes:
41 |         - name: kubeflow-imagenet
42 |           persistentVolumeClaim:
43 |             claimName: kubeflow-imagenet
44 |       affinity:
45 |         nodeAffinity:
46 |           requiredDuringSchedulingIgnoredDuringExecution:
47 |             nodeSelectorTerms:
48 |               - matchExpressions:
49 |                   - key: topology.kubernetes.io/region
50 |                     operator: In
51 |                     values:
52 |                       - LAS1
53 |       restartPolicy: Never
54 |   backoffLimit: 2
55 | 


--------------------------------------------------------------------------------
/kubeflow/training-operator/resnet50/k8s/imagenet-mpijob.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: "kubeflow.org/v2beta1"
  2 | kind: "MPIJob"
  3 | metadata:
  4 |   name: "imagenet-16gpu-mpijob"
  5 | spec:
  6 |   slotsPerWorker: 8
  7 |   runPolicy:
  8 |     cleanPodPolicy: Running
  9 |   mpiReplicaSpecs:
 10 |     Launcher:
 11 |       replicas: 1
 12 |       restartPolicy: OnFailure
 13 |       template:
 14 |         spec:
 15 |           containers:
 16 |             - name: pytorch
 17 |               image: navarrepratt/pytorch_mpi_resnet50:6
 18 |               command:
 19 |                 - "mpirun"
 20 |                 - "-np"
 21 |                 - "16"  # Total processes = num workers * slots per workers
 22 |                 - "-x"
 23 |                 - "WANDB_API_KEY=$(WANDB_API_KEY)"
 24 |                 - "-x"
 25 |                 - "NCCL_DEBUG=INFO"
 26 |                 - "--allow-run-as-root"
 27 |                 - "python"
 28 |                 - "/opt/resnet50/src/resnet50.py"
 29 |                 - "--data-dir"
 30 |                 - "/mnt/pvc/dataset/ILSVRC/Data/CLS-LOC"
 31 |                 - "--model-dir"
 32 |                 - "/mnt/pvc/mpi/checkpoints"
 33 |                 - "--epochs"
 34 |                 - "10"
 35 |                 - "--batch-size"
 36 |                 - "256"
 37 |                 - "--wandb-project"
 38 |                 - "resnet50-imagenet-horovod"
 39 |                 - "--wandb-run"
 40 |                 - "a40-16gpu"
 41 |               resources:
 42 |                 requests:
 43 |                   cpu: 2
 44 |                   memory: 128Mi
 45 |               env:
 46 |                 - name: WANDB_API_KEY
 47 |                   valueFrom:
 48 |                     secretKeyRef:
 49 |                       name: wandb-token-secret
 50 |                       key: token
 51 |           affinity:
 52 |             nodeAffinity:
 53 |               requiredDuringSchedulingIgnoredDuringExecution:
 54 |                 nodeSelectorTerms:
 55 |                   - matchExpressions:
 56 |                       - key: failure-domain.beta.kubernetes.io/region
 57 |                         operator: In
 58 |                         values:
 59 |                           - LAS1
 60 | 
 61 |     Worker:
 62 |       replicas: 2
 63 |       restartPolicy: OnFailure
 64 |       template:
 65 |         spec:
 66 |           containers:
 67 |             - name: pytorch
 68 |               image: navarrepratt/pytorch_mpi_resnet50:6
 69 |               resources:  # Use the full node
 70 |                 requests:
 71 |                   cpu: 90
 72 |                   memory: 700G
 73 |                   nvidia.com/gpu: 8
 74 |                 limits:
 75 |                   cpu: 90
 76 |                   memory: 700G
 77 |                   nvidia.com/gpu: 8
 78 |               volumeMounts:
 79 |                 - name: kubeflow-resnet50
 80 |                   mountPath: /mnt/pvc
 81 |                 - name: dshm
 82 |                   mountPath: /dev/shm
 83 |           volumes:
 84 |             - name: kubeflow-resnet50
 85 |               persistentVolumeClaim:
 86 |                 claimName: kubeflow-resnet50
 87 |             - emptyDir:
 88 |                 medium: Memory
 89 |               name: dshm
 90 |           affinity:
 91 |             nodeAffinity:
 92 |               requiredDuringSchedulingIgnoredDuringExecution:
 93 |                 nodeSelectorTerms:
 94 |                   - matchExpressions:
 95 |                       - key: gpu.nvidia.com/model
 96 |                         operator: In
 97 |                         values:
 98 |                           - A40
 99 |                       - key: failure-domain.beta.kubernetes.io/region
100 |                         operator: In
101 |                         values:
102 |                           - LAS1
103 | 


--------------------------------------------------------------------------------
/kubeflow/training-operator/resnet50/k8s/kaggle-secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 |   token: enterYourSecret==
4 | kind: Secret
5 | metadata:
6 |   name: kaggle-token-secret
7 | type: Opaque
8 | 


--------------------------------------------------------------------------------
/kubeflow/training-operator/resnet50/k8s/model-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: kubeflow-imagenet
 5 | spec:
 6 |   storageClassName: shared-hdd-las1
 7 |   accessModes:
 8 |     - ReadWriteMany
 9 |   resources:
10 |     requests:
11 |       storage: 1000Gi
12 | 


--------------------------------------------------------------------------------
/kubeflow/training-operator/resnet50/k8s/wanbd-secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 |   token: enterYourSecret==
4 | kind: Secret
5 | metadata:
6 |   name: wandb-token-secret
7 | type: Opaque
8 | 


--------------------------------------------------------------------------------
/online-inference/bloom-176b-deepspeed/00-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: microsoft-bloom-deepspeed-inference-fp16
 5 | spec:
 6 |   storageClassName: shared-nvme-las1
 7 |   accessModes:
 8 |     - ReadWriteMany
 9 |   resources:
10 |     requests:   
11 |       storage: 350Gi
12 | 


--------------------------------------------------------------------------------
/online-inference/bloom-176b-deepspeed/01-download-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: microsoft-bloom-deepspeed-inference-fp16-download
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       containers:
 9 |       - name: model-downloader
10 |         image: tweldoncw/huggingface-hub-downloader:2
11 |         imagePullPolicy: IfNotPresent
12 |         command:
13 |           - "python3"
14 |           - "/app/download.py"
15 |           - "--model-id=microsoft/bloom-deepspeed-inference-fp16"
16 |           - "--revision=main"
17 |         env: 
18 |           - name: HF_HOME
19 |             value: /mnt/models
20 |         volumeMounts:
21 |           - name: model-cache
22 |             mountPath: /mnt/models
23 |         resources:
24 |           requests:
25 |             cpu: 1
26 |             memory: 4Gi 
27 |           limits:
28 |             cpu: 1
29 |             memory: 4Gi 
30 |       volumes:
31 |         - name: model-cache
32 |           persistentVolumeClaim:
33 |             claimName: microsoft-bloom-deepspeed-inference-fp16
34 |       affinity:
35 |         nodeAffinity:
36 |           requiredDuringSchedulingIgnoredDuringExecution:
37 |             nodeSelectorTerms:
38 |             - matchExpressions:
39 |               - key: topology.kubernetes.io/region
40 |                 operator: In
41 |                 values: 
42 |                 - LAS1
43 |       restartPolicy: Never
44 |   backoffLimit: 2
45 | 


--------------------------------------------------------------------------------
/online-inference/bloom-176b-deepspeed/02-inference-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.kubeflow.org/v1beta1
 2 | kind: InferenceService
 3 | metadata:
 4 |   name: microsoft-bloom-deepspeed-inference-fp16
 5 | spec:
 6 |   predictor:
 7 |     containerConcurrency: 1
 8 |     minReplicas: 1
 9 |     maxReplicas: 1
10 |     affinity:
11 |       nodeAffinity:
12 |         requiredDuringSchedulingIgnoredDuringExecution:
13 |           nodeSelectorTerms:
14 |           - matchExpressions:
15 |             - key: topology.kubernetes.io/region
16 |               operator: In
17 |               values:
18 |               - LAS1
19 |             - key: gpu.nvidia.com/class
20 |               operator: In
21 |               values:
22 |               - A100_NVLINK_80GB
23 |     containers:
24 |     - name: kfserving-container
25 |       image: tweldoncw/microsoft-bloom-deepspeed-inference-fp16:7
26 |       command:
27 |         - "/usr/bin/bash"
28 |         - "server.sh"
29 |       ports:
30 |       - containerPort: 5000
31 |         protocol: TCP
32 |       env: 
33 |         - name: STORAGE_URI # Kserve mounts the PVC at /mnt/pvc/
34 |           value: pvc://microsoft-bloom-deepspeed-inference-fp16/
35 |         - name: HF_HOME
36 |           value: /mnt/models
37 |       resources:
38 |         limits:
39 |           cpu: 12
40 |           memory: 64Gi
41 |           nvidia.com/gpu: 8
42 |         requests:
43 |           cpu: 12
44 |           memory: 64Gi
45 |           nvidia.com/gpu: 8
46 | 


--------------------------------------------------------------------------------
/online-inference/bloom-176b-deepspeed/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG CUDA_RELEASE=11.6.2-cudnn8-devel-ubuntu20.04
 2 | FROM nvidia/cuda:${CUDA_RELEASE}
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | ADD files/ /app
 5 | WORKDIR /app
 6 | RUN apt -y update && \
 7 |     apt -y upgrade && \
 8 |     apt install -y git python3 python3-pip python3-mpi4py && \
 9 |     pip install --no-cache-dir -r requirements.txt && \
10 |     pip install --no-cache-dir git+https://github.com/microsoft/DeepSpeed-MII && \
11 |     git clone https://github.com/huggingface/transformers-bloom-inference.git 
12 | WORKDIR /app/transformers-bloom-inference/bloom-inference-server
13 | RUN git checkout bd8af12 && \
14 |     git apply /app/isvc-patch.txt && \
15 |     chmod +x server.sh
16 | 


--------------------------------------------------------------------------------
/online-inference/bloom-176b-deepspeed/Dockerfile.downloader:
--------------------------------------------------------------------------------
1 | FROM python:3.9.13-alpine3.16
2 | RUN mkdir /app
3 | ADD downloader/ /app/
4 | WORKDIR /app
5 | RUN pip3 install --no-cache-dir --upgrade pip
6 | RUN pip3 install --no-cache-dir -r requirements.txt
7 | CMD ["python3", "/app/download.py"]
8 | 


--------------------------------------------------------------------------------
/online-inference/bloom-176b-deepspeed/downloader/download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import huggingface_hub as hf
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument('--model-id', required=True)
 7 | parser.add_argument('--revision', default="main")
 8 | args = parser.parse_args()
 9 | 
10 | hf.snapshot_download(repo_id=args.model_id, revision=args.revision)
11 | 


--------------------------------------------------------------------------------
/online-inference/bloom-176b-deepspeed/downloader/requirements.txt:
--------------------------------------------------------------------------------
1 | huggingface_hub==0.8.1


--------------------------------------------------------------------------------
/online-inference/bloom-176b-deepspeed/files/isvc-patch.txt:
--------------------------------------------------------------------------------
 1 | diff --git a/bloom-inference-server/models/model.py b/bloom-inference-server/models/model.py
 2 | index a16c8bb..0c8d69c 100644
 3 | --- a/bloom-inference-server/models/model.py
 4 | +++ b/bloom-inference-server/models/model.py
 5 | @@ -18,7 +18,8 @@ class Model:
 6 |          raise NotImplementedError("This is a dummy class")
 7 |  
 8 |      def generate(self, request: GenerateRequest) -> GenerateResponse:
 9 | -        input_tokens = self.tokenizer(request.text, return_tensors="pt", padding=True)
10 | +        input_tokens = self.tokenizer(
11 | +            request.text, return_tensors="pt", padding=True)
12 |  
13 |          for t in input_tokens:
14 |              if torch.is_tensor(input_tokens[t]):
15 | @@ -58,14 +59,17 @@ class Model:
16 |  
17 |          input_token_lengths = [x.shape[0] for x in input_tokens.input_ids]
18 |          output_token_lengths = [x.shape[0] for x in output_tokens]
19 | -        generated_tokens = [o - i for i, o in zip(input_token_lengths, output_token_lengths)]
20 | +        generated_tokens = [o - i for i,
21 | +                            o in zip(input_token_lengths, output_token_lengths)]
22 |  
23 |          if request.remove_input_from_output:
24 |              # the generate method's output includes input too. Remove input if
25 |              # that is requested by the user
26 | -            output_tokens = [x[-i:] for x, i in zip(output_tokens, generated_tokens)]
27 | +            output_tokens = [x[-i:]
28 | +                             for x, i in zip(output_tokens, generated_tokens)]
29 |  
30 | -        output_text = self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
31 | +        output_text = self.tokenizer.batch_decode(
32 | +            output_tokens, skip_special_tokens=True)
33 |  
34 |          return GenerateResponse(text=output_text, num_generated_tokens=generated_tokens)
35 |  
36 | @@ -79,14 +83,31 @@ class Model:
37 |  
38 |  
39 |  def get_downloaded_model_path(model_name: str):
40 | -    f = partial(
41 | -        snapshot_download,
42 | -        repo_id=model_name,
43 | -        allow_patterns=["*"],
44 | -        local_files_only=is_offline_mode(),
45 | -        cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
46 | -    )
47 | -    # download only on 1 process
48 | -    run_rank_n(f, barrier=True)
49 | -    # now since the snapshot is downloaded, pass the model_path to all processes
50 | -    return f()
51 | +    # Modified to not use snapshot_download() which requires write permissions.
52 | +    # InferenceServices mount PVC's as read-only.
53 | +    model_id_split = model_name.split('/')
54 | +    model_org = model_id_split[0]
55 | +    model_repo = model_id_split[1]
56 | +
57 | +    model_directory = (
58 | +        "models" +
59 | +        "--" +
60 | +         model_org +
61 | +         "--" +
62 | +        model_repo
63 | +        )
64 | +
65 | +    HF_HOME = os.getenv('HF_HOME', '/mnt/models')
66 | +    HUB_CACHE = os.path.join(HF_HOME, "hub")
67 | +    MODEL_REVISION = os.getenv('MODEL_REVISION', 'main')
68 | +
69 | +    model_path = os.path.join(HUB_CACHE, model_directory)
70 | +
71 | +    model_ref_path = os.path.join(model_path, 'refs', MODEL_REVISION)
72 | +
73 | +    with open(model_ref_path, 'r') as f:
74 | +        model_git_ref = f.readlines()[0]
75 | +
76 | +    model_snapshot_path = os.path.join(model_path, "snapshots", model_git_ref)
77 | +
78 | +    return model_snapshot_path
79 | diff --git a/bloom-inference-server/server.sh b/bloom-inference-server/server.sh
80 | old mode 100644
81 | new mode 100755
82 | index 92179fb..0dcf756
83 | --- a/bloom-inference-server/server.sh
84 | +++ b/bloom-inference-server/server.sh
85 | @@ -1,5 +1,5 @@
86 | -export MODEL_NAME=bigscience/bloom
87 | -export DEPLOYMENT_FRAMEWORK=hf_accelerate
88 | +export MODEL_NAME=microsoft/bloom-deepspeed-inference-fp16
89 | +export DEPLOYMENT_FRAMEWORK=ds_inference
90 |  export DTYPE=fp16
91 |  
92 |  # for more information on gunicorn see https://docs.gunicorn.org/en/stable/settings.html
93 | 


--------------------------------------------------------------------------------
/online-inference/bloom-176b-deepspeed/files/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | torchvision
 3 | torchaudio
 4 | --extra-index-url https://download.pytorch.org/whl/cu116
 5 | transformers==4.21.3
 6 | accelerate==0.12.0
 7 | deepspeed>=0.7.3
 8 | flask==2.2.2 
 9 | flask_api==3.0.post1
10 | gunicorn==20.1.0
11 | pydantic==1.10.2
12 | huggingface_hub>=0.9.0 
13 | 


--------------------------------------------------------------------------------
/online-inference/bloom-176b/00-bloom-176b-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: model-cache
 5 | spec:
 6 |   storageClassName: shared-nvme-ord1
 7 |   accessModes:
 8 |     - ReadWriteMany
 9 |   resources:
10 |     requests:
11 |       storage: 350Gi
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/online-inference/bloom-176b/01-bloom-176b-download-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: bloom-176b-download
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       containers:
 9 |       - name: model-downloader
10 |         image: tweldoncw/bloom-176b:1
11 |         imagePullPolicy: IfNotPresent
12 |         command: ["bash", "-c"]
13 |         args:
14 |           - 'download_model bloom /mnt/pvc'
15 |         volumeMounts:
16 |           - name: cache
17 |             mountPath: /mnt/pvc
18 |         resources:
19 |           requests:
20 |             cpu: 1
21 |             memory: 4Gi 
22 |           limits:
23 |             cpu: 1
24 |             memory: 4Gi 
25 |       volumes:
26 |         - name: cache
27 |           persistentVolumeClaim:
28 |             claimName: model-cache
29 |       affinity:
30 |         nodeAffinity:
31 |           requiredDuringSchedulingIgnoredDuringExecution:
32 |             nodeSelectorTerms:
33 |             - matchExpressions:
34 |               - key: topology.kubernetes.io/region
35 |                 operator: In
36 |                 values: 
37 |                 - ORD1
38 |       restartPolicy: Never
39 |   backoffLimit: 2
40 | 


--------------------------------------------------------------------------------
/online-inference/bloom-176b/02-bloom-176b-inferenceservice.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.kubeflow.org/v1beta1
 2 | kind: InferenceService
 3 | metadata:
 4 |   name: bloom-176b
 5 | spec:
 6 |   predictor:
 7 |     containerConcurrency: 1
 8 |     minReplicas: 1
 9 |     maxReplicas: 1 
10 |     affinity:
11 |       nodeAffinity:
12 |         requiredDuringSchedulingIgnoredDuringExecution:
13 |           nodeSelectorTerms:
14 |           - matchExpressions:
15 |             - key: gpu.nvidia.com/class
16 |               operator: In
17 |               values:
18 |               - A100_PCIE_80GB
19 |             - key: topology.kubernetes.io/region
20 |               operator: In
21 |               values:
22 |               - ORD1 
23 |     containers:
24 |       - name: kserve-container
25 |         image: tweldoncw/bloom-176b:1
26 |         command:
27 |           - "python3"
28 |           - "/workspace/bloom.py"
29 |         env:
30 |           # The following values are defaults which may be changed as needed
31 |           - name: MODEL_PATH
32 |             value: "/mnt/pvc/bloom" 
33 |           - name: STORAGE_URI # Kserve mounts the PVC at /mnt/pvc/
34 |             value: pvc://model-cache/
35 |           - name: MODEL_DOWNLOAD_TIMEOUT
36 |             value: "3600"
37 |           # The following values are defaults which may be changed as needed here, as well in each predictor request. 
38 |           - name: MIN_LENGTH
39 |             value: "1"
40 |           - name: MAX_LENGTH
41 |             value: "40"
42 |           - name: TEMPERATURE
43 |             value: "1.0"
44 |           - name: TOP_K
45 |             value: "50"
46 |           - name: TOP_P
47 |             value: "1.0"
48 |           - name: REPETITION_PENALTY
49 |             value: "1.125"
50 |         resources:
51 |           requests:
52 |             cpu: 12
53 |             memory: 64Gi
54 |             nvidia.com/gpu: 5
55 |           limits:
56 |             cpu: 12
57 |             memory: 64Gi
58 |             nvidia.com/gpu: 5
59 | 


--------------------------------------------------------------------------------
/online-inference/bloom-176b/model/Dockerfile:
--------------------------------------------------------------------------------
 1 | # PyTorch and Hugging Face
 2 | FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime AS pytorch-huggingface
 3 | 
 4 | # Upgrade packages
 5 | RUN apt update && apt upgrade -y
 6 | 
 7 | RUN apt update && apt install -y curl git wget zip tree
 8 | 
 9 | ADD requirements.txt /tmp/
10 | RUN pip3 install -r /tmp/requirements.txt && \
11 | # Remove Apache Log4j 2 CVE-2021-44228, ray 1.9.1 has not upgraded log4j as they promised \
12 | rm -rf /opt/conda/lib/python3.7/site-packages/ray/jars 
13 | 
14 | ADD scripts/ /usr/bin/
15 | ADD bloom.py /workspace
16 | 
17 | #RUN mkdir -p /inference
18 | #WORKDIR /inference
19 | #
20 | #ADD huggingface/wiki_corpus.txt huggingface/wiki_corpus.py ./
21 | #ADD huggingface/huggingface.py ./
22 | 


--------------------------------------------------------------------------------
/online-inference/bloom-176b/model/bloom.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import re
 4 | import logging
 5 | import kserve
 6 | from typing import Dict
 7 | 
 8 | from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 9 | import torch
10 | 
11 | mem_map = {0: '71GIB', 1: '71GIB', 2: '71GIB', 3: '71GIB', 4: '71GIB'}
12 | 
13 | model_id = os.getenv('MODEL_ID', "bigscience/bloom")
14 | 
15 | options = {
16 |     'MODEL_PATH': os.getenv('MODEL_PATH', "/mnt/pvc/bloom"),
17 |     'MODEL_NAME': re.sub(r'[^\w-]', '-', model_id).lower(),
18 |     'MODEL_TYPE': os.getenv('MODEL_TYPE', 'text-generation'),
19 |     #'DEVICE_MAP': os.getenv('DEVICE_MAP', "auto"),
20 |     'MODEL_DOWNLOAD_TIMEOUT': int(os.getenv('MODEL_DOWNLOAD_TIMEOUT', 300))
21 | }
22 | 
23 | model_params = {
24 |     'MIN_LENGTH': int(os.getenv('MIN_LENGTH', 1)),
25 |     'MAX_LENGTH': int(os.getenv('MAX_LENGTH', 40)),
26 |     'TEMPERATURE': float(os.getenv('TEMPERATURE', 1.0)),
27 |     'TOP_K': int(os.getenv('TOP_K', 50)),
28 |     'TOP_P': float(os.getenv('TOP_P', 1.0)),
29 |     'REPETITION_PENALTY': float(os.getenv('REPETITION_PENALTY', 1.0)),
30 | }
31 | 
32 | logging.basicConfig(level=kserve.constants.KSERVE_LOGLEVEL)
33 | logger = logging.getLogger(options['MODEL_NAME'])
34 | 
35 | class Model(kserve.Model):
36 |     def __init__(self, name: str):
37 |         super().__init__(name)
38 |         self.name = name
39 |         self.ready = False
40 |         self.model = None
41 |         self.tokenizer = None
42 |         self.generator = None
43 |         self.model_name = options['MODEL_NAME']
44 | 
45 |     def load(self):
46 |         self.model = AutoModelForCausalLM.from_pretrained(options["MODEL_PATH"], device_map="auto", max_memory=mem_map, torch_dtype=torch.bfloat16, local_files_only=True)
47 |         self.model.bfloat16().eval()
48 |         self.tokenizer = AutoTokenizer.from_pretrained(options["MODEL_PATH"], local_files_only=True)
49 |         self.generator = pipeline(
50 |             options['MODEL_TYPE'],
51 |             model=self.model,
52 |             tokenizer=self.tokenizer,
53 |             device_map="auto",
54 |         )
55 |         self.ready = True
56 | 
57 |     def predict(self, request: Dict) -> Dict:
58 |         request_params = model_params.copy()
59 | 
60 |         if 'parameters' in request:
61 |             parameters = request['parameters']
62 |             for k, pv in parameters.items():
63 |                 pk = k.upper()
64 |                 if pk in request_params:
65 |                     logger.debug(f'Parameter {pk} changed from {request_params[pk]} to {pv}')
66 |                     request_params[pk] = pv
67 | 
68 |         return {'predictions': self.generator(
69 |             request['instances'],
70 |             #do_sample=True,
71 |             min_length=request_params['MIN_LENGTH'],
72 |             max_length=request_params['MAX_LENGTH'],
73 |             temperature=request_params['TEMPERATURE'],
74 |             top_k=request_params['TOP_K'],
75 |             top_p=request_params['TOP_P'],
76 |             repetition_penalty=request_params['REPETITION_PENALTY']
77 |         )}
78 | 
79 |     @staticmethod
80 |     def is_ready():
81 |         ready_path = os.path.join(options['MODEL_PATH'], '.ready.txt')
82 |         logger.info(f'Waiting for download to be ready ...')
83 |         interval_time = 10
84 |         intervals = options['MODEL_DOWNLOAD_TIMEOUT'] // interval_time
85 |         for i in range(intervals):
86 |             time.sleep(interval_time)
87 |             if os.path.exists(ready_path):
88 |                 logger.info('Download ready')
89 |                 return
90 |         raise Exception(f'Download timeout {interval_time * intervals}!')
91 | 
92 | if __name__ == '__main__':
93 |     Model.is_ready()
94 |     with torch.no_grad():
95 |         model = Model(options['MODEL_NAME'])
96 |         model.load()
97 |         kserve.ModelServer().start([model])
98 |    


--------------------------------------------------------------------------------
/online-inference/bloom-176b/model/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate==0.11.0
2 | git+https://github.com/huggingface/transformers.git@ccc0897 # For device_map in pipelines() support
3 | kserve==0.8.0.2


--------------------------------------------------------------------------------
/online-inference/bloom-176b/model/scripts/download_model:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Author: Marcin Gucki 
 4 | # https://github.com/coreweave/ml-images/blob/master/huggingface/download_huggingface.sh
 5 | 
 6 | set -ex
 7 | 
 8 | if [[ $# -ne 2 ]]; then
 9 |     echo "Invalid number of arguments"
10 |     echo "Usage: ./download_huggingface.sh <model_name> <save_path>"
11 |     echo "  - model_name  - model name to download e.g. NovelAI/genji-python-6b"
12 |     echo "  - save_path  - base directory where the model is saved, e.g. /mnt/pvc"
13 |     exit 1
14 | fi
15 | 
16 | MODEL_NAME=$1
17 | SAVE_PATH=$2
18 | 
19 | BLOBSTORE_PREFIX="inference"
20 | PATH=${PATH}:"${CURRENT_DIR}/scripts/bin"
21 | 
22 | echo "SAVE_PATH: ${SAVE_PATH}"
23 | echo "MODEL_NAME: ${MODEL_NAME}"
24 | 
25 | mkdir -pv "${SAVE_PATH}/${MODEL_NAME}"
26 | 
27 | function download_file {
28 |   local FILE_PATH=$1
29 |   local DIR_PATH=$(dirname "${FILE_PATH}")
30 |   mkdir -p "${DIR_PATH}"
31 |   curl "http://blobstore.s3.ord1.coreweave.com/inference/${FILE_PATH}" --output "${FILE_PATH}"
32 | }
33 | 
34 | function  download {
35 |   echo "Downloading model ${MODEL_NAME} into ${SAVE_PATH}"
36 | 
37 |   pushd "${SAVE_PATH}"
38 |   mkdir -p "${MODEL_NAME}"
39 |   pushd "${MODEL_NAME}"
40 |   FILE_LIST=($(curl --insecure "http://blobstore.s3.ord1.coreweave.com/inference/${MODEL_NAME}/files.txt" | awk '{print $4;}'))
41 |   popd
42 | 
43 |   for file in "${FILE_LIST[@]}";do
44 |     relative_path=${file#"s3://blobstore/inference/"}
45 |     download_file "${relative_path}"
46 |   done
47 | 
48 |   popd
49 | }
50 | 
51 | function set_ready {
52 |   echo "Save .ready.txt in ${SAVE_PATH}/${MODEL_NAME}"
53 |   pushd "${SAVE_PATH}/${MODEL_NAME}"
54 |   touch ".ready.txt"
55 |   tree
56 |   popd
57 | }
58 | 
59 | date
60 | download
61 | set_ready
62 | date
63 | 
64 | exit 0
65 | 


--------------------------------------------------------------------------------
/online-inference/custom-basnet/README.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | This example demonstrates deploying an auto-scaling Inference service from a pre-existing docker image. This can be useful when deploying off-the-shelf models that aren't available as ie. Tensorflow SavedModels. One example of this is the [IBM COCO Based Object Detector](https://github.com/IBM/MAX-Object-Detector). An [example InferenceService](./object-detector-inferenceservice.yaml) for that also exists in this repository. The rest of this example will focus on a [public wrapped version](https://github.com/cyrildiagne/basnet-http) of the [BASNet object detection model](https://github.com/NathanUA/BASNet). This example and the test client is based on work by [Cyril Diagne](https://twitter.com/cyrildiagne/status/1256916982764646402).
 4 | 
 5 | **Input**  
 6 | ![input](./client/images/test.png)
 7 | 
 8 | 
 9 | **Output**  
10 | ![output](./client/expected_output.png)
11 | 
12 | ### Getting Started
13 | 
14 | After installing `kubectl` and adding your CoreWeave Cloud access credentials, the following steps will deploy the Inference Service. Clone all the files in this repository to follow along.
15 | 
16 | 1. Apply the resources. This can be used to both create and update existing manifests
17 |    ```bash
18 |     $ kubectl apply -f basnet-inferenceservice.yaml
19 |     inferenceservice.serving.kubeflow.org/basnet configured
20 |     ```
21 |     
22 | 2. List pods to see that the Transformer and Predictor have launched successfully
23 |    ```bash
24 |    $ kubectl get pods
25 |    NAME                                                           READY   STATUS    RESTARTS   AGE
26 |    basnet-predictor-default-sj9kr-deployment-76b67d669-4gjrp      2/2     Running   0          34s
27 |    ```
28 |    If the predictor fails to init, look in the logs for clues `kubectl logs basnet-predictor-default-sj9kr-deployment-76b67d669-4gjrp kfserving-container`.
29 | 
30 | 3. Once all the Pods are running, we can get the API endpoint for our model. Since this model doesn't adhere to the [Tensorflow V1 HTTP API](https://www.tensorflow.org/tfx/serving/api_rest#predict_api), we can't use the API endpoint provided by `kubectl get inferenceservices`. We have to hit up the predictor directly.
31 |    ```bash
32 |    $ kubectl get ksvc
33 |    NAME                         URL                                                                       LATESTCREATED                      LATESTREADY                        READY   REASON
34 |    basnet-predictor-default     https://basnet-predictor-default.tenant-test.knative.chi.coreweave.com    basnet-predictor-default-sj9kr     basnet-predictor-default-sj9kr     True
35 |    ```
36 |    The URL in the output is the public API URL for your newly deployed model.
37 |    
38 | 4. Enter the client directory. You can either run the test client locally or in docker. The output will be in `images/output.png`. 
39 |    ```bash
40 |     $ cd client/
41 |     $ export SERVICE_URL=https://basnet-predictor-default.tenant-test.knative.chi.coreweave.com
42 |     $ docker build -t test .; docker run --rm -it -v $(pwd)/images:/app/images test --basnet_service_host $SERVICE_URL
43 |     INFO:root: > sending to BASNet...
44 |     INFO:root:200
45 |     INFO:root: > saving results...
46 |     INFO:root: > opening mask...
47 |     INFO:root: > compositing final image...
48 |     INFO:root: > saving final image...
49 |     $ open images/output.png
50 |    ```
51 |    
52 | 5. Remove the inference service
53 |    ```bash
54 |    $ kubectl delete inferenceservices basnet
55 |    inferenceservice.serving.kubeflow.org "basnet" deleted
56 |    ```


--------------------------------------------------------------------------------
/online-inference/custom-basnet/basnet-inferenceservice.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.kubeflow.org/v1beta1
 2 | kind: InferenceService
 3 | metadata:
 4 |   labels:
 5 |     qos.coreweave.cloud/latency: low
 6 |   name: basnet
 7 | spec:
 8 |   predictor:
 9 |     maxReplicas: 20
10 |     minReplicas: 1
11 |     containerConcurrency: 1
12 |     containers:
13 |     - name: kfserving-container
14 |       image: docker.io/cyrildiagne/basnet-http
15 |       ports:
16 |       - containerPort: 80
17 |         protocol: TCP
18 |       resources:
19 |         limits:
20 |           cpu: "3"
21 |           memory: 8Gi
22 |           nvidia.com/gpu: "1"
23 |         requests:
24 |           cpu: 500m
25 |           memory: 4Gi
26 |     affinity:
27 |       nodeAffinity:
28 |         requiredDuringSchedulingIgnoredDuringExecution:
29 |           nodeSelectorTerms:
30 |           - matchExpressions:
31 |             - key: gpu.nvidia.com/class
32 |               operator: In
33 |               values:
34 |               - Quadro_RTX_5000
35 | 


--------------------------------------------------------------------------------
/online-inference/custom-basnet/client/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/custom-basnet/client/.DS_Store


--------------------------------------------------------------------------------
/online-inference/custom-basnet/client/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7-slim
 2 | 
 3 | #ARG DEBIAN_FRONTEND=noninteractive
 4 | #RUN apt-get update && apt-get install -y build-essential
 5 | 
 6 | ENV APP_HOME /app
 7 | WORKDIR $APP_HOME
 8 | 
 9 | # Install production dependencies.
10 | COPY requirements.txt ./
11 | RUN pip install --no-cache-dir -r ./requirements.txt
12 | 
13 | # Copy local code to container image
14 | COPY main.py ./
15 | 
16 | ENTRYPOINT ["python", "main.py"]
17 | 


--------------------------------------------------------------------------------
/online-inference/custom-basnet/client/expected_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/custom-basnet/client/expected_output.png


--------------------------------------------------------------------------------
/online-inference/custom-basnet/client/images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/custom-basnet/client/images/.DS_Store


--------------------------------------------------------------------------------
/online-inference/custom-basnet/client/images/cut_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/custom-basnet/client/images/cut_mask.png


--------------------------------------------------------------------------------
/online-inference/custom-basnet/client/images/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/custom-basnet/client/images/output.png


--------------------------------------------------------------------------------
/online-inference/custom-basnet/client/images/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/custom-basnet/client/images/test.png


--------------------------------------------------------------------------------
/online-inference/custom-basnet/client/main.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import logging
 3 | import argparse
 4 | import io
 5 | from PIL import Image
 6 | 
 7 | logging.basicConfig(level=logging.INFO)
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--basnet_service_host', required=True, help="The BASNet service host")
11 | args = parser.parse_args()
12 | 
13 | # Send to BASNet service.
14 | logging.info(' > sending to BASNet...')
15 | source = open('images/test.png', 'rb')
16 | files = {'data': source }
17 | res = requests.post(args.basnet_service_host, files=files)
18 | logging.info(res.status_code)
19 | 
20 | # Save mask locally.
21 | logging.info(' > saving results...')
22 | with open('images/cut_mask.png', 'wb') as f:
23 |     f.write(res.content)
24 |     # shutil.copyfileobj(res.raw, f)
25 | 
26 | logging.info(' > opening mask...')
27 | mask = Image.open('images/cut_mask.png').convert("L").resize((512, 512))
28 | 
29 | # Convert string data to PIL Image.
30 | logging.info(' > compositing final image...')
31 | ref = Image.open(source).resize((512, 512))
32 | empty = Image.new("RGBA", ref.size, 0)
33 | img = Image.composite(ref, empty, mask)
34 | 
35 | # Save locally.
36 | logging.info(' > saving final image...')
37 | img.save('images/output.png')
38 | 


--------------------------------------------------------------------------------
/online-inference/custom-basnet/client/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.23.0
2 | Pillow==7.1.2
3 | 


--------------------------------------------------------------------------------
/online-inference/custom-basnet/object-detector-inferenceservice.yaml:
--------------------------------------------------------------------------------
 1 | # This is a CPU only export of the model, for demonstration purposes only
 2 | apiVersion: serving.kubeflow.org/v1alpha2
 3 | kind: InferenceService
 4 | metadata:
 5 |   labels:
 6 |     qos.coreweave.cloud/latency: low
 7 |   name: object-detector
 8 | spec:
 9 |   default:
10 |     predictor:
11 |       custom:
12 |         container:
13 |           image: codait/max-object-detector
14 |           name: kfserving-container
15 |           ports:
16 |             - containerPort: 80
17 |           resources:
18 |             limits:
19 |               cpu: "3"
20 |               memory: 8Gi
21 |             requests:
22 |               cpu: "1"
23 |               memory: 4Gi
24 | 


--------------------------------------------------------------------------------
/online-inference/custom-pytorch-aitextgen/aitextgen-inferenceservice.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.kubeflow.org/v1beta1
 2 | kind: InferenceService
 3 | metadata:
 4 |   labels:
 5 |     qos.coreweave.cloud/latency: low
 6 |   name: aitextgen
 7 | spec:
 8 |   predictor:
 9 |     maxReplicas: 10
10 |     minReplicas: 3
11 |     containerConcurrency: 1
12 |     containers:
13 |     - name: kfserving-container
14 |       image: coreweave/aitextgen-model:11
15 |       resources:
16 |         limits:
17 |           cpu: "3"
18 |           memory: 18Gi
19 |           nvidia.com/gpu: "1"
20 |         requests:
21 |           cpu: "1"
22 |           memory: 10Gi
23 |     affinity:
24 |       nodeAffinity:
25 |         requiredDuringSchedulingIgnoredDuringExecution:
26 |           nodeSelectorTerms:
27 |           - matchExpressions:
28 |             - key: gpu.nvidia.com/class
29 |               operator: In
30 |               values:
31 |               - Quadro_RTX_5000
32 | 


--------------------------------------------------------------------------------
/online-inference/custom-pytorch-aitextgen/custom-predictor/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.5-cuda10.1-cudnn7-devel
 2 | 
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | RUN apt-get update && apt-get install -y build-essential git
 5 | 
 6 | ENV APP_HOME /app
 7 | WORKDIR $APP_HOME
 8 | 
 9 | RUN git clone https://github.com/NVIDIA/apex
10 | RUN cd apex && /opt/conda/bin/python -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'./setup.py'"'"'; __file__='"'"'.//setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' --cpp_ext --cuda_ext install --record /tmp/install-record.txt --single-version-externally-managed --compile --install-headers /opt/conda/include/python3.7m/apex
11 | RUN cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
12 | 
13 | # Install production dependencies.
14 | COPY requirements.txt ./
15 | RUN pip install --no-cache-dir -r ./requirements.txt
16 | 
17 | # Copy local code to container image
18 | COPY *.py ./
19 | 
20 | CMD ["python", "model.py"]
21 | 


--------------------------------------------------------------------------------
/online-inference/custom-pytorch-aitextgen/custom-predictor/model.py:
--------------------------------------------------------------------------------
 1 | import kfserving
 2 | from typing import List, Dict
 3 | 
 4 | from aitextgen import aitextgen
 5 | 
 6 | class Model(kfserving.KFModel):
 7 |     def __init__(self, name: str):
 8 |         super().__init__(name)
 9 |         self.name = name
10 |         self.ready = False
11 | 
12 |     def load(self):
13 |         self.ai = aitextgen(tf_gpt2="1558M", to_gpu=True, to_fp16=True)
14 |         self.ready = True
15 | 
16 |     def predict(self, request: Dict) -> Dict:
17 |         payload = request["text"]
18 | 
19 |         prediction = self.ai.generate_one(prompt=payload, max_length=request.get("length", 64))
20 | 
21 |         return { 'prediction': prediction }
22 | 
23 | if __name__ == "__main__":
24 |     model = Model('aitextgen')
25 |     model.load()
26 |     kfserving.KFServer(workers=1).start([model])
27 | 


--------------------------------------------------------------------------------
/online-inference/custom-pytorch-aitextgen/custom-predictor/requirements.txt:
--------------------------------------------------------------------------------
1 | kfserving==0.5.1
2 | aitextgen
3 | tensorflow
4 | 
5 | 


--------------------------------------------------------------------------------
/online-inference/custom-sentiment/custom-predictor/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.5-cuda10.1-cudnn7-runtime
 2 | 
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | RUN apt-get update && apt-get install -y build-essential
 5 | 
 6 | ENV APP_HOME /app
 7 | WORKDIR $APP_HOME
 8 | 
 9 | # Install production dependencies.
10 | COPY requirements.txt ./
11 | RUN pip install --no-cache-dir -r ./requirements.txt
12 | 
13 | # Copy local code to container image
14 | COPY *.py ./
15 | 
16 | CMD ["python", "model.py"]
17 | 


--------------------------------------------------------------------------------
/online-inference/custom-sentiment/custom-predictor/model.py:
--------------------------------------------------------------------------------
 1 | import kfserving
 2 | from typing import List, Dict
 3 | 
 4 | from fastai.text import load_learner
 5 | 
 6 | class Model(kfserving.KFModel):
 7 |     def __init__(self, name: str):
 8 |         super().__init__(name)
 9 |         self.name = name
10 |         self.ready = False
11 | 
12 |     def load(self):
13 |         self.model = load_learner("/mnt/models")
14 |         self.ready = True
15 | 
16 |     def predict(self, request: Dict) -> Dict:
17 |         # Request and response follows the Tensorflow V1 HTTP API,
18 |         # but does not have to.
19 |         # No batching, grab the first instance only
20 |         payload = request["instances"][0]
21 | 
22 |         predictions = self.model.predict(payload)
23 |         prediction = predictions[0].obj
24 | 
25 |         return { 'predictions': [prediction] }
26 | 
27 | if __name__ == "__main__":
28 |     model = Model('sentiment')
29 |     model.load()
30 |     kfserving.KFServer(workers=1).start([model])
31 | 


--------------------------------------------------------------------------------
/online-inference/custom-sentiment/custom-predictor/requirements.txt:
--------------------------------------------------------------------------------
1 | kfserving==0.5.1
2 | fastai==1.0.61
3 | torch==1.5.0
4 | 


--------------------------------------------------------------------------------
/online-inference/custom-sentiment/image-secrets-serviceaccount.patch.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | imagePullSecrets:
4 | - name: docker-hub
5 | 


--------------------------------------------------------------------------------
/online-inference/custom-sentiment/model-storage-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: model-storage
 5 | spec:
 6 |   # https://docs.coreweave.com/coreweave-kubernetes/storage
 7 |   storageClassName: shared-nvme-ord1 
 8 |   accessModes:
 9 |     - ReadWriteMany
10 |   resources:
11 |     requests:
12 |       storage: 30Gi
13 | 


--------------------------------------------------------------------------------
/online-inference/custom-sentiment/sample.json:
--------------------------------------------------------------------------------
1 | {
2 |   "instances": ["CoreWeave is my favourite cloud"]
3 | }
4 | 


--------------------------------------------------------------------------------
/online-inference/custom-sentiment/sentiment-inferenceservice.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.kubeflow.org/v1beta1
 2 | kind: InferenceService
 3 | metadata:
 4 |   labels:
 5 |     qos.coreweave.cloud/latency: low
 6 |   name: sentiment
 7 | spec:
 8 |   predictor:
 9 |     maxReplicas: 10
10 |     minReplicas: 0
11 |     containerConcurrency: 1
12 |     containers:
13 |     - name: kfserving-container
14 |       image: coreweave/fastai-sentiment:4
15 |       env:
16 |       - name: STORAGE_URI
17 |         value: pvc://model-storage/sentiment
18 |       resources:
19 |         limits:
20 |           cpu: "3"
21 |           memory: 8Gi
22 |           nvidia.com/gpu: "1"
23 |         requests:
24 |           cpu: "1"
25 |           memory: 6Gi
26 |     affinity:
27 |       nodeAffinity:
28 |         requiredDuringSchedulingIgnoredDuringExecution:
29 |           nodeSelectorTerms:
30 |           - matchExpressions:
31 |             - key: gpu.nvidia.com/class
32 |               operator: In
33 |               values:
34 |               - Quadro_RTX_5000
35 | 


--------------------------------------------------------------------------------
/online-inference/custom-sentiment/sleep-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: sleep
 5 | spec:
 6 |   replicas: 1
 7 |   revisionHistoryLimit: 1
 8 |   selector:
 9 |     matchLabels:
10 |       app.kubernetes.io/name: sleep
11 |   strategy:
12 |     type: Recreate
13 |   template:
14 |     metadata:
15 |       labels:
16 |         app.kubernetes.io/name: sleep
17 |     spec:
18 |       containers:
19 |       - name: sleep
20 |         image: banst/awscli:1.18.56
21 |         # Simple way of keeping an idle container running
22 |         command: [sleep]
23 |         args: ["86400d"]
24 |         imagePullPolicy: IfNotPresent
25 |         resources:
26 |           requests:
27 |             cpu: 50m
28 |             memory: 10Mi
29 |           limits:
30 |             cpu: 1
31 |             memory: 128Mi
32 |         volumeMounts:
33 |           - name: model-storage
34 |             mountPath: /models
35 | 
36 |       volumes:
37 |         - name: model-storage
38 |           persistentVolumeClaim:
39 |             claimName: model-storage
40 | 
41 |       affinity:
42 |         nodeAffinity:
43 |           requiredDuringSchedulingIgnoredDuringExecution:
44 |             nodeSelectorTerms:
45 |               - matchExpressions:
46 |                   - key: topology.kubernetes.io/region
47 |                     operator: In
48 |                     values:
49 |                       - ORD1
50 | 


--------------------------------------------------------------------------------
/online-inference/dalle-mini/00-model-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: dalle-mini-model-cache
 5 | spec:
 6 |   storageClassName: shared-nvme-ord1
 7 |   accessModes:
 8 |     - ReadWriteMany
 9 |   resources:
10 |     requests:
11 |       storage: "30Gi"


--------------------------------------------------------------------------------
/online-inference/dalle-mini/01-model-download-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: dalle-mini-download
 5 |   #name: dalle-mega-download
 6 | spec:
 7 |   template:
 8 |     spec:
 9 |       containers:
10 |       - name: model-downloader
11 |         image: tweldoncw/model-downloader:6
12 |         imagePullPolicy: IfNotPresent
13 |         command: 
14 |           - "python3"
15 |           - "/app/download.py"
16 |           - "--model-id=dalle-mini/dalle-mini"
17 |           #- "--model-id=dalle-mini/dalle-mega"
18 |           - "--model-cache=/mnt/pvc"
19 |         volumeMounts:
20 |           - name: model-cache
21 |             mountPath: /mnt/pvc
22 |         resources:
23 |           requests:
24 |             cpu: 1
25 |             memory: 4Gi 
26 |           limits:
27 |             cpu: 1
28 |             memory: 4Gi 
29 |       volumes:
30 |         - name: model-cache
31 |           persistentVolumeClaim:
32 |             claimName: dalle-mini-model-cache
33 |       affinity:
34 |         nodeAffinity:
35 |           requiredDuringSchedulingIgnoredDuringExecution:
36 |             nodeSelectorTerms:
37 |             - matchExpressions:
38 |               - key: topology.kubernetes.io/region
39 |                 operator: In
40 |                 values: 
41 |                 - ORD1
42 |       restartPolicy: Never
43 |   backoffLimit: 2
44 | 


--------------------------------------------------------------------------------
/online-inference/dalle-mini/02-inference-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.kubeflow.org/v1beta1
 2 | kind: InferenceService
 3 | metadata:
 4 |   #name: dalle-mega
 5 |   name: dalle-mini
 6 | spec:
 7 |   predictor:
 8 |     containerConcurrency: 1
 9 |     minReplicas: 1
10 |     maxReplicas: 1 
11 |     affinity:
12 |       nodeAffinity:
13 |         requiredDuringSchedulingIgnoredDuringExecution:
14 |           nodeSelectorTerms:
15 |           - matchExpressions:
16 |             - key: gpu.nvidia.com/class
17 |               operator: In
18 |               values:
19 |               - RTX_A6000
20 |             - key: topology.kubernetes.io/region
21 |               operator: In
22 |               values:
23 |               - ORD1 
24 |     containers:
25 |       - name: kserve-container
26 |         image: tweldoncw/dalle-mini:7
27 |         command:
28 |           - "python3"
29 |           - "/app/service.py"
30 |         env:
31 |           - name: MODEL_ID
32 |             #value: "dalle-mini/dalle-mega"
33 |             value: "dalle-mini/dalle-mini"
34 |           - name: MODEL_CACHE
35 |             value: "/mnt/models"
36 |           - name: STORAGE_URI # Kserve mounts the PVC at /mnt/models/
37 |             value: pvc://dalle-mini-model-cache/
38 |             # The following env vars are the default model parameters, which can be changed as needed. 
39 |           - name: TOP_K
40 |             value: "50"
41 |           - name: TOP_P
42 |             value: "1.0"
43 |           - name: TEMPERATURE
44 |             value: "1.0" 
45 |           - name: CONDITION_SCALE
46 |             value: "10.0"
47 |         resources:
48 |           requests:
49 |             cpu: 6
50 |             memory: 48Gi
51 |             nvidia.com/gpu: 1
52 |           limits:
53 |             cpu: 6
54 |             memory: 48Gi
55 |             nvidia.com/gpu: 1
56 | 


--------------------------------------------------------------------------------
/online-inference/dalle-mini/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG MODEL=dalle-mini
 2 | ARG CUDA_RELEASE=12.2.0-devel-ubuntu20.04
 3 | FROM nvidia/cuda:${CUDA_RELEASE} AS base
 4 | ENV DEBIAN_FRONTEND=noninteractive
 5 | RUN apt-mark unhold $(apt-mark showhold)
 6 | RUN apt update && apt upgrade -y
 7 | 
 8 | RUN apt install -y python3 python3-pip git
 9 | RUN mkdir -p /app
10 | ADD model/ /app
11 | WORKDIR /app
12 | RUN pip3 install -r requirements.txt
13 | 
14 | CMD ["python3", "/app/service.py"]


--------------------------------------------------------------------------------
/online-inference/dalle-mini/Dockerfile.downloader:
--------------------------------------------------------------------------------
1 | FROM python:3.9.13-alpine3.16
2 | RUN mkdir /app
3 | RUN pip3 install huggingface_hub
4 | ADD downloader/download.py /app
5 | CMD ["python3", "/app/download.py"]


--------------------------------------------------------------------------------
/online-inference/dalle-mini/downloader/download.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import tempfile
 3 | import logging
 4 | import shutil
 5 | from huggingface_hub import snapshot_download
 6 | import os
 7 | 
 8 | logger = logging.getLogger("downloader")
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("--model-id", type=str, default="dalle-mini/dalle-mini")
12 | parser.add_argument("--model-cache", type=str, default="/model-cache")
13 | args = parser.parse_args()
14 | 
15 | logger.info(f'Downloading {args.model_id}...')
16 | 
17 | tmpdir = tempfile.TemporaryDirectory(dir=args.model_cache)
18 | model = snapshot_download(repo_id=args.model_id, cache_dir=tmpdir.name)
19 | model_dir = os.path.join(args.model_cache, args.model_id)
20 | os.makedirs(model_dir)
21 | 
22 | os.chdir(model)
23 | for file in os.listdir(model):
24 |     os.getcwd()
25 |     src = os.readlink(os.path.join(model, file))
26 |     dest = os.path.join(model_dir, file)
27 |     logger.info(f'moving {src} to {dest}')
28 |     shutil.move(src, dest)
29 | 
30 | ready = os.path.join(model_dir, '.ready.txt')
31 | with open(ready, 'w') as ready_file:
32 |     pass
33 | 
34 | tmpdir.cleanup()
35 | 
36 | logger.info(f'Download complete')


--------------------------------------------------------------------------------
/online-inference/dalle-mini/model/requirements.txt:
--------------------------------------------------------------------------------
 1 | -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 2 | jax[cuda]==0.3.15 
 3 | dalle-mini==0.1.1
 4 | git+https://github.com/patil-suraj/vqgan-jax.git@10ef240
 5 | jupyter==1.0.0
 6 | jupyterlab==3.4.4
 7 | ipywidgets==7.7.1
 8 | tqdm==4.64.0
 9 | kserve==0.9.0
10 | msrest==0.7.1
11 | 
12 | 


--------------------------------------------------------------------------------
/online-inference/fastertransformer/README.md:
--------------------------------------------------------------------------------
1 | Please refer to [the documentation](https://docs.coreweave.com/machine-learning-and-ai/inference/examples/triton-inference/triton-inference-server-fastertransformer) for usage instructions.
2 | 


--------------------------------------------------------------------------------
/online-inference/fastertransformer/build/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Rahul Talari (rtalari@coreweave.com)
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Base Image
16 | ARG TRITON_VERSION=22.04
17 | ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3
18 | FROM ${BASE_IMAGE} as server-builder
19 | 
20 | # Get NIVIDIA keys to authenticate 
21 | RUN export this_distro="$(cat /etc/os-release | grep '^ID=' | awk -F'=' '{print $2}')" \
22 |     && export this_version="$(cat /etc/os-release | grep '^VERSION_ID=' | awk -F'=' '{print $2}' | sed 's/[^0-9]*//g')" \
23 |     && apt-key adv --fetch-keys "https://developer.download.nvidia.com/compute/cuda/repos/${this_distro}${this_version}/x86_64/7fa2af80.pub" \
24 |     && apt-key adv --fetch-keys "https://developer.download.nvidia.com/compute/cuda/repos/${this_distro}${this_version}/x86_64/3bf863cc.pub"
25 | 
26 | # Run updates and install packages for build
27 | RUN apt-get update && \
28 |     apt-get install -y --no-install-recommends \
29 |     openssh-server zsh tmux mosh locales-all clangd sudo \
30 |     zip unzip wget build-essential autoconf autogen gdb \ 
31 |     python3.8 python3-pip python3-dev rapidjson-dev \
32 |     xz-utils zstd libz-dev && \
33 |     apt-get clean && \
34 |     rm -rf /var/lib/apt/lists/*
35 | 
36 | # Setup workdir for build
37 | WORKDIR /workspace/build/
38 | 
39 | # CMake
40 | RUN CMAKE_VERSION=3.18 && \
41 |     CMAKE_BUILD=3.18.4 && \
42 |     wget -nv https://cmake.org/files/v${CMAKE_VERSION}/cmake-${CMAKE_BUILD}.tar.gz && \
43 |     tar -xf cmake-${CMAKE_BUILD}.tar.gz && \
44 |     cd cmake-${CMAKE_BUILD} && \
45 |     ./bootstrap --parallel=$(grep -c ^processor /proc/cpuinfo) -- -DCMAKE_USE_OPENSSL=OFF && \
46 |     make -j"$(grep -c ^processor /proc/cpuinfo)" install && \
47 |     cd /workspace/build/ && \
48 |     rm -rf /workspace/build/cmake-${CMAKE_BUILD}
49 | 
50 | # backend build
51 | WORKDIR /workspace/build/triton-experiments
52 | 
53 | RUN git clone https://github.com/triton-inference-server/fastertransformer_backend.git 
54 | RUN mv /workspace/build/triton-experiments/fastertransformer_backend/cmake /workspace/build/triton-experiments
55 | RUN mv /workspace/build/triton-experiments/fastertransformer_backend/src /workspace/build/triton-experiments
56 | RUN mv /workspace/build/triton-experiments/fastertransformer_backend/CMakeLists.txt /workspace/build/triton-experiments
57 | 
58 | ARG FORCE_BACKEND_REBUILD=0
59 | RUN mkdir build -p && \
60 |     cd build && \
61 |     cmake \
62 |       -D CMAKE_EXPORT_COMPILE_COMMANDS=1 \
63 |       -D CMAKE_BUILD_TYPE=Release \
64 |       -D CMAKE_INSTALL_PREFIX=/opt/tritonserver \
65 |       -D TRITON_COMMON_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \
66 |       -D TRITON_CORE_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \
67 |       -D TRITON_BACKEND_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \
68 |       .. && \
69 |     make -j"$(grep -c ^processor /proc/cpuinfo)" install
70 | 
71 | # =================================
72 | #  Runner Image
73 | # =================================
74 | 
75 | FROM ${BASE_IMAGE} as server
76 | 
77 | # TODO: Change to PARALLEL and see performance metrics
78 | ENV NCCL_LAUNCH_MODE=PARALLEL
79 | 
80 | COPY --from=server-builder /opt/tritonserver/backends/fastertransformer /opt/tritonserver/backends/fastertransformer


--------------------------------------------------------------------------------
/online-inference/fastertransformer/client/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Rahul Talari (rtalari@coreweave.com)
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Base Image
16 | ARG TRITON_VERSION=22.05
17 | ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3
18 | FROM ${BASE_IMAGE} as server-builder
19 | 
20 | # Get NIVIDIA keys to authenticate 
21 | RUN export this_distro="$(cat /etc/os-release | grep '^ID=' | awk -F'=' '{print $2}')" \
22 |     && export this_version="$(cat /etc/os-release | grep '^VERSION_ID=' | awk -F'=' '{print $2}' | sed 's/[^0-9]*//g')" \
23 |     && apt-key adv --fetch-keys "https://developer.download.nvidia.com/compute/cuda/repos/${this_distro}${this_version}/x86_64/7fa2af80.pub" \
24 |     && apt-key adv --fetch-keys "https://developer.download.nvidia.com/compute/cuda/repos/${this_distro}${this_version}/x86_64/3bf863cc.pub"
25 | 
26 | # Run updates and install packages for build
27 | RUN apt-get update && \
28 |     apt-get install -y --no-install-recommends \
29 |     python3.8 python3-pip python3-dev && \
30 |     apt-get clean && \
31 |     rm -rf /var/lib/apt/lists/*
32 | 
33 | # Setup workdir for build
34 | WORKDIR /workspace
35 | 
36 | ADD gpt_bpe gpt_bpe
37 | ADD hf_tokenizer hf_tokenizer
38 | ADD example.py example.py
39 | ADD sample_request.json sample_request.json
40 | ADD requirements.txt requirements.txt
41 | RUN pip3 install torch==1.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html
42 | RUN pip3 install -r requirements.txt
43 | 
44 | ENTRYPOINT [ "python3", "example.py" ]
45 | 
46 | 


--------------------------------------------------------------------------------
/online-inference/fastertransformer/client/hf_tokenizer/hf_tokenize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from pathlib import Path
16 | from tokenizers import Tokenizer
17 | from typing import List, Union
18 | 
19 | class HFTokenizer:
20 |     def __init__(self, vocab_file):
21 |         self.tokenizer = Tokenizer.from_file(vocab_file)
22 | 
23 |     def tokenize(self, text: str):
24 |         return self.tokenizer.encode(text).ids
25 | 
26 |     def tokenize_batch(self, text_batch: Union[List[str], str]):
27 |         return self.tokenizer.encode_batch(text_batch)
28 | 
29 |     def detokenize(self, token_ids):
30 |         return self.tokenizer.decode(token_ids)


--------------------------------------------------------------------------------
/online-inference/fastertransformer/client/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.8.1
 2 | aiosignal==1.2.0
 3 | async-timeout==4.0.2
 4 | attrs==22.1.0
 5 | Brotli==1.0.9
 6 | certifi==2022.6.15
 7 | charset-normalizer==2.1.0
 8 | frozenlist==1.3.1
 9 | gevent==21.12.0
10 | geventhttpclient==2.0
11 | greenlet==1.1.2
12 | grpcio==1.41.0
13 | idna==3.3
14 | multidict==6.0.2
15 | numpy==1.23.1
16 | protobuf==3.19.4
17 | python-rapidjson==1.8
18 | regex==2022.7.25
19 | six==1.16.0
20 | tritonclient==2.24.0
21 | tokenizers==0.12.1
22 | typing_extensions==4.3.0
23 | yarl==1.8.1
24 | zope.event==4.5.0
25 | zope.interface==5.4.0


--------------------------------------------------------------------------------
/online-inference/fastertransformer/client/sample_request.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "request": [
 3 |         {
 4 |             "name": "input_ids",
 5 |             "data": [],
 6 |             "dtype": "int32"
 7 |         },
 8 |         {
 9 |             "name": "input_lengths",
10 |             "data": [],
11 |             "dtype": "int32"
12 |         },
13 |         {
14 |             "name": "request_output_len",
15 |             "data": [[64]],
16 |             "dtype": "int32"
17 |         },
18 |         {
19 |             "name": "beam_search_diversity_rate",
20 |             "data": [[0]],
21 |             "dtype": "float32"
22 |         },
23 |         {
24 |             "name": "temperature",
25 |             "data": [[1.0]],
26 |             "dtype": "float32"
27 |         },
28 |         {
29 |             "name": "len_penalty",
30 |             "data": [[1.0]],
31 |             "dtype": "float32"
32 |         },
33 |         {
34 |             "name": "repetition_penalty",
35 |             "data": [[1.0]],
36 |             "dtype": "float32"
37 |         },
38 |         {
39 |             "name": "random_seed",
40 |             "data": [[0]],
41 |             "dtype": "uint64"
42 |         },
43 |         {
44 |             "name": "is_return_log_probs",
45 |             "data": [[false]],
46 |             "dtype": "bool"
47 |         },
48 |         {
49 |             "name": "beam_width",
50 |             "data": [[1]],
51 |             "dtype": "int32"
52 |         },
53 |         {
54 |             "name": "runtime_top_k",
55 |             "data": [[10]],
56 |             "dtype": "int32"
57 |         },
58 |         {
59 |             "name": "runtime_top_p",
60 |             "data": [[0.0]],
61 |             "dtype": "float32"
62 |         },
63 |         {
64 |             "name": "stop_words_list",
65 |             "data": [[[0], [-1]]],
66 |             "dtype": "int32"
67 |         },
68 |         {
69 |             "name": "bad_words_list",
70 |             "data": [[[0], [-1]]],
71 |             "dtype": "int32"
72 |         }
73 |     ]
74 | }
75 | 


--------------------------------------------------------------------------------
/online-inference/fastertransformer/ft-inference-service-gptj.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.kubeflow.org/v1beta1
 2 | kind: InferenceService
 3 | metadata:
 4 |   labels:
 5 |     qos.coreweave.cloud/latency: low
 6 |   name: fastertransformer-triton-gptj
 7 | spec:
 8 |   predictor:
 9 |     maxReplicas: 1
10 |     minReplicas: 1
11 |     containerConcurrency: 1
12 |     containers:
13 |     - name: gptj-ft
14 |       image: rtalaricw/gptj_ft:v1.2-22.04-new
15 |       command: ["/opt/tritonserver/bin/tritonserver"]
16 |       args: ["--model-repository=/mnt/pvc/gptj-store/triton-model-store"]
17 |       env:
18 |         - name: STORAGE_URI 
19 |           value: pvc://model-storage/
20 |       ports:
21 |         # Uncomment to use GRPC
22 |         # - containerPort: 8001
23 |         #   name: h2c
24 |         #   protocol: TCP
25 |         - containerPort: 8000
26 |           protocol: TCP
27 |       resources:
28 |         requests:
29 |           cpu: 4
30 |           memory: 8Gi
31 |           nvidia.com/gpu: 1
32 |         limits:
33 |           cpu: 4
34 |           memory: 8Gi
35 |           nvidia.com/gpu: 1
36 |     affinity:
37 |       nodeAffinity:
38 |         requiredDuringSchedulingIgnoredDuringExecution:
39 |           nodeSelectorTerms:
40 |           - matchExpressions:
41 |             - key: gpu.nvidia.com/class
42 |               operator: In
43 |               values:
44 |                 - RTX_A5000
45 |             - key: topology.kubernetes.io/region
46 |               operator: In
47 |               values:
48 |                 - LAS1


--------------------------------------------------------------------------------
/online-inference/fastertransformer/ft-inference-service-neox.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.kubeflow.org/v1beta1
 2 | kind: InferenceService
 3 | metadata:
 4 |   labels:
 5 |     qos.coreweave.cloud/latency: low
 6 |   name: fastertransformer-triton-neox
 7 | spec:
 8 |   predictor:
 9 |     maxReplicas: 1
10 |     minReplicas: 1
11 |     containerConcurrency: 1
12 |     containers:
13 |     - name: gpt-neox-ft
14 |       image: rtalaricw/gptj_ft:v1.2-22.04-new
15 |       command: ["/opt/tritonserver/bin/tritonserver"]
16 |       args: ["--model-repository=/mnt/pvc/gpt-neox/triton-model-store"]
17 |       env:
18 |         - name: STORAGE_URI 
19 |           value: pvc://model-storage/
20 |       ports:
21 |         # Uncomment to use GRPC
22 |         # - containerPort: 8001
23 |         #   name: h2c
24 |         #   protocol: TCP
25 |         - containerPort: 8000
26 |           protocol: TCP
27 |       resources:
28 |         requests:
29 |           cpu: 4
30 |           memory: 64Gi
31 |           nvidia.com/gpu: 1
32 |         limits:
33 |           cpu: 4
34 |           memory: 64Gi
35 |           nvidia.com/gpu: 1
36 |     affinity:
37 |       nodeAffinity:
38 |         requiredDuringSchedulingIgnoredDuringExecution:
39 |           nodeSelectorTerms:
40 |           - matchExpressions:
41 |             - key: gpu.nvidia.com/class
42 |               operator: In
43 |               values:
44 |                 - RTX_A6000
45 |             - key: topology.kubernetes.io/region
46 |               operator: In
47 |               values:
48 |                 - LAS1


--------------------------------------------------------------------------------
/online-inference/fastertransformer/model-storage-pvc.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: model-storage
 5 | spec:
 6 |   # https://docs.coreweave.com/coreweave-kubernetes/storage
 7 |   storageClassName: shared-nvme-ord1 
 8 |   accessModes:
 9 |     - ReadWriteMany
10 |   resources:
11 |     requests:
12 |       storage: 150Gi
13 |   
14 | 


--------------------------------------------------------------------------------
/online-inference/hf-llm/.dockerignore:
--------------------------------------------------------------------------------
1 | *
2 | !serializer/requirements.txt
3 | !serializer/*.py
4 | !service/requirements.txt
5 | !service/*.py
6 | 


--------------------------------------------------------------------------------
/online-inference/hf-llm/00-optional-s3-secret.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | data:
 3 |   access_key: Replace_this_with_your_base64_encoded_access_key
 4 |   secret_key: Replace_this_with_your_base64_encoded_secret_key
 5 |   host_url: Replace_this_with_your_base64_encoded_host_url
 6 | kind: Secret
 7 | metadata:
 8 |   name: s3-credentials
 9 | type: Opaque
10 | 


--------------------------------------------------------------------------------
/online-inference/hf-llm/01-optional-s3-serialize-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: hf-llm-serializer
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       containers:
 9 |       - name: model-serializer
10 |         image: ghcr.io/coreweave/ml-containers/hf-llm-inference:073f175
11 |         imagePullPolicy: IfNotPresent
12 |         command:
13 |           - "python3"
14 |           - "/app/serialize.py"
15 |           - "--hf-model-id=distilgpt2"
16 |           - "--precision=float16"
17 |           - "--dest-bucket=your-bucket-here"
18 |         env:
19 |         - name: S3_KEY
20 |           valueFrom:
21 |             secretKeyRef:
22 |               name: s3-credentials
23 |               key: access_key
24 |         - name: S3_SECRET
25 |           valueFrom:
26 |             secretKeyRef:
27 |               name: s3-credentials
28 |               key: secret_key
29 |         - name: S3_HOST
30 |           valueFrom:
31 |             secretKeyRef:
32 |               name: s3-credentials
33 |               key: host_url
34 |         resources:
35 |           requests:
36 |             cpu: 2
37 |             memory: 16Gi
38 |           limits:
39 |             cpu: 2
40 |             memory: 16Gi
41 |       affinity:
42 |         nodeAffinity:
43 |           requiredDuringSchedulingIgnoredDuringExecution:
44 |             nodeSelectorTerms:
45 |             - matchExpressions:
46 |               - key: topology.kubernetes.io/region
47 |                 operator: In
48 |                 values: 
49 |                 - ORD1
50 |       restartPolicy: Never
51 |   backoffLimit: 2
52 | 


--------------------------------------------------------------------------------
/online-inference/hf-llm/02-inference-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.knative.dev/v1
 2 | kind: Service
 3 | metadata:
 4 |   name: hf-llm
 5 |   annotations:
 6 |     networking.knative.dev/ingress-class: kourier.ingress.networking.knative.dev
 7 |   labels:
 8 |     knative.coreweave.cloud/ingress: kourier.ingress.networking.knative.dev
 9 | spec:
10 |   template:
11 |     metadata:
12 |       annotations:
13 |         autoscaling.knative.dev/minScale: "1"
14 |         autoscaling.knative.dev/maxScale: "1"
15 |     spec:
16 |       affinity:
17 |         nodeAffinity:
18 |           requiredDuringSchedulingIgnoredDuringExecution:
19 |             nodeSelectorTerms:
20 |             - matchExpressions:
21 |               - key: gpu.nvidia.com/class
22 |                 operator: In
23 |                 values:
24 |                 - Quadro_RTX_5000
25 |               - key: topology.kubernetes.io/region
26 |                 operator: In
27 |                 values:
28 |                 - ORD1
29 |       containers:
30 |       - name: kfserving-container
31 |         image: ghcr.io/coreweave/ml-containers/hf-llm-inference:073f175
32 |         command:
33 |         - "python3"
34 |         - "/app/service.py"
35 |         - "--model-uri=s3://tensorized/EleutherAI/pythia-70m"
36 |         - "--precision=float16"
37 |         - "--port=80"
38 |         env:
39 |         - name: S3_KEY
40 |           valueFrom:
41 |             secretKeyRef:
42 |               name: s3-credentials
43 |               key: access_key
44 |               optional: true
45 |         - name: S3_SECRET
46 |           valueFrom:
47 |             secretKeyRef:
48 |               name: s3-credentials
49 |               key: secret_key
50 |               optional: true
51 |         - name: S3_HOST
52 |           valueFrom:
53 |             secretKeyRef:
54 |               name: s3-credentials
55 |               key: host_url
56 |               optional: true
57 |         ports:
58 |         - protocol: TCP
59 |           containerPort: 80
60 |         livenessProbe:
61 |           httpGet:
62 |             path: /
63 |             port: 80
64 |           initialDelaySeconds: 30
65 |           periodSeconds: 30
66 |         readinessProbe:
67 |           httpGet:
68 |             path: /
69 |             port: 80
70 |           initialDelaySeconds: 30
71 |           periodSeconds: 30
72 |         resources:
73 |           requests:
74 |             cpu: 4
75 |             memory: 16Gi
76 |             nvidia.com/gpu: 1
77 |           limits:
78 |             cpu: 4
79 |             memory: 16Gi
80 |             nvidia.com/gpu: 1
81 | 


--------------------------------------------------------------------------------
/online-inference/hf-llm/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ghcr.io/coreweave/ml-containers/torch:afecfe9-base-cuda12.0.1-torch2.0.0-vision0.15.1
 2 | ENV DEBIAN_FRONTEND=noninteractive
 3 | 
 4 | RUN apt-get -qq update && \
 5 |     apt-get -qq install --no-install-recommends -y git curl && \
 6 |     apt-get clean
 7 | 
 8 | ADD service/ /app/
 9 | COPY serializer/serialize.py /app/serialize.py
10 | WORKDIR /app
11 | 
12 | RUN pip3 install --no-cache-dir --upgrade pip && \
13 |     pip3 install --no-cache-dir -r requirements.txt
14 | 


--------------------------------------------------------------------------------
/online-inference/hf-llm/serializer/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=2.0.0,<=2.3.0
2 | transformers==4.36.2
3 | tensorizer==2.7.1
4 | 


--------------------------------------------------------------------------------
/online-inference/hf-llm/serializer/serialize.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from argparse import ArgumentParser
 4 | 
 5 | import torch
 6 | from tensorizer import TensorSerializer, stream_io
 7 | from transformers import AutoModelForCausalLM
 8 | 
 9 | logging.basicConfig(level=logging.INFO)
10 | logger = logging.getLogger(__file__)
11 | 
12 | s3_access_key_default = os.getenv("S3_KEY") or None
13 | s3_secret_access_key_default = os.getenv("S3_SECRET") or None
14 | s3_endpoint_default = os.getenv("S3_HOST") or "object.ord1.coreweave.com"
15 | 
16 | parser = ArgumentParser()
17 | parser.add_argument("--hf-model-id", default="distilgpt2", type=str)
18 | parser.add_argument(
19 |     "--precision", choices=["float16", "float32"], default="float16", type=str
20 | )
21 | parser.add_argument("--dest-bucket", required=True, type=str)
22 | parser.add_argument(
23 |     "--s3-access-key",
24 |     default=s3_access_key_default,
25 |     required=s3_access_key_default is None,
26 |     type=str,
27 | )
28 | parser.add_argument(
29 |     "--s3-secret-access-key",
30 |     default=s3_secret_access_key_default,
31 |     required=s3_secret_access_key_default is None,
32 |     type=str,
33 | )
34 | parser.add_argument("--s3-endpoint", default=s3_endpoint_default, type=str)
35 | args = parser.parse_args()
36 | 
37 | 
38 | def save_artifact_s3(model, path):
39 |     serializer = TensorSerializer(
40 |         stream_io.open_stream(
41 |             path_uri=path,
42 |             mode="wb",
43 |             s3_access_key_id=args.s3_access_key,
44 |             s3_secret_access_key=args.s3_secret_access_key,
45 |             s3_endpoint=args.s3_endpoint,
46 |             s3_config_path=None,
47 |         )
48 |     )
49 |     serializer.write_module(model)
50 |     serializer.close()
51 |     logger.info(f"Tensorized S3 artifact written to {path}")
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     model_id = args.hf_model_id
56 |     model = AutoModelForCausalLM.from_pretrained(
57 |         model_id,
58 |         torch_dtype=torch.float16
59 |         if args.precision == "float16"
60 |         else torch.float32,
61 |     )
62 | 
63 |     model_file = "fp16/model.tensors" if args.precision == "float16" else ""
64 |     uri = "s3://" + "/".join((args.dest_bucket, model_id, model_file))
65 | 
66 |     save_artifact_s3(model, uri)
67 | 


--------------------------------------------------------------------------------
/online-inference/hf-llm/service/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=2.0.0,<=2.3.0
2 | transformers==4.36.2
3 | tensorizer==2.7.1
4 | fastapi==0.105.0
5 | uvicorn==0.24.0
6 | 


--------------------------------------------------------------------------------
/online-inference/image-classifier/jupyter/model-storage-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: model-storage
 5 | spec:
 6 |   # Available shared filesystem storage classes.
 7 |   # Only use shared filesystems when mounting on multiple nodes is a requirement.
 8 |   # Regular storage classes provide better performance.
 9 |   #
10 |   # sharedfs-hdd-replicated - HDD Backend shared filesystem with replicas
11 |   # sharedfs-ssd-replicated - SSD Backed shared filesystem with replicas
12 |   storageClassName: sharedfs-hdd-replicated
13 |   accessModes:
14 |     - ReadWriteMany
15 |   resources:
16 |     requests:
17 |       storage: 30Gi
18 | 


--------------------------------------------------------------------------------
/online-inference/image-classifier/jupyter/tensorflow-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: tensorflow-jupyter
 5 | spec:
 6 |   strategy:
 7 |     type: Recreate
 8 |   # Replicas controls the number of instances of the Pod to maintain running at all times
 9 |   replicas: 1
10 |   selector:
11 |     matchLabels:
12 |       app.kubernetes.io/name: tensorflow-jupyter
13 |   template:
14 |     metadata:
15 |       labels:
16 |         app.kubernetes.io/name: tensorflow-jupyter
17 |     spec:
18 |       containers:
19 |         - name: tf
20 |           image: tensorflow/tensorflow:2.0.1-gpu-py3-jupyter
21 | 
22 |           ports:
23 |             - name: notebook
24 |               containerPort: 8888
25 |               protocol: TCP
26 | 
27 |           readinessProbe:
28 |             tcpSocket:
29 |               port: notebook
30 |             initialDelaySeconds: 5
31 |             periodSeconds: 10
32 |           livenessProbe:
33 |             httpGet:
34 |               path: /
35 |               port: notebook
36 |             initialDelaySeconds: 15
37 |             periodSeconds: 15
38 |             failureThreshold: 3
39 |             timeoutSeconds: 10
40 | 
41 |           volumeMounts:
42 |             - name: storage
43 |               mountPath: /tf/notebooks
44 |             - name: model-storage
45 |               mountPath: /models
46 | 
47 |           resources:
48 |             requests:
49 |               cpu: 500m # The CPU unit is mili-cores. 500m is 0.5 cores
50 |               memory: 2048Mi
51 |             limits:
52 |               # GPUs can only be allocated as a limit, which both reserves and limits the number of GPUs the Pod will have access to
53 |               # Making individual Pods resource light is advantageous for bin-packing. In the case of Jupyter, we stick to two GPUs for
54 |               # demonstration purposes
55 |               nvidia.com/gpu: 1
56 | 
57 |       # Node affinity can be used to require / prefer the Pods to be scheduled on a node with a specific hardware type
58 |       # No affinity allows scheduling on all hardware types that can fulfill the resource request.
59 |       # In this example, without affinity, any NVIDIA GPU would be allowed to run the Pod.
60 |       # Read more about affinity at: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
61 |       affinity:
62 |         nodeAffinity:
63 |           # This will REQUIRE the Pod to be run on a system with a GPU with 8GB VRAM
64 |           requiredDuringSchedulingIgnoredDuringExecution:
65 |             nodeSelectorTerms:
66 |             - matchExpressions:
67 |               - key: gpu.nvidia.com/vram
68 |                 operator: In
69 |                 values:
70 |                   - "8"
71 | 
72 |           preferredDuringSchedulingIgnoredDuringExecution:
73 |             - weight: 10
74 |               preference:
75 |                 matchExpressions:
76 |                 - key: cpu.coreweave.cloud/family
77 |                   operator: In
78 |                   values:
79 |                     - i5
80 |                     - i7
81 |                     - i9
82 |                     - xeon
83 |                     - ryzen
84 |           
85 |       volumes:
86 |         - name: storage
87 |           persistentVolumeClaim:
88 |             claimName: jupyter-pv-claim
89 |         - name: model-storage
90 |           persistentVolumeClaim:
91 |             claimName: model-storage
92 |       restartPolicy: Always
93 | 


--------------------------------------------------------------------------------
/online-inference/image-classifier/jupyter/tensorflow-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   annotations:
 5 |     metallb.universe.tf/address-pool: public
 6 |     # Setting a sharing key might save public IP addresses
 7 |     # See https://metallb.universe.tf/usage/#ip-address-sharing for more detail
 8 |     metallb.universe.tf/allow-shared-ip: example-1
 9 |   name: tensorflow-jupyter
10 | spec:
11 |   type: LoadBalancer
12 |   externalTrafficPolicy: Local
13 |   ports:
14 |     - name: notebook
15 |       port: 8888
16 |       protocol: TCP
17 |       targetPort: notebook
18 |   selector:
19 |     app.kubernetes.io/name: tensorflow-jupyter
20 | 


--------------------------------------------------------------------------------
/online-inference/image-classifier/service/classifier-inferenceservice.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.kubeflow.org/v1beta1
 2 | kind: InferenceService
 3 | metadata:
 4 |   name: classifier
 5 |   annotations:
 6 |     os.coreweave.cloud/latency: low
 7 | spec:
 8 |   predictor:
 9 |     # Max one request processed at the same time per container (GPU)
10 |     minReplicas: 0 # Allow scale to zero
11 |     maxReplicas: 3
12 |     containerConcurrency: 1
13 |     tensorflow:
14 |       # The PVC and path inside the PVC to the model. The path is what we put after /models/ in export_dir in the notebook.
15 |       storageUri: pvc://model-storage/inception/
16 |       runtimeVersion: "2.1.0-gpu"
17 |       resources:
18 |         requests:
19 |           cpu: 1
20 |           memory: 6Gi
21 |         limits:
22 |           cpu: 3
23 |           memory: 10Gi
24 |           nvidia.com/gpu: 1
25 |     affinity:
26 |       nodeAffinity:
27 |         requiredDuringSchedulingIgnoredDuringExecution:
28 |           nodeSelectorTerms:
29 |           - matchExpressions:
30 |             - key: gpu.nvidia.com/class
31 |               operator: In
32 |               values:
33 |               - Tesla_V100
34 | 
35 |   transformer:
36 |     minReplicas: 1
37 |     maxReplicas: 2
38 |     containers:
39 |     - image: coreweave/inception-transformer:0.11 # Docker image of the code found in transformer/
40 |       name: user-container
41 |       resources:
42 |         requests:
43 |           cpu: 200m
44 |           memory: 64Mi
45 |         limits:
46 |           cpu: 3
47 |           memory: 8Gi
48 | 


--------------------------------------------------------------------------------
/online-inference/image-classifier/service/predict_url.sh:
--------------------------------------------------------------------------------
1 | curl -v -d "{\"instances\": [{\"url\":\"$1\"}]}" $SERVICE_URL:predict
2 | 


--------------------------------------------------------------------------------
/online-inference/image-classifier/transformer/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7-slim
 2 | 
 3 | RUN apt update && apt install -y git
 4 | 
 5 | RUN pip install --upgrade pip
 6 | RUN pip install 'git+git://github.com/coreweave/kfserving#egg=kfserving&subdirectory=python/kfserving'
 7 | 
 8 | ADD requirements.txt .
 9 | RUN pip install -r requirements.txt
10 | 
11 | RUN mkdir -p /transformer/
12 | WORKDIR /transformer
13 | 
14 | COPY *.py ./
15 | 
16 | ENTRYPOINT ["python", "main.py"]
17 | 


--------------------------------------------------------------------------------
/online-inference/image-classifier/transformer/main.py:
--------------------------------------------------------------------------------
 1 | import kfserving
 2 | import argparse
 3 | from transformer import Transformer
 4 | 
 5 | parser = argparse.ArgumentParser(parents=[kfserving.kfserver.parser])
 6 | parser.add_argument('--model_name', default="model",
 7 |                     help='The name that the model is served under.')
 8 | parser.add_argument('--predictor_host', help='The URL for the model predict function', required=True)
 9 | 
10 | args, _ = parser.parse_known_args()
11 | 
12 | if __name__ == "__main__":
13 |     transformer = Transformer(args.model_name, predictor_host=args.predictor_host)
14 |     kfserver = kfserving.KFServer(workers=4)
15 |     kfserver.start(models=[transformer])
16 | 


--------------------------------------------------------------------------------
/online-inference/image-classifier/transformer/requirements.txt:
--------------------------------------------------------------------------------
1 | pillow==7.1.2
2 | dill==0.3.1.1
3 | msgpack==0.6.2
4 | numpy==1.18.0
5 | requests==2.22.0
6 | 


--------------------------------------------------------------------------------
/online-inference/image-classifier/transformer/transformer.py:
--------------------------------------------------------------------------------
 1 | import kfserving
 2 | from typing import List, Dict
 3 | import logging
 4 | import requests
 5 | import numpy as np
 6 | import base64
 7 | 
 8 | # The signature name is defined at time of export, in signature_def_map supplied to builder
 9 | # Tensorflows default is serving_default
10 | SERVING_SIGNATURE_NAME = 'serving_default'
11 | 
12 | logging.basicConfig(level=kfserving.constants.KFSERVING_LOGLEVEL)
13 | 
14 | class Transformer(kfserving.KFModel):
15 |     def __init__(self, name: str, predictor_host: str):
16 |         super().__init__(name)
17 |         self.predictor_host = predictor_host
18 | 
19 |         self.labels = requests.get(
20 |             "https://storage.googleapis.com/download.tensorflow.org/data/ImageNetLabels.txt"
21 |         ).text.split("\n")
22 | 
23 | 
24 |     # Accept input either in base64 format or as a url
25 |     def encode(self, input):
26 |         if 'b64' in input:
27 |             b64 = input['b64']
28 |         else:
29 |             image = requests.get(input["url"]).content
30 |             b64 = base64.b64encode(image).decode("utf-8")
31 | 
32 |         # Input name is defined when exporting the module
33 |         # Tensorflow Serving decodes base64 encoded images when sent in an object with the b64 key.
34 |         # https://towardsdatascience.com/serving-image-based-deep-learning-models-with-tensorflow-servings-restful-api-d365c16a7dc4
35 |         return {"image_bytes": {"b64": b64 } }
36 | 
37 |     # Match up the most likely prediction to the labels
38 |     def decode(self, prediction):
39 |         return {
40 |             'class': self.labels[np.argmax(prediction)],
41 |             'score': max(prediction)
42 |         }
43 | 
44 |     def preprocess(self, inputs: Dict) -> Dict:
45 |         return {'signature_name': SERVING_SIGNATURE_NAME, 'instances': [self.encode(instance) for instance in inputs['instances']]}
46 | 
47 | 
48 |     def postprocess(self, inputs: List) -> List:
49 |         return {'predictions': [self.decode(prediction) for prediction in inputs['predictions']]}


--------------------------------------------------------------------------------
/online-inference/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/online-inference/overview.png


--------------------------------------------------------------------------------
/online-inference/stable-diffusion/00-optional-s3-secret.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | data:
 3 |   access_key: Replace_this_with_your_access_key
 4 | kind: Secret
 5 | metadata:
 6 |   name: s3-access-key
 7 | type: Opaque
 8 | ---
 9 | apiVersion: v1
10 | data:
11 |   secret_key: Replace_this_with_your_secret_key
12 | kind: Secret
13 | metadata:
14 |   name: s3-secret-key
15 | type: Opaque
16 | ---
17 | apiVersion: v1
18 | data:
19 |   url: Replace_this_with_your_host_url
20 | kind: Secret
21 | metadata:
22 |   name: s3-host-url
23 | type: Opaque


--------------------------------------------------------------------------------
/online-inference/stable-diffusion/01-optional-s3-serialize-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: stable-diffusion-serializer
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       containers:
 9 |       - name: model-serializer
10 |         image: ghcr.io/coreweave/ml-containers/sd-inference:amercurio-sd-overhaul-7d29c61
11 |         imagePullPolicy: IfNotPresent
12 |         command:
13 |           - "python3"
14 |           - "/app/serialize.py"
15 |           - "--hf-model-id=runwayml/stable-diffusion-v1-5"
16 |           - "--precision=float16"
17 |           - "--dest-bucket=your-bucket-here"
18 |         env:
19 |         - name: AWS_KEY
20 |           valueFrom:
21 |             secretKeyRef:
22 |               name: s3-access-key
23 |               key: access_key
24 |         - name: AWS_SECRET
25 |           valueFrom:
26 |             secretKeyRef:
27 |               name: s3-secret-key
28 |               key: secret_key
29 |         - name: AWS_HOST
30 |           valueFrom:
31 |             secretKeyRef:
32 |               name: s3-host-url
33 |               key: url
34 |         resources:
35 |           requests:
36 |             cpu: 2
37 |             memory: 16Gi
38 |           limits:
39 |             cpu: 2
40 |             memory: 16Gi
41 |       affinity:
42 |         nodeAffinity:
43 |           requiredDuringSchedulingIgnoredDuringExecution:
44 |             nodeSelectorTerms:
45 |             - matchExpressions:
46 |               - key: topology.kubernetes.io/region
47 |                 operator: In
48 |                 values: 
49 |                 - ORD1
50 |       restartPolicy: Never
51 |   backoffLimit: 2
52 | 


--------------------------------------------------------------------------------
/online-inference/stable-diffusion/02-inference-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.knative.dev/v1
 2 | kind: Service
 3 | metadata:
 4 |   name: sd
 5 |   annotations: 
 6 |     networking.knative.dev/ingress-class: kourier.ingress.networking.knative.dev
 7 |   labels:
 8 |     knative.coreweave.cloud/ingress: kourier.ingress.networking.knative.dev
 9 | spec:
10 |   template:
11 |     metadata:
12 |       annotations:
13 |         autoscaling.knative.dev/minScale: "1"
14 |         autoscaling.knative.dev/maxScale: "1"
15 |     spec:
16 |       affinity:
17 |         nodeAffinity:
18 |           requiredDuringSchedulingIgnoredDuringExecution:
19 |             nodeSelectorTerms:
20 |             - matchExpressions:
21 |               - key: gpu.nvidia.com/class
22 |                 operator: In
23 |                 values:
24 |                 - Quadro_RTX_5000
25 |               - key: topology.kubernetes.io/region
26 |                 operator: In
27 |                 values:
28 |                 - ORD1 
29 |       containers:
30 |       - name: kfserving-container
31 |         image: ghcr.io/coreweave/ml-containers/sd-inference:amercurio-sd-overhaul-7d29c61
32 |         command:
33 |         - "python3"
34 |         - "/app/service.py"
35 |         - "--model-uri=s3://tensorized/runwayml/stable-diffusion-v1-5"
36 |         - "--precision=float16"
37 |         - "--port=80"
38 |         env:
39 |         - name: AWS_KEY
40 |           valueFrom:
41 |             secretKeyRef:
42 |               name: s3-access-key
43 |               key: access_key
44 |               optional: true
45 |         - name: AWS_SECRET
46 |           valueFrom:
47 |             secretKeyRef:
48 |               name: s3-secret-key
49 |               key: secret_key
50 |               optional: true
51 |         - name: AWS_HOST
52 |           valueFrom:
53 |             secretKeyRef:
54 |               name: s3-host-url
55 |               key: url
56 |               optional: true
57 |         ports:
58 |         - protocol: TCP
59 |           containerPort: 80
60 |         livenessProbe:
61 |           httpGet:
62 |             path: /
63 |             port: 80
64 |           initialDelaySeconds: 30
65 |           periodSeconds: 30
66 |         readinessProbe:
67 |           httpGet:
68 |             path: /
69 |             port: 80
70 |           initialDelaySeconds: 30
71 |           periodSeconds: 30
72 |         resources:
73 |           requests:
74 |             cpu: 4
75 |             memory: 16Gi
76 |             nvidia.com/gpu: 1
77 |           limits:
78 |             cpu: 4
79 |             memory: 16Gi
80 |             nvidia.com/gpu: 1
81 | 


--------------------------------------------------------------------------------
/online-inference/stable-diffusion/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ghcr.io/coreweave/ml-containers/torch:afecfe9-base-cuda11.8.0-torch2.0.0-vision0.15.1
 2 | ENV DEBIAN_FRONTEND=noninteractive
 3 | 
 4 | RUN apt update && apt upgrade -y && \
 5 |     apt update && apt install -y python3 python3-pip git curl && \
 6 |     apt clean
 7 | 
 8 | ADD service/ /app/
 9 | COPY serializer/serialize.py /app/serialize.py
10 | WORKDIR /app
11 | 
12 | RUN pip3 install --no-cache-dir --upgrade pip && \
13 |     pip3 install --no-cache-dir -r requirements.txt
14 | 


--------------------------------------------------------------------------------
/online-inference/stable-diffusion/README.md:
--------------------------------------------------------------------------------
1 | # Stable Diffusion
2 | Please refer to [CoreWeave Docs](https://docs.coreweave.com/machine-learning-and-ai/inference/examples/pytorch-jax/hugging-face/pytorch-hugging-face-diffusers-stable-diffusion-text-to-image) for a deployment tutorial. 
3 | 


--------------------------------------------------------------------------------
/online-inference/stable-diffusion/serializer/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.0.0
2 | transformers==4.33.1
3 | diffusers==0.20.2
4 | tensorizer==2.3.0
5 | 


--------------------------------------------------------------------------------
/online-inference/stable-diffusion/serializer/serialize.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import logging
 4 | from tensorizer import TensorSerializer, stream_io
 5 | from diffusers import StableDiffusionPipeline
 6 | from argparse import ArgumentParser
 7 | 
 8 | logging.basicConfig(level=logging.INFO)
 9 | logger = logging.getLogger(__file__)
10 | 
11 | parser = ArgumentParser()
12 | parser.add_argument("--hf-model-id", default="runwayml/stable-diffusion-v1-5", type=str)
13 | parser.add_argument("--precision", choices=["float16", "float32"], default="float16", type=str)
14 | parser.add_argument("--dest-bucket", default=None, required=True, type=str)
15 | parser.add_argument("--s3-access-key", default=os.getenv("AWS_KEY"), required=False, type=str)
16 | parser.add_argument("--s3-secret-access-key", default=os.getenv("AWS_SECRET"), required=False, type=str)
17 | parser.add_argument("--s3-endpoint", default=os.getenv("AWS_HOST", "object.ord1.coreweave.com"), required=False, type=str)
18 | args = parser.parse_args()
19 | 
20 | def save_artifact(model, path, sub_path):
21 |     serializer = TensorSerializer(path + sub_path)
22 |     serializer.write_module(model)
23 |     serializer.close()
24 | 
25 | def save_artifact_s3(model, path, sub_path):
26 |     serializer = TensorSerializer(
27 |         stream_io.open_stream(
28 |             path_uri = path + sub_path,
29 |             mode = 'wb',
30 |             s3_access_key_id = args.s3_access_key,
31 |             s3_secret_access_key = args.s3_secret_access_key,
32 |             s3_endpoint = args.s3_endpoint,
33 |             s3_config_path=None
34 |         )
35 |     )
36 |     serializer.write_module(model)
37 |     serializer.close()
38 |     logger.info(f"Tensorized S3 artifact written to {path + sub_path}")
39 | 
40 | if __name__ == '__main__':
41 |     model_id = args.hf_model_id
42 |     model = StableDiffusionPipeline.from_pretrained(
43 |         model_id,
44 |         torch_dtype=torch.float16 if args.precision == "float16" else torch.float32
45 |     )
46 | 
47 |     BASE_S3_URL = f"s3://{args.dest_bucket}/"
48 | 
49 |     dtype_str = "/fp16" if args.precision == "float16" else ""
50 | 
51 |     save_artifact_s3(model.vae, BASE_S3_URL + model_id + dtype_str, '/vae.tensors')
52 |     save_artifact_s3(model.unet, BASE_S3_URL + model_id + dtype_str, '/unet.tensors')
53 |     save_artifact_s3(model.text_encoder, BASE_S3_URL + model_id + dtype_str, '/text_encoder.tensors')
54 | 
55 |     logger.info(f"Wrote tensorized S3 artifact to: {BASE_S3_URL + model_id}")
56 | 


--------------------------------------------------------------------------------
/online-inference/stable-diffusion/service/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.0.0
2 | transformers==4.33.1
3 | diffusers==0.20.2
4 | tensorizer==2.3.0
5 | numpy==1.24.2
6 | scipy==1.10.1
7 | fastapi==0.85.1
8 | uvicorn==0.16.0
9 | 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/README.md:
--------------------------------------------------------------------------------
 1 | # GPT-J InferenceService with Tensorizer & HuggingFace
 2 | 
 3 | The following instructions will guide you through setting up an
 4 | [InferenceService](https://docs.coreweave.com/coreweave-machine-learning-and-ai/how-to-guides-and-tutorials/examples)
 5 | with [Tensorizer](https://github.com/coreweave/tensorizer)
 6 | or [HuggingFace Transformers](https://huggingface.co/docs/transformers/index)
 7 | serving [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6b).
 8 | 
 9 | From the root of `tensorizer-isvc`:
10 | 
11 | - Provision a [PVC](https://kubernetes.io/docs/concepts/storage/persistent-volumes/)
12 |   - `kubectl apply -f pvc.yaml`
13 | - Download the model to the PVC
14 |   - `kubectl apply -f model-download/model-download-job.yaml`
15 | - Run the HuggingFace InferenceService (currently using KServe)
16 |   - `kubectl apply -f tensorizer_hf_isvc/kserve/hf-isvc.yaml`
17 | - Or, run the Tensorizer InferenceService (currently using KServe)
18 |   - `kubectl apply -f tensorizer_hf_isvc/kserve/tensorizer-isvc.yaml`
19 | - View the InferenceService deployment information and URL
20 |   - `kubectl get isvc`
21 |   - `http://` may be required in place of `https://` when connecting to the displayed URL
22 | - Test the InferenceService
23 |   - The KServe services use [KServe's V1 protocol](https://kserve.github.io/website/0.10/modelserving/data_plane/v1_protocol/):
24 |     ```bash
25 |     curl http://<URL>/v1/models/gptj:predict -X POST -H 'Content-Type: application/json' -d '{"instances": ["Hello!"]}'
26 |     ```
27 |   - The Flask services simply encode queries into the URL path component:
28 |     ```bash
29 |     curl http://<URL>/predict/Hello%21
30 |     ```
31 | - Run the benchmark
32 |   - `python benchmark/load_test.py --kserve --url=<ISVC_URL> --requests=<NUMBER_OF_REQUESTS>`
33 |   - `load_test.py` defaults to running async requests with [`aiohttp`](https://pypi.org/project/aiohttp/)
34 |   - `--sync` may be added to the command line to instead send requests sequentially
35 |     using [`requests`](https://pypi.org/project/requests/)
36 | - Delete the InferenceService
37 |   - `kubectl delete -f tensorizer_hf_isvc/<...>/<...>-isvc.yaml`
38 |   - Use the same manifest file that was used with `kubectl apply`
39 | 
40 | Each InferenceService manifest (`*-isvc.yaml`) runs a container defined
41 | in a Dockerfile in its same directory, such as `tensorizer_hf_isvc/kserve/Dockerfile`.
42 | These may be changed and rebuilt to customize the behavior of the InferenceService.
43 | 
44 | > Note: The build context for each Dockerfile is its parent directory, so the build commands look like:
45 | > ```bash
46 | > docker build ./tensorizer_hf_isvc -f ./tensorizer_hf_isvc/kserve/Dockerfile
47 | > docker build ./tensorizer_hf_isvc -f ./tensorizer_hf_isvc/flask/Dockerfile
48 | > ```
49 | 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/benchmark/inputs.txt:
--------------------------------------------------------------------------------
 1 | Hello, how are you?
 2 | What up dig dog?
 3 | You are a killer!
 4 | Live a good life
 5 | Life is great
 6 | Chilling on a roof
 7 | Love you
 8 | Mox is cute
 9 | You are my enemy
10 | Change is required
11 | Love the life
12 | 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/benchmark/locustfile.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import random
 3 | import urllib.parse
 4 | 
 5 | from locust import HttpUser, task
 6 | 
 7 | inputs_file_path = os.path.join(
 8 |     os.path.dirname(os.path.abspath(__file__)), "inputs.txt"
 9 | )
10 | 
11 | with open(inputs_file_path, "r", encoding="utf-8") as inputs_file:
12 |     inputs = [line.strip() for line in inputs_file]
13 | 
14 | 
15 | def random_inference_url() -> str:
16 |     query = urllib.parse.quote(random.choice(inputs))
17 |     return f"/predict/{query}"
18 | 
19 | 
20 | class QuickstartUser(HttpUser):
21 |     @task
22 |     def predict(self):
23 |         with self.client.get(random_inference_url()) as response:
24 |             if response.status_code != 200:
25 |                 response.failure("Could not return response")
26 | 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/model-download/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ghcr.io/coreweave/ml-containers/torch:bb02bee-base-cuda11.8.0-torch2.0.0-vision0.15.1-audio2.0.1
 2 | 
 3 | # Install cURL, for tensorizer
 4 | RUN apt-get -qq update && \
 5 |     apt-get -qq install --no-install-recommends -y curl && \
 6 |     apt-get clean
 7 | 
 8 | RUN mkdir -p /downloader/
 9 | WORKDIR /downloader
10 | 
11 | COPY requirements.txt .
12 | RUN pip install --no-cache-dir --upgrade pip && \
13 |     pip install --no-cache-dir -r requirements.txt
14 | 
15 | COPY model_download.py ./
16 | 
17 | ENTRYPOINT ["python", "model_download.py"]
18 | 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/model-download/model-download-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: model-download
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       containers:
 9 |       - name: model-downloader
10 |         image: rtalaricw/model-download-gptj:v2.0
11 |         imagePullPolicy: IfNotPresent
12 |         volumeMounts:
13 |           - name: model-cache
14 |             mountPath: /mnt
15 |         resources:
16 |           requests:
17 |             cpu: 2
18 |             memory: 40Gi
19 |           limits:
20 |             cpu: 2
21 |             memory: 40Gi
22 |       volumes:
23 |         - name: model-cache
24 |           persistentVolumeClaim:
25 |             claimName: model-storage
26 |       affinity:
27 |         nodeAffinity:
28 |           requiredDuringSchedulingIgnoredDuringExecution:
29 |             nodeSelectorTerms:
30 |             - matchExpressions:
31 |               - key: topology.kubernetes.io/region
32 |                 operator: In
33 |                 values:
34 |                   - LAS1
35 |               
36 |       restartPolicy: Never
37 |   backoffLimit: 1


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/model-download/model_download.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tensorizer import TensorSerializer
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer
 4 | 
 5 | model_ref = "EleutherAI/gpt-j-6b"
 6 | 
 7 | tokenizer = AutoTokenizer.from_pretrained(model_ref)
 8 | tokenizer.save_pretrained("/mnt")
 9 | del tokenizer
10 | 
11 | model = AutoModelForCausalLM.from_pretrained(
12 |     model_ref,
13 |     revision="float16",
14 |     torch_dtype=torch.float16,
15 |     low_cpu_mem_usage=True,
16 | )
17 | # If only the tensorized model is desired, instead of saving the whole
18 | # PyTorch model, only the PretrainedConfig for the model need be saved
19 | # with the tokenizer and .tensors file.
20 | # model.config.save_pretrained("/mnt")
21 | model.save_pretrained("/mnt")
22 | 
23 | serializer = TensorSerializer("/mnt/gptj.tensors")
24 | serializer.write_module(model, remove_tensors=True)
25 | serializer.close()
26 | 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/model-download/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.27.1
2 | tensorizer==1.1.0
3 | accelerate==0.19.0
4 | 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: model-storage
 5 | spec:
 6 |   storageClassName: shared-nvme-las1
 7 |   accessModes:
 8 |     - ReadWriteMany
 9 |   resources:
10 |     requests:
11 |       storage: 200Gi


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/tensorizer_hf_isvc/flask/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ghcr.io/coreweave/ml-containers/torch:bb02bee-base-cuda11.8.0-torch2.0.0-vision0.15.1-audio2.0.1
 2 | 
 3 | RUN mkdir -p /transformer/
 4 | WORKDIR /transformer
 5 | 
 6 | COPY flask/requirements.txt .
 7 | 
 8 | RUN pip install --no-cache-dir --upgrade pip && \
 9 |     pip install --no-cache-dir -r requirements.txt
10 | 
11 | COPY flask/flask_api.py .
12 | COPY load_model.py .
13 | 
14 | ENTRYPOINT ["python", "-m", "gunicorn", "-w1", "-b0.0.0.0", "flask_api:app", "--timeout", "300"]
15 | 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/tensorizer_hf_isvc/flask/flask_api.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | from flask import Flask, Response
 5 | from load_model import load_model_based_on_type
 6 | from transformers import AutoTokenizer
 7 | 
 8 | MODEL_LOAD_TYPE = os.getenv("MODEL_LOAD_TYPE")
 9 | 
10 | 
11 | class Transformer:
12 |     def __init__(self):
13 |         self.model = load_model_based_on_type(model_load_type=MODEL_LOAD_TYPE)
14 | 
15 |         self.model.eval()
16 |         torch.manual_seed(100)
17 | 
18 |         self.tokenizer = AutoTokenizer.from_pretrained("/mnt/pvc")
19 |         self.eos = self.tokenizer.eos_token_id
20 | 
21 |     def encode(self, text):
22 |         input_ids = self.tokenizer.encode(text, return_tensors="pt").to("cuda")
23 | 
24 |         return input_ids
25 | 
26 |     # Match up the most likely prediction to the labels
27 |     def decode(self, input_ids):
28 |         with torch.no_grad():
29 |             output_ids = self.model.generate(
30 |                 input_ids,
31 |                 max_new_tokens=50,
32 |                 do_sample=True,
33 |                 pad_token_id=self.eos,
34 |             )
35 | 
36 |         print(f"tensor output IDs: {output_ids}")
37 | 
38 |         output = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
39 | 
40 |         print(f"tensor output: {output}\n", flush=True)
41 | 
42 |         return output
43 | 
44 | 
45 | llm = Transformer()
46 | app = Flask(__name__)
47 | 
48 | 
49 | @app.route("/")
50 | def index():
51 |     return Response(status=200)
52 | 
53 | 
54 | @app.route("/predict/<text>")
55 | def predict(text):
56 |     input_ids = llm.encode(text)
57 |     output = llm.decode(input_ids)
58 | 
59 |     return Response(output, mimetype="text/plain", status=200)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     app.run(host="0.0.0.0", port=8000)
64 | 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/tensorizer_hf_isvc/flask/hf-isvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.kubeflow.org/v1beta1
 2 | kind: InferenceService
 3 | metadata:
 4 |   labels:
 5 |     qos.coreweave.cloud/latency: low
 6 |   name: flask-hf-gptj
 7 | spec:
 8 |   predictor:
 9 |     maxReplicas: 100
10 |     minReplicas: 1
11 |     containerConcurrency: 1
12 |     containers:
13 |     - name: flask-hf-gptj
14 |       image: rtalaricw/gptj-hf-tensorizer-pvc:v2.1
15 |       ports:
16 |       - protocol: TCP
17 |         containerPort: 8000
18 |       env:
19 |         - name: STORAGE_URI 
20 |           value: pvc://model-storage/
21 |         - name: MODEL_LOAD_TYPE
22 |           value: hf
23 |         - name: PYTHONUNBUFFERED
24 |           value: "1"
25 |       resources:
26 |         requests:
27 |           cpu: 8
28 |           memory: 64Gi
29 |           nvidia.com/gpu: 1
30 |         limits:
31 |           cpu: 8
32 |           memory: 64Gi
33 |           nvidia.com/gpu: 1
34 |     affinity:
35 |       nodeAffinity:
36 |         requiredDuringSchedulingIgnoredDuringExecution:
37 |           nodeSelectorTerms:
38 |           - matchExpressions:
39 |             - key: gpu.nvidia.com/class
40 |               operator: In
41 |               values:
42 |                 - A40
43 |             - key: topology.kubernetes.io/region
44 |               operator: In
45 |               values:
46 |                 - LAS1


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/tensorizer_hf_isvc/flask/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==2.3.2
2 | gunicorn==20.1.0
3 | transformers==4.27.1
4 | tensorizer==1.1.0
5 | 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/tensorizer_hf_isvc/flask/tensorizer-isvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.kubeflow.org/v1beta1
 2 | kind: InferenceService
 3 | metadata:
 4 |   labels:
 5 |     qos.coreweave.cloud/latency: low
 6 |   name: flask-tensorizer-gptj
 7 | spec:
 8 |   predictor:
 9 |     maxReplicas: 100
10 |     minReplicas: 1
11 |     containerConcurrency: 1
12 |     containers:
13 |     - name: flask-tensorizer-gptj
14 |       image: rtalaricw/gptj-hf-tensorizer-pvc:v2.1
15 |       ports:
16 |       - protocol: TCP
17 |         containerPort: 8000
18 |       env:
19 |         - name: STORAGE_URI 
20 |           value: pvc://model-storage/
21 |         - name: MODEL_LOAD_TYPE
22 |           value: tensorizer
23 |         - name: PYTHONUNBUFFERED
24 |           value: "1"
25 |       resources:
26 |         requests:
27 |           cpu: 8
28 |           memory: 64Gi
29 |           nvidia.com/gpu: 1
30 |         limits:
31 |           cpu: 8
32 |           memory: 64Gi
33 |           nvidia.com/gpu: 1
34 |     affinity:
35 |       nodeAffinity:
36 |         requiredDuringSchedulingIgnoredDuringExecution:
37 |           nodeSelectorTerms:
38 |           - matchExpressions:
39 |             - key: gpu.nvidia.com/class
40 |               operator: In
41 |               values:
42 |                 - A40
43 |             - key: topology.kubernetes.io/region
44 |               operator: In
45 |               values:
46 |                 - LAS1 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/tensorizer_hf_isvc/kserve/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ghcr.io/coreweave/ml-containers/torch:bb02bee-base-cuda11.8.0-torch2.0.0-vision0.15.1-audio2.0.1
 2 | 
 3 | RUN mkdir -p /transformer/
 4 | WORKDIR /transformer
 5 | 
 6 | COPY kserve/requirements.txt .
 7 | 
 8 | RUN pip install --no-cache-dir --upgrade pip && \
 9 |     pip install --no-cache-dir -r requirements.txt
10 | 
11 | COPY kserve/kserve_api.py .
12 | COPY load_model.py .
13 | 
14 | ENTRYPOINT ["python", "kserve_api.py"]
15 | 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/tensorizer_hf_isvc/kserve/hf-isvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.kubeflow.org/v1beta1
 2 | kind: InferenceService
 3 | metadata:
 4 |   labels:
 5 |     qos.coreweave.cloud/latency: low
 6 |   name: kserve-hf-gptj
 7 | spec:
 8 |   predictor:
 9 |     maxReplicas: 100
10 |     minReplicas: 1
11 |     containerConcurrency: 1
12 |     containers:
13 |     - name: kserve-hf-gptj
14 |       image: rtalaricw/gptj-hf-tensorizer-pvc-kserve:v2.1
15 |       env:
16 |         - name: STORAGE_URI 
17 |           value: pvc://model-storage/
18 |         - name: MODEL_LOAD_TYPE
19 |           value: hf
20 |         - name: PYTHONUNBUFFERED
21 |           value: "1"
22 |       resources:
23 |         requests:
24 |           cpu: 8
25 |           memory: 64Gi
26 |           nvidia.com/gpu: 1
27 |         limits:
28 |           cpu: 8
29 |           memory: 64Gi
30 |           nvidia.com/gpu: 1
31 |     affinity:
32 |       nodeAffinity:
33 |         requiredDuringSchedulingIgnoredDuringExecution:
34 |           nodeSelectorTerms:
35 |           - matchExpressions:
36 |             - key: gpu.nvidia.com/class
37 |               operator: In
38 |               values:
39 |                 - A40
40 |             - key: topology.kubernetes.io/region
41 |               operator: In
42 |               values:
43 |                 - LAS1


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/tensorizer_hf_isvc/kserve/kserve_api.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from typing import Dict
 4 | 
 5 | import kserve
 6 | import kserve.errors
 7 | import torch
 8 | from load_model import load_model_based_on_type
 9 | from transformers import AutoTokenizer
10 | 
11 | MODEL_NAME = "gptj"
12 | MODEL_LOAD_TYPE = os.getenv("MODEL_LOAD_TYPE")
13 | 
14 | logging.basicConfig(level=kserve.constants.KSERVE_LOGLEVEL)
15 | logger = logging.getLogger(MODEL_NAME)
16 | logger.info(f"Model Name: {MODEL_NAME}")
17 | 
18 | 
19 | class Model(kserve.Model):
20 |     def __init__(self, name: str):
21 |         super().__init__(name)
22 |         self.name = name
23 |         self.model = None
24 |         self.tokenizer = None
25 |         self.eos = None
26 |         self.ready = False
27 | 
28 |     def load(self):
29 |         logger.info(f"Loading {MODEL_NAME}")
30 | 
31 |         self.model = load_model_based_on_type(model_load_type=MODEL_LOAD_TYPE)
32 | 
33 |         self.model.eval()
34 |         torch.manual_seed(100)
35 | 
36 |         self.tokenizer = AutoTokenizer.from_pretrained("/mnt/pvc")
37 |         self.eos = self.tokenizer.eos_token_id
38 | 
39 |         self.ready = True
40 | 
41 |     def validate(self, payload: Dict):
42 |         # Ensure that the request has the appropriate type to process
43 |         if not isinstance(payload, Dict):
44 |             raise kserve.errors.InvalidInput("Expected payload to be a dict")
45 |         return super().validate(payload)
46 | 
47 |     def predict(self, payload: Dict, headers: Dict[str, str] = None) -> Dict:
48 |         inputs = payload.get("instances") or ["Please input some text"]
49 |         outputs = []
50 |         for text in inputs:
51 |             input_ids = self.tokenizer.encode(text, return_tensors="pt").to(
52 |                 "cuda"
53 |             )
54 | 
55 |             with torch.no_grad():
56 |                 output_ids = self.model.generate(
57 |                     input_ids,
58 |                     max_new_tokens=50,
59 |                     do_sample=True,
60 |                     pad_token_id=self.eos,
61 |                 )
62 | 
63 |             print(f"tensor output IDs: {output_ids}")
64 | 
65 |             output = self.tokenizer.decode(
66 |                 output_ids[0], skip_special_tokens=True
67 |             )
68 |             outputs.append(output)
69 | 
70 |             print(f"tensor output: {output}\n", flush=True)
71 | 
72 |         return {"predictions": outputs}
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     model = Model(name=MODEL_NAME)
77 |     model.load()
78 |     kserve.ModelServer().start([model])
79 | 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/tensorizer_hf_isvc/kserve/requirements.txt:
--------------------------------------------------------------------------------
1 | kserve==0.10.1
2 | transformers==4.27.1
3 | tensorizer==1.1.0
4 | 


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/tensorizer_hf_isvc/kserve/tensorizer-isvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.kubeflow.org/v1beta1
 2 | kind: InferenceService
 3 | metadata:
 4 |   labels:
 5 |     qos.coreweave.cloud/latency: low
 6 |   name: kserve-tensorizer-gptj
 7 | spec:
 8 |   predictor:
 9 |     maxReplicas: 100
10 |     minReplicas: 1
11 |     containerConcurrency: 1
12 |     containers:
13 |     - name: kserve-tensorizer-gptj
14 |       image: rtalaricw/gptj-hf-tensorizer-pvc-kserve:v2.1
15 |       env:
16 |         - name: STORAGE_URI 
17 |           value: pvc://model-storage/
18 |         - name: MODEL_LOAD_TYPE
19 |           value: tensorizer
20 |         - name: PYTHONUNBUFFERED
21 |           value: "1"
22 |       resources:
23 |         requests:
24 |           cpu: 8
25 |           memory: 64Gi
26 |           nvidia.com/gpu: 1
27 |         limits:
28 |           cpu: 8
29 |           memory: 64Gi
30 |           nvidia.com/gpu: 1
31 |     affinity:
32 |       nodeAffinity:
33 |         requiredDuringSchedulingIgnoredDuringExecution:
34 |           nodeSelectorTerms:
35 |           - matchExpressions:
36 |             - key: gpu.nvidia.com/class
37 |               operator: In
38 |               values:
39 |                 - A40
40 |             - key: topology.kubernetes.io/region
41 |               operator: In
42 |               values:
43 |                 - LAS1


--------------------------------------------------------------------------------
/online-inference/tensorizer-isvc/tensorizer_hf_isvc/load_model.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from typing import Literal
 3 | 
 4 | import torch
 5 | from tensorizer import TensorDeserializer
 6 | from tensorizer.utils import convert_bytes, get_mem_usage, no_init_or_tensor
 7 | from transformers import AutoConfig, AutoModelForCausalLM, GPTJForCausalLM
 8 | 
 9 | DEVICE = "cuda"
10 | 
11 | 
12 | def load_model_based_on_type(
13 |     model_load_type: Literal["tensorizer", "hf"] = "tensorizer",
14 |     model_path: str = "/mnt/pvc",
15 | ):
16 |     """
17 |     Loads the model using Tensorizer or HuggingFace.
18 | 
19 |     Args:
20 |         model_load_type: Method to load the model [Options: "tensorizer", "hf"]
21 |         model_path: Path to the model files
22 |     """
23 |     if model_load_type not in ("tensorizer", "hf"):
24 |         raise ValueError(
25 |             'model_load_type must be either "tensorizer" or "hf";'
26 |             f" got {model_load_type}"
27 |         )
28 | 
29 |     if model_load_type == "hf":
30 |         start = time.time()
31 |         model = GPTJForCausalLM.from_pretrained(
32 |             model_path, torch_dtype=torch.float16
33 |         ).to(DEVICE)
34 |         duration = time.time() - start
35 |         print(
36 |             f"Deserialized model in {duration:0.2f}s"
37 |             " using HuggingFace Transformers"
38 |         )
39 | 
40 |         return model
41 | 
42 |     # If the config file were not pre-downloaded along with the HuggingFace
43 |     # model as in this example, this could use a HuggingFace model reference
44 |     # instead of a path for a small download of just the relevant config file.
45 |     # model_ref = "EleutherAI/gpt-j-6B"
46 |     config = AutoConfig.from_pretrained(model_path)
47 | 
48 |     # This ensures that the model is not initialized.
49 |     with no_init_or_tensor():
50 |         model = AutoModelForCausalLM.from_config(config)
51 | 
52 |     before_mem = get_mem_usage()
53 | 
54 |     # Lazy load the tensors from PVC into the model.
55 |     start = time.time()
56 |     deserializer = TensorDeserializer(
57 |         f"{model_path}/gptj.tensors", plaid_mode=True
58 |     )
59 |     deserializer.load_into_module(model)
60 |     end = time.time()
61 | 
62 |     # Brag about how fast we are.
63 |     total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
64 |     duration = end - start
65 |     per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
66 |     after_mem = get_mem_usage()
67 |     deserializer.close()
68 |     print(
69 |         f"Deserialized {total_bytes_str} in {duration:0.2f}s, {per_second}/s"
70 |         " using Tensorizer"
71 |     )
72 |     print(f"Memory usage before: {before_mem}")
73 |     print(f"Memory usage after: {after_mem}")
74 | 
75 |     return model
76 | 


--------------------------------------------------------------------------------
/online-inference/vllm/00-s3-secret.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Secret
 3 | metadata:
 4 |   name: s3-credentials
 5 | type: Opaque
 6 | data:
 7 |   access_key: <REPLACE_THIS_WITH_YOUR_BASE64_ENCODED_ACCESS_KEY>
 8 |   secret_key: <REPLACE_THIS_WITH_YOUR_BASE64_ENCODED_SECRET_KEY>
 9 |   host_url: <REPLACE_THIS_WITH_YOUR_BASE64_ENCODED_HOST_URL>
10 | 


--------------------------------------------------------------------------------
/online-inference/vllm/01-s3-serialize-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: vllm-serializer
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       containers:
 9 |       - name: model-serializer
10 |         image: rtalaricw/vllm:0.1
11 |         imagePullPolicy: IfNotPresent
12 |         command:
13 |           - "/bin/sh"
14 |           - "-c"
15 |           - |
16 |             cd /app/vllm && python3 -m examples.tensorize_vllm_model --model EleutherAI/pythia-70m serialize --serialized-directory s3://model-store/ --suffix vllm
17 |         env:
18 |         - name: S3_ACCESS_KEY_ID
19 |           valueFrom:
20 |             secretKeyRef:
21 |               name: s3-credentials
22 |               key: access_key
23 |         - name: S3_SECRET_ACCESS_KEY
24 |           valueFrom:
25 |             secretKeyRef:
26 |               name: s3-credentials
27 |               key: secret_key
28 |         - name: S3_ENDPOINT_URL
29 |           valueFrom:
30 |             secretKeyRef:
31 |               name: s3-credentials
32 |               key: host_url
33 |         resources:
34 |           requests:
35 |             cpu: "2"
36 |             memory: 16Gi
37 |             nvidia.com/gpu: "1"
38 |           limits:
39 |             cpu: "2"
40 |             memory: 16Gi
41 |             nvidia.com/gpu: "1"
42 |       affinity:
43 |         nodeAffinity:
44 |           requiredDuringSchedulingIgnoredDuringExecution:
45 |             nodeSelectorTerms:
46 |             - matchExpressions:
47 |               - key: topology.kubernetes.io/region
48 |                 operator: In
49 |                 values: 
50 |                 - ORD1
51 |               - key: gpu.nvidia.com/class
52 |                 operator: In
53 |                 values:
54 |                 - RTX_A5000
55 |             
56 |       restartPolicy: Never
57 |   backoffLimit: 1
58 | 


--------------------------------------------------------------------------------
/online-inference/vllm/02-inference-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: serving.knative.dev/v1
 2 | kind: Service
 3 | metadata:
 4 |   name: vllm-inference-service
 5 |   annotations:
 6 |     networking.knative.dev/ingress-class: kourier.ingress.networking.knative.dev
 7 |   labels:
 8 |     knative.coreweave.cloud/ingress: kourier.ingress.networking.knative.dev
 9 | spec:
10 |   template:
11 |     metadata:
12 |       annotations:
13 |         autoscaling.knative.dev/minScale: "1"
14 |         autoscaling.knative.dev/maxScale: "1"
15 |     spec:
16 |       affinity:
17 |         nodeAffinity:
18 |           requiredDuringSchedulingIgnoredDuringExecution:
19 |             nodeSelectorTerms:
20 |             - matchExpressions:
21 |               - key: gpu.nvidia.com/class
22 |                 operator: In
23 |                 values:
24 |                 - RTX_A5000
25 |               - key: topology.kubernetes.io/region
26 |                 operator: In
27 |                 values:
28 |                 - ORD1
29 |       containers:
30 |       - name: kfserving-container
31 |         image: rtalaricw/vllm:0.1
32 |         command:
33 |           - "/bin/sh"
34 |           - "-c"
35 |           - |
36 |             python -m vllm.entrypoints.openai.api_server \
37 |               --model EleutherAI/pythia-70m \
38 |               --model-loader-extra-config '{"tensorizer_uri": "s3://model-store/vllm/EleutherAI/pythia-70m/vllm/model.tensors"}' \
39 |               --load-format tensorizer \
40 |               --port 80
41 |         env:
42 |         - name: S3_ACCESS_KEY_ID
43 |           valueFrom:
44 |             secretKeyRef:
45 |               name: s3-credentials
46 |               key: access_key
47 |         - name: S3_SECRET_ACCESS_KEY
48 |           valueFrom:
49 |             secretKeyRef:
50 |               name: s3-credentials
51 |               key: secret_key
52 |         - name: S3_ENDPOINT_URL
53 |           valueFrom:
54 |             secretKeyRef:
55 |               name: s3-credentials
56 |               key: host_url
57 |         ports:
58 |         - protocol: TCP
59 |           containerPort: 80
60 |         livenessProbe:
61 |           httpGet:
62 |             path: /v1/models
63 |             port: 80
64 |           initialDelaySeconds: 30
65 |           periodSeconds: 30
66 |         readinessProbe:
67 |           httpGet:
68 |             path: /health
69 |             port: 80
70 |           initialDelaySeconds: 30
71 |           periodSeconds: 30
72 |         resources:
73 |           requests:
74 |             cpu: 4
75 |             memory: 16Gi
76 |             nvidia.com/gpu: 1
77 |           limits:
78 |             cpu: 4
79 |             memory: 16Gi
80 |             nvidia.com/gpu: 1
81 | 


--------------------------------------------------------------------------------
/online-inference/vllm/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ghcr.io/coreweave/ml-containers/torch-extras:c1bf355-nccl-cuda12.2.2-ubuntu22.04-nccl2.19.3-1-torch2.2.2-vision0.17.2-audio2.2.2
 2 | ENV DEBIAN_FRONTEND=noninteractive
 3 | 
 4 | RUN apt-get -qq update && \
 5 |     apt-get -qq install --no-install-recommends -y git curl libsodium23 && \
 6 |     apt-get clean
 7 | 
 8 | RUN pip install git+https://github.com/coreweave/vllm.git@sangstar/tensorizer-update#egg=vllm[tensorizer]
 9 | 
10 | WORKDIR /app
11 | 
12 | RUN git clone -b sangstar/tensorizer-update https://github.com/coreweave/vllm.git
13 | RUN cd vllm && python3 setup.py build_ext --inplace
14 | 


--------------------------------------------------------------------------------
/online-inference/vllm/README.md:
--------------------------------------------------------------------------------
 1 | This folder contains instructions to run the vLLM inference server.
 2 | 
 3 | Some of the features include:
 4 | 
 5 | 1. Serialize a [vLLM-supported model](https://github.com/vllm-project/vllm?tab=readme-ov-file#about) from the HuggingFace Model Hub.
 6 | 2. Tensorizer support for fast model deserialization and loading from vLLM 
 7 | 
 8 | To run the example:
 9 | 
10 | 1. Run ```kubectl apply -f 00-optional-s3-secret.yaml``` and replace ```access_key```, ```secret_key``` and ```host_url```
11 | 2. Run ```kubectl apply -f 01-optional-s3-serialize-job.yaml``` and replace ```--model EleutherAI/pythia-70m```, ```--serialized-directory s3://my-bucket/``` and optionally ```--suffix vllm```
12 | 3. Run ```kubectl apply -f 02-inference-service.yaml``` and replace ```--model EleutherAI/pythia-70m``` and ```--model-loader-extra-config '{"tensorizer_uri": "s3://model-store/vllm/EleutherAI/pythia-70m/vllm/model.tensors"}'``` with your serialized model path
13 | 
14 | You should have an inference service running a container with an OpenAI compatible server. 
15 | 
16 | To interact with the client, you can ```kubectl get ksvc``` to find your inference service named: ```vllm-inference-service``` to get the URL. 
17 | 
18 | The URL will be ```<INFERENCE_SERVICE_URL>:80/```.
19 | 
20 | You can use the OpenAI Python client or CURL to interact with it. More information about the client can be found here: https://docs.vllm.ai/en/latest/getting_started/quickstart.html
21 | 


--------------------------------------------------------------------------------
/sd-dreambooth-workflow/db-finetune-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: db-finetune-data
 5 | spec:
 6 |   storageClassName: shared-hdd-las1
 7 |   accessModes:
 8 |     - ReadWriteMany
 9 |   resources:
10 |     requests:
11 |       storage: 2000Gi
12 | 


--------------------------------------------------------------------------------
/sd-dreambooth-workflow/db-workflow-event-binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: WorkflowEventBinding
 3 | metadata:
 4 |   name: db-finetune-event-binding
 5 | spec:
 6 |   event:
 7 |     selector: discriminator == "db-finetune"
 8 |   submit:
 9 |     workflowTemplateRef:
10 |       name: db-finetune-template
11 |     arguments:
12 |       parameters:
13 |         - name: run_name
14 |           valueFrom:
15 |             event: payload.run_name
16 |         - name: instance_dataset
17 |           valueFrom:
18 |             event: payload.instance_dataset
19 |         - name: instance_prompt
20 |           valueFrom:
21 |             event: payload.instance_prompt
22 |         - name: class_dataset
23 |           valueFrom:
24 |             event: payload.class_dataset
25 |         - name: class_prompt
26 |           valueFrom:
27 |             event: payload.class_prompt
28 |         - name: output
29 |           valueFrom:
30 |             event: payload.output
31 |         - name: num_class_images
32 |           valueFrom:
33 |             event: "payload.num_class_images == null ? 100: payload.num_class_images"
34 |         - name: run_inference
35 |           valueFrom:
36 |             event: "payload.run_inference == null ? true : payload.run_inference"
37 |         - name: inference_only
38 |           valueFrom:
39 |             event: "payload.inference_only == null ? false : payload.inference_only"
40 | 


--------------------------------------------------------------------------------
/sd-dreambooth-workflow/huggingface-secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 |   token: enterYourSecret==
4 | kind: Secret
5 | metadata:
6 |   name: huggingface-token-secret
7 | type: Opaque
8 | 


--------------------------------------------------------------------------------
/sd-dreambooth-workflow/inference-role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   name: inference
 5 | ---
 6 | apiVersion: rbac.authorization.k8s.io/v1
 7 | kind: Role
 8 | metadata:
 9 |   name: role:inference
10 | rules:
11 |   - apiGroups:
12 |       - serving.kubeflow.org
13 |     resources:
14 |       - inferenceservices
15 |     verbs:
16 |       - '*'
17 |   - apiGroups:
18 |       - serving.knative.dev
19 |     resources:
20 |       - services
21 |       - revisions
22 |     verbs:
23 |       - '*'
24 | ---
25 | apiVersion: rbac.authorization.k8s.io/v1
26 | kind: RoleBinding
27 | metadata:
28 |   name: rolebinding:inference-inference
29 | roleRef:
30 |   apiGroup: rbac.authorization.k8s.io
31 |   kind: Role
32 |   name: role:inference
33 | subjects:
34 |   - kind: ServiceAccount
35 |     name: inference
36 | 
37 | 


--------------------------------------------------------------------------------
/sd-dreambooth-workflow/wandb-secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 |   token: enterYourSecret==
4 | kind: Secret
5 | metadata:
6 |   name: wandb-token-secret
7 | type: Opaque
8 | 


--------------------------------------------------------------------------------
/sd-finetuner-workflow/huggingface-secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 |   token: enterYourSecret==
4 | kind: Secret
5 | metadata:
6 |   name: huggingface-token-secret
7 | type: Opaque
8 | 


--------------------------------------------------------------------------------
/sd-finetuner-workflow/inference-role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   name: inference
 5 | ---
 6 | apiVersion: rbac.authorization.k8s.io/v1
 7 | kind: Role
 8 | metadata:
 9 |   name: role:inference
10 | rules:
11 |   - apiGroups:
12 |       - serving.kubeflow.org
13 |     resources:
14 |       - inferenceservices
15 |     verbs:
16 |       - '*'
17 |   - apiGroups:
18 |       - serving.knative.dev
19 |     resources:
20 |       - services
21 |       - revisions
22 |     verbs:
23 |       - '*'
24 | ---
25 | apiVersion: rbac.authorization.k8s.io/v1
26 | kind: RoleBinding
27 | metadata:
28 |   name: rolebinding:inference-inference
29 | roleRef:
30 |   apiGroup: rbac.authorization.k8s.io
31 |   kind: Role
32 |   name: role:inference
33 | subjects:
34 |   - kind: ServiceAccount
35 |     name: inference
36 | 


--------------------------------------------------------------------------------
/sd-finetuner-workflow/sd-finetune-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: sd-finetune-data
 5 | spec:
 6 |   storageClassName: shared-hdd-ord1
 7 |   accessModes:
 8 |     - ReadWriteMany
 9 |   resources:
10 |     requests:
11 |       storage: 2000Gi
12 | 


--------------------------------------------------------------------------------
/sd-finetuner-workflow/sd-finetune-workflow-event-binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: WorkflowEventBinding
 3 | metadata:
 4 |   name: sd-finetune-event-binding
 5 | spec:
 6 |   event:
 7 |     selector: discriminator == "sd-finetune"
 8 |   submit:
 9 |     workflowTemplateRef:
10 |       name: sd-finetune-template
11 |     arguments:
12 |       parameters:
13 |         - name: run_name
14 |           valueFrom:
15 |             event: payload.run_name
16 |         - name: dataset
17 |           valueFrom:
18 |             event: payload.dataset
19 |         - name: run_inference
20 |           valueFrom:
21 |             event: "payload.run_inference == null ? true : payload.run_inference"
22 |         - name: inference_only
23 |           valueFrom:
24 |             event: "payload.inference_only == null ? false : payload.inference_only"
25 | 


--------------------------------------------------------------------------------
/sd-finetuner-workflow/sd-finetuner/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gooseai/torch-base:6cfdc11
 2 | RUN apt-get install -y cuda-nvcc-11-3 cuda-nvml-dev-11-3 libcurand-dev-11-3 \
 3 |                        libcublas-dev-11-3 libcusparse-dev-11-3 \
 4 |                        libcusolver-dev-11-3 cuda-nvprof-11-3 \
 5 |                        ninja-build && \
 6 |     apt-get clean
 7 | RUN mkdir /app
 8 | WORKDIR /app
 9 | COPY requirements.txt .
10 | RUN pip3 install --no-cache-dir -r requirements.txt
11 | RUN pip3 install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 --upgrade
12 | COPY accelerate_config.yaml .
13 | COPY datasets.py .
14 | COPY finetuner.py .
15 | CMD [ "/usr/bin/python3", "-m", "accelerate.commands.launch", "finetuner.py" ]
16 | 


--------------------------------------------------------------------------------
/sd-finetuner-workflow/sd-finetuner/accelerate_config.yaml:
--------------------------------------------------------------------------------
 1 | command_file: null
 2 | commands: null
 3 | compute_environment: LOCAL_MACHINE
 4 | deepspeed_config: {}
 5 | distributed_type: MULTI_GPU
 6 | downcast_bf16: 'no'
 7 | fsdp_config: {}
 8 | gpu_ids: all
 9 | machine_rank: 0
10 | main_process_ip: null
11 | main_process_port: null
12 | main_training_function: main
13 | megatron_lm_config: {}
14 | mixed_precision: 'no'
15 | num_machines: 1
16 | num_processes: 1
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_name: null
20 | tpu_zone: null
21 | use_cpu: false
22 | 


--------------------------------------------------------------------------------
/sd-finetuner-workflow/sd-finetuner/requirements.txt:
--------------------------------------------------------------------------------
 1 | diffusers==0.14.0
 2 | numpy==1.23.4
 3 | wandb==0.13.4
 4 | torch
 5 | torchvision
 6 | transformers>=4.21.0
 7 | huggingface-hub>=0.10.0
 8 | Pillow==9.2.0
 9 | tqdm==4.64.1
10 | ftfy==6.1.1
11 | bitsandbytes
12 | pynvml~=11.4.1
13 | psutil~=5.9.0
14 | accelerate==0.15.0
15 | 


--------------------------------------------------------------------------------
/sd-finetuner-workflow/wandb-secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 |   token: enterYourSecret==
4 | kind: Secret
5 | metadata:
6 |   name: wandb-token-secret
7 | type: Opaque
8 | 


--------------------------------------------------------------------------------
/spark/cpu-pod-template.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: cpu-job
 5 | spec:
 6 |   terminationGracePeriodSeconds: 10
 7 |   containers:
 8 |     - name: cpu-job
 9 |       volumeMounts:
10 |         - mountPath: /dev/shm
11 |           name: dshm
12 |         - name: spark-pvc
13 |           mountPath: /mnt/pvc
14 |           readOnly: false
15 | 
16 |   affinity:
17 |     nodeAffinity:
18 |       requiredDuringSchedulingIgnoredDuringExecution:
19 |         nodeSelectorTerms:
20 |           - matchExpressions:
21 |               - key: topology.kubernetes.io/region
22 |                 operator: In
23 |                 values:
24 |                   - "LGA1"
25 |               - key: node.coreweave.cloud/cpu
26 |                 operator: In
27 |                 values:
28 |                   - amd-epyc-rome
29 |                   - amd-epyc-milan
30 |                   - intel-xeon-v3
31 |                   - intel-xeon-v4
32 |   volumes:
33 |     - name: dshm
34 |       emptyDir:
35 |         medium: Memory
36 |     - name: spark-pvc
37 |       persistentVolumeClaim:
38 |         claimName: spark-pvc
39 |         readOnly: false
40 |   restartPolicy: Always
41 | 


--------------------------------------------------------------------------------
/spark/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG SPARK_VERSION="v3.4.0"
 2 | FROM apache/spark-py:$SPARK_VERSION
 3 | 
 4 | USER 0
 5 | 
 6 | RUN mkdir /app
 7 | 
 8 | ARG MSCOCO_SOURCE=https://huggingface.co/datasets/ChristophSchuhmann/MS_COCO_2017_URL_TEXT/resolve/main/mscoco.parquet
 9 | RUN wget $MSCOCO_SOURCE -O /app/mscoco.parquet
10 | 
11 | ADD requirements.txt /app/requirements.txt
12 | RUN pip install -r /app/requirements.txt
13 | 
14 | ADD download_imgdataset.py /app/download_imgdataset.py
15 | 


--------------------------------------------------------------------------------
/spark/docker/download_imgdataset.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | from img2dataset import download
 5 | from pyspark.sql import SparkSession
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument("--url-list", type=Path, default=Path("/mnt/pvc/mscoco.parquet"), help="Path to the url list file")
 9 | parser.add_argument("--output", type=Path, default=Path("/mnt/pvc/mscoco"), help="Path to output folder")
10 | parser.add_argument("--thread-count", "-t", type=int, default=16, help="Number of threads for img2dataset")
11 | args = parser.parse_args()
12 | 
13 | args.output.mkdir(parents=True, exist_ok=True)
14 | 
15 | if not args.url_list.exists():
16 |     raise ValueError(f"The URL list does not exist at: {args.url_list}")
17 | 
18 | # All options are specified in the spark submit command. Any options specified here will override the spark submit conf
19 | spark = SparkSession.builder.getOrCreate()
20 | 
21 | download(
22 |     thread_count=args.thread_count,  # Process count will be num executors * num cores per executor
23 |     url_list=str(args.url_list),
24 |     image_size=256,
25 |     output_folder=str(args.output),
26 |     output_format="webdataset",
27 |     input_format="parquet",
28 |     url_col="URL",
29 |     caption_col="TEXT",
30 |     subjob_size=1000,
31 |     distributor="pyspark",
32 | )
33 | 


--------------------------------------------------------------------------------
/spark/docker/requirements.txt:
--------------------------------------------------------------------------------
1 | img2dataset==1.41.0
2 | 


--------------------------------------------------------------------------------
/spark/example-spark-submit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Replace this command with your desired namespace if you don't want to use your default namespace
 4 | NAMESPACE=$(kubectl config view --minify -o jsonpath='{..namespace}')
 5 | echo "Using the namespace: $NAMESPACE"
 6 | 
 7 | $SPARK_HOME/bin/spark-submit \
 8 |     --master k8s://https://k8s.ord1.coreweave.com \
 9 |     --deploy-mode cluster \
10 |     --name download-mscoco-16-64 \
11 |     --conf spark.driver.cores=16 \
12 |     --conf spark.kubernetes.driver.limit.cores=16 \
13 |     --conf spark.driver.memory="64G" \
14 |     --conf spark.executor.cores=16 \
15 |     --conf spark.kubernetes.executor.limit.cores=16 \
16 |     --conf spark.executor.memory="64G" \
17 |     --conf spark.executor.instances=1 \
18 |     --conf spark.kubernetes.driver.container.image=navarrepratt/spark-download-imgdataset:1.0.2 \
19 |     --conf spark.kubernetes.executor.container.image=navarrepratt/spark-download-imgdataset:1.0.2 \
20 |     --conf spark.kubernetes.driver.podTemplateFile=./cpu-pod-template.yaml \
21 |     --conf spark.kubernetes.executor.podTemplateFile=./cpu-pod-template.yaml \
22 |     --conf spark.kubernetes.namespace="$NAMESPACE" \
23 |     --conf spark.kubernetes.authenticate.driver.serviceAccountName=spark-sa \
24 |     local:///app/download_imgdataset.py --output /mnt/pvc/mscoco -t 2048
25 | 


--------------------------------------------------------------------------------
/spark/jupyter/jupyter-service.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | kind: Service
  3 | metadata:
  4 |   name: spark-jupyter
  5 | spec:
  6 |   type: ClusterIP
  7 |   clusterIP: None
  8 |   ports:
  9 |     - name: notebook
 10 |       port: 8888
 11 |       protocol: TCP
 12 |     - name: spark-ui
 13 |       port: 4040
 14 |       protocol: TCP
 15 |     - name: blockmanager
 16 |       port: 7777
 17 |       protocol: TCP
 18 |     - name: driver
 19 |       port: 2222
 20 |       protocol: TCP
 21 |   selector:
 22 |     app.kubernetes.io/name: spark-jupyter
 23 | ---
 24 | apiVersion: apps/v1
 25 | kind: Deployment
 26 | metadata:
 27 |   name: spark-jupyter
 28 | spec:
 29 |   strategy:
 30 |     type: Recreate
 31 |   replicas: 1
 32 |   selector:
 33 |     matchLabels:
 34 |       app.kubernetes.io/name: spark-jupyter
 35 |   template:
 36 |     metadata:
 37 |       labels:
 38 |         app.kubernetes.io/name: spark-jupyter
 39 |     spec:
 40 |       serviceAccountName: spark-sa
 41 |       containers:
 42 |         - name: jupyter
 43 |           image: jupyter/all-spark-notebook:python-3.10
 44 |           command:
 45 |             - "jupyter"
 46 |             - "lab"
 47 |             - "--ip"
 48 |             - "0.0.0.0"
 49 |             - "--no-browser"
 50 |             - "--allow-root"
 51 |             - "--notebook-dir"
 52 |             - "/mnt/pvc"
 53 |             - "--LabApp.token=''"
 54 | 
 55 |           securityContext:
 56 |             runAsUser: 0
 57 | 
 58 |           ports:
 59 |             - name: notebook
 60 |               containerPort: 8888
 61 |               protocol: TCP
 62 |             - name: blockmanager
 63 |               containerPort: 7777
 64 |               protocol: TCP
 65 |             - name: driver
 66 |               containerPort: 2222
 67 |               protocol: TCP
 68 |             - name: spark-ui
 69 |               containerPort: 4040
 70 |               protocol: TCP
 71 | 
 72 |           readinessProbe:
 73 |             tcpSocket:
 74 |               port: notebook
 75 |             initialDelaySeconds: 5
 76 |             periodSeconds: 10
 77 |           livenessProbe:
 78 |             httpGet:
 79 |               path: /
 80 |               port: notebook
 81 |             initialDelaySeconds: 15
 82 |             periodSeconds: 15
 83 |             failureThreshold: 3
 84 |             timeoutSeconds: 10
 85 | 
 86 |           volumeMounts:
 87 |             - name: storage
 88 |               mountPath: /mnt/pvc
 89 | 
 90 |           env:
 91 |             - name: WANDB_API_KEY
 92 |               valueFrom:
 93 |                 secretKeyRef:
 94 |                   name: wandb-token-secret
 95 |                   key: token
 96 | 
 97 |           resources:
 98 |             requests:
 99 |               cpu: "4"
100 |               memory: 16Gi
101 |             limits:
102 |               cpu: "4"
103 |               memory: 16Gi
104 |       affinity:
105 |         nodeAffinity:
106 |           requiredDuringSchedulingIgnoredDuringExecution:
107 |             nodeSelectorTerms:
108 |             - matchExpressions:
109 |               - key: topology.kubernetes.io/region
110 |                 operator: In
111 |                 values:
112 |                   - "LGA1"
113 |       volumes:
114 |         - name: storage
115 |           persistentVolumeClaim:
116 |             claimName: spark-pvc
117 |       restartPolicy: Always


--------------------------------------------------------------------------------
/spark/spark-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: spark-pvc
 5 | spec:
 6 |   storageClassName: shared-nvme-lga1
 7 |   accessModes:
 8 |     - ReadWriteMany
 9 |   resources:
10 |     requests:
11 |       storage: "400Gi"
12 | 


--------------------------------------------------------------------------------
/spark/spark-role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   name: spark-sa
 5 | ---
 6 | apiVersion: rbac.authorization.k8s.io/v1
 7 | kind: Role
 8 | metadata:
 9 |   name: role:spark
10 | rules:
11 |   - apiGroups:
12 |       - ""
13 |     resources:
14 |       - configmaps
15 |       - pods
16 |       - services
17 |       - persistentvolumeclaims
18 |     verbs:
19 |       - '*'
20 | ---
21 | apiVersion: rbac.authorization.k8s.io/v1
22 | kind: RoleBinding
23 | metadata:
24 |   name: spark
25 | roleRef:
26 |   apiGroup: rbac.authorization.k8s.io
27 |   kind: Role
28 |   name: role:spark
29 | subjects:
30 |   - kind: ServiceAccount
31 |     name: spark-sa
32 | 


--------------------------------------------------------------------------------
/spark/wandb-secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | data:
3 |   token: enterYourSecret==
4 | kind: Secret
5 | metadata:
6 |   name: wandb-token-secret
7 | type: Opaque
8 | 


--------------------------------------------------------------------------------
/tensorflow-jupyter/jupyter-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: jupyter-pv-claim
 5 | spec:
 6 |   # Available storage classes at time of writing are
 7 |   # block-nvme-lga1 - New York - NVMe Storage with 3 Replicas
 8 |   # block-hdd-lga1 - New York - HDD Storage with 3 Replicas
 9 |   # Other data centers currently available [ewr1, las1]
10 |   storageClassName: block-nvme-lga1
11 |   accessModes:
12 |     - ReadWriteOnce
13 |   resources:
14 |     requests:
15 |       storage: 10Gi
16 | 


--------------------------------------------------------------------------------
/tensorflow-jupyter/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coreweave/kubernetes-cloud/ed5c832f666badc124f0a12d9c60260920ee9089/tensorflow-jupyter/screenshot.png


--------------------------------------------------------------------------------
/tensorflow-jupyter/tensorflow-deployment.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: apps/v1
  2 | kind: Deployment
  3 | metadata:
  4 |   name: tensorflow-jupyter
  5 | spec:
  6 |   strategy:
  7 |     type: Recreate
  8 |   # Replicas controls the number of instances of the Pod to maintain running at all times
  9 |   replicas: 1
 10 |   selector:
 11 |     matchLabels:
 12 |       app.kubernetes.io/name: tensorflow-jupyter
 13 |   template:
 14 |     metadata:
 15 |       labels:
 16 |         app.kubernetes.io/name: tensorflow-jupyter
 17 |     spec:
 18 |       containers:
 19 |         - name: tf
 20 |           image: tensorflow/tensorflow:2.12.0-gpu-jupyter
 21 | 
 22 |           ports:
 23 |             - name: notebook
 24 |               containerPort: 8888
 25 |               protocol: TCP
 26 | 
 27 |           readinessProbe:
 28 |             tcpSocket:
 29 |               port: notebook
 30 |             initialDelaySeconds: 5
 31 |             periodSeconds: 10
 32 |           livenessProbe:
 33 |             httpGet:
 34 |               path: /
 35 |               port: notebook
 36 |             initialDelaySeconds: 15
 37 |             periodSeconds: 15
 38 |             failureThreshold: 3
 39 |             timeoutSeconds: 10
 40 | 
 41 |           volumeMounts:
 42 |             - name: storage
 43 |               mountPath: /tf/notebooks
 44 | 
 45 |           resources:
 46 |             requests:
 47 |               cpu: 500m # The CPU unit is milli-cores. 500m is 0.5 cores
 48 |               memory: 16Gi
 49 |             limits:
 50 |               cpu: 2000m
 51 |               memory: 16Gi
 52 |               # GPUs can only be allocated as a limit, which both reserves and limits the number of GPUs the Pod will have access to
 53 |               # Making individual Pods resource light is advantageous for bin-packing. In the case of Jupyter, we stick to two GPUs for
 54 |               # demonstration purposes
 55 |               nvidia.com/gpu: 2
 56 | 
 57 |       # Node affinity can be used to require / prefer the Pods to be scheduled on a node with a specific hardware type
 58 |       # No affinity allows scheduling on all hardware types that can fulfill the resource request.
 59 |       # In this example, without affinity, any NVIDIA GPU would be allowed to run the Pod.
 60 |       # Read more about affinity at: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
 61 |       affinity:
 62 |         nodeAffinity:
 63 |           # This will REQUIRE the Pod to be run on a system with an NVIDIA A40 GPU
 64 |           requiredDuringSchedulingIgnoredDuringExecution:
 65 |             nodeSelectorTerms:
 66 |             - matchExpressions:
 67 |               - key: gpu.nvidia.com/class
 68 |                 operator: In
 69 |                 values:
 70 |                   - A40
 71 |               - key: failure-domain.beta.kubernetes.io/region
 72 |                 operator: In
 73 |                 values:
 74 |                   - LGA1
 75 | 
 76 |           # As ML testing doesn't require a lot of network throughput, we try to play nice and only schedule
 77 |           # the Pod on systems with only 1G network connections. We also desire decent CPUs. This is a preference, not a requirement.
 78 |           # If systems with i5 / i9 / Xeon CPUs and/or 1G ethernet are not available to fulfill the requested resources, the Pods
 79 |           # will be scheduled on higher end systems.
 80 |           preferredDuringSchedulingIgnoredDuringExecution:
 81 | #            - weight: 10
 82 | #              preference:
 83 | #                matchExpressions:
 84 | #                - key: cpu.atlantic.cloud/family
 85 | #                  operator: In
 86 | #                  values:
 87 | #                    - i7
 88 | #                    - i5
 89 | #                    - i9
 90 | #                    - xeon
 91 |             - weight: 10
 92 |               preference:
 93 |                 matchExpressions:
 94 |                 - key: ethernet.atlantic.cloud/speed
 95 |                   operator: In
 96 |                   values:
 97 |                     - 1G
 98 |       volumes:
 99 |         - name: storage
100 |           persistentVolumeClaim:
101 |             claimName: jupyter-pv-claim
102 |       restartPolicy: Always
103 | 


--------------------------------------------------------------------------------
/tensorflow-jupyter/tensorflow-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   annotations:
 5 |     metallb.universe.tf/address-pool: public
 6 |     # Setting a sharing key might save public IP addresses
 7 |     # See https://metallb.universe.tf/usage/#ip-address-sharing for more detail
 8 |     metallb.universe.tf/allow-shared-ip: example-1
 9 |   name: tensorflow-jupyter
10 | spec:
11 |   type: LoadBalancer
12 |   externalTrafficPolicy: Local
13 |   ports:
14 |     - name: notebook
15 |       port: 8888
16 |       protocol: TCP
17 |       targetPort: notebook
18 |   selector:
19 |     app.kubernetes.io/name: tensorflow-jupyter
20 | 


--------------------------------------------------------------------------------
/virtual-server/examples/curl/README.md:
--------------------------------------------------------------------------------
 1 | ## Virtual Server `curl` Example
 2 | 
 3 | This is an example implementation in `bash` of a Kubernetes client interacting with a Virtual Server resource on CoreWeave Cloud using `curl`. The example script provided creates, lists, and deletes a simple Ubuntu 20.04 Virtual Server with 2 CPU cores and 2Gi of memory.
 4 | 
 5 | ## Usage
 6 | 
 7 | ### Dependencies
 8 | Before invoking the script, the `jq` and `curl` commands must be installed and available from the `PATH`.
 9 | 
10 | ### Environment variables
11 | In invoking this script, `TOKEN` and `NAMESPACE` will be exported as environment variables. The value of `NAMESPACE` should be set to the desired namespace. The value of `TOKEN` should be replaced with the value of `'token:'` generated in the `kubeconfig` file.
12 | 
13 | > ℹ️ [See more about how to generate the kubeconfig file.](https://docs.coreweave.com/coreweave-kubernetes/getting-started#obtain-access-credentials)
14 | 
15 | 
16 | ### Running the script
17 | 
18 | The script is invoked like so:
19 | 
20 | ```bash
21 | TOKEN=<token> NAMESPACE=<namespace> ./run.sh
22 | ```
23 | 
24 | ## Implementation Breakdown
25 | 
26 | The implementation consists of a few simple `curl` calls to two APIs:
27 | 
28 | 1. **[Kubevirt](https://kubevirt.io/)** - An open-source project that allows running virtual systems on the Kubernetes cluster.
29 | 1. **[Virtual Server](https://docs.coreweave.com/virtual-servers/getting-started)** - A Kubernetes Custom Resource that allows deploying a virtual system and interacting with Kubevirt with ease.
30 | 
31 | > 💡 **Additional resources**
32 | > 
33 | > The latest resource details, such as statuses and conditions, are available on [Virtual Servers reference API](https://pkg.go.dev/github.com/coreweave/virtual-server/api/v1alpha1#VirtualServerConditionType)
34 | > The general description of Kubernetes RESTful API is available in [the official documentation of the Kubernetes API Overview](https://kubernetes.io/docs/reference/using-api/). Basic concepts of the API are described in [the official documentation of the Kubernetes API Concepts](https://kubernetes.io/docs/reference/using-api/api-concepts/).
35 | 
36 | ## Virtual Server functions
37 | 
38 | - `create_vs()` - creates the Virtual Server
39 | - `delete_vs()` - deletes the Virtual Server
40 | - `list_vs()` - lists of all the Virtual Servers in the designated namespace
41 | - `get_vs()` - prints formatted JSON details about the Virtual Server
42 | - `wait_until_vs_status()` - loops until the expected condition is met. 
43 | 
44 | ## Kubevirt functions
45 | 
46 | **VM**
47 | - `start_vm()` - starts a Virtual Machine and creates a Virtual Machine Instance
48 | - `stop_vm()` - stops the Virtual Machine, then the deletes Virtual Machine Instance
49 | - `list_vm()` - lists all the Virtual Machines in namespace
50 | - `get_vm()` - prints formatted JSON details about the Virtual Machine
51 | 
52 | **VMI**
53 | - `list_vmi()` - lists all the Virtual Machine Instances in the designated namespace
54 | - `get_vmi()` - prints formatted JSON details about Virtual Machine Instance
55 | 
56 | > 💡 **Additional resources**
57 | > 
58 | > The [Kubevirt Python client](https://github.com/kubevirt/client-python#documentation-for-api-endpoints) can list all of the Kubevirt RESTful API, both for VMs and VMIs.
59 | 


--------------------------------------------------------------------------------
/virtual-server/examples/curl/virtual-server.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "apiVersion": "virtualservers.coreweave.com/v1alpha1",
 3 |   "kind": "VirtualServer",
 4 |   "metadata": {
 5 |     "name": "vs-example"
 6 |   },
 7 |   "spec": {
 8 |     "region": "ORD1",
 9 |     "os": {
10 |       "type": "linux"
11 |     },
12 |     "initializeRunning": true,
13 |     "resources": {
14 |       "cpu": {
15 |         "count": 2,
16 |         "type": "amd-epyc-rome"
17 |       },
18 |       "memory": "2Gi"
19 |     },
20 |     "storage": {
21 |       "root": {
22 |         "size": "40Gi",
23 |         "storageClassName": "block-nvme-ord1",
24 |         "source": {
25 |           "pvc": {
26 |             "namespace": "vd-images",
27 |             "name": "ubuntu2004-nvidia-515-86-01-1-docker-master-20221205-ord1"
28 |           }
29 |         }
30 |       }
31 |     },
32 |     "users": [
33 |       {
34 |         "username": "myuser",
35 |         "password": "password1234"
36 |       }
37 |     ],
38 |     "network": {
39 |       "public": true,
40 |       "tcp": {
41 |         "ports": [
42 |           22
43 |         ]
44 |       }
45 |     }
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/virtual-server/examples/go/.gitignore:
--------------------------------------------------------------------------------
1 | vs
2 | 


--------------------------------------------------------------------------------
/virtual-server/examples/go/Makefile:
--------------------------------------------------------------------------------
 1 | # Go parameters
 2 | GOCMD=go
 3 | GOMOD=$(GOCMD) mod
 4 | GOBUILD=$(GOCMD) build
 5 | GOCLEAN=$(GOCMD) clean
 6 | BINARY_NAME=vs
 7 | 
 8 | all: install run
 9 | 
10 | install:
11 | 	$(GOMOD) download
12 | 
13 | run:
14 | 	export GO111MODULES=on
15 | 	$(GOBUILD) -o $(BINARY_NAME) -v ./...
16 | 	./$(BINARY_NAME)
17 | 
18 | clean:
19 | 	$(GOCLEAN)
20 | 	rm -rf $(BINARY_NAME)
21 | 


--------------------------------------------------------------------------------
/virtual-server/examples/go/README.md:
--------------------------------------------------------------------------------
 1 | # Go example
 2 | 
 3 | An example Go implementation of a kubernetes client that interacts with the CoreWeave VirtualServer resource as well as the kubevirt subresource api.
 4 | 
 5 | The Go example illustrates the following:
 6 | 1. Build VirtualServer definition based on API https://github.com/coreweave/virtual-server.
 7 | 2. Builds a Service and PVC to be used as a FloatingIP and Additional Filesystem respectively. 
 8 | 3. Removal of an existing VirtualServer.
 9 | 4. Creation of a new VirtualServer. The instance is started automatically.
10 | 5. Waiting for VirtualServer ready status.
11 | 6. Create floating service when environment variable `FLOATING_SERVICE_NAME` is specified.
12 | 7. Stop the instance and wait until it is fully stopped.
13 | 8. Delete the VirtualServer.
14 | 
15 | ## Run
16 | 
17 | The first run takes more time until all necessary packages are downloaded.
18 | 
19 | Be sure to use secure credentials for `USERNAME` and `PASSWORD` as they will be used to create a user in your Virtual Server
20 | ```
21 | USERNAME=<my_username> PASSWORD=<my_password> KUBECONFIG=/home/<user>/.kubeconfig NAMESPACE=<namespace> make
22 | ```
23 | 
24 | In order to create floating service, environment variable `FLOATING_SERVICE_NAME` must be spefified:
25 | ```
26 | FLOATING_SERVICE_NAME=<floating_service_name> PASSWORD=<my_password> KUBECONFIG=/home/<user>/.kubeconfig NAMESPACE=<namespace> make
27 | ```
28 | 


--------------------------------------------------------------------------------
/virtual-server/examples/go/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/coreweave/kubernetes-cloud/virtual-server/examples/go
 2 | 
 3 | go 1.13
 4 | 
 5 | replace (
 6 | 	github.com/go-kit/kit => github.com/go-kit/kit v0.3.0
 7 | 	github.com/openshift/api => github.com/openshift/api v0.0.0-20210105115604-44119421ec6b
 8 | 	github.com/openshift/client-go => github.com/openshift/client-go v0.0.0-20210112165513-ebc401615f47
 9 | 	github.com/operator-framework/operator-lifecycle-manager => github.com/operator-framework/operator-lifecycle-manager v0.17.0
10 | 	github.com/operator-framework/operator-registry => github.com/operator-framework/operator-registry v1.16.1
11 | 	k8s.io/api => k8s.io/api v0.20.2
12 | 	k8s.io/apimachinery => k8s.io/apimachinery v0.20.2
13 | 	k8s.io/client-go => k8s.io/client-go v0.20.2
14 | 	k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.16.4
15 | 	kubevirt.io/containerized-data-importer => kubevirt.io/containerized-data-importer v1.26.1
16 | 	sigs.k8s.io/structured-merge-diff => sigs.k8s.io/structured-merge-diff v1.0.1-0.20191108220359-b1b620dd3f06
17 | )
18 | 
19 | require (
20 | 	github.com/coreweave/virtual-server v1.15.0
21 | 	github.com/spf13/pflag v1.0.5
22 | 	k8s.io/api v0.20.2
23 | 	k8s.io/apimachinery v0.20.2
24 | 	kubevirt.io/client-go v0.39.0
25 | 	sigs.k8s.io/controller-runtime v0.8.3
26 | )
27 | 


--------------------------------------------------------------------------------
/virtual-server/examples/kubectl/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains several example manifests for `VirtualServer`.
 2 | 
 3 | To run any of the examples issue: `kubectl apply -f <manifest_file_name.yaml>`
 4 | 
 5 | CoreWeave provides base images for different operating systems, including images pre-loaded with NVIDIA drivers and remote desktop software. Refer to the [System Images Documentation](https://docs.coreweave.com/virtual-servers/coreweave-system-images) to learn how to list these images via CLI.
 6 | 
 7 | - [virtual-server-direct-attach-lb.yaml](virtual-server-direct-attach-lb.yaml) shows how to directly attach the Load Balancer IP to a Virtual Server. This will the VS a unfiltered Public IP that also is visible to the VM itself. This will give a classic VPS style experience.
 8 | 
 9 | - [virtual-server-windows-internal-ip-only.yaml](virtual-server-windows-internal-ip-only.yaml) creates a Windows Virtual Server with no public IP (STATIC internal IP only) - useful for servers that will only be accessed in your namespace, such as Domain Controllers.
10 | 
11 | - [virtual-server-windows-cpu-only.yaml](virtual-server-windows-cpu-only.yaml) creates a Windows Virtual Server with no GPU - CPU compute only.
12 | 
13 | - [virtual-server-shared-pvc.yaml](virtual-server-shared-pvc.yaml) attaches shared `PVC` to the Virtual Server. The `PVC` formated already and mounted as `/mnt/shared-pvc`.
14 | 
15 | - [virtual-server-ephemeral-root-disk.yaml](virtual-server-ephemeral-root-disk.yaml) boots a Virtual Server from a root-disk image in ephemeral mode. Changes to the VM root disk will be written to local node ephemeral storage, and lost on restart. Useful for epehemeral tasks such as pixel-streaming and data-processing.
16 | 
17 | - [virtual-server-windows.yaml](virtual-server-windows.yaml) creates Windows10 Virtual Server. To get the external IP and use remote desktop via `RDP` protocol, issue the following command:
18 |   ```
19 |   kubectl get svc vs-windows10-tcp -o jsonpath="{.status.loadBalancer.ingress[*].ip}"
20 |   ```
21 | 
22 | - [virtual-server-block-pvc.yaml](virtual-server-block-pvc.yaml) attaches an additional block `PVC` disk to the virtual machine. The new disk is raw and needs to be formatted.
23 | 
24 |   ```
25 | 
26 |   myuser@vs-ubuntu2004-block-pvc:~$ sudo mkfs.ext4 /dev/vdb
27 |   mke2fs 1.45.5 (07-Jan-2020)
28 |   Discarding device blocks: done                            
29 |   Creating filesystem with 5242880 4k blocks and 1310720 inodes
30 |   Filesystem UUID: 0a05b295-9518-41f3-8b64-18d5902d419e
31 |   Superblock backups stored on blocks: 
32 |     32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632, 2654208, 
33 |     4096000
34 | 
35 |   Allocating group tables: done                            
36 |   Writing inode tables: done                            
37 |   Creating journal (32768 blocks): done
38 |   Writing superblocks and filesystem accounting information: done 
39 | 
40 |   ```
41 | 
42 |   Now, the disk is ready:
43 |   ```
44 |   myuser@vs-ubuntu2004-block-pvc:~$ sudo mkdir /mnt/vdb && sudo mount /dev/vdb /mnt/vdb
45 |   myuser@vs-ubuntu2004-block-pvc:~$ df -h
46 |   Filesystem      Size  Used Avail Use% Mounted on
47 |   udev            7.9G     0  7.9G   0% /dev
48 |   tmpfs           1.6G  1.1M  1.6G   1% /run
49 |   /dev/vda1        39G  2.6G   37G   7% /
50 |   tmpfs           7.9G     0  7.9G   0% /dev/shm
51 |   tmpfs           5.0M     0  5.0M   0% /run/lock
52 |   tmpfs           7.9G     0  7.9G   0% /sys/fs/cgroup
53 |   /dev/vda15      105M  7.8M   97M   8% /boot/efi
54 |   /dev/loop0       71M   71M     0 100% /snap/lxd/19647
55 |   /dev/loop1       56M   56M     0 100% /snap/core18/1988
56 |   /dev/loop2       33M   33M     0 100% /snap/snapd/11107
57 |   /dev/loop3       33M   33M     0 100% /snap/snapd/12704
58 |   /dev/loop4       56M   56M     0 100% /snap/core18/2128
59 |   /dev/loop5       71M   71M     0 100% /snap/lxd/21029
60 |   tmpfs           1.6G     0  1.6G   0% /run/user/1001
61 |   /dev/vdb         20G   45M   19G   1% /mnt/vdb
62 |   ```
63 | 
64 | Additional examples and documentation:
65 | 
66 | - [Kubernetes documentation](https://docs.coreweave.com/coreweave-kubernetes/getting-started)
67 | - [VirtualServer documentation](https://docs.coreweave.com/virtual-servers/getting-started)
68 | - [Advanced Label selectors](https://docs.coreweave.com/coreweave-kubernetes/label-selectors)
69 | - [CPU and GPU Availability](https://docs.coreweave.com/coreweave-kubernetes/node-types)
70 | - [Storage](https://docs.coreweave.com/coreweave-kubernetes/storage)
71 | 
72 | 


--------------------------------------------------------------------------------
/virtual-server/examples/kubectl/virtual-server-block-pvc.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: PersistentVolumeClaim
 4 | metadata:
 5 |   name: vs-block-pvc
 6 | spec:
 7 |   accessModes:
 8 |   - ReadWriteOnce
 9 |   storageClassName: block-nvme-ord1
10 |   volumeMode: Block
11 |   resources:
12 |     requests:
13 |       storage: 20Gi
14 | ---
15 | apiVersion: virtualservers.coreweave.com/v1alpha1
16 | kind: VirtualServer
17 | metadata:
18 |   name: vs-ubuntu2004-block-pvc
19 | spec:
20 |   region: ORD1
21 |   os:
22 |     type: linux
23 |   resources:
24 |     gpu:
25 |       type: Quadro_RTX_4000
26 |       count: 1
27 |     cpu:
28 |       count: 3
29 |     memory: 16Gi
30 |   storage:
31 |     root:
32 |       size: 40Gi
33 |       storageClassName: block-nvme-ord1
34 |       source:
35 |         pvc:
36 |           namespace: vd-images
37 |           name: ubuntu2004-nvidia-510-47-03-1-docker-master-20220517-ord1
38 |     additionalDisks:
39 |       - name: additional-block-volume
40 |         spec:
41 |           persistentVolumeClaim:
42 |             claimName: vs-block-pvc
43 | #  users:
44 | #    - username: SET YOUR USERNAME HERE
45 | #      password: SET YOUR PASSWORD HERE  
46 |       # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key
47 |       # sshpublickey: |
48 |       #  ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname
49 |   network:
50 |     public: true
51 |     tcp:
52 |       ports:
53 |         - 22
54 |   cloudInit: |
55 |     # The disk_setup directive instructs Cloud-init to partition a disk.
56 |     disk_setup:
57 |       /dev/vdb:
58 |         table_type: gpt
59 |         layout: True
60 |         overwrite: False
61 |     # fs_setup describes the how the file systems are supposed to look.
62 |     fs_setup:
63 |       - label: None
64 |         filesystem: ext4
65 |         device: /dev/vdb
66 |         partition: 'auto'
67 |     # 'mounts' contains a list of lists; the inner list are entries for an /etc/fstab line
68 |     mounts:
69 |      - [ vdb, /mnt/block-pvc, auto, "defaults" ]
70 |   initializeRunning: true
71 | 


--------------------------------------------------------------------------------
/virtual-server/examples/kubectl/virtual-server-cloudinit.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: virtualservers.coreweave.com/v1alpha1
 2 | kind: VirtualServer
 3 | metadata:
 4 |   name: example-vs
 5 | spec:
 6 |   region: ORD1
 7 |   os:
 8 |     type: linux
 9 |   resources:
10 |     gpu:
11 |       type: Quadro_RTX_4000
12 |       count: 1
13 |     cpu:
14 |       count: 4
15 |     memory: 16Gi
16 |   storage:
17 |     root:
18 |       size: 40Gi
19 |       storageClassName: block-nvme-ord1
20 |       source:
21 |         pvc:
22 |           namespace: vd-images
23 |           name: ubuntu2004-nvidia-510-47-03-1-docker-master-20220421-ord1
24 |   # Change user name and pasword
25 |   # User is on the sudoers list
26 |   #  users:
27 |   #    - username: SET YOUR USERNAME HERE
28 |   #      password: SET YOUR PASSWORD HERE  
29 |   # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key
30 |   #  sshpublickey: |
31 |   #    ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname
32 |   network:
33 |     public: true
34 |     tcp:
35 |       ports:
36 |         - 22
37 |         - 443
38 |         - 60443
39 |         - 4172
40 |         - 3389
41 |     udp:
42 |       ports:
43 |         - 4172
44 |         - 3389
45 |   cloudInit: |
46 |     # Write a simple script
47 |     write_files:
48 |     - content: |
49 |         #!/bin/bash
50 |         echo "Hello world!"
51 |       path: /home/myuser/script.sh
52 |       permissions: '0744'
53 |       owner: myuser:myuser
54 |     # Update packages
55 |     package_update: true
56 |     # Install packages
57 |     packages:
58 |       - curl
59 |       - git
60 |     # Run additional commands
61 |     runcmd:
62 |       - [df, -h]
63 |       - [git, version]
64 |       - [curl, --version ]
65 |       - [bash, /home/myuser/script.sh]
66 |   initializeRunning: true
67 | 


--------------------------------------------------------------------------------
/virtual-server/examples/kubectl/virtual-server-direct-attach-lb.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: virtualservers.coreweave.com/v1alpha1
 2 | kind: VirtualServer
 3 | metadata:
 4 |   name: example-vs-direct
 5 | spec:
 6 |   region: ORD1
 7 |   os:
 8 |     type: linux
 9 |   resources:
10 |     gpu:
11 |       type: Quadro_RTX_4000
12 |       count: 1
13 |     cpu:
14 |       count: 4
15 |     memory: 16Gi
16 |   storage:
17 |     root:
18 |       size: 40Gi
19 |       storageClassName: block-nvme-ord1
20 |       source:
21 |         pvc:
22 |           namespace: vd-images
23 |           name: ubuntu2004-nvidia-510-47-03-1-docker-master-20220421-ord1
24 |   # Change user name and pasword
25 |   # User is on the sudoers list
26 |   #  users:
27 |   #    - username: SET YOUR USERNAME HERE
28 |   #      password: SET YOUR PASSWORD HERE  
29 |   # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key
30 |   #  sshpublickey: |
31 |   #    ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname
32 |   network:
33 |     public: true
34 |     directAttachLoadBalancerIP: true
35 | 


--------------------------------------------------------------------------------
/virtual-server/examples/kubectl/virtual-server-ephemeral-disk.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: virtualservers.coreweave.com/v1alpha1
 2 | kind: VirtualServer
 3 | metadata:
 4 |   name: vs-ubuntu2004-ephemeral-disk
 5 | spec:
 6 |   region: ORD1
 7 |   os:
 8 |     type: linux
 9 |   resources:
10 |     cpu:
11 |       count: 2
12 |       type: intel-xeon-v4
13 |     memory: 4Gi
14 |   storage:
15 |     root:
16 |       size: 40Gi
17 |       storageClassName: block-nvme-ord1
18 |       source:
19 |         pvc:
20 |           namespace: vd-images
21 |           name: ubuntu2004-docker-master-20220708-ord1
22 |     additionalDisks:
23 |       - name: ephemeral-disk
24 |         spec:
25 |           emptyDisk:
26 |             capacity: 10Gi
27 | #  users:
28 | #    - username: SET YOUR USERNAME HERE
29 | #      password: SET YOUR PASSWORD HERE  
30 |       # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key
31 |       # sshpublickey: |
32 |       #  ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname
33 |   network:
34 |     public: true
35 |     tcp:
36 |       ports:
37 |         - 22
38 |   # Format and mount the ephemeral disk
39 |   cloudInit: |
40 |     bootcmd:
41 |       - test "$(lsblk /dev/vdb)" && mkfs.ext4 /dev/vdb
42 |       - mkdir -p /mnt/vdb
43 |     mounts:
44 |     - [ "/dev/vdb", "/mnt/vdb", "ext4", "defaults,nofail", "0", "2" ]
45 |     runcmd:
46 |       - [df, -h]
47 |   initializeRunning: true
48 | 


--------------------------------------------------------------------------------
/virtual-server/examples/kubectl/virtual-server-ephemeral-root-disk.yaml:
--------------------------------------------------------------------------------
 1 | ### 
 2 | ## Ephemeral Root Disks
 3 | # Many use cases, such as data processing or pixel-streaming is ephemeral. VM instances are short-lived and deleted on shut-down.
 4 | # In these instances, leveraging ephemeral root disks will speed up instantiation as well as lower costs.
 5 | # Ephemeral root-disks don't require a new root volume to be allocated, removing a time consuming step in the instantiation process.
 6 | # Epehemeral disks are still writeable, modifications made at run-time is temporarily stored in the ephemeral disk of the serving node.
 7 | # All changes written to the root disk are lost on when the VM is shut down. A shared filesystem volume or NFS/SMB/Object storage should be used
 8 | # to store persistent data in ie data-processing use cases.
 9 | #
10 | # To launch a VS using an ephemeral root disk, the source image needs to be cloned into a `ReadOnlyMany` type volume.
11 | ###
12 | ---
13 | apiVersion: v1
14 | kind: PersistentVolumeClaim
15 | metadata:
16 |   name: image-rox
17 | spec:
18 |   accessModes:
19 |   - ReadOnlyMany
20 |   dataSource:
21 |     kind: PersistentVolumeClaim
22 |     name: <source-root-disk-pvc> # This name will be the same name as a DataVolume/VirtualServer used as the source.
23 |   resources:
24 |     requests:
25 |       storage: 40Gi # Must match the size of the source volume
26 |   storageClassName: block-nvme-ord1
27 |   volumeMode: Block
28 | ---
29 | apiVersion: virtualservers.coreweave.com/v1alpha1
30 | kind: VirtualServer
31 | metadata:
32 |   name: example-vs
33 | spec:
34 |   region: ORD1
35 |   os:
36 |     type: linux
37 |   resources:
38 |     gpu:
39 |       type: Quadro_RTX_4000
40 |       count: 1
41 |     cpu:
42 |       count: 4
43 |     memory: 16Gi
44 |   storage:
45 |     root:
46 |       size: 40Gi
47 |       storageClassName: block-nvme-ord1
48 |       ephemeral: true
49 |       source:
50 |         pvc:
51 |           namespace: tenant-example # Replace with your namespace
52 |           name: image-rox
53 |   # Change user name and pasword
54 |   # User is on the sudoers list
55 |   #  users:
56 |   #    - username: SET YOUR USERNAME HERE
57 |   #      password: SET YOUR PASSWORD HERE  
58 |   # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key
59 |   #  sshpublickey: |
60 |   #    ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname
61 |   network:
62 |     public: false
63 | 


--------------------------------------------------------------------------------
/virtual-server/examples/kubectl/virtual-server-shared-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: vs-shared-pvc
 5 | spec:
 6 |   accessModes:
 7 |   - ReadWriteMany
 8 |   storageClassName: shared-nvme-ord1
 9 |   volumeMode: Filesystem
10 |   resources:
11 |     requests:
12 |       storage: 20Gi
13 | ---
14 | apiVersion: virtualservers.coreweave.com/v1alpha1
15 | kind: VirtualServer
16 | metadata:
17 |   name: vs-ubuntu2004-shared-pvc
18 | spec:
19 |   region: ORD1
20 |   os:
21 |     type: linux
22 |   resources:
23 |     gpu:
24 |       type: Quadro_RTX_4000
25 |       count: 1
26 |     cpu:
27 |       count: 3
28 |     memory: 16Gi
29 |   storage:
30 |     root:
31 |       size: 40Gi
32 |       storageClassName: block-nvme-ord1
33 |       source:
34 |         pvc:
35 |           namespace: vd-images
36 |           name: ubuntu2004-nvidia-510-47-03-1-docker-master-20220517-ord1
37 |     filesystems:
38 |       - name: shared-pvc
39 |         spec:
40 |           persistentVolumeClaim:
41 |             claimName: vs-shared-pvc
42 |   # Change user name and pasword
43 |   # User is on the sudoer list
44 | #  users:
45 | #    - username: SET YOUR USERNAME HERE
46 | #      password: SET YOUR PASSWORD HERE  
47 |       # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key
48 |       # sshpublickey: |
49 |       #   ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname
50 |   network:
51 |     public: true
52 |     tcp:
53 |       ports:
54 |         - 22
55 |   cloudInit: |
56 |     # Write a simple script
57 |     write_files:
58 |     - content: |
59 |         #!/bin/bash
60 |         echo "Hello world!"
61 |       path: /home/myuser/script.sh
62 |       permissions: '0744'
63 |       owner: myuser:myuser
64 |     # Update packages
65 |     package_update: true
66 |     # Install packages
67 |     packages:
68 |       - curl
69 |       - git
70 |     # Run additional commands
71 |     runcmd:
72 |       - [df, -h]
73 |       - [git, version]
74 |       - [curl, --version ]
75 |       - [bash, /home/myuser/script.sh]
76 |   initializeRunning: true
77 | 


--------------------------------------------------------------------------------
/virtual-server/examples/kubectl/virtual-server-static-mac.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: virtualservers.coreweave.com/v1alpha1
 2 | kind: VirtualServer
 3 | metadata:
 4 |   name: example-vs-static-mac
 5 | spec:
 6 |   region: ORD1
 7 |   os:
 8 |     type: linux
 9 |   resources:
10 |     cpu:
11 |       count: 2
12 |       type: amd-epyc-rome
13 |     memory: 2Gi
14 |   storage:
15 |     root:
16 |       size: 40Gi
17 |       storageClassName: block-nvme-ord1
18 |       source:
19 |         pvc:
20 |           namespace: vd-images
21 |           name: ubuntu2004-docker-master-20220103-ord1
22 |   # Change user name and pasword
23 |   # User is on the sudoers list
24 |   #  users:
25 |   #    - username: SET YOUR USERNAME HERE
26 |   #      password: SET YOUR PASSWORD HERE  
27 |   # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key
28 |   #  sshpublickey: |
29 |   #    ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname
30 |   network:
31 |     macAddress: A2-1F-EE-09-06-5D
32 |     public: true
33 |     tcp:
34 |       ports:
35 |         - 22
36 |   initializeRunning: true
37 | 
38 | 


--------------------------------------------------------------------------------
/virtual-server/examples/kubectl/virtual-server-windows-cpu-only.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: virtualservers.coreweave.com/v1alpha1
 2 | kind: VirtualServer
 3 | metadata:
 4 |   name: vs-windows-cpu
 5 | spec:
 6 |   region: LAS1
 7 |   os:
 8 |     type: windows
 9 |   resources:
10 |     cpu:
11 |       # Reference CPU instance label selectors here:
12 |       # https://docs.coreweave.com/resources/resource-based-pricing#cpu-only-instance-resource-pricing
13 |       type: amd-epyc-rome
14 |       count: 3
15 |     memory: 16Gi
16 |   storage:
17 |     root:
18 |       size: 80Gi
19 |       storageClassName: block-nvme-las1
20 |       source:
21 |         pvc:
22 |           namespace: vd-images
23 |           # Reference querying source image here:
24 |           # https://docs.coreweave.com/virtual-servers/root-disk-lifecycle-management/exporting-coreweave-images-to-a-writable-pvc#identifying-source-image
25 |           name: winserver2019std-master-20210813-las1
26 |   # Change user name and pasword
27 | #  users:
28 | #    - username: SET YOUR USERNAME HERE
29 | #      password: SET YOUR PASSWORD HERE  
30 |   network:
31 |     public: true
32 |     tcp:
33 |       ports:
34 |         - 443
35 |         - 60443
36 |         - 4172
37 |         - 3389
38 |     udp:
39 |       ports:
40 |         - 4172
41 |         - 3389
42 |   initializeRunning: true
43 | 


--------------------------------------------------------------------------------
/virtual-server/examples/kubectl/virtual-server-windows-internal-ip-only.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: virtualservers.coreweave.com/v1alpha1
 2 | kind: VirtualServer
 3 | metadata:
 4 |   name: vs-windows-internal
 5 | spec:
 6 |   region: ORD1
 7 |   os:
 8 |     type: windows
 9 |   resources:
10 |     cpu:
11 |       # Reference CPU instance label selectors here:
12 |       # https://docs.coreweave.com/resources/resource-based-pricing#cpu-only-instance-resource-pricing
13 |       type: amd-epyc-rome
14 |       count: 4
15 |     memory: 16Gi
16 |   storage:
17 |     root:
18 |       size: 80Gi
19 |       storageClassName: block-nvme-ord1
20 |       source:
21 |         pvc:
22 |           namespace: vd-images
23 |           # Reference querying source image here:
24 |           # https://docs.coreweave.com/virtual-servers/root-disk-lifecycle-management/exporting-coreweave-images-to-a-writable-pvc#identifying-source-image
25 |           name: winserver2019std-master-20210819-ord1
26 |   # Change user name and pasword
27 | #  users:
28 | #    - username: SET YOUR USERNAME HERE
29 | #      password: SET YOUR PASSWORD HERE  
30 |   network:
31 |     directAttachLoadBalancerIP: true
32 |     public: false
33 |   initializeRunning: true


--------------------------------------------------------------------------------
/virtual-server/examples/kubectl/virtual-server-windows.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: virtualservers.coreweave.com/v1alpha1
 2 | kind: VirtualServer
 3 | metadata:
 4 |   name: vs-windows10
 5 | spec:
 6 |   region: LAS1
 7 |   os:
 8 |     type: windows
 9 |   resources:
10 |     gpu:
11 |       type: Quadro_RTX_4000
12 |       count: 1
13 |     cpu:
14 |       count: 3
15 |     memory: 16Gi
16 |   storage:
17 |     root:
18 |       size: 80Gi
19 |       storageClassName: block-nvme-las1
20 |       source:
21 |         pvc:
22 |           namespace: vd-images
23 |           # Reference querying source image here:
24 |           # https://docs.coreweave.com/virtual-servers/root-disk-lifecycle-management/exporting-coreweave-images-to-a-writable-pvc#identifying-source-image
25 |           name: win10-master-20210722-las1
26 |   # Change user name and pasword
27 | #  users:
28 | #    - username: SET YOUR USERNAME HERE
29 | #      password: SET YOUR PASSWORD HERE  
30 |   network:
31 |     public: true
32 |     tcp:
33 |       ports:
34 |         - 443
35 |         - 60443
36 |         - 4172
37 |         - 3389
38 |     udp:
39 |       ports:
40 |         - 4172
41 |         - 3389
42 |   initializeRunning: true
43 | 


--------------------------------------------------------------------------------
/virtual-server/examples/kubectl/virtual-server.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: virtualservers.coreweave.com/v1alpha1
 2 | kind: VirtualServer
 3 | metadata:
 4 |   name: example-vs
 5 | spec:
 6 |   region: ORD1
 7 |   os:
 8 |     type: linux
 9 |   resources:
10 |     gpu:
11 |       type: Quadro_RTX_4000
12 |       count: 1
13 |     cpu:
14 |       count: 4
15 |     memory: 16Gi
16 |   storage:
17 |     root:
18 |       size: 40Gi
19 |       storageClassName: block-nvme-ord1
20 |       source:
21 |         pvc:
22 |           namespace: vd-images
23 |           name: ubuntu2004-nvidia-510-47-03-1-docker-master-20220421-ord1
24 |   # Change user name and pasword
25 |   # User is on the sudoers list
26 |   #  users:
27 |   #    - username: SET YOUR USERNAME HERE
28 |   #      password: SET YOUR PASSWORD HERE  
29 |   # To use key-based authentication replace and uncomment ssh-rsa below with your public ssh key
30 |   #  sshpublickey: |
31 |   #    ssh-rsa AAAAB3NzaC1yc2EAAAA ... user@hostname
32 |   network:
33 |     public: true
34 |     tcp:
35 |       ports:
36 |         - 22
37 | 


--------------------------------------------------------------------------------
/virtual-server/examples/nodejs/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "virtual-server-client-example",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "client.js",
 6 |   "scripts": {
 7 |     "start": "node main.js"
 8 |   },
 9 |   "author": "Yitzy Dier",
10 |   "license": "ISC",
11 |   "dependencies": {
12 |     "kubernetes-client": "^9.0.0"
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/virtual-server/examples/nodejs/util.js:
--------------------------------------------------------------------------------
 1 | // Validates is a quantity is a valid k8s resource.Quantity
 2 | const k8sValidateQuantity = (size) => /^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$/.test(size)
 3 | // Create a new blank VirtualServer Manifest object
 4 | const newVirtualServerManifest = ({name, namespace}) => ({
 5 |   apiVersion: "virtualservers.coreweave.com/v1alpha1",
 6 |   kind: "VirtualServer",
 7 |   metadata: {
 8 |     name,
 9 |     namespace
10 |   },
11 |   spec: {
12 |     affinity: {},
13 |     region: "",
14 |     os: {
15 |       definition: "a",
16 |       type: ""
17 |     },
18 |     resources: {
19 |       definition: "a",
20 |       gpu: {
21 |         type: null,
22 |         count: null,
23 |       },
24 |       cpu: {
25 |         type: null,
26 |         count: null,
27 |       },
28 |       memory: ""
29 |     },
30 |     storage: {
31 |       root: {
32 |         size: "",
33 |         source: {
34 |           pvc: {
35 |             namespace: "",
36 |             name: ""
37 |           },
38 |           storageClassName: "",
39 |           volumeMode: null,
40 |           accessMode: null
41 |         }
42 |       },
43 |       additionalDisks: [
44 |         
45 |       ],
46 |       filesystems: [
47 | 
48 |       ],
49 |       swap: null
50 |     },
51 |     users: [
52 | 
53 |     ],
54 |     network: {
55 |       tcp: [],
56 |       udp: [],
57 |       directAttachLoadBalancerIP: false,
58 |       floatingIPs: []
59 |     },
60 |     initializeRunning: false
61 |   }
62 | })
63 | 
64 | module.exports = {
65 |   k8sValidateQuantity,
66 |   newVirtualServerManifest
67 | }


--------------------------------------------------------------------------------
/virtual-server/examples/python/.gitignore:
--------------------------------------------------------------------------------
1 | .*/
2 | __pycache__/
3 | 


--------------------------------------------------------------------------------
/virtual-server/examples/python/README.md:
--------------------------------------------------------------------------------
 1 | # Python example
 2 | 
 3 | An example Python implementation of a kubernetes client that interacts with the Coreweave VirtualServer resource as well as the kubevirt subresource api.
 4 | 
 5 | The python example illustrates the following:
 6 | 1. Removal of an existing Virtual Server.
 7 | 2. Creation of a new Virtual Server based on the `my_virtualserver` example configuration
 8 | 3. Waiting for a Virtual Server ready status.
 9 | 4. Stop the Virtual Server instance and wait until it is stopped.
10 | 5. Delete the Virtual Server instance.
11 | 
12 | In order to workaround unresolved issues with resource paths in the native python client for kubevirt https://github.com/kubevirt/client-python, we introduced the class `KubeVirtClient` for basic operations on kubevirt VirtualMachine resources.
13 | 
14 | Class VSClient performs basic operations on the Vitrual Server resource.
15 | 
16 | ## Install
17 | 
18 | ```
19 | virtualenv -p python3 .venv && source ./.venv/bin/activate
20 | pip install kubernetes
21 | ```
22 | 
23 | ## Run
24 | 
25 | ```
26 | Be sure to set secure credentials for your USERNAME and PASSWORD, as they will be used to create a user in your Virtual Server
27 | USERNAME=<my_username> PASSWORD=<my_password> NAMESPACE=<my_namespace> KUBECONFIG=$HOME/.kube/config python3 main.py
28 | ```
29 | 


--------------------------------------------------------------------------------
/virtual-server/examples/python/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import sys
  4 | 
  5 | from kubernetes.client.rest import ApiException
  6 | from vsclient import VSClient
  7 | 
  8 | name = 'my-virtual-server'
  9 | namespace = os.environ.get('NAMESPACE', 'default')
 10 | username = os.environ.get('USERNAME')
 11 | password = os.environ.get('PASSWORD')
 12 | 
 13 | if username == None or password == None:
 14 |     print('USERNAME and PASSWORD environment variables are required')
 15 |     sys.exit()
 16 | 
 17 | my_virtualserver = {
 18 |     'apiVersion': f'{VSClient.GROUP}/{VSClient.VERSION}',
 19 |     'kind': 'VirtualServer',
 20 |     'metadata': {'name': name, 'namespace': namespace},
 21 |     'spec': {
 22 |         'region': 'ORD1',  # ord1, ewr1, ewr2
 23 |         'os': {
 24 |             'type': 'linux',
 25 |         },
 26 |         'resources': {
 27 |             'gpu': {
 28 |                 'type': 'Quadro_RTX_4000',
 29 |                 'count': 1
 30 |             },
 31 |             'cpu': {
 32 |                 # GPU type and CPU type are mutually exclusive i.e. CPU type cannot be specified when GPU type is selected.
 33 |                 # CPU is selected automatically based on GPU type.
 34 |                 # 'type': 'amd-epyc-rome',
 35 |                 'count': 2,
 36 |             },
 37 |             'memory': '16Gi'
 38 |         },
 39 |         # Add user
 40 |         # SSH public key is optional and allows to login without a password
 41 |         # Public key is located in $HOME/.ssh/id_rsa.pub
 42 |         # publicKey = `ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDEQCQpab6UWuA ... user@hostname`
 43 |         'users': [
 44 |             {
 45 |                 'username': username,
 46 |                 'password': password,
 47 |           	  	# SSHPublicKey: publicKey
 48 |             }
 49 |         ],
 50 |         # Add cloud config
 51 |         # more examples on https://cloudinit.readthedocs.io/en/latest/topics/examples.html
 52 |         'cloudInit': """
 53 | # Update packages
 54 | package_update: true
 55 | # Install packages
 56 | packages:
 57 |   - curl
 58 |   - git
 59 | # Run additional commands
 60 | runcmd:
 61 |   - [df, -h]
 62 |   - [git, version]
 63 |   - [curl, --version ]
 64 | """,
 65 |         'storage': {
 66 |             'root': {
 67 |                 'size': '40Gi',
 68 |                 'source': {
 69 |                     'pvc': {
 70 |                         'name': 'ubuntu2004-nvidia-515-86-01-1-docker-master-20221205-ord1',
 71 |                         'namespace': 'vd-images'
 72 |                     }
 73 |                 },
 74 |                 'storageClassName': 'block-nvme-ord1',
 75 |                 'volumeMode': 'Block',
 76 |                 'accessMode': 'ReadWriteOnce'
 77 |             }
 78 |         },
 79 |         'network': {
 80 |             'tcp': {
 81 |                 'ports': [22, 443, 60443, 4172, 3389]
 82 |             },
 83 |             'udp': {
 84 |                 'ports': [4172, 3389]
 85 |             }
 86 |         },
 87 |         'initializeRunning': True
 88 |     }
 89 | }
 90 | 
 91 | 
 92 | vsclient = VSClient()
 93 | 
 94 | try:
 95 |     vsclient.delete(namespace, name)
 96 | except ApiException as e:
 97 |     if e.status == 404:
 98 |         print(f'VirtualServer {name} in namespace {namespace} already deleted')
 99 |     else:
100 |         print(f'VirtualServer delete exception {e}')
101 |         exit(1)
102 | 
103 | # Create virtual server
104 | print(vsclient.create(my_virtualserver))
105 | print(f'VirtualServer status: {vsclient.ready(namespace, name)}')
106 | 
107 | # Stop the Virtual Machine Instance to apply changes.
108 | print(vsclient.kubevirt_api.stop(namespace, name))
109 | print(f'VirtualServer status: {vsclient.ready(namespace, name, expected_state="Stopped")}')
110 | 
111 | # Update the manifest and attach directly to Load Balancer
112 | my_virtualserver['spec']['network']['tcp']['ports'] = []
113 | my_virtualserver['spec']['network']['udp']['ports'] = []
114 | my_virtualserver['spec']['network']['directAttachLoadBalancerIP'] = True
115 | print(vsclient.update(my_virtualserver))
116 | 
117 | print(vsclient.kubevirt_api.start(namespace, name))
118 | print(f'VirtualServer status: {vsclient.ready(namespace, name)}')
119 | 
120 | # Delete virtual server
121 | vsclient.delete(namespace, name)
122 | 
123 | exit(0)
124 | 


--------------------------------------------------------------------------------
/virtual-server/examples/terraform/README.md:
--------------------------------------------------------------------------------
 1 | # Deploying Virtual Servers to Kubernetes with Terraform
 2 | 
 3 | This [terraform](terraform.io) module uses the [kubernetes provider](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs) to deploy `VirtualServers` to [CoreWeave Cloud](coreweave.com).
 4 | 
 5 | ## Setup
 6 | 
 7 | This module requires your `user_namespace`, `kubeconfig_path`, your desired desktop `vs_username` (and you can optionally supply `vs_password` or set `vs_password_generate` to `true`), `vs_image` (defaults to Ubuntu 20.04), `vs_gpu_enable` (and `vs_gpu_count`), and your desired `vs_name` to set your system hostname.
 8 | 
 9 | ## Installation
10 | 
11 | Run:
12 | 
13 | ```bash
14 | terraform plan
15 | terraform apply -auto-approve
16 | ```
17 | 
18 | This module will output the network and credential information for the system, consumable by another module via the `network` and `password` attributes.
19 | 
20 | ## Examples
21 | 
22 | In the `examples/` directory is a sample Terraform plan that demonstrates consuming the module to create two Virtual Server instances.
23 | 


--------------------------------------------------------------------------------
/virtual-server/examples/terraform/examples/module-use.tf:
--------------------------------------------------------------------------------
 1 | variable "kubeconfig_path" {}
 2 | variable "vs_name" {}
 3 | variable "vs_username" {}
 4 | variable "vs_generate_password" {
 5 |   default = true
 6 | }
 7 | variable "user_namespace" {}
 8 | 
 9 | module "virtualserver_1" {
10 | 
11 |   source = "../"
12 | 
13 |   kubeconfig_path      = var.kubeconfig_path
14 |   vs_name              = "hostOne"
15 |   vs_username          = "onePerson"
16 |   vs_generate_password = var.vs_generate_password
17 |   user_namespace       = var.user_namespace
18 | }
19 | 
20 | module "virtualserver_2" {
21 | 
22 |   source = "../"
23 | 
24 |   kubeconfig_path      = var.kubeconfig_path
25 |   vs_name              = "hostTwo"
26 |   vs_username          = "secondPerson"
27 |   vs_generate_password = var.vs_generate_password
28 |   user_namespace       = var.user_namespace
29 | }
30 | 
31 | output "vs_one_info" {
32 |   value = module.virtualserver_1.vs_network
33 | }
34 | 
35 | output "vs_two_info" {
36 |   value = module.virtualserver_2.vs_network
37 | }
38 | 


--------------------------------------------------------------------------------
/virtual-server/examples/terraform/main.tf:
--------------------------------------------------------------------------------
 1 | provider "kubernetes" {
 2 |   config_path = var.kubeconfig_path
 3 | }
 4 | 
 5 | provider "kubernetes-alpha" {
 6 |   config_path = var.kubeconfig_path
 7 | }
 8 | 
 9 | resource "random_string" "vs_generate_password" {
10 |   count            = var.vs_generate_password ? 1 : 0
11 |   length           = 16
12 |   special          = true
13 |   override_special = "_%@"
14 | }
15 | 


--------------------------------------------------------------------------------
/virtual-server/examples/terraform/outputs.tf:
--------------------------------------------------------------------------------
1 | output "vs_network" {
2 |   value = data.kubernetes_service.vs_loadbalancer.status.0.load_balancer.0.ingress.0.ip
3 | }
4 | 
5 | output "vs_password" {
6 |   value = var.vs_generate_password ? random_string.vs_generate_password[0].result : var.vs_password
7 | }
8 | 


--------------------------------------------------------------------------------
/virtual-server/examples/terraform/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "kubeconfig_path" {
 2 |   description = "Path to kubeconfig"
 3 |   default     = "~/.kube/config"
 4 | }
 5 | 
 6 | variable "user_namespace" {
 7 |   description = "Namespace Virtual Server will be installed to"
 8 | }
 9 | 
10 | variable "vs_name" {
11 |   description = "Virtual Server hostname"
12 |   default     = "MY-VS"
13 | }
14 | 
15 | variable "vs_username" {
16 |   description = "Virtual Server username"
17 | }
18 | 
19 | variable "vs_password" {
20 |   type        = string
21 |   default     = "null"
22 |   description = "User provided password (vs_generate_password must be set to false)"
23 | }
24 | 
25 | variable "vs_generate_password" {
26 |   type        = bool
27 |   default     = true
28 |   description = "Generate password"
29 | }
30 | 
31 | variable "vs_memory" {
32 |   description = "Virtual Server RAM"
33 |   default     = "16Gi"
34 | }
35 | 
36 | variable "vs_root_storage" {
37 |   description = "Virtual Server root device storage (i.e 80Gi)"
38 |   default     = "80Gi"
39 | }
40 | 
41 | variable "vs_os_type" {
42 |   default = "linux"
43 | }
44 | 
45 | variable "vs_image" {
46 |   description = "OS image"
47 |   default     = "ubuntu2004-docker-master-20210601-ord1"
48 | }
49 | 
50 | variable "vs_gpu" {
51 |   description = "GPU"
52 |   default     = "Quadro_RTX_4000"
53 | }
54 | 
55 | variable "vs_gpu_enable" {
56 |   default = true
57 | }
58 | 
59 | variable "vs_cpu_count" {
60 |   default = 3
61 | }
62 | 
63 | variable "vs_gpu_count" {
64 |   default = 1
65 | }
66 | 
67 | variable "vs_region" {
68 |   description = "Region default from vs_regions map"
69 |   default     = "ORD1"
70 | }
71 | 
72 | variable "vs_running" {
73 |   description = "Running virtual server on provisioning"
74 |   default     = true
75 | }
76 | 
77 | variable "vs_public_networking" {
78 |   default = true
79 | }
80 | 
81 | variable "vs_attach_loadbalancer" {
82 |   description = "Attach Service LoadBalancer IP directly to VS (vs_tcp_ports and vs_udp_ports must be empty)."
83 |   default     = false
84 | }
85 | 
86 | variable "vs_tcp_ports" {
87 |   type    = list(any)
88 |   default = [22, 443, 60443, 4172, 3389]
89 | }
90 | 
91 | variable "vs_udp_ports" {
92 |   type    = list(any)
93 |   default = [4172, 3389]
94 | }
95 | 


--------------------------------------------------------------------------------
/virtual-server/examples/terraform/vs.tf:
--------------------------------------------------------------------------------
 1 | resource "kubernetes_manifest" "virtualserver" {
 2 |   provider = kubernetes-alpha
 3 | 
 4 |   manifest = {
 5 |     "apiVersion" = "virtualservers.coreweave.com/v1alpha1"
 6 |     "kind"       = "VirtualServer"
 7 |     "metadata" = {
 8 |       "name"      = var.vs_name
 9 |       "namespace" = var.user_namespace
10 |     }
11 |     "spec" = {
12 |       "initializeRunning" = var.vs_running
13 |       "network" = {
14 |         "directAttachLoadBalancerIP" = var.vs_attach_loadbalancer
15 |         "public"                     = var.vs_public_networking
16 |         "tcp" = {
17 |           "ports" = var.vs_tcp_ports
18 |         }
19 |         "udp" = {
20 |           "ports" = var.vs_udp_ports
21 |         }
22 |       }
23 |       "os" = {
24 |         "type" = var.vs_os_type
25 |       }
26 |       "region" = var.vs_region
27 |       "resources" = {
28 |         "cpu" = {
29 |           "count" = var.vs_cpu_count
30 |         }
31 |         "gpu" = {
32 |           "count" = var.vs_gpu_count
33 |           "type"  = var.vs_gpu_enable ? var.vs_gpu : "Quadro_RTX_4000"
34 |         }
35 |         "memory" = var.vs_memory
36 |       }
37 |       "storage" = {
38 |         "root" = {
39 |           "size" = var.vs_root_storage
40 |           "source" = {
41 |             "pvc" = {
42 |               "name"      = var.vs_image
43 |               "namespace" = "vd-images"
44 |             }
45 |           }
46 |           "storageClassName" = "block-nvme-${var.vs_region}"
47 |         }
48 |       }
49 |       "users" = [
50 |         {
51 |           "username" = var.vs_username
52 |           "password" = var.vs_generate_password ? random_string.vs_generate_password[0].result : var.vs_password
53 |         },
54 |       ]
55 | 
56 |     }
57 |   }
58 | }
59 | 
60 | data "kubernetes_service" "vs_loadbalancer" {
61 |   depends_on = [kubernetes_manifest.virtualserver]
62 |   metadata {
63 |     name      = "${var.vs_name}-tcp"
64 |     namespace = var.user_namespace
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/virtual-server/pvc-clone.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Clone the disk PVC for a given VM instance to a new PVC.
 3 | set -e -o pipefail -u
 4 | 
 5 | if [ $# -ne 2 ]; then
 6 |     echo "Usage: $0 <source vmi> <destination pvc name>"
 7 |     exit 1
 8 | fi
 9 | 
10 | SRC="$1"
11 | DST="$2"
12 | 
13 | 
14 | get_field() {
15 |     kubectl get $1 $2 -o=jsonpath='{'"$3"'}'
16 | }
17 | 
18 | if kubectl get vmi $SRC &>/dev/null; then
19 |     echo "Found running VM instance: $SRC"
20 |     read -p "Stop it? [y/N] " STOP
21 |     
22 | 	if [ "$(get_field vmi $SRC ".metadata.annotations.vs\.coreweave\.com/vmi")" == "true" ]; then
23 | 	SRC_PVC=$(get_field vmi $SRC ".spec.volumes..dataVolume.name")
24 | 	else
25 | 	SRC_PVC=$(get_field vmi $SRC ".spec.volumes[?(@.name=='dv')].persistentVolumeClaim.claimName")
26 | 	fi
27 | 
28 |     if [[ "$STOP" =~ ^[yY]$ ]]; then
29 |         virtctl stop $SRC
30 | 
31 |         echo -n "Waiting for $SRC to stop..."
32 |         while kubectl get vmi $SRC &>/dev/null; do
33 |             sleep 1
34 |             echo -n "."
35 |         done
36 |         echo " stopped."
37 |     else
38 |         echo "ERROR: cannot clone pvc of a running VM"
39 |         exit 1
40 |     fi
41 | 
42 | elif kubectl get pvc $SRC &>/dev/null; then
43 | 
44 |     SRC_PVC="$SRC"
45 | 
46 | else
47 |     echo "ERROR: Did not find PVC or VM instance named: $SRC"
48 |     exit 1
49 | fi
50 | 
51 | SRC_PVC_CLASS=$(get_field pvc $SRC_PVC ".spec.storageClassName")
52 | SRC_PVC_SIZE=$(get_field pvc $SRC_PVC ".spec.resources.requests.storage")
53 | 
54 | REGION=${SRC_PVC_CLASS//*-}
55 | 
56 | if [ "$REGION" == "replica" ]; then
57 |     REGION="ord1"
58 | fi
59 | 
60 | DST_PVC="${DST}-$(date '+%Y%m%d')-block-${REGION}"
61 | 
62 | cat <<EOF | kubectl apply -f -
63 | apiVersion: v1
64 | kind: PersistentVolumeClaim
65 | metadata:
66 |   name: ${DST_PVC}
67 | spec:
68 |   accessModes:
69 |   - ReadWriteOnce
70 |   storageClassName: ${SRC_PVC_CLASS}
71 |   volumeMode: Block
72 |   resources:
73 |     requests:
74 |       storage: ${SRC_PVC_SIZE}
75 |   dataSource:
76 |     kind: PersistentVolumeClaim
77 |     name: ${SRC_PVC}
78 | EOF
79 | 
80 | echo -n "Waiting for pvc $DST_PVC to be bound..."
81 | while [ $(get_field pvc $DST_PVC ".status.phase") != "Bound" ]; do
82 |     sleep 1
83 |     echo -n "."
84 | done
85 | echo " done."
86 | 
87 | echo "Clone of $SRC_PVC to $DST_PVC is complete."
88 | echo "Source VM instance $SRC can be started up or destroyed."
89 | 


--------------------------------------------------------------------------------