├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── argo-validation-controller
    ├── Dockerfile
    ├── README.md
    ├── argo-validation-controller.go
    ├── argo-validation-deployment.yaml
    └── certificates
    │   ├── README.md
    │   └── csr.conf
├── crds.yaml
├── deployment.yaml
├── examples
    └── hparam.yaml
├── garbage-collection
    ├── README.md
    ├── garbage-collection-deployment.yaml
    └── gc_cleanup.py
├── hyperparam-controller
    ├── README.md
    ├── __init__.py
    ├── api.py
    └── controller.py
└── tensorboard-spawner
    ├── spawner.py
    ├── tb-deploy
        ├── Dockerfile
        └── download.sh
    ├── tb-deployment.yaml
    └── tb-service.yaml


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # certs
107 | *.pem
108 | *.csr


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3
2 | 
3 | RUN pip install kubernetes Flask
4 | ENV PYTHONUNBUFFERED=0
5 | COPY . /app
6 | WORKDIR /app
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 GitHub
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Argo-ML
 2 | 
 3 | Controllers, wrappers and miscellaneous utils to make it easier for Argo to be used in ML scenarios. There are 3 major architectures in the repo.
 4 | 
 5 | ## Controller
 6 | 
 7 | Controller, also known as operator, manages Kubernetes [Custom Resources](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/), or CRs. Our controllers, written in Python, are built around main loop through Kubernetes events. Event will be emitted every time resource is added, updated or deleted.
 8 | Kubernetes Python client allows to watch for these events, therefore providing great way to write management code for regular or custom resources.
 9 | 
10 | Example loop will look like that:
11 | 
12 | ```
13 | group = "mycustomapi"
14 | version = "v1"
15 | plural = "mycustomresource"
16 | 
17 | for event in watch.stream(custom_api.list_namespaced_custom_object, group, version, namespace, plural):
18 |     if event['type'] == 'ADDED':
19 |         do_something_when_resource_is_created(event['object'])
20 | ```
21 | 
22 | 
23 | ## Admission Controller
24 | 
25 | Although it's also called controller, it's not following pattern above. [Admission Controllers](https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/) are API endpoints that are triggered when user (or system) attempts to create particular resource. There are two types of admission controllers:
26 | 
27 | * Validating admission controller is great place to include resource validation. Whenever resource is created, Kube API will call this endpoint and expect either validation success message or error, throwing it back to user
28 | * Mutating admission controller is allows us to add custom logic to modify any resource on creation. For example, we could add common secret to every Pod
29 | 
30 | ## APIs
31 | 
32 | Just regular REST APIs
33 | 
34 | 
35 | # Components
36 | 
37 | ## Argo hyperparam workflow
38 | 
39 | This is controller that takes `HyperparamWorkflow` custom resource - resource similar to original Argo `Workflow`, adds new fields that defines hyperparameter search space. Controller then generates `Workflow` for with list of all hyperparam combinations as parameters.
40 | 
41 | ## Argo validation controller
42 | 
43 | Argo workflows have specific syntax. You can validate it if you create `Workflow` with `argo` cli tool, but that won't be the case for our custom wrapper resources. Argo validation controller allows us to validate workflow syntax in these.
44 | 
45 | ## Tensorboard spawner
46 | 
47 | Small API that takes workflow name as input, lists artifacts names `tensorboard` and spawns Tensorboard instance for them.
48 | 
49 | ## Garbage collector
50 | 
51 | Small utility tool to delete old pods produced by workflows.
52 | 


--------------------------------------------------------------------------------
/argo-validation-controller/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM golang:1.11.5
 2 | 
 3 | LABEL AUTHOR=inc0/awmatheson
 4 | LABEL REPO=github/argo-ml
 5 | 
 6 | RUN mkdir -p /go/src/github.com/github/argo-ml
 7 | COPY . /go/src/github.com/github/argo-ml
 8 | RUN go get github.com/github/argo-ml
 9 | RUN go build src/github.com/github/argo-ml/argo-validation-controller.go
10 | 
11 | CMD ["./argo-validation-controller"]
12 | 


--------------------------------------------------------------------------------
/argo-validation-controller/README.md:
--------------------------------------------------------------------------------
 1 | # Argo validation controller
 2 | 
 3 | This is only (so far) GoLang application in this repo. Reason for it breaking out of Python standard is being able to reuse Argo's own validation code. Currently this service validates `HyperparamWorkflow`, but goal is to extend it to every `Workflow` wrapper CRD we maintain.
 4 | 
 5 | This service provides HTTPS API that follows validation admission controller requirements in K8s. Kubernetes, after adding `ValidationWebhookConfiguration` (defined in `argo-validation-deployment.yaml`) will call this API and expect `AdmissionReview` serialized object in return. This review will either allow resource to be created or deny it and pass error to user.
 6 | 
 7 | ## Deployment
 8 | 
 9 | This service requires HTTPS, so it requires certificates. K8s allows to sign TLS certs using internal CA, you can find full instruction for it in [certificate README](https://github.com/github/argo-ml/blob/master/argo-validation-controller/certificates/README.md).
10 | 
11 | After certificates are created and saved as `Secret`, build and push image from `Dockerfile` and run `kubectl apply -f argo-validation-deployment.yaml` to deploy service and configure Kubernetes to call it.


--------------------------------------------------------------------------------
/argo-validation-controller/argo-validation-controller.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"crypto/tls"
  6 | 	"encoding/json"
  7 | 	"errors"
  8 | 	"fmt"
  9 | 	"io/ioutil"
 10 | 	"log"
 11 | 	"net/http"
 12 | 
 13 | 	wfv1 "github.com/argoproj/argo/pkg/apis/workflow/v1alpha1"
 14 | 	"github.com/argoproj/argo/workflow/validate"
 15 | 	"k8s.io/api/admission/v1beta1"
 16 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 17 | )
 18 | 
 19 | func main() {
 20 | 	sCert, _ := tls.LoadX509KeyPair("/certificates/server-cert.pem", "/certificates/server-key.pem")
 21 | 	srv := &http.Server{
 22 | 		Addr:    ":8443",
 23 | 		Handler: &handler{},
 24 | 	}
 25 | 	srv.TLSConfig = &tls.Config{
 26 | 		Certificates: []tls.Certificate{sCert},
 27 | 	}
 28 | 	log.Print("Starting the service...")
 29 | 	log.Fatal(srv.ListenAndServeTLS("", ""))
 30 | }
 31 | 
 32 | type handler struct{}
 33 | 
 34 | func (h *handler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
 35 | 
 36 | 	b, err := ioutil.ReadAll(req.Body)
 37 | 	defer req.Body.Close()
 38 | 	if err != nil {
 39 | 		http.Error(w, err.Error(), 500)
 40 | 		return
 41 | 	}
 42 | 
 43 | 	validationError, allowed := handleAdmission(b)
 44 | 
 45 | 	w.Header().Set("Content-Type", "application/json")
 46 | 	reviewStatus := v1beta1.AdmissionResponse{}
 47 | 
 48 | 	if allowed {
 49 | 		reviewStatus.Allowed = true
 50 | 	} else {
 51 | 		reviewStatus.Allowed = false
 52 | 		reviewStatus.Result = &metav1.Status{
 53 | 			Message: fmt.Sprintf("%s", validationError),
 54 | 		}
 55 | 
 56 | 	}
 57 | 	validationRequest := &v1beta1.AdmissionReview{}
 58 | 	_ = json.Unmarshal(b, validationRequest)
 59 | 	validationRequest.Response = &reviewStatus
 60 | 	output, _ := json.Marshal(validationRequest)
 61 | 	w.Write(output)
 62 | }
 63 | 
 64 | func handleAdmission(b []byte) (string, bool) {
 65 | 	log.Printf("Request received: %s", b)
 66 | 	validationRequest := &v1beta1.AdmissionReview{}
 67 | 	err := json.Unmarshal(b, validationRequest)
 68 | 	if err != nil {
 69 | 		return fmt.Sprintf("Error while unmarshalling AdmissionReview: %s", err), false
 70 | 	}
 71 | 	wf, err := getResource(validationRequest)
 72 | 
 73 | 	if err != nil {
 74 | 		return fmt.Sprintf("Error while generating workflow: %s", err), false
 75 | 	}
 76 | 
 77 | 	err = validateWF(wf)
 78 | 
 79 | 	if err != nil {
 80 | 		return fmt.Sprintf("Validation error: %s", err), false
 81 | 	}
 82 | 	return "", true
 83 | }
 84 | 
 85 | // function to ping the hparam api
 86 | func getResource(validationRequest *v1beta1.AdmissionReview) ([]byte, error) {
 87 | 	hparam, err := json.Marshal(validationRequest.Request.Object)
 88 | 	if err != nil {
 89 | 		log.Printf("Error processing validation request: %s\n", err)
 90 | 		r := []byte("")
 91 | 		return r, err
 92 | 	}
 93 | 	response, err := http.Post("http://argo-hyperparam-controller:5000/workflow", "application/json", bytes.NewBuffer(hparam))
 94 | 	if err != nil {
 95 | 		r := []byte("")
 96 | 		return r, err
 97 | 	} else if response.StatusCode != 200 {
 98 | 		resp, _ := ioutil.ReadAll(response.Body)
 99 | 		err = errors.New(fmt.Sprintf("The HTTP request code is not 200: %s\n", resp))
100 | 		r := []byte("")
101 | 		return r, err
102 | 	} else {
103 | 		data, _ := ioutil.ReadAll(response.Body)
104 | 		return data, nil
105 | 	}
106 | }
107 | 
108 | // function to validate the results using argoproj validation code
109 | func validateWF(jsonStr []byte) error {
110 | 	wf := &wfv1.Workflow{}
111 | 	err := json.Unmarshal(jsonStr, wf)
112 | 	if err != nil {
113 | 		return err
114 | 	}
115 | 	return validate.ValidateWorkflow(wf, validate.ValidateOpts{})
116 | }
117 | 


--------------------------------------------------------------------------------
/argo-validation-controller/argo-validation-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: argo-validation-controller
 5 |   namespace: argo
 6 | spec:
 7 |   ports:
 8 |   - name: webhook
 9 |     port: 8443
10 |     targetPort: 8443
11 |   selector:
12 |     name: argo-validation-controller
13 | ---
14 | apiVersion: apps/v1beta1
15 | kind: Deployment
16 | metadata:
17 |   name: argo-validation-controller
18 |   namespace: argo
19 |   labels:
20 |     name: argo-validation-controller
21 | spec:
22 |   replicas: 3
23 |   template:
24 |     metadata:
25 |       name: argo-validation-controller
26 |       labels:
27 |         name: argo-validation-controller
28 |     spec:
29 |       containers:
30 |         - name: webhook
31 |           image: analytics-kubelet-055005d.private-us-east-1.github.net:30550/argo-validation:latest
32 |           imagePullPolicy: Never
33 |           volumeMounts:
34 |             - name: webhook-certs
35 |               mountPath: /certificates/
36 |               readOnly: true
37 |           securityContext:
38 |             readOnlyRootFilesystem: true
39 |           ports:
40 |           - name: http
41 |             containerPort: 8443
42 |             protocol: TCP
43 |       volumes:
44 |         - name: webhook-certs
45 |           secret:
46 |             secretName: argo-validation-certs
47 | ---
48 | kind: ValidatingWebhookConfiguration
49 | metadata:
50 |   name: scheduling-admission
51 | webhooks:
52 |   - name: argo-validation-controller.argo
53 |     rules:
54 |       - apiGroups:
55 |           - "argoproj.io"
56 |         apiVersions:
57 |           - v1alpha1
58 |         operations:
59 |           - CREATE
60 |         resources:
61 |           - hyperparamworkflows
62 |     failurePolicy: Ignore
63 |     clientConfig:
64 |       service:
65 |         name: argo-validation-controller
66 |         namespace: argo
67 |         path: "/"
68 |       caBundle: ${CA_BUNDLE}


--------------------------------------------------------------------------------
/argo-validation-controller/certificates/README.md:
--------------------------------------------------------------------------------
 1 | # Generating certificates
 2 | 
 3 | ## Prepare csr.conf
 4 | 
 5 | Important - change namespace and name of services
 6 | 
 7 | ## Generate certificates
 8 | 
 9 | ```
10 | openssl genrsa -out server-key.pem 2048
11 | openssl req -new -key server-key.pem -subj "/CN=argo-validation-controller.argo.svc.cluster.local" -out server.csr -config csr.conf
12 | ```
13 | 
14 | ## Prepare CertificateSigningRequest
15 | 
16 | ```
17 | CSR=$(cat server.csr | base64 | tr -d '\n')
18 | ```
19 | 
20 | ```
21 | cat <<EOF | kubectl apply -f -
22 | apiVersion: certificates.k8s.io/v1beta1
23 | kind: CertificateSigningRequest
24 | metadata:
25 |   name: argo-validation-controller
26 | spec:
27 |   groups:
28 |   - system:authenticated
29 |   request: $CSR
30 |   usages:
31 |   - digital signature
32 |   - key encipherment
33 |   - server auth
34 | EOF
35 | ```
36 | or in a separate file with:
37 | ```
38 | kubectl create -f << csr file >>
39 | ```
40 | 
41 | ## Approve CSR
42 | 
43 | ```
44 | kubectl certificate approve argo-validation-controller
45 | ```
46 | 
47 | Get your approved certificate
48 | 
49 | ```
50 | serverCert=$(kubectl get csr argo-validation-controller -o jsonpath='{.status.certificate}')
51 | ```
52 | 
53 | Create file with approved cert
54 | 
55 | ```
56 | echo ${serverCert} | openssl base64 -d -A -out ${tmpdir}/server-cert.pem
57 | ```
58 | 
59 | ## Prepare manifest.yaml
60 | 
61 | You need Kubernetes CA certificate for it
62 | 
63 | ```
64 | CA_BUNDLE=$(kubectl get configmap -n kube-system extension-apiserver-authentication -o=jsonpath='{.data.client-ca-file}' | base64 | tr -d '\n')
65 | ```
66 | 
67 | Paste contents of `$CA_BUNDLE` to manifest.yaml
68 | 
69 | ```
70 | kind: ValidatingWebhookConfiguration
71 | metadata:
72 |   name: scheduling-admission
73 | webhooks:
74 |   - name: argo-validation-controller.argo
75 |     rules:
76 |       - apiGroups:
77 |           - "argoproj.io"
78 |         apiVersions:
79 |           - v1alpha1
80 |         operations:
81 |           - CREATE
82 |         resources:
83 |           - hyperparamworkflows
84 |     failurePolicy: Ignore
85 |     clientConfig:
86 |       service:
87 |         name: argo-validation-controller
88 |         namespace: argo
89 |         path: "/"
90 |       caBundle: ${CA_BUNDLE}
91 | ```
92 | 
93 | ## set up secrets
94 | 
95 | `kubectl -n argo create secret generic argo-validation-certs --from-file="server-cert.pem" --from-file="server-key.pem"`
96 | 


--------------------------------------------------------------------------------
/argo-validation-controller/certificates/csr.conf:
--------------------------------------------------------------------------------
 1 | [req]
 2 | req_extensions = v3_req
 3 | distinguished_name = req_distinguished_name
 4 | [req_distinguished_name]
 5 | [ v3_req ]
 6 | basicConstraints = CA:FALSE
 7 | keyUsage = nonRepudiation, digitalSignature, keyEncipherment
 8 | extendedKeyUsage = serverAuth
 9 | subjectAltName = @alt_names
10 | [alt_names]
11 | DNS.1 = argo-validation-controller
12 | DNS.2 = argo-validation-controller.argo
13 | DNS.3 = argo-validation-controller.argo.svc
14 | DNS.4 = argo-validation-controller.argo.svc.cluster.local
15 | 


--------------------------------------------------------------------------------
/crds.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: apiextensions.k8s.io/v1beta1
 3 | kind: CustomResourceDefinition
 4 | metadata:
 5 |   name: hyperparamworkflows.argoproj.io
 6 | spec:
 7 |   group: argoproj.io
 8 |   version: v1alpha1
 9 |   scope: Namespaced
10 |   names:
11 |     plural: hyperparamworkflows
12 |     kind: HyperParamWorkflow
13 |     singular: hyperparamworkflow
14 |     shortNames:
15 |     - hparam
16 | 


--------------------------------------------------------------------------------
/deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: argo-hyperparam-controller
 5 |   namespace: argo
 6 | spec:
 7 |   ports:
 8 |   - name: webhook
 9 |     port: 5000
10 |     targetPort: 5000
11 |   selector:
12 |     name: argo-hyperparam-controller
13 | ---
14 | apiVersion: apps/v1
15 | kind: Deployment
16 | metadata:
17 |   name: argo-hyperparam-controller
18 |   namespace: argo
19 |   labels:
20 |     app: argo-hyperparam-controller
21 | spec:
22 |   replicas: 1
23 |   selector:
24 |     matchLabels:
25 |       app: argo-hyperparam-controller
26 |   template:
27 |     metadata:
28 |       labels:
29 |         app: argo-hyperparam-controller
30 |     spec:
31 |       containers:
32 |       - name: argo-hyperparam-controller
33 |         image: analytics-kubelet-055005d.private-us-east-1.github.net:30550/argo-ml:latest
34 |         command: ["python", "hyperparam-controller/controller.py"]
35 |       - name: argo-hyperparam-validation
36 |         image: analytics-kubelet-055005d.private-us-east-1.github.net:30550/argo-ml:latest
37 |         command: ["python", "hyperparam-controller/api.py"]
38 |         ports:
39 |         - name: http
40 |           containerPort: 5000
41 |           protocol: TCP


--------------------------------------------------------------------------------
/examples/hparam.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: argoproj.io/v1alpha1
 3 | kind: HyperParamWorkflow
 4 | metadata:
 5 |   name: example-hparam-sweep
 6 | spec:
 7 |   hyperparams:
 8 |     # This setup will create 12 models - LR between 0.1 and 0.5 x 3 types of models
 9 |     learning-rate:
10 |       range:  # Ranges will start from min and go to max with step
11 |         min: 0.1
12 |         max: 0.5
13 |         step: 0.1
14 |     model:
15 |       values:  # Values will iterate over flat list
16 |         - RandomForest
17 |         - SVM
18 |         - LogisticRegression
19 |   algorithm: grid
20 | 
21 |   entrypoint: hparam-example
22 |   templates:
23 |   - name: hparam-example
24 |     parallelism: 3  # This will allow only 3 nodes to run at same time, good for resource conservation
25 |     steps:
26 |     - - name: train
27 |         template: train
28 |         arguments:
29 |           parameters:
30 |           - {name: learning-rate, value: "{{item.learning-rate}}"}
31 |           - {name: model, value: "{{item.model}}"}
32 |         withParam: "{{workflow.parameters.hyperparams}}"
33 | 
34 |   - name: train
35 |     inputs:
36 |       parameters:
37 |       - name: learning-rate
38 |       - name: model
39 |     container:
40 |       image: docker/whalesay:latest
41 |       command: [sh, -c]
42 |       args: ["cowsay $LR"]
43 |       resources:
44 |         requests:
45 |           nvidia.com/gpu: 1 # requesting 1 GPU
46 |         limits:
47 |           nvidia.com/gpu: 1
48 |       env:
49 |         - name: LR
50 |           value: "{{inputs.parameters.learning-rate}}"


--------------------------------------------------------------------------------
/garbage-collection/README.md:
--------------------------------------------------------------------------------
 1 | ## Garbage Collection Argo ML
 2 | 
 3 | When using argo for Machine Learning, you can run into a problem of a number of pods left after successfully competing. You may have different workflow types that require the pods to persist for different lengths of time for whatever reason. This utility allows you deploy an easy cronjob that will clean up old pods depending on the criteria you set.
 4 | 
 5 | Here is an example of deleting all the [scheduled workflows](link to cron workflows) that are over 10 days old
 6 | 
 7 | ```bash
 8 | python gc_cleanup.py --label_selector cronWorkflow --max_age_hrs 240
 9 | ```
10 | 
11 | And then to clear out all the non-labeled "adhoc" workflows
12 | 
13 | ```bash
14 | python gc_cleanup.py --label_selector cronWorkflow --max_age_hrs 240 --adhoc
15 | ```
16 | **If you do not include the labels and starts_with lists when specifying adhoc, they will be deleted since adhoc is not an actual labeled workflow or type of workflow**
17 | 
18 | 
19 | optional arguments:
20 | 
21 |   -n NAMESPACE, --namespace NAMESPACE
22 |   The custom resource's namespace. The default is "default"
23 |   -grp GROUP, --group GROUP
24 |   The custom resource's group name. The default is "argoproj.io"
25 |   -version VERSION
26 |   The custom resource's version. The default is "v1alpha1"
27 |   -p PLURAL, --plural PLURAL
28 |   The custom resource's plural name to filter by. for example Workflow would be workflows. The default is "workflows"
29 |   --starts_with STARTS_WITH [STARTS_WITH ...]
30 |   A list of specific names filtering for workflows that start with
31 |   --label_selector LABEL_SELECTOR [LABEL_SELECTOR ...]
32 |   A list of labels to filter by
33 |   --adhoc               
34 |   This flag will cause the workflows filtered by the label_selector and starts_with to be ignored if set
35 |   --max_age_hrs MAX_AGE_HRS
36 |   The maximum age to keep workflows for in hours. Default is 168
37 | 


--------------------------------------------------------------------------------
/garbage-collection/garbage-collection-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1beta1
 2 | kind: CronJob
 3 | metadata:
 4 |   name: garbage-collection
 5 | spec:
 6 |   schedule: "@daily"
 7 |   jobTemplate:
 8 |     spec:
 9 |       template:
10 |         spec:
11 |           containers:
12 |           - name: garbage-collection
13 |             image: analytics-kubelet-055005d.private-us-east-1.github.net:30550/argo-ml:latest
14 |             args:
15 |             - python
16 |             - gc_cleanup.py --max_age_hrs 168 --adhoc
17 |           restartPolicy: OnFailure
18 | 


--------------------------------------------------------------------------------
/garbage-collection/gc_cleanup.py:
--------------------------------------------------------------------------------
  1 | # argo garbage collection
  2 | 
  3 | import os
  4 | import argparse
  5 | from datetime import datetime, tzinfo, timedelta
  6 | import logging
  7 | 
  8 | 
  9 | from kubernetes import client, config, watch
 10 | from kubernetes.client.rest import ApiException
 11 | 
 12 | 
 13 | config.load_incluster_config()
 14 | 
 15 | 
 16 | api_client = client.ApiClient()
 17 | custom_api = client.CustomObjectsApi(api_client)
 18 | v1_api = client.CoreV1Api(api_client)
 19 | 
 20 | 
 21 | def get_pods(workflow,namespace):
 22 |     # check if pods exist in workflow and collect them
 23 |     nodes = list(workflow['status']['nodes'].keys())
 24 |     pods = []
 25 |     for node in nodes:
 26 |         try:
 27 |             api_response = v1_api.read_namespaced_pod_status(node, namespace)
 28 |             pods.append(node)
 29 |         except ApiException as e:
 30 |             pass
 31 |     return pods
 32 | 
 33 | 
 34 | def delete_pods(pod,namespace,body):
 35 |     try:
 36 |         api_response = v1_api.delete_namespaced_pod(pod, namespace, body=body, propagation_policy='Background')
 37 |         logging.info("{} deleted".format(pod))
 38 |         return api_response
 39 |     except ApiException as e:
 40 |         logging.info("Exception when calling CoreV1Api->delete_namespaced_pod: %s\n" % e)
 41 | 
 42 | 
 43 | def check_filters(key,workflow,filter_words):
 44 |     for word in filter_words:
 45 |         if key.startswith(word):
 46 |             return True
 47 |         elif word in workflow['metadata']['labels']:
 48 |             return True
 49 |     return False
 50 | 
 51 | 
 52 | def clean_up(args):
 53 |     # body object for kubernetes api
 54 |     body = client.V1DeleteOptions()
 55 |     # get all workflows
 56 |     try:
 57 |         workflows = custom_api.list_namespaced_custom_object(args.group, args.version, args.namespace, args.plural)
 58 |     except ApiException as e:
 59 |         logging.warning("Exception when calling CustomObjectsApi->list_namespaced_custom_object: %s\n" % e)
 60 | 
 61 |     # track workflows expired, workflows not expired and pods deleted for logging
 62 |     workflows_expired = []
 63 |     workflows_not_expired = []
 64 |     pods_deleted = []
 65 |     for workflow in workflows['items']:
 66 |         key = workflow['metadata']['name']
 67 |         try:
 68 |             finished_at = datetime.strptime(workflow['status']['finishedAt'], '%Y-%m-%dT%H:%M:%SZ')
 69 |         except TypeError:
 70 |             logging.info('could not read workflow {}'.format(key))
 71 |             continue
 72 |         time_since_completion = (datetime.utcnow() - finished_at).total_seconds()/60/60
 73 |         # Get specific metadata based on workflow type
 74 |         if args.adhoc:
 75 |             exists = check_filters(key, workflow, args.starts_with + args.label_selector)
 76 |             if not exists and int(time_since_completion) > int(args.max_age_hrs):
 77 |                 workflows_expired.append(key)
 78 |                 pods = get_pods(workflow,args.namespace)
 79 |                 for pod in pods:
 80 |                     if not args.dry_run:
 81 |                         delete_pods(pod,args.namespace,body)
 82 |                     else:
 83 |                         logging.info("dry_run flag set, would have deleted {}".format(pod))
 84 |             else:
 85 |                 workflows_not_expired.append(key)
 86 |         else:
 87 |             exists = check_filters(key, workflow, args.starts_with + args.label_selector)
 88 |             if exists and int(time_since_completion) > int(args.max_age_hrs):
 89 |                 workflows_expired.append(key)
 90 |                 pods = get_pods(workflow, args.namespace)
 91 |                 for pod in pods:
 92 |                     if not args.dry_run:
 93 |                         delete_pods(pod,args.namespace,body)
 94 |                     else:
 95 |                         logging.info("dry_run flag set, would have deleted {}".format(pod))
 96 |             else:
 97 |                 workflows_not_expired.append(key)
 98 |     logging.info("expired workflows: {}".format(workflows_expired))
 99 | 
100 | 
101 | def main():
102 |     logging.basicConfig(level=logging.INFO)
103 |     logging.getLogger().setLevel(logging.INFO)
104 | 
105 |     # initiate the parser
106 |     parser = argparse.ArgumentParser(description = 'a garbage collection utility for cleaning up argo workflows')
107 |     parser.add_argument("-n", "--namespace", default="default", type=str, help=("The custom resource's namespace."))
108 |     parser.add_argument("-grp", "--group", default="argoproj.io", type=str, help=("The custom resource's group name."))
109 |     parser.add_argument("-version", default="v1alpha1", type=str, help=("The custom resource's version."))
110 |     parser.add_argument("-p", "--plural", default="workflows", type=str, help=("The custom resource's plural name to filter by."))
111 |     parser.add_argument("--starts_with", nargs='+', default = [], type=str, help=("A list of specific names filtering for workflows that start with"))
112 |     parser.add_argument("--label_selector", nargs='+', default = [], type=str, help=("A list of labels to filter by."))
113 |     parser.add_argument("--adhoc", action='store_true', help=("This flag will cause the workflows filtered by the label_selector and starts_with to be ignored if set"))
114 |     parser.add_argument("--max_age_hrs", default=168, type=int, help=("enter the maximum age to keep workflows for in hours"))
115 |     parser.add_argument("--dry_run", action='store_true', help=("Triggers a dry run delete"))
116 | 
117 |     args = parser.parse_args()
118 |     logging.info(args)
119 |     clean_up(args)
120 | 
121 | if __name__ == "__main__":
122 |     main()
123 | 


--------------------------------------------------------------------------------
/hyperparam-controller/README.md:
--------------------------------------------------------------------------------
 1 | # Argo Hyperparameter tuning
 2 | 
 3 | Hyperparameter tuning controller manages custom resource `HyperParamWorkflow`. This is wrapper resource around `Workflow` that unrolls defined hyperparam search space according to algorithm chosen and creates list of runs. List then will be passed to workflow as parameter and can be used, for example, with `with_items` clause.
 4 | 
 5 | You can list hyperparameter runs with
 6 | 
 7 | ```
 8 | kubectl get hparam
 9 | ```
10 | 
11 | ## Structure of HyperparamWorkflow
12 | 
13 | Example hparam workflow
14 | 
15 | ```
16 | apiVersion: argoproj.io/v1alpha1
17 | kind: HyperParamWorkflow
18 | metadata:
19 |   name: example-hparam-sweep
20 | spec:
21 |   hyperparams:
22 |     # This setup will create 12 models - LR between 0.1 and 0.5 x 3 types of models
23 |     learning-rate:
24 |       range:  # Ranges will start from min and go to max with step
25 |         min: 0.1
26 |         max: 0.5
27 |         step: 0.1
28 |     model:
29 |       values:  # Values will iterate over flat list
30 |         - RandomForest
31 |         - SVM
32 |         - LogisticRegression
33 |   algorithm: grid
34 | 
35 |   entrypoint: hparam-example
36 |   templates:
37 |   - name: hparam-example
38 |     parallelism: 3  # This will allow only 3 nodes to run at same time, good for resource conservation
39 |     steps:
40 |     - - name: train
41 |         template: train
42 |         arguments:
43 |           parameters:
44 |           - {name: learning-rate, value: "{{item.learning-rate}}"}
45 |           - {name: model, value: "{{item.model}}"}
46 |         withParam: "{{workflow.parameters.hyperparams}}"
47 | 
48 |   - name: train
49 |     inputs:
50 |       parameters:
51 |       - name: learning-rate
52 |       - name: model
53 |     container:
54 |       image: docker/whalesay:latest
55 |       command: [sh, -c]
56 |       args: ["cowsay $LR"]
57 |       resources:
58 |         requests:
59 |           nvidia.com/gpu: 1 # requesting 1 GPU
60 |         limits:
61 |           nvidia.com/gpu: 1
62 |       env:
63 |         - name: LR
64 |           value: "{{inputs.parameters.learning-rate}}"
65 | ```
66 | 
67 | This looks like regular Argo workflow with few additional fields:
68 | 
69 | `hyperparams` - this field defines list of hyperparameters we want to optimize. There are two ways to specify parameter search space:
70 | 
71 |     * `values` - each value in list will be hyperparameter
72 |     * `range` - hyperparameters will be all values between `min` and `max` with `step`
73 | 
74 | `algorithm` - Algorithm used for generating hyperparams, currently we only support `grid` which means every combination


--------------------------------------------------------------------------------
/hyperparam-controller/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/github/argo-ml/0e754d3c4a64c15bc318b703716737264964ec7d/hyperparam-controller/__init__.py


--------------------------------------------------------------------------------
/hyperparam-controller/api.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request, jsonify
 2 | from pprint import pprint
 3 | from controller import grid_search, generate_workflow
 4 | 
 5 | 
 6 | app = Flask(__name__)
 7 | 
 8 | 
 9 | @app.route("/workflow", methods=["POST"])
10 | def workflow():
11 |     hyperparam = request.json
12 |     pprint(hyperparam)
13 |     if hyperparam['spec']['algorithm'] == 'grid':
14 |         experiments = grid_search(hyperparam['spec']['hyperparams'])
15 |     else:
16 |         return "Algorithm not supported: {}".format(hyperparam['spec']['algorithm']), 400
17 |     return jsonify(generate_workflow(hyperparam, experiments))
18 | 
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     app.run(host="0.0.0.0", debug=True)


--------------------------------------------------------------------------------
/hyperparam-controller/controller.py:
--------------------------------------------------------------------------------
  1 | from pprint import pprint
  2 | import itertools
  3 | import json, yaml
  4 | from kubernetes import client, config
  5 | from kubernetes import watch as kwatch
  6 | from kubernetes.config.config_exception import ConfigException
  7 | import logging
  8 | 
  9 | logging.basicConfig(level=logging.DEBUG)
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | try:
 13 |     config.load_incluster_config()
 14 | except ConfigException:
 15 |     logger.debug("loading local kube config")
 16 |     config.load_kube_config()
 17 | 
 18 | def generate_param_combinations(params):
 19 |     keys, values = zip(*params.items())
 20 |     experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]
 21 |     return experiments
 22 | 
 23 | 
 24 | def unroll_hparams(hparams):
 25 |     parameters = {}
 26 | 
 27 |     for param in hparams:
 28 |         parameters[param] = []
 29 |         if 'range' in hparams[param]:
 30 |             rang = hparams[param]['range']
 31 |             i = rang['min']
 32 |             while abs(i) <= abs(rang['max']):
 33 |                 parameters[param].append(i)
 34 |                 i += rang['step']
 35 |         if 'values' in hparams[param]:
 36 |             parameters[param].extend(hparams[param]['values'])
 37 | 
 38 |     return parameters
 39 | 
 40 | 
 41 | def grid_search(hparams):
 42 |     unrolled = unroll_hparams(hparams)
 43 |     return generate_param_combinations(unrolled)
 44 | 
 45 | 
 46 | def generate_workflow(wf, experiments):
 47 |     wf['kind'] = "Workflow"
 48 |     del wf['spec']['algorithm']
 49 |     del wf['spec']['hyperparams']
 50 | 
 51 |     for i in ['selfLink', 'uid', 'creationTimestamp', 'generation', 'resourceVersion']:
 52 |         if i in wf['metadata']:
 53 |             del wf['metadata'][i]
 54 | 
 55 |     wf['spec']['arguments'] = wf['spec'].get('arguments', {})
 56 |     wf['spec']['arguments']['parameters'] = wf['spec']['arguments'].get('parameters', [])
 57 | 
 58 |     wf['spec']['arguments']['parameters'].append(
 59 |         {
 60 |             'name': 'hyperparams',
 61 |             'value': json.dumps(experiments)
 62 |         }
 63 |     )
 64 |     pprint(wf['metadata'])
 65 |     return wf
 66 | 
 67 | def main():
 68 |     group = "argoproj.io"
 69 |     version = "v1alpha1"
 70 |     plural = "hyperparamworkflows"
 71 | 
 72 | 
 73 |     namespace = 'default'
 74 |     api_client = client.ApiClient()
 75 |     custom_api = client.CustomObjectsApi(api_client)
 76 | 
 77 | 
 78 |     watch = kwatch.Watch(return_type=object)
 79 | 
 80 |     logger.info("Starting loop")
 81 |     for event in watch.stream(custom_api.list_cluster_custom_object, group, version, plural):
 82 |         logger.debug(event)
 83 |         if event['type'] == 'ADDED':
 84 |             namespace = event['metadata']['namespace']
 85 |             hparams = event['raw_object']['spec']['hyperparams']
 86 |             if event['raw_object']['spec']['algorithm'] == 'grid':
 87 |                 experiments = grid_search(hparams)
 88 |                 wf = generate_workflow(event['raw_object'], experiments)
 89 |                 try:
 90 |                     resp = custom_api.create_namespaced_custom_object(group, version, namespace, "workflows", wf, pretty=True)
 91 |                 except client.rest.ApiException:
 92 |                     continue
 93 |                 logger.info(yaml.dump(wf))
 94 |         if event['type'] == 'DELETED':
 95 |             namespace = event['metadata']['namespace']
 96 |             # TODO: This would be better managed with resource owners
 97 |             name = event['raw_object']['metadata']['name']
 98 |             custom_api.delete_namespaced_custom_object(group, version, namespace, "workflows", name=name, body=client.V1DeleteOptions())
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     main()


--------------------------------------------------------------------------------
/tensorboard-spawner/spawner.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask
 2 | from flask import request
 3 | from flask import jsonify
 4 | from kubernetes import client, config
 5 | from kubernetes.client.rest import ApiException
 6 | import json
 7 | import yaml
 8 | from jinja2 import Template
 9 | import time
10 | 
11 | 
12 | app = Flask(__name__)
13 | 
14 | config.load_incluster_config()
15 | api_client = client.ApiClient()
16 | 
17 | custom_api = client.CustomObjectsApi(api_client)
18 | 
19 | 
20 | def get_tensorboard_artifacts(wf):
21 |     artifacts = []
22 |     for name, node in wf["status"]["nodes"].items():
23 |         if not "outputs" in node:
24 |             continue
25 |         afs = node["outputs"].get("artifacts")
26 |         if not afs:
27 |             continue
28 |         for af in afs:
29 |             if af["name"] != "tensorboard":
30 |                 continue
31 |             artifacts.append(af)
32 |     return artifacts
33 | 
34 | 
35 | @app.route("/tb", methods=["GET"])
36 | def workflow():
37 |     group = "argoproj.io"
38 |     version = "v1alpha1"
39 |     plural = "workflows"
40 |     namespace = "default"
41 |     workflow = request.args["wf"]
42 |     try:
43 |         wf = custom_api.get_namespaced_custom_object(group, version, namespace, plural, workflow)
44 |     except client.rest.ApiException as e:
45 |         if e.status == 404:
46 |             return "Workflow not found", 404
47 |         raise
48 |     tb_artifacts = get_tensorboard_artifacts(wf)
49 |     logs = [a['s3']['key'] for a in tb_artifacts]
50 | 
51 |     with open('/app/tensorboard-spawner/tb-deployment.yaml') as f:
52 |         tpl = Template(f.read())
53 |         deploy = yaml.safe_load(
54 |             tpl.render(workflow=workflow, logs=logs)
55 |         )
56 |     with open('/app/tensorboard-spawner/tb-service.yaml') as f:
57 |         tpl = Template(f.read())
58 |         svc = yaml.safe_load(
59 |             tpl.render(workflow=workflow, logs=logs)
60 |         )
61 |     core_api = client.CoreV1Api()
62 |     app_api = client.AppsV1Api()
63 | 
64 |     try:
65 |         s = core_api.read_namespaced_service(namespace=namespace, name="tensorboard-{}".format(workflow))
66 |     except ApiException as e:
67 |         if e.status != 404:
68 |             raise
69 |     else:
70 |         return jsonify(s.spec.ports[0].node_port)
71 | 
72 |     svc_resp = core_api.create_namespaced_service(namespace, svc)
73 |     deploy_resp = app_api.create_namespaced_deployment(namespace, deploy)
74 |     time.sleep(1)  # wait a second for nodeport to appear
75 | 
76 |     s = core_api.read_namespaced_service(namespace=namespace, name="tensorboard-{}".format(workflow))
77 |     return jsonify(s.spec.ports[0].node_port)
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     app.run(host="0.0.0.0", debug=True)


--------------------------------------------------------------------------------
/tensorboard-spawner/tb-deploy/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM tensorflow/tensorflow
2 | 
3 | RUN apt-get update && apt-get -y install wget
4 | RUN wget https://dl.min.io/client/mc/release/linux-amd64/mc && chmod +x ./mc && mv ./mc /usr/local/bin
5 | COPY download.sh /
6 | RUN chmod +x /download.sh
7 | 
8 | ENTRYPOINT []
9 | 


--------------------------------------------------------------------------------
/tensorboard-spawner/tb-deploy/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | set -x
 5 | 
 6 | mkdir -p /downloaded
 7 | 
 8 | for log in "$@"
 9 | do
10 |     mc cp "dumpster/github-virga/$log" "/downloaded/$log"
11 |     dirname=`echo $log | cut -d/ -f2`
12 |     mkdir -p "/logs/$dirname"
13 |     tar -xvzf "/downloaded/$log" -C "/logs/$dirname"
14 | done


--------------------------------------------------------------------------------
/tensorboard-spawner/tb-deployment.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | ---
 3 | apiVersion: apps/v1
 4 | kind: Deployment
 5 | metadata:
 6 |   name: tensorboard-{{ workflow }}
 7 |   labels:
 8 |     app: tensorboard{{ workflow }}
 9 | spec:
10 |   replicas: 1
11 |   selector:
12 |     matchLabels:
13 |       app: tensorboard-{{ workflow }}
14 |   template:
15 |     metadata:
16 |       labels:
17 |         app: tensorboard-{{ workflow }}
18 |     spec:
19 |       initContainers:
20 |       - name: downloader
21 |         image: analytics-kubelet-055005d.private-us-east-1.github.net:30550/tensorboard
22 |         command: 
23 |           - "/download.sh"
24 |           {% for log in logs %}
25 |           - "{{ log }}"
26 |           {% endfor %}
27 |         volumeMounts:
28 |         - name: logs
29 |           mountPath: /logs
30 |         env:
31 |         - name: MC_HOSTS_dumpster
32 |           valueFrom:
33 |             secretKeyRef:
34 |               name: s3host
35 |               key: s3host
36 |       containers:
37 |       - name: tensorboard
38 |         image: analytics-kubelet-055005d.private-us-east-1.github.net:30550/tensorboard
39 |         command: ["tensorboard", "--logdir=/logs"]
40 |         ports:
41 |         - name: http
42 |           containerPort: 6006
43 |           protocol: TCP
44 |         volumeMounts:
45 |         - name: logs
46 |           mountPath: /logs
47 |       volumes:
48 |       - name: logs
49 |         emptyDir: {}


--------------------------------------------------------------------------------
/tensorboard-spawner/tb-service.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Service
 4 | metadata:
 5 |   name: tensorboard-{{ workflow }}
 6 |   label:
 7 |     virga: tensorboard
 8 | spec:
 9 |   ports:
10 |   - name: tb
11 |     port: 6006
12 |     targetPort: 6006
13 |   type: NodePort
14 |   selector:
15 |     app: tensorboard-{{ workflow }}


--------------------------------------------------------------------------------