├── testing
    ├── __init__.py
    ├── version.json
    ├── test-infra
    │   ├── environments
    │   │   ├── prow
    │   │   │   ├── spec.json
    │   │   │   ├── params.libsonnet
    │   │   │   ├── main.jsonnet
    │   │   │   └── .metadata
    │   │   │   │   └── k.libsonnet
    │   │   └── base.libsonnet
    │   ├── .ksonnet
    │   │   └── registries
    │   │   │   ├── kubeflow
    │   │   │       └── 5c35580d76092788b089cb447be3f3097cffe60b.yaml
    │   │   │   └── incubator
    │   │   │       └── ea3408d44c2d8ea4d321364e5533d5c60e74bce0.yaml
    │   ├── components
    │   │   ├── workflows.jsonnet
    │   │   ├── argo.jsonnet
    │   │   ├── params.libsonnet
    │   │   ├── nfs-jupyter.jsonnet
    │   │   ├── workflows.libsonnet
    │   │   └── argo.libsonnet
    │   ├── app.yaml
    │   └── debug_pod.yaml
    ├── argo_client_test.py
    ├── bootstrap.sh
    ├── Makefile
    ├── argo_client.py
    ├── checkout.sh
    ├── run_e2e_workflow_test.py
    ├── Dockerfile
    ├── prow_artifacts_test.py
    ├── README.md
    ├── run_e2e_workflow.py
    ├── prow_artifacts.py
    └── test_deploy.py
├── .gitmodules
├── .travis.yml
├── components
    ├── tf-controller
    │   ├── README.md
    │   ├── Makefile
    │   └── deploy_crd.yaml
    ├── k8s-model-server
    │   ├── inception-client
    │   │   ├── images
    │   │   │   └── sleeping-pepper.jpg
    │   │   ├── requirements.txt
    │   │   ├── run.sh
    │   │   ├── Dockerfile
    │   │   └── label.py
    │   └── docker
    │   │   ├── Makefile
    │   │   └── Dockerfile
    └── jupyterhub
    │   ├── docker
    │       ├── Makefile
    │       └── Dockerfile
    │   └── README.md
├── kubeflow
    ├── registry.yaml
    ├── generate_docs.py
    ├── README.md
    ├── core
    │   ├── parts.yaml
    │   ├── README.md
    │   ├── prototypes
    │   │   └── all.jsonnet
    │   ├── nfs.libsonnet
    │   ├── jupyterhub.libsonnet
    │   └── tf-job.libsonnet
    ├── tf-job
    │   ├── parts.yaml
    │   ├── tf-job.libsonnet
    │   ├── prototypes
    │   │   ├── tf-job.jsonnet
    │   │   └── tf-cnn-benchmarks.jsonnet
    │   └── README.md
    └── tf-serving
    │   ├── parts.yaml
    │   ├── prototypes
    │       └── tf-serving-all-features.jsonnet
    │   ├── README.md
    │   └── tf-serving.libsonnet
├── tf-controller-examples
    └── tf-cnn
    │   ├── README.md
    │   ├── Dockerfile.template
    │   ├── Dockerfile.cpu
    │   ├── Dockerfile.gpu
    │   ├── tf_job_gpu.yaml
    │   ├── tf_job_gpu_distributed.yaml
    │   ├── tf_job_cpu.yaml
    │   ├── tf_job_cpu_distributed.yaml
    │   ├── Makefile
    │   ├── launcher.py
    │   └── create_job_specs.py
├── .gitignore
├── Makefile
├── CONTRIBUTING.md
├── README.md
└── LICENSE


/testing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/testing/version.json:
--------------------------------------------------------------------------------
1 | {"image": "gcr.io/mlkube-testing/kubeflow-testing:v20180104-ce39a55-e3b0c4"}
2 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tensorflow_k8s"]
2 | 	path = tensorflow_k8s
3 | 	url = https://github.com/tensorflow/k8s.git
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | script:
2 |   - make
3 | 
4 | notifications:
5 |   email:
6 |     on_success: never
7 |     on_failure: never
8 | 


--------------------------------------------------------------------------------
/testing/test-infra/environments/prow/spec.json:
--------------------------------------------------------------------------------
1 | {
2 |   "server": "https://35.196.185.88",
3 |   "namespace": "kubeflow-testing"
4 | }


--------------------------------------------------------------------------------
/components/tf-controller/README.md:
--------------------------------------------------------------------------------
1 | Deployment manifests for Tensorflow Kubernetes controller hosted at https://github.com/tensorflow/k8s


--------------------------------------------------------------------------------
/testing/test-infra/environments/base.libsonnet:
--------------------------------------------------------------------------------
1 | local components = std.extVar("__ksonnet/components");
2 | components + {
3 |   // Insert user-specified overrides here.
4 | }
5 | 


--------------------------------------------------------------------------------
/components/k8s-model-server/inception-client/images/sleeping-pepper.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dynamicwebpaige/kubeflow/master/components/k8s-model-server/inception-client/images/sleeping-pepper.jpg


--------------------------------------------------------------------------------
/kubeflow/registry.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: '0.1'
 2 | kind: ksonnet.io/registry
 3 | libraries:  
 4 |   core:
 5 |     version: master
 6 |     path: core
 7 |   tf-job:
 8 |     version: master
 9 |     path: tf-job
10 |   tf-serving:
11 |     version: master
12 |     path: tf-serving


--------------------------------------------------------------------------------
/testing/test-infra/environments/prow/params.libsonnet:
--------------------------------------------------------------------------------
 1 | local params = import "../../components/params.libsonnet";
 2 | params + {
 3 |   components +: {
 4 |     // Insert component parameter overrides here. Ex:
 5 |     // guestbook +: {
 6 |     //   name: "guestbook-dev",
 7 |     //   replicas: params.global.replicas,
 8 |     // },
 9 |   },
10 | }
11 | 


--------------------------------------------------------------------------------
/testing/test-infra/environments/prow/main.jsonnet:
--------------------------------------------------------------------------------
1 | local base = import "../base.libsonnet";
2 | local k = import "k.libsonnet";
3 | 
4 | base + {
5 |   // Insert user-specified overrides here. For example if a component is named "nginx-deployment", you might have something like:
6 |   //   "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"})
7 | }
8 | 


--------------------------------------------------------------------------------
/testing/test-infra/.ksonnet/registries/kubeflow/5c35580d76092788b089cb447be3f3097cffe60b.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: "0.1"
 2 | gitVersion:
 3 |   commitSha: 5c35580d76092788b089cb447be3f3097cffe60b
 4 |   refSpec: master
 5 | kind: ksonnet.io/registry
 6 | libraries:
 7 |   core:
 8 |     path: core
 9 |     version: master
10 |   tf-serving:
11 |     path: tf-serving
12 |     version: master
13 | 


--------------------------------------------------------------------------------
/components/k8s-model-server/inception-client/requirements.txt:
--------------------------------------------------------------------------------
 1 | backports.weakref==1.0.post1
 2 | bleach==1.5.0
 3 | enum34==1.1.6
 4 | funcsigs==1.0.2
 5 | futures==3.2.0
 6 | grpcio==1.8.3
 7 | html5lib==0.9999999
 8 | Markdown==2.6.11
 9 | mock==2.0.0
10 | numpy==1.13.3
11 | pbr==3.1.1
12 | protobuf==3.5.1
13 | six==1.11.0
14 | tensorflow==1.4.1
15 | tensorflow-serving-api==1.4.0
16 | tensorflow-tensorboard==0.4.0rc3
17 | Werkzeug==0.14.1
18 | 


--------------------------------------------------------------------------------
/tf-controller-examples/tf-cnn/README.md:
--------------------------------------------------------------------------------
 1 | # Training TF CNN models
 2 | 
 3 | This directory contains code to train convolutional
 4 | neural networks using [tf_cnn_benchmarks](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
 5 | which is optimized for performance.
 6 | 
 7 | 
 8 | The jobs can be run on a cluster just by running kubectl
 9 | 
10 | e.g.
11 | 
12 | ```
13 | kubectl create -f tf_job_gpu.yaml
14 | ```
15 | 
16 | By default the examples run using synthetic data and save the trained model
17 | inside the container.


--------------------------------------------------------------------------------
/testing/test-infra/components/workflows.jsonnet:
--------------------------------------------------------------------------------
 1 | local params = std.extVar("__ksonnet/params").components["workflows"];
 2 | 
 3 | local k = import 'k.libsonnet';
 4 | local workflows = import 'workflows.libsonnet';
 5 | local namespace = params.namespace;
 6 | 
 7 | // TODO(jlewi): Can we make name default so some random unique value?
 8 | // I didn't see any routines in the standard library for datetime or random.
 9 | local name = params.name;
10 | 
11 | local prowEnv = workflows.parseEnv(params.prow_env);
12 | local bucket = params.bucket;
13 | std.prune(k.core.v1.list.new([workflows.parts(namespace, name).e2e(prowEnv, bucket),]))


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # pkg and bin directories currently contain build artifacts
 2 | # only so we exclude them.
 3 | bin/
 4 | vendor/
 5 | 
 6 | .vscode/
 7 | 
 8 | # Compiled python files.
 9 | *.pyc
10 | 
11 | # Emacs temporary files
12 | *~
13 | 
14 | # Other temporary files
15 | .DS_Store
16 | 
17 | # Files created by Gogland IDE
18 | .idea/
19 | 
20 | # Exclude wheel files for now.
21 | # The only wheel file is the TF wheel one which is quite large.
22 | # We don't want to check that into source control because it could be
23 | # quite large.
24 | *.whl
25 | 
26 | # Bazel files
27 | **/bazel-*
28 | # Examples egg
29 | examples/tf_sample/tf_sample.egg-info/
30 | examples/.ipynb_checkpoints/
31 | 
32 | **/.ipynb_checkpoints
33 | 


--------------------------------------------------------------------------------
/testing/test-infra/app.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: 0.0.1
 2 | kind: ksonnet.io/app
 3 | libraries:
 4 |   core:
 5 |     gitVersion:
 6 |       commitSha: 5c35580d76092788b089cb447be3f3097cffe60b
 7 |       refSpec: master
 8 |     name: core
 9 |     registry: kubeflow
10 | name: test-infra
11 | registries:
12 |   incubator:
13 |     gitVersion:
14 |       commitSha: ea3408d44c2d8ea4d321364e5533d5c60e74bce0
15 |       refSpec: master
16 |     protocol: github
17 |     uri: github.com/ksonnet/parts/tree/master/incubator
18 |   kubeflow:
19 |     gitVersion:
20 |       commitSha: 5c35580d76092788b089cb447be3f3097cffe60b
21 |       refSpec: master
22 |     protocol: github
23 |     uri: github.com/google/kubeflow/tree/master/kubeflow
24 | version: 0.0.1
25 | 


--------------------------------------------------------------------------------
/testing/test-infra/components/argo.jsonnet:
--------------------------------------------------------------------------------
 1 | local params = std.extVar("__ksonnet/params").components["argo"];
 2 | 
 3 | local k = import 'k.libsonnet';
 4 | local argo = import 'argo.libsonnet';
 5 | local namespace = params.namespace;
 6 | 
 7 | std.prune(k.core.v1.list.new([argo.parts(namespace).crd,
 8 | 							  argo.parts(namespace).config,
 9 | 							  argo.parts(namespace).deploy,
10 | 							  argo.parts(namespace).deployUi,
11 | 	                          argo.parts(namespace).uiService,
12 | 	                          argo.parts(namespace).uiIngress,
13 | 	                          argo.parts(namespace).serviceAccount,
14 | 	                          argo.parts(namespace).roleBinding,
15 | 	                          argo.parts(namespace).defaultRoleBinding,]))


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright 2015 Google Inc. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | 
14 | all: presubmit
15 | 
16 | presubmit:
17 | 	@echo ">> checking file boilerplate"
18 | 	@./build/check_boilerplate.sh
19 | 
20 | TAG?=$(shell git rev-parse HEAD)
21 | .PHONY: all presubmit
22 | 


--------------------------------------------------------------------------------
/kubeflow/generate_docs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # This script assumes you ran go install github.com/ksonnet/parts/doc-gen
 4 | 
 5 | import glob
 6 | import os
 7 | import subprocess
 8 | 
 9 | if __name__ == "__main__":
10 |   this_dir = os.path.dirname(__file__)
11 | 
12 |   GOPATH = os.getenv("GOPATH")
13 |   doc_gen = os.path.join(GOPATH, "bin/doc-gen")
14 |   for f in os.listdir(this_dir):
15 |     full_dir = os.path.join(this_dir, f)
16 |     if not os.path.isdir(f):
17 |       continue
18 |     prototypes = glob.glob(os.path.join(full_dir, "prototypes/*.jsonnet"))
19 | 
20 | 
21 |     command = [doc_gen, os.path.join(full_dir, "parts.yaml")]
22 |     command.extend(prototypes)
23 |     with open(os.path.join(full_dir, "README.md"), "w") as hout:
24 |       subprocess.check_call(command, stdout=hout)
25 | 


--------------------------------------------------------------------------------
/testing/argo_client_test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import unittest
 4 | 
 5 | from testing import argo_client
 6 | from kubernetes import client as k8s_client
 7 | import mock
 8 | import os
 9 | import yaml
10 | from py import util
11 | 
12 | class ArgoClientTest(unittest.TestCase):
13 |   def setUp(self):
14 |     self.test_dir = os.path.join(os.path.dirname(__file__), "test-data")
15 | 
16 |   def test_wait_for_workflow(self):
17 |     api_client = mock.MagicMock(spec=k8s_client.ApiClient)
18 | 
19 |     with open(os.path.join(self.test_dir, "successful_workflow.yaml")) as hf:
20 |       response = yaml.load(hf)
21 | 
22 |     api_client.call_api.return_value = response
23 |     result = argo_client.wait_for_workflow(api_client, "some-namespace", "some-set")
24 |     self.assertIsNotNone(result)
25 | 
26 | if __name__ == "__main__":
27 |   unittest.main()


--------------------------------------------------------------------------------
/kubeflow/README.md:
--------------------------------------------------------------------------------
 1 | # Kubeflow Ksonnet Registry
 2 | 
 3 | ## Overview
 4 | 
 5 | This directory contains the Kubeflow ksonnet [registry][2]. If you are unfamiliar with ksonnet, we recommend browsing [the official site][1] to gain more context.
 6 | 
 7 | 
 8 | ## Usage
 9 | 
10 | Please refer to the [Kubeflow user guide](https://github.com/google/kubeflow/blob/master/README.ksonnet.md)
11 | 
12 | ## Library-specific Documentation
13 | 
14 | Each of the libraries in this directory has its own README.md. These are autogenerated from the metadata in their `parts.yaml` file, using the [`doc-gen` script][4].
15 | 
16 | Note that you can use the `ks` commands in your terminal to access this same documentation.
17 | 
18 | [1]: https://ksonnet.io
19 | [2]: https://ksonnet.io/docs/concepts#registry
20 | [3]: https://ksonnet.io/#get-started
21 | [4]: https://github.com/ksonnet/parts/blob/master/doc-gen/main.go
22 | 


--------------------------------------------------------------------------------
/testing/test-infra/.ksonnet/registries/incubator/ea3408d44c2d8ea4d321364e5533d5c60e74bce0.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: "0.1"
 2 | gitVersion:
 3 |   commitSha: ea3408d44c2d8ea4d321364e5533d5c60e74bce0
 4 |   refSpec: master
 5 | kind: ksonnet.io/registry
 6 | libraries:
 7 |   apache:
 8 |     path: apache
 9 |     version: master
10 |   efk:
11 |     path: efk
12 |     version: master
13 |   mariadb:
14 |     path: mariadb
15 |     version: master
16 |   memcached:
17 |     path: memcached
18 |     version: master
19 |   mongodb:
20 |     path: mongodb
21 |     version: master
22 |   mysql:
23 |     path: mysql
24 |     version: master
25 |   nginx:
26 |     path: nginx
27 |     version: master
28 |   node:
29 |     path: node
30 |     version: master
31 |   postgres:
32 |     path: postgres
33 |     version: master
34 |   redis:
35 |     path: redis
36 |     version: master
37 |   tomcat:
38 |     path: tomcat
39 |     version: master
40 | 


--------------------------------------------------------------------------------
/components/k8s-model-server/inception-client/run.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/bin/bash
16 | 
17 | SERVER=$1
18 | PORT=$2
19 | 
20 | if [ -z $SERVER ] ; then
21 |   SERVER=$INCEPTION_SERVICE_HOST
22 | fi
23 | 
24 | if [ -z $PORT ] ; then
25 |   PORT=$INCEPTION_SERVICE_PORT
26 | fi
27 | 
28 | python label.py -s $SERVER -p $PORT /data/*.jpg
29 | 


--------------------------------------------------------------------------------
/components/jupyterhub/docker/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | VERSION=1.0
16 | PROJECT_ID=kubeflow
17 | PROJECT=gcr.io/${PROJECT_ID}
18 | 
19 | all: build
20 | 
21 | build:
22 | 	docker build --pull -t ${PROJECT}/jupyterhub-k8s:${VERSION} .
23 | 
24 | push: build
25 | 	gcloud docker -- push ${PROJECT}/jupyterhub-k8s:${VERSION}
26 | 
27 | .PHONY: all build push
28 | 


--------------------------------------------------------------------------------
/components/k8s-model-server/docker/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | VERSION=1.0
16 | PROJECT_ID=kubeflow
17 | PROJECT=gcr.io/${PROJECT_ID}
18 | 
19 | all: build
20 | 
21 | build:
22 | 	docker build --pull -t ${PROJECT}/model-server:${VERSION} .
23 | 
24 | push: build
25 | 	gcloud docker -- push ${PROJECT}/model-server:${VERSION}
26 | 
27 | .PHONY: all build push
28 | 


--------------------------------------------------------------------------------
/kubeflow/core/parts.yaml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "name": "core",
 3 |    "apiVersion": "0.0.1",
 4 |    "kind": "ksonnet.io/parts",
 5 |    "description": "Core components of Kubeflow.\n",
 6 |    "author": "kubeflow team <kubeflow-team@google.com>",
 7 |    "contributors": [
 8 |       {
 9 |          "name": "Jeremy Lewi",
10 |          "email": "jlewi@google.com"
11 |       }
12 |    ],
13 |    "repository": {
14 |       "type": "git",
15 |       "url": "https://github.com/google/kubeflow"
16 |    },
17 |    "bugs": {
18 |       "url": "https://github.com/google/kubeflow/issues"
19 |    },
20 |    "keywords": [
21 |       "kubeflow",
22 |       "tensorflow"
23 |    ],
24 |    "quickStart": {
25 |       "prototype": "io.ksonnet.pkg.kubeflow",
26 |       "componentName": "core",
27 |       "flags": {
28 |          "name": "core",
29 |          "namespace": "default",
30 |          "disks": ""
31 |       },
32 |       "comment": "Core Kubeflow components."
33 |    },
34 |    "license": "Apache 2.0"
35 | }
36 | 


--------------------------------------------------------------------------------
/kubeflow/tf-job/parts.yaml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "name": "tf-job",
 3 |    "apiVersion": "0.0.1",
 4 |    "kind": "ksonnet.io/parts",
 5 |    "description": "Prototypes for running TensorFlow jobs.\n",
 6 |    "author": "kubeflow team <kubeflow-team@google.com>",
 7 |    "contributors": [
 8 |       {
 9 |          "name": "Jeremy Lewi",
10 |          "email": "jlewi@google.com"
11 |       }
12 |    ],
13 |    "repository": {
14 |       "type": "git",
15 |       "url": "https://github.com/google/kubeflow"
16 |    },
17 |    "bugs": {
18 |       "url": "https://github.com/google/kubeflow/issues"
19 |    },
20 |    "keywords": [
21 |       "kubeflow",
22 |       "tensorflow",
23 |       "database"
24 |    ],
25 |    "quickStart": {
26 |       "prototype": "io.ksonnet.pkg.tf-job",
27 |       "componentName": "tf-job",
28 |       "flags": {
29 |          "name": "tf-job",
30 |          "namespace": "default"
31 |       },
32 |       "comment": "Run TensorFlow Job"
33 |    },
34 |    "license": "Apache 2.0"
35 | }
36 | 


--------------------------------------------------------------------------------
/components/k8s-model-server/inception-client/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | FROM python:2.7.14
16 | 
17 | RUN pip install --no-cache-dir grpcio tensorflow tensorflow-serving-api
18 | 
19 | RUN mkdir -p /opt/label /data
20 | 
21 | WORKDIR /opt/label
22 | 
23 | COPY label.py ./
24 | COPY run.sh ./
25 | 
26 | ARG IMAGES_DIR=images/
27 | 
28 | ADD $IMAGES_DIR /data/
29 | 
30 | ENTRYPOINT ["bash", "run.sh"]
31 | CMD []
32 | 


--------------------------------------------------------------------------------
/kubeflow/tf-serving/parts.yaml:
--------------------------------------------------------------------------------
 1 | {
 2 |    "name": "tf-serving",
 3 |    "apiVersion": "0.0.1",
 4 |    "kind": "ksonnet.io/parts",
 5 |    "description": "TensorFlow serving is a server for TensorFlow models.\n",
 6 |    "author": "kubeflow team <kubeflow-team@google.com>",
 7 |    "contributors": [
 8 |       {
 9 |          "name": "Jeremy Lewi",
10 |          "email": "jlewi@google.com"
11 |       }
12 |    ],
13 |    "repository": {
14 |       "type": "git",
15 |       "url": "https://github.com/google/kubeflow"
16 |    },
17 |    "bugs": {
18 |       "url": "https://github.com/google/kubeflow/issues"
19 |    },
20 |    "keywords": [
21 |       "kubeflow",
22 |       "tensorflow",
23 |       "database"
24 |    ],
25 |    "quickStart": {
26 |       "prototype": "io.ksonnet.pkg.tf-serving",
27 |       "componentName": "tf-serving",
28 |       "flags": {
29 |          "name": "tf-serving",
30 |          "namespace": "default"
31 |       },
32 |       "comment": "Run TensorFlow Serving"
33 |    },
34 |    "license": "Apache 2.0"
35 | }
36 | 


--------------------------------------------------------------------------------
/testing/test-infra/components/params.libsonnet:
--------------------------------------------------------------------------------
 1 | {
 2 |   global: {
 3 |     // User-defined global parameters; accessible to all component and environments, Ex:
 4 |     // replicas: 4,
 5 |   },
 6 |   components: {
 7 |     // Component-level parameters, defined initially from 'ks prototype use ...'
 8 |     // Each object below should correspond to a component in the components/ directory
 9 |     "argo": {
10 |       namespace: "kubeflow-test-infra",
11 |     },
12 |     "workflows": {
13 |       bucket: "mlkube-testing_temp",
14 |       name: "kubeflow-presubmit-81-2-39b6",
15 |       namespace: "kubeflow-test-infra",
16 |       prow_env: "BUILD_NUMBER=2,JOB_NAME=kubeflow-presubmit,JOB_TYPE=presubmit,PULL_NUMBER=81,REPO_NAME=kubeflow,REPO_OWNER=google",
17 |     },
18 |     "nfs-jupyter": {
19 |       cloud: "",
20 |       disks: "kubeflow-testing",
21 |       name: "nfs-jupyter",
22 |       namespace: "kubeflow-test-infra",
23 |       tfJobImage: "gcr.io/tf-on-k8s-dogfood/tf_operator:v20171214-0bd02ac",
24 |     },
25 |   },
26 | }
27 | 


--------------------------------------------------------------------------------
/kubeflow/tf-serving/prototypes/tf-serving-all-features.jsonnet:
--------------------------------------------------------------------------------
 1 | // @apiVersion 0.1
 2 | // @name io.ksonnet.pkg.tf-serving
 3 | // @description TensorFlow serving
 4 | // @shortDescription A TensorFlow serving deployment
 5 | // @param name string Name to give to each of the components
 6 | // @optionalParam namespace string default Namespace
 7 | // @param model_path string Path to the model. This can be a GCS path.
 8 | 
 9 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): We have to add namespace as an explicit parameter
10 | // because ksonnet doesn't support inheriting it from the environment yet.
11 | 
12 | local k = import 'k.libsonnet';
13 | local tfServing = import 'kubeflow/tf-serving/tf-serving.libsonnet';
14 | 
15 | local name = import 'param://name';
16 | local namespace = import 'param://namespace';
17 | local modelPath = import 'param://model_path';
18 | 
19 | std.prune(k.core.v1.list.new([
20 |   tfServing.parts.deployment.modelServer(name, namespace, modelPath),
21 |   tfServing.parts.deployment.modelService(name, namespace),
22 | ]))
23 | 


--------------------------------------------------------------------------------
/testing/test-infra/debug_pod.yaml:
--------------------------------------------------------------------------------
 1 | # This pod is useful for starting a shell that you can use to interactively debug our tests
 2 | apiVersion: batch/v1
 3 | kind: Job
 4 | metadata:
 5 |   name: test-job
 6 |   namespace: kubeflow-test-infra
 7 | spec:
 8 |   template:
 9 |     spec:
10 |       containers:
11 |       - name: test-container
12 |         image: gcr.io/mlkube-testing/kubeflow-testing:latest
13 |         command: ["tail", "-f", "/dev/null"]
14 |         volumeMounts:
15 |           - mountPath: /mnt/test-data-volume
16 |             name: kubeflow-test-volume
17 |           - mountPath: /secret/gcp-credentials
18 |             name: gcp-credentials
19 |         env:        
20 |           - name: GOOGLE_APPLICATION_CREDENTIALS
21 |             value: /secret/gcp-credentials/key.json
22 |       restartPolicy: Never
23 |       volumes:      
24 |         - name: kubeflow-test-volume
25 |           persistentVolumeClaim:
26 |             claimName: kubeflow-testing
27 |         - name: gcp-credentials
28 |           secret:
29 |             secretName: kubeflow-testing-credentials
30 | 
31 |   backoffLimit: 4


--------------------------------------------------------------------------------
/components/jupyterhub/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | FROM python:3.6
16 | 
17 | RUN apt-get update && \
18 |     apt-get install -y npm nodejs-legacy
19 | 
20 | RUN npm install -g configurable-http-proxy && \
21 |     pip3 install --no-cache-dir \
22 |                  notebook \
23 |                  jupyterhub==0.8.1 \
24 |                  jupyterhub-kubespawner==0.7.1 \
25 |                  jupyterhub-dummyauthenticator \
26 |                  oauthenticator
27 | 
28 | ENTRYPOINT jupyterhub
29 | 


--------------------------------------------------------------------------------
/tf-controller-examples/tf-cnn/Dockerfile.template:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Docker image for running examples in Tensorflow models.
16 | # base_image depends on whether we are running on GPUs or non-GPUs
17 | FROM {{base_image}}
18 | 
19 | RUN apt-get update && apt-get install -y --no-install-recommends \
20 |     ca-certificates \
21 |     build-essential \
22 |     git
23 | 
24 | RUN mkdir -p /opt
25 | 
26 | RUN git clone https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks
27 | 
28 | COPY launcher.py /opt
29 | RUN chmod u+x /opt/*
30 | ENTRYPOINT ["/opt/launcher.py"]
31 | 


--------------------------------------------------------------------------------
/tf-controller-examples/tf-cnn/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Docker image for running examples in Tensorflow models.
16 | # base_image depends on whether we are running on GPUs or non-GPUs
17 | FROM tensorflow/tensorflow@sha256:5edc0446cc989ad75bc30631f89f20694fe5bf5226f665d47e5c7f35a3b18484
18 | 
19 | RUN apt-get update && apt-get install -y --no-install-recommends \
20 |     ca-certificates \
21 |     build-essential \
22 |     git
23 | 
24 | RUN mkdir -p /opt
25 | 
26 | RUN git clone https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks
27 | 
28 | COPY launcher.py /opt
29 | RUN chmod u+x /opt/*
30 | ENTRYPOINT ["/opt/launcher.py"]
31 | 


--------------------------------------------------------------------------------
/tf-controller-examples/tf-cnn/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Docker image for running examples in Tensorflow models.
16 | # base_image depends on whether we are running on GPUs or non-GPUs
17 | FROM tensorflow/tensorflow@sha256:bfadad8f2c80424d8d6059d3b8cd6947bf23111dc786fc33db72b56b632a1f28
18 | 
19 | RUN apt-get update && apt-get install -y --no-install-recommends \
20 |     ca-certificates \
21 |     build-essential \
22 |     git
23 | 
24 | RUN mkdir -p /opt
25 | 
26 | RUN git clone https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks
27 | 
28 | COPY launcher.py /opt
29 | RUN chmod u+x /opt/*
30 | ENTRYPOINT ["/opt/launcher.py"]
31 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution,
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 | 
25 | ## Get involved
26 | 
27 | * [Slack](http://kubeflow.slack.com/)
28 | * [Twitter](http://twitter.com/kubeflow)
29 | * [Mailing List](https://groups.google.com/forum/#!forum/kubeflow-discuss)
30 | 


--------------------------------------------------------------------------------
/testing/bootstrap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # This script is used to bootstrap our prow jobs.
 4 | # The point of this script is to check out the google/kubeflow repo
 5 | # at the commit corresponding to the Prow job. We can then
 6 | # invoke the launcher script at that commit to submit and
 7 | # monitor an Argo workflow
 8 | set -xe
 9 | 
10 | mkdir -p /src
11 | git clone https://github.com/google/kubeflow.git /src/google_kubeflow
12 | 
13 | cd /src/google_kubeflow
14 | 
15 | echo Job Name = ${JOB_NAME}
16 | 
17 | # See https://github.com/kubernetes/test-infra/tree/master/prow#job-evironment-variables
18 | if [ ! -z ${PULL_NUMBER} ]; then
19 |  git fetch origin  pull/${PULL_NUMBER}/head:pr
20 |  git checkout ${PULL_PULL_SHA}
21 | else 
22 |  if [ ! -z ${PULL_BASE_SHA} ]; then
23 |  	# Its a post submit; checkout the commit to test.
24 |   	git checkout ${PULL_BASE_SHA}
25 |  fi
26 | fi	
27 | 
28 | # Update submodules.
29 | git submodule init
30 | git submodule update
31 | 
32 | # Print out the commit so we can tell from logs what we checked out.
33 | echo Repo is at `git describe --tags --always --dirty`
34 | git submodule
35 | git status
36 | 
37 | export PYTHONPATH=$PYTHONPATH:/src/google_kubeflow/tensorflow_k8s
38 | cd /src/google_kubeflow
39 | # Invoke the script to run the workflow
40 | python -m testing.run_e2e_workflow  \
41 |   --project=mlkube-testing \
42 |   --zone=us-east1-d \
43 |   --cluster=kubeflow-testing \
44 |   --bucket=kubernetes-jenkins


--------------------------------------------------------------------------------
/testing/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Requirements:
16 | #   https://github.com/mattrobenolt/jinja2-cli
17 | #   pip install jinja2-clie
18 | IMG = gcr.io/mlkube-testing/kubeflow-testing
19 | TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always --dirty)-$(shell git diff | sha256sum | cut -c -6)
20 | DIR := ${CURDIR}
21 | 
22 | all: build
23 | 
24 | # To build without the cache set the environment variable
25 | # export DOCKER_BUILD_OPTS=--no-cache
26 | build:
27 | 	@echo {\"image\": \"$(IMG):$(TAG)\"} > version.json
28 | 	docker build ${DOCKER_BUILD_OPTS} -t $(IMG):$(TAG) .
29 | 	docker tag $(IMG):$(TAG) $(IMG):latest
30 | 	@echo Built $(IMG):$(TAG) and tagged with latest
31 | 
32 | push: build
33 | 	gcloud docker -- push $(IMG):$(TAG)
34 | 	gcloud docker -- push $(IMG):latest
35 | 	@echo Pushed $(IMG) with :latest and :$(TAG) tags
36 | 


--------------------------------------------------------------------------------
/kubeflow/tf-job/tf-job.libsonnet:
--------------------------------------------------------------------------------
 1 | local k = import 'k.libsonnet';
 2 | 
 3 | {
 4 |   parts:: {    
 5 |     tfJobReplica(replicaType, number, args, image, numGpus=0):: 
 6 |       local baseContainer = {
 7 |                   "image": image, 
 8 |                   "name": "tensorflow",                 
 9 |                 };
10 |       local containerArgs = if std.length(args) > 0 then 
11 |                 {
12 |                   args: args,
13 |                 }
14 |                 else {};               
15 |       local resources =  if numGpus > 0 then {
16 |                   resources: {
17 |                     limits: {
18 |                       "nvidia.com/gpu": numGpus,
19 |                     }
20 |                   }
21 |                 } else {};
22 |     if number > 0 then
23 |     {
24 |           "replicas": number, 
25 |           "template": {
26 |             "spec": {
27 |               "containers": [
28 |                 baseContainer +  containerArgs + resources, 
29 |               ], 
30 |               "restartPolicy": "OnFailure"
31 |             }
32 |           }, 
33 |           "tfReplicaType": replicaType,
34 |     }
35 |     else {},
36 | 
37 |     tfJob(name, namespace, replicas):: {
38 |         "apiVersion": "tensorflow.org/v1alpha1", 
39 |         "kind": "TfJob", 
40 |         "metadata": {
41 |           "name": name,
42 |           "namespace": namespace,
43 |         }, 
44 |         "spec": {
45 |           "replicaSpecs": replicas,
46 |         }
47 |     },
48 |   },
49 | }
50 | 


--------------------------------------------------------------------------------
/components/tf-controller/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Create a yaml template to deploy the CRD
16 | # Requires the template plugin
17 | # https://github.com/technosophos/helm-template
18 | CHART := https://storage.googleapis.com/tf-on-k8s-dogfood-releases/latest/tf-job-operator-chart-latest.tgz
19 | deploy_config:
20 | 	rm -rf /tmp/tfjob_config_builder
21 | 	mkdir -p /tmp/tfjob_config_builder
22 | 	wget -O /tmp/tfjob_config_builder/tf-job-operator-chart-latest.tgz https://storage.googleapis.com/tf-on-k8s-dogfood-releases/latest/tf-job-operator-chart-latest.tgz
23 | 	tar -C /tmp/tfjob_config_builder -xvf  /tmp/tfjob_config_builder/tf-job-operator-chart-latest.tgz
24 | 	# We set the templates to render because we don't want to render the tests.
25 | 	helm template /tmp/tfjob_config_builder/tf-job-operator-chart --set cloud=gke,rbac.install=true  \
26 | 		-x ./templates/config.yaml -x ./templates/deployment.yaml -x ./templates/rbac.yaml -x ./templates/service-account.yaml > deploy_crd.yaml
27 | 


--------------------------------------------------------------------------------
/components/k8s-model-server/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | FROM ubuntu:16.04
16 | 
17 | MAINTAINER Kenneth Owens <kowens@google.com>
18 | 
19 | ENV MS_USER=model-server
20 | 
21 | RUN apt-get update && apt-get install -y \
22 |         build-essential \
23 |         curl \
24 |         libcurl3-dev \
25 |         git \
26 |         libfreetype6-dev \
27 |         libpng12-dev \
28 |         libzmq3-dev \
29 |         pkg-config \
30 |         python-dev \
31 |         python-numpy \
32 |         python-pip \
33 |         software-properties-common \
34 |         swig \
35 |         zip \
36 |         zlib1g-dev && \
37 |     apt-get clean && \
38 |     rm -rf /var/lib/apt/lists/*
39 | 
40 | RUN echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable \
41 |  tensorflow-model-server \
42 |  tensorflow-model-server-universal" \
43 |  | tee /etc/apt/sources.list.d/tensorflow-serving.list && \
44 |  curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg \
45 |  | apt-key add -
46 | 
47 | RUN apt-get update && apt-get install -y \
48 |         tensorflow-model-server && \
49 |     apt-get clean && \
50 |     rm -rf /var/lib/apt/lists/*
51 | 
52 | RUN set -x \
53 |     && useradd $MS_USER \
54 |     && [ `id -u $MS_USER` -eq 1000 ] \
55 |     && [ `id -g $MS_USER` -eq 1000 ]
56 | 
57 | CMD ["/bin/bash"]
58 | 


--------------------------------------------------------------------------------
/testing/argo_client.py:
--------------------------------------------------------------------------------
 1 | """Some utility functions for working with TfJobs."""
 2 | 
 3 | import datetime
 4 | import json
 5 | import logging
 6 | import time
 7 | 
 8 | from kubernetes import client as k8s_client
 9 | from kubernetes.client.rest import ApiException
10 | 
11 | from py import util
12 | 
13 | GROUP = "argoproj.io"
14 | VERSION = "v1alpha1"
15 | PLURAL = "workflows"
16 | KIND = "Workflow"
17 | 
18 | def log_status(workflow):
19 |   """A callback to use with wait_for_workflow."""
20 |   logging.info("Workflow %s in namespace %s; phase=%s",
21 |            workflow["metadata"]["name"],
22 |            workflow["metadata"]["namespace"],
23 |            workflow["status"]["phase"])
24 | 
25 | def wait_for_workflow(client, namespace, name,
26 |                       timeout=datetime.timedelta(minutes=5),
27 |                       polling_interval=datetime.timedelta(seconds=30),
28 |                       status_callback=None):
29 |   """Wait for the specified workflow to finish.
30 | 
31 |   Args:
32 |     client: K8s api client.
33 |     namespace: namespace for the workflow.
34 |     name: Name of the workflow.
35 |     timeout: How long to wait for the workflow.
36 |     polling_interval: How often to poll for the status of the workflow.
37 |     status_callback: (Optional): Callable. If supplied this callable is
38 |       invoked after we poll the job. Callable takes a single argument which
39 |       is the job.
40 | 
41 |   Raises:
42 |     TimeoutError: If timeout waiting for the job to finish.
43 |   """
44 |   crd_api = k8s_client.CustomObjectsApi(client)
45 |   end_time = datetime.datetime.now() + timeout
46 |   while True:
47 |     results = crd_api.get_namespaced_custom_object(
48 |         GROUP, VERSION, namespace, PLURAL, name)
49 | 
50 |     if status_callback:
51 |       status_callback(results)
52 | 
53 |     if results["status"]["phase"] in ["Failed", "Succeeded"]:
54 |       return results
55 | 
56 |     if datetime.datetime.now() + polling_interval > end_time:
57 |       raise util.TimeoutError(
58 |         "Timeout waiting for workflow {0} in namespace {1} to finish.".format(
59 |           name, namespace))
60 | 
61 |     time.sleep(polling_interval.seconds)
62 | 


--------------------------------------------------------------------------------
/kubeflow/tf-job/prototypes/tf-job.jsonnet:
--------------------------------------------------------------------------------
 1 | // @apiVersion 0.1
 2 | // @name io.ksonnet.pkg.tf-job
 3 | // @description A TensorFlow job (could be training or evaluation).
 4 | // @shortDescription A TensorFlow jjob.
 5 | // @param name string Name to give to each of the components
 6 | // @optionalParam namespace string default Namespace
 7 | // @optionalParam args string null Comma separated list of arguments to pass to the job
 8 | // @optionalParam image string null The docker image to use for the job.
 9 | // @optionalParam image_gpu string null The docker image to use when using GPUs.
10 | // @optionalParam num_masters number 1 The number of masters to use
11 | // @optionalParam num_ps number 0 The number of ps to use
12 | // @optionalParam num_workers number 0 The number of workers to use
13 | // @optionalParam num_gpus number 0 The number of GPUs to attach to workers.
14 | 
15 | // TODO(https://github.com/ksonnet/ksonnet/issues/235): ks param set args won't work if the arg starts with "--".
16 | 
17 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): We have to add namespace as an explicit parameter
18 | // because ksonnet doesn't support inheriting it from the environment yet.
19 | 
20 | local k = import 'k.libsonnet';
21 | local tfJob = import 'kubeflow/tf-job/tf-job.libsonnet';
22 | 
23 | local name = import 'param://name';
24 | local namespace = import 'param://namespace';
25 | 
26 | local argsParam = import 'param://args';
27 | local args = 
28 |     if argsParam == "null" then
29 |     []
30 |     else
31 | 	std.split(argsParam, ',');
32 | 
33 | local image = import 'param://image';
34 | local imageGpu = import 'param://image_gpu';
35 | local numMasters = import 'param://num_masters';
36 | local numPs = import 'param://num_ps';
37 | local numWorkers = import 'param://num_workers';
38 | local numGpus = import 'param://num_gpus';
39 | 
40 | local workerSpec = if numGpus > 0 then
41 |   	tfJob.parts.tfJobReplica("WORKER", numWorkers, args, imageGpu, numGpus)
42 |   	else
43 |   	tfJob.parts.tfJobReplica("WORKER", numWorkers, args, image);
44 | 
45 | std.prune(k.core.v1.list.new([
46 |   tfJob.parts.tfJob(name, namespace, [
47 |   	tfJob.parts.tfJobReplica("MASTER", numMasters, args, image),
48 | 	workerSpec,  	
49 |   	tfJob.parts.tfJobReplica("PS", numPs, args, image)
50 |   ]),
51 | ]))
52 | 


--------------------------------------------------------------------------------
/tf-controller-examples/tf-cnn/tf_job_gpu.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: tensorflow.org/v1alpha1
16 | kind: TfJob
17 | metadata:
18 |   name: inception-171202-163257-gpu-1
19 |   namespace: default
20 | spec:
21 |   replicaSpecs:
22 |   - replicas: 1
23 |     template:
24 |       spec:
25 |         containers:
26 |         - args:
27 |           - python
28 |           - tf_cnn_benchmarks.py
29 |           - --batch_size=32
30 |           - --model=resnet50
31 |           - --variable_update=parameter_server
32 |           - --flush_stdout=true
33 |           - --num_gpus=1
34 |           image: gcr.io/kubeflow/tf-benchmarks-gpu:v20171202-bdab599-dirty-284af3
35 |           name: tensorflow
36 |           resources:
37 |             limits:
38 |               nvidia.com/gpu: 1
39 |           workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
40 |         restartPolicy: OnFailure
41 |     tfReplicaType: WORKER
42 |   - replicas: 1
43 |     template:
44 |       spec:
45 |         containers:
46 |         - args:
47 |           - python
48 |           - tf_cnn_benchmarks.py
49 |           - --batch_size=32
50 |           - --model=resnet50
51 |           - --variable_update=parameter_server
52 |           - --flush_stdout=true
53 |           - --num_gpus=1
54 |           image: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3
55 |           name: tensorflow
56 |           workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
57 |         restartPolicy: OnFailure
58 |     tfReplicaType: PS
59 |   terminationPolicy:
60 |     chief:
61 |       replicaName: WORKER
62 |       replicaIndex: 0
63 |   tfImage: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3
64 | 


--------------------------------------------------------------------------------
/tf-controller-examples/tf-cnn/tf_job_gpu_distributed.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: tensorflow.org/v1alpha1
16 | kind: TfJob
17 | metadata:
18 |   name: inception-171202-163257-gpu-3
19 |   namespace: default
20 | spec:
21 |   replicaSpecs:
22 |   - replicas: 3
23 |     template:
24 |       spec:
25 |         containers:
26 |         - args:
27 |           - python
28 |           - tf_cnn_benchmarks.py
29 |           - --batch_size=32
30 |           - --model=resnet50
31 |           - --variable_update=parameter_server
32 |           - --flush_stdout=true
33 |           - --num_gpus=1
34 |           image: gcr.io/kubeflow/tf-benchmarks-gpu:v20171202-bdab599-dirty-284af3
35 |           name: tensorflow
36 |           resources:
37 |             limits:
38 |               nvidia.com/gpu: 1
39 |           workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
40 |         restartPolicy: OnFailure
41 |     tfReplicaType: WORKER
42 |   - replicas: 1
43 |     template:
44 |       spec:
45 |         containers:
46 |         - args:
47 |           - python
48 |           - tf_cnn_benchmarks.py
49 |           - --batch_size=32
50 |           - --model=resnet50
51 |           - --variable_update=parameter_server
52 |           - --flush_stdout=true
53 |           - --num_gpus=1
54 |           image: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3
55 |           name: tensorflow
56 |           workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
57 |         restartPolicy: OnFailure
58 |     tfReplicaType: PS
59 |   terminationPolicy:
60 |     chief:
61 |       replicaName: WORKER
62 |       replicaIndex: 0
63 |   tfImage: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3
64 | 


--------------------------------------------------------------------------------
/kubeflow/core/README.md:
--------------------------------------------------------------------------------
 1 | # core
 2 | 
 3 | > Core components of Kubeflow.
 4 | 
 5 | 
 6 | * [Quickstart](#quickstart)
 7 | * [Using Prototypes](#using-prototypes)
 8 |   * [io.ksonnet.pkg.kubeflow-core](#io.ksonnet.pkg.kubeflow-core)
 9 | 
10 | ## Quickstart
11 | 
12 | *The following commands use the `io.ksonnet.pkg.kubeflow` prototype to generate Kubernetes YAML for core, and then deploys it to your Kubernetes cluster.*
13 | 
14 | First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)).
15 | 
16 | If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init <app-name>`.
17 | 
18 | Finally, in the ksonnet application directory, run the following:
19 | 
20 | ```shell
21 | # Expand prototype as a Jsonnet file, place in a file in the
22 | # `components/` directory. (YAML and JSON are also available.)
23 | $ ks prototype use io.ksonnet.pkg.kubeflow-core \
24 |   --name core \
25 |   --namespace default \
26 |   --disks 
27 | 
28 | # Apply to server.
29 | $ ks apply -f core.jsonnet
30 | ```
31 | 
32 | ## Using the library
33 | 
34 | The library files for core define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure core for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache.
35 | 
36 | This library provides a set of pre-fabricated "flavors" (or "distributions") of core, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs.
37 | 
38 | These prototypes, as well as how to use them, are enumerated below.
39 | 
40 | ### io.ksonnet.pkg.kubeflow-core
41 | 
42 | Kubeflow core components
43 | #### Example
44 | 
45 | ```shell
46 | # Expand prototype as a Jsonnet file, place in a file in the
47 | # `components/` directory. (YAML and JSON are also available.)
48 | $ ks prototype use io.ksonnet.pkg.kubeflow-core core \
49 |   --name YOUR_NAME_HERE
50 | ```
51 | 
52 | #### Parameters
53 | 
54 | The available options to pass prototype are:
55 | 
56 | * `--name=<name>`: Name to give to each of the components [string]
57 | 
58 | 
59 | [rootReadme]: https://github.com/ksonnet/mixins
60 | 


--------------------------------------------------------------------------------
/tf-controller-examples/tf-cnn/tf_job_cpu.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: tensorflow.org/v1alpha1
16 | kind: TfJob
17 | metadata:
18 |   name: inception-171202-163257-cpu-1
19 |   namespace: default
20 | spec:
21 |   replicaSpecs:
22 |   - replicas: 1
23 |     template:
24 |       spec:
25 |         containers:
26 |         - args:
27 |           - python
28 |           - tf_cnn_benchmarks.py
29 |           - --batch_size=32
30 |           - --model=resnet50
31 |           - --variable_update=parameter_server
32 |           - --flush_stdout=true
33 |           - --num_gpus=1
34 |           - --local_parameter_device=cpu
35 |           - --device=cpu
36 |           - --data_format=NHWC
37 |           image: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3
38 |           name: tensorflow
39 |           workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
40 |         restartPolicy: OnFailure
41 |     tfReplicaType: WORKER
42 |   - replicas: 1
43 |     template:
44 |       spec:
45 |         containers:
46 |         - args:
47 |           - python
48 |           - tf_cnn_benchmarks.py
49 |           - --batch_size=32
50 |           - --model=resnet50
51 |           - --variable_update=parameter_server
52 |           - --flush_stdout=true
53 |           - --num_gpus=1
54 |           - --local_parameter_device=cpu
55 |           - --device=cpu
56 |           - --data_format=NHWC
57 |           image: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3
58 |           name: tensorflow
59 |           workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
60 |         restartPolicy: OnFailure
61 |     tfReplicaType: PS
62 |   terminationPolicy:
63 |     chief:
64 |       replicaName: WORKER
65 |       replicaIndex: 0
66 |   tfImage: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3
67 | 


--------------------------------------------------------------------------------
/tf-controller-examples/tf-cnn/tf_job_cpu_distributed.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: tensorflow.org/v1alpha1
16 | kind: TfJob
17 | metadata:
18 |   name: inception-171202-163257-cpu-3
19 |   namespace: default
20 | spec:
21 |   replicaSpecs:
22 |   - replicas: 3
23 |     template:
24 |       spec:
25 |         containers:
26 |         - args:
27 |           - python
28 |           - tf_cnn_benchmarks.py
29 |           - --batch_size=32
30 |           - --model=resnet50
31 |           - --variable_update=parameter_server
32 |           - --flush_stdout=true
33 |           - --num_gpus=1
34 |           - --local_parameter_device=cpu
35 |           - --device=cpu
36 |           - --data_format=NHWC
37 |           image: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3
38 |           name: tensorflow
39 |           workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
40 |         restartPolicy: OnFailure
41 |     tfReplicaType: WORKER
42 |   - replicas: 1
43 |     template:
44 |       spec:
45 |         containers:
46 |         - args:
47 |           - python
48 |           - tf_cnn_benchmarks.py
49 |           - --batch_size=32
50 |           - --model=resnet50
51 |           - --variable_update=parameter_server
52 |           - --flush_stdout=true
53 |           - --num_gpus=1
54 |           - --local_parameter_device=cpu
55 |           - --device=cpu
56 |           - --data_format=NHWC
57 |           image: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3
58 |           name: tensorflow
59 |           workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
60 |         restartPolicy: OnFailure
61 |     tfReplicaType: PS
62 |   terminationPolicy:
63 |     chief:
64 |       replicaName: WORKER
65 |       replicaIndex: 0
66 |   tfImage: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3
67 | 


--------------------------------------------------------------------------------
/testing/test-infra/environments/prow/.metadata/k.libsonnet:
--------------------------------------------------------------------------------
 1 | local k8s = import "k8s.libsonnet";
 2 | 
 3 | local apps = k8s.apps;
 4 | local core = k8s.core;
 5 | local extensions = k8s.extensions;
 6 | 
 7 | local hidden = {
 8 |   mapContainers(f):: {
 9 |     local podContainers = super.spec.template.spec.containers,
10 |     spec+: {
11 |       template+: {
12 |         spec+: {
13 |           // IMPORTANT: This overwrites the 'containers' field
14 |           // for this deployment.
15 |           containers: std.map(f, podContainers),
16 |         },
17 |       },
18 |     },
19 |   },
20 | 
21 |   mapContainersWithName(names, f) ::
22 |     local nameSet =
23 |       if std.type(names) == "array"
24 |       then std.set(names)
25 |       else std.set([names]);
26 |     local inNameSet(name) = std.length(std.setInter(nameSet, std.set([name]))) > 0;
27 |     self.mapContainers(
28 |       function(c)
29 |         if std.objectHas(c, "name") && inNameSet(c.name)
30 |         then f(c)
31 |         else c
32 |     ),
33 | };
34 | 
35 | k8s + {
36 |   apps:: apps + {
37 |     v1beta1:: apps.v1beta1 + {
38 |       local v1beta1 = apps.v1beta1,
39 | 
40 |       daemonSet:: v1beta1.daemonSet + {
41 |         mapContainers(f):: hidden.mapContainers(f),
42 |         mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
43 |       },
44 | 
45 |       deployment:: v1beta1.deployment + {
46 |         mapContainers(f):: hidden.mapContainers(f),
47 |         mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
48 |       },
49 |     },
50 |   },
51 | 
52 |   core:: core + {
53 |     v1:: core.v1 + {
54 |       list:: {
55 |         new(items)::
56 |           {apiVersion: "v1"} +
57 |           {kind: "List"} +
58 |           self.items(items),
59 | 
60 |         items(items):: if std.type(items) == "array" then {items+: items} else {items+: [items]},
61 |       },
62 |     },
63 |   },
64 | 
65 |   extensions:: extensions + {
66 |     v1beta1:: extensions.v1beta1 + {
67 |       local v1beta1 = extensions.v1beta1,
68 | 
69 |       daemonSet:: v1beta1.daemonSet + {
70 |         mapContainers(f):: hidden.mapContainers(f),
71 |         mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
72 |       },
73 | 
74 |       deployment:: v1beta1.deployment + {
75 |         mapContainers(f):: hidden.mapContainers(f),
76 |         mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f),
77 |       },
78 |     },
79 |   },
80 | }
81 | 


--------------------------------------------------------------------------------
/kubeflow/tf-serving/README.md:
--------------------------------------------------------------------------------
 1 | # tf-serving
 2 | 
 3 | > TensorFlow serving is a server for TensorFlow models.
 4 | 
 5 | 
 6 | * [Quickstart](#quickstart)
 7 | * [Using Prototypes](#using-prototypes)
 8 |   * [io.ksonnet.pkg.tf-serving](#io.ksonnet.pkg.tf-serving)
 9 | 
10 | ## Quickstart
11 | 
12 | *The following commands use the `io.ksonnet.pkg.tf-serving` prototype to generate Kubernetes YAML for tf-serving, and then deploys it to your Kubernetes cluster.*
13 | 
14 | First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)).
15 | 
16 | If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init <app-name>`.
17 | 
18 | Finally, in the ksonnet application directory, run the following:
19 | 
20 | ```shell
21 | # Expand prototype as a Jsonnet file, place in a file in the
22 | # `components/` directory. (YAML and JSON are also available.)
23 | $ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \
24 |   --name tf-serving \
25 |   --namespace default
26 | 
27 | # Apply to server.
28 | $ ks apply -f tf-serving.jsonnet
29 | ```
30 | 
31 | ## Using the library
32 | 
33 | The library files for tf-serving define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure tf-serving for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache.
34 | 
35 | This library provides a set of pre-fabricated "flavors" (or "distributions") of tf-serving, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs.
36 | 
37 | These prototypes, as well as how to use them, are enumerated below.
38 | 
39 | ### io.ksonnet.pkg.tf-serving
40 | 
41 | TensorFlow serving
42 | #### Example
43 | 
44 | ```shell
45 | # Expand prototype as a Jsonnet file, place in a file in the
46 | # `components/` directory. (YAML and JSON are also available.)
47 | $ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \
48 |   --name YOUR_NAME_HERE \
49 |   --model_path YOUR_MODEL_PATH_HERE
50 | ```
51 | 
52 | #### Parameters
53 | 
54 | The available options to pass prototype are:
55 | 
56 | * `--name=<name>`: Name to give to each of the components [string]
57 | * `--model_path=<model_path>`: Path to the model. This can be a GCS path. [string]
58 | 
59 | 
60 | [rootReadme]: https://github.com/ksonnet/mixins
61 | 


--------------------------------------------------------------------------------
/testing/checkout.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # This script is used as the first step in our Argo workflows to check out the code
 4 | # corresponding the prow job.
 5 | #
 6 | # TODO(jlewi): Eliminate code duplication with bootstraph.sh my moving shared code into
 7 | # a bash script that can be sourced from multiple scripts.
 8 | #!/bin/bash
 9 | set -xe 
10 | SRC_DIR=$1
11 | 
12 | # Print out env for debugging.
13 | env | sort
14 | 
15 | git clone https://github.com/${REPO_OWNER}/${REPO_NAME}.git /tmp/src
16 | 
17 | # Some git operations are really slow when using NFS.
18 | # We observed clone times increasing from O(30) seconds to O(4 minutes)
19 | # when we switched to NFS.
20 | # As a workaround we clone into a local directory and then move the files onto
21 | # NFS. Copying to NFS is still a bottleneck and increases the run time to O(1. 5 minutes).
22 | # clone --recurse-submodules https://github.com/google/kubeflow.git /tmp/src",
23 | cd /tmp/src
24 | 
25 | # We need to set the preloadindex option; to try to speedup git ops like describe
26 | # and status when using an NFS filesystem.
27 | # See: https://stackoverflow.com/questions/4994772/ways-to-improve-git-status-performance
28 | # unfortunately this doesn't seem to help with sub modules.
29 | git config core.preloadindex true
30 | 
31 | # See https://github.com/kubernetes/test-infra/tree/master/prow#job-evironment-variables
32 | if [ ! -z ${PULL_NUMBER} ]; then
33 |  git fetch origin  pull/${PULL_NUMBER}/head:pr
34 |  if [ ! -z ${PULL_PULL_SHA} ]; then
35 | 	git checkout ${PULL_PULL_SHA}
36 |  else
37 |  	# Checkout the latest commit for this PR since no commit specified.
38 |  	git checkout pr
39 |  fi
40 | else 
41 |  if [ ! -z ${PULL_BASE_SHA} ]; then
42 |   # Its a post submit; checkout the commit to test.
43 |     git checkout ${PULL_BASE_SHA}
44 |  fi
45 | fi  
46 | 
47 | # Update submodules.
48 | git submodule init
49 | git submodule update
50 | 
51 | # TODO(jlewi): As noted above the git operations below are really
52 | # slow when using NFS.
53 | # Print out the git version in the logs
54 | git describe --tags --always --dirty      
55 | git status
56 | 
57 | # Move it to NFS
58 | mkdir -p + ${SRC_DIR}
59 | 
60 | # The period is needed because we want to copy the contents of the src directory
61 | # into srcDir not srcDir/src/.
62 | cp -r /tmp/src/.  ${SRC_DIR}
63 | 
64 | # Make the files world readable/writable.
65 | # This is a hack to make it easy to modify the files from jupyterhub which is using
66 | # a different user/group id.
67 | chmod -R a+rwx  ${SRC_DIR}


--------------------------------------------------------------------------------
/testing/test-infra/components/nfs-jupyter.jsonnet:
--------------------------------------------------------------------------------
 1 | local params = std.extVar("__ksonnet/params").components["nfs-jupyter"];
 2 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): We have to add namespace as an explicit parameter
 3 | // because ksonnet doesn't support inheriting it from the environment yet.
 4 | 
 5 | local k = import 'k.libsonnet';
 6 | local jupyter = import "kubeflow/core/jupyterhub.libsonnet";
 7 | local tfjob = import "kubeflow/core/tf-job.libsonnet";
 8 | local nfs = import "kubeflow/core/nfs.libsonnet";
 9 | 
10 | local name = params.name;
11 | local namespace = params.namespace;
12 | 
13 | // TODO(jlewi): Make this a parameter
14 | local jupyterHubImage = 'gcr.io/kubeflow/jupyterhub:1.0';
15 | local diskParam = params.disks;
16 | 
17 | local diskNames = if diskParam != "null" && std.length(diskParam) > 0 then
18 |   std.split(diskParam, ',')
19 |   else [];
20 | 
21 | local jupyterConfigMap = if std.length(diskNames) == 0 then
22 | 	jupyter.parts(namespace).jupyterHubConfigMap
23 | 	else jupyter.parts(namespace).jupyterHubConfigMapWithVolumes(diskNames);
24 | 
25 | local tfJobImage = params.tfJobImage;
26 | 
27 | // Create a list of the resources needed for a particular disk
28 | local diskToList = function(diskName) [
29 | 	nfs.parts(namespace, name,).diskResources(diskName).storageClass,
30 | 	nfs.parts(namespace, name,).diskResources(diskName).volumeClaim,
31 | 	nfs.parts(namespace, name,).diskResources(diskName).service,
32 | 	nfs.parts(namespace, name,).diskResources(diskName).provisioner];
33 | 
34 | local allDisks = std.flattenArrays(std.map(diskToList, diskNames));
35 | 
36 | local nfsComponents =
37 | 	if std.length(allDisks) > 0 then
38 | 	[nfs.parts(namespace, name).serviceAccount,
39 | 	 nfs.parts(namespace, name).role,
40 | 	 nfs.parts(namespace, name).roleBinding,
41 | 	 nfs.parts(namespace, name).clusterRoleBinding,] + allDisks
42 | 	else 
43 | 	[];
44 | 
45 | // TODO(jlewi): Maybe we should split this into separate components
46 | // for Jupyter and NFS. We always need NFS because its used by our 
47 | // Argo workflows. But Jupyter could be optional.
48 | std.prune(k.core.v1.list.new([
49 | 	// jupyterHub components
50 | 	jupyterConfigMap,
51 |     jupyter.parts(namespace).jupyterHubService, 
52 |     jupyter.parts(namespace).jupyterHubLoadBalancer,
53 |     jupyter.parts(namespace).jupyterHub(jupyterHubImage),
54 |     jupyter.parts(namespace).jupyterHubRole,
55 |     jupyter.parts(namespace).jupyterHubServiceAccount,
56 |     jupyter.parts(namespace).jupyterHubRoleBinding,    
57 | ] + nfsComponents))
58 | 
59 | 


--------------------------------------------------------------------------------
/tf-controller-examples/tf-cnn/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | REGISTRY := gcr.io/kubeflow
16 | TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always --dirty)-$(shell git diff | sha256sum | cut -c -6)
17 | DIR := ${CURDIR}
18 | 
19 | # 1.4 isn't new enough for the tf-benchmarks code
20 | # so we pin to a particular nightly build image.
21 | # CPU_BASE = tensorflow/tensorflow:nightly
22 | CPU_BASE = tensorflow/tensorflow@sha256:5edc0446cc989ad75bc30631f89f20694fe5bf5226f665d47e5c7f35a3b18484
23 | # GPU_BASE = tensorflow/tensorflow:nightly-gpu
24 | GPU_BASE = tensorflow/tensorflow@sha256:bfadad8f2c80424d8d6059d3b8cd6947bf23111dc786fc33db72b56b632a1f28
25 | 
26 | BENCH_MARKS_IMAGE := $(REGISTRY)/tf-benchmarks
27 | 
28 | # The published versions of the example code to use.
29 | PUBLISHED_CPU := gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3
30 | PUBLISHED_GPU := gcr.io/kubeflow/tf-benchmarks-gpu:v20171202-bdab599-dirty-284af3
31 | 
32 | # Build the cpu image
33 | build-cpu:
34 | 	jinja2 Dockerfile.template --format=yaml -D base_image=$(CPU_BASE) > Dockerfile.cpu
35 | 	docker build -t $(BENCH_MARKS_IMAGE)-cpu:$(TAG) -f Dockerfile.cpu ./
36 | 	gcloud docker -- push $(BENCH_MARKS_IMAGE)-cpu:$(TAG)
37 | 
38 | build-gpu:
39 | 	jinja2 Dockerfile.template --format=yaml -D base_image=$(GPU_BASE) > Dockerfile.gpu
40 | 	docker build -t $(BENCH_MARKS_IMAGE)-gpu:$(TAG) -f Dockerfile.gpu ./
41 | 	gcloud docker -- push $(BENCH_MARKS_IMAGE)-gpu:$(TAG)
42 | 
43 | build-images: build-cpu build-gpu
44 | 
45 | # Create the templates
46 | build-templates:
47 | 	python create_job_specs.py --cpu_image=$(PUBLISHED_CPU) --gpu_image=$(PUBLISHED_GPU) \
48 | 		--gpu --num_workers=1 --output=tf_job_gpu.yaml
49 | 	python create_job_specs.py --cpu_image=$(PUBLISHED_CPU) --gpu_image=$(PUBLISHED_GPU) \
50 | 		--gpu --num_workers=3 --output=tf_job_gpu_distributed.yaml
51 | 	python create_job_specs.py --cpu_image=$(PUBLISHED_CPU) --gpu_image=$(PUBLISHED_GPU) \
52 | 		--no-gpu --num_workers=1 --output=tf_job_cpu.yaml
53 | 	python create_job_specs.py --cpu_image=$(PUBLISHED_CPU) --gpu_image=$(PUBLISHED_GPU) \
54 | 		--no-gpu --num_workers=3 --output=tf_job_cpu_distributed.yaml
55 | 


--------------------------------------------------------------------------------
/testing/run_e2e_workflow_test.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import unittest
 4 | import mock
 5 | from testing import run_e2e_workflow
 6 | import tempfile
 7 | 
 8 | from google.cloud import storage  # pylint: disable=no-name-in-module
 9 | 
10 | class TestRunE2eWorkflow(unittest.TestCase):
11 |   @mock.patch("testing.run_e2e_workflow.upload_file_to_gcs")
12 |   @mock.patch("testing.run_e2e_workflow.upload_to_gcs")
13 |   @mock.patch("testing.run_e2e_workflow.util.load_kube_config")
14 |   @mock.patch("testing.run_e2e_workflow.argo_client.wait_for_workflow")
15 |   @mock.patch("testing.run_e2e_workflow.util.configure_kubectl")
16 |   @mock.patch("testing.run_e2e_workflow.util.run")
17 |   def testMainPresubmit(self, mock_run, mock_configure, mock_wait, *unused_mocks):  # pylint: disable=no-self-use
18 |     """Test create started for presubmit job."""
19 | 
20 |     os.environ["REPO_OWNER"] = "fake_org"
21 |     os.environ["REPO_NAME"] = "fake_name"
22 |     os.environ["PULL_NUMBER"] = "77"
23 |     os.environ["PULL_PULL_SHA"] = "123abc"
24 |     os.environ["JOB_NAME"] = "kubeflow-presubmit"
25 |     os.environ["JOB_TYPE"] = "presubmit"
26 |     os.environ["BUILD_NUMBER"] = "1234"
27 | 
28 |     args = ["--project=some-project", "--cluster=some-cluster",
29 |             "--zone=us-east1-d", "--bucket=some-bucket"]
30 |     run_e2e_workflow.main(args)
31 | 
32 |     mock_configure.assert_called_once_with("some-project", "us-east1-d",
33 |                                            "some-cluster",)
34 |     self.assertItemsEqual(
35 |       ["ks", "param", "set", "workflows", "name"],
36 |       mock_run.call_args_list[0][0][0][:-1])
37 |     # Workflow name will have some random salt at the end.
38 |     self.assertRegexpMatches(mock_run.call_args_list[0][0][0][-1],
39 |                              "kubeflow-presubmit-77-[0-9a-z]{4}")
40 | 
41 |     self.assertItemsEqual(
42 |       ["ks", "param", "set", "workflows", "prow_env",
43 |        "BUILD_NUMBER=1234,JOB_NAME=kubeflow-presubmit,JOB_TYPE=presubmit"
44 |        ",PULL_NUMBER=77,PULL_PULL_SHA=123abc,REPO_NAME=fake_name"
45 |        ",REPO_OWNER=fake_org"],
46 |       mock_run.call_args_list[1][0][0])
47 | 
48 |     self.assertItemsEqual(
49 |       ["ks", "param", "set", "workflows", "namespace",
50 |        "kubeflow-test-infra"],
51 |       mock_run.call_args_list[2][0][0])
52 | 
53 |     self.assertItemsEqual(
54 |       ["ks", "param", "set", "workflows", "bucket", "some-bucket"],
55 |       mock_run.call_args_list[3][0][0])
56 | 
57 |     self.assertItemsEqual(
58 |       ["ks", "show", "prow", "-c", "workflows"],
59 |       mock_run.call_args_list[4][0][0])
60 | 
61 |     self.assertItemsEqual(
62 |       ["ks", "apply", "prow", "-c", "workflows"],
63 |       mock_run.call_args_list[5][0][0])
64 | 
65 | 
66 | if __name__ == "__main__":
67 |   unittest.main()
68 | 


--------------------------------------------------------------------------------
/kubeflow/tf-job/README.md:
--------------------------------------------------------------------------------
 1 | # tf-job
 2 | 
 3 | > Prototypes for running TensorFlow jobs.
 4 | 
 5 | 
 6 | * [Quickstart](#quickstart)
 7 | * [Using Prototypes](#using-prototypes)
 8 |   * [io.ksonnet.pkg.tf-job](#io.ksonnet.pkg.tf-job)
 9 |   * [io.ksonnet.pkg.tf-cnn](#io.ksonnet.pkg.tf-cnn)
10 | 
11 | ## Quickstart
12 | 
13 | *The following commands use the `io.ksonnet.pkg.tf-job` prototype to generate Kubernetes YAML for tf-job, and then deploys it to your Kubernetes cluster.*
14 | 
15 | First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)).
16 | 
17 | If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init <app-name>`.
18 | 
19 | Finally, in the ksonnet application directory, run the following:
20 | 
21 | ```shell
22 | # Expand prototype as a Jsonnet file, place in a file in the
23 | # `components/` directory. (YAML and JSON are also available.)
24 | $ ks prototype use io.ksonnet.pkg.tf-job tf-job \
25 |   --namespace default \
26 |   --name tf-job
27 | 
28 | # Apply to server.
29 | $ ks apply -f tf-job.jsonnet
30 | ```
31 | 
32 | ## Using the library
33 | 
34 | The library files for tf-job define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure tf-job for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache.
35 | 
36 | This library provides a set of pre-fabricated "flavors" (or "distributions") of tf-job, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs.
37 | 
38 | These prototypes, as well as how to use them, are enumerated below.
39 | 
40 | ### io.ksonnet.pkg.tf-job
41 | 
42 | A TensorFlow job (could be training or evaluation).
43 | #### Example
44 | 
45 | ```shell
46 | # Expand prototype as a Jsonnet file, place in a file in the
47 | # `components/` directory. (YAML and JSON are also available.)
48 | $ ks prototype use io.ksonnet.pkg.tf-job tf-job \
49 |   --name YOUR_NAME_HERE
50 | ```
51 | 
52 | #### Parameters
53 | 
54 | The available options to pass prototype are:
55 | 
56 | * `--name=<name>`: Name to give to each of the components [string]
57 | ### io.ksonnet.pkg.tf-cnn
58 | 
59 | A TensorFlow CNN Benchmarking job
60 | #### Example
61 | 
62 | ```shell
63 | # Expand prototype as a Jsonnet file, place in a file in the
64 | # `components/` directory. (YAML and JSON are also available.)
65 | $ ks prototype use io.ksonnet.pkg.tf-cnn tf-job \
66 |   --name YOUR_NAME_HERE
67 | ```
68 | 
69 | #### Parameters
70 | 
71 | The available options to pass prototype are:
72 | 
73 | * `--name=<name>`: Name for the job. [string]
74 | 
75 | 
76 | [rootReadme]: https://github.com/ksonnet/mixins
77 | 


--------------------------------------------------------------------------------
/components/k8s-model-server/inception-client/label.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | #!/usr/bin/env python2.7
17 | 
18 | """
19 | Runs the Inception model being served on the kubeflow model server on an image
20 | that you specify.
21 | 
22 | Note: This file is a modification of the inception client available on the
23 | TensorFlow Serving GitHub repository:
24 |   https://github.com/tensorflow/serving/blob/master/tensorflow_serving/example/inception_client.py
25 | """
26 | 
27 | from __future__ import print_function
28 | 
29 | # This is a placeholder for a Google-internal import.
30 | 
31 | import argparse
32 | 
33 | from grpc.beta import implementations
34 | import tensorflow as tf
35 | 
36 | from tensorflow_serving.apis import predict_pb2
37 | from tensorflow_serving.apis import prediction_service_pb2
38 | 
39 | 
40 | def main(image_paths, server, port):
41 |   channel = implementations.insecure_channel(server, port)
42 |   stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)
43 | 
44 |   raw_images = []
45 |   for path in image_paths:
46 |     with tf.gfile.Open(path) as img:
47 |       raw_images.append(img.read())
48 | 
49 |   # Send request
50 |   # See prediction_service.proto for gRPC request/response details.
51 |   request = predict_pb2.PredictRequest()
52 |   request.model_spec.name = 'inception'
53 |   request.model_spec.signature_name = 'predict_images'
54 |   request.inputs['images'].CopyFrom(
55 |       tf.make_tensor_proto(raw_images, shape=[len(raw_images)]))
56 |   result = stub.Predict(request, 10.0)  # 10 secs timeout
57 |   print(result)
58 | 
59 | 
60 | if __name__ == '__main__':
61 |   parser = argparse.ArgumentParser('Label an image using Inception')
62 |   parser.add_argument(
63 |       '-s',
64 |       '--server',
65 |       help='URL of host serving the Inception model'
66 |   )
67 |   parser.add_argument(
68 |       '-p',
69 |       '--port',
70 |       type=int,
71 |       default=9000,
72 |       help='Port at which Inception model is being served'
73 |   )
74 |   parser.add_argument(
75 |       'images',
76 |       nargs='+',
77 |       help='Paths (local or GCS) to images you would like to label'
78 |   )
79 | 
80 |   args = parser.parse_args()
81 | 
82 |   main(args.images, args.server, args.port)
83 | 


--------------------------------------------------------------------------------
/testing/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Docker image for running E2E tests using Argo.
 2 | 
 3 | FROM python:2.7-slim
 4 | MAINTAINER Jeremy Lewi
 5 | 
 6 | # Never prompt the user for choices on installation/configuration of packages
 7 | ENV DEBIAN_FRONTEND noninteractive
 8 | ENV TERM linux
 9 | 
10 | # Define en_US.
11 | ENV LANGUAGE=en_US.UTF-8 \
12 |     LANG=en_US.UTF-8 \
13 |     LC_ALL=en_US.UTF-8 \
14 |     LC_CTYPE=en_US.UTF-8 \
15 |     LC_MESSAGES=en_US.UTF-8 \
16 |     LC_ALL=en_US.UTF-8
17 | 
18 | 
19 | # buildDeps should be packages needed only to build some other packages as
20 | # these packages are purged in a later step.
21 | #
22 | # gcc & python-dev are needed so we can install crcmod for gsutil
23 | RUN set -ex \
24 |     && apt-get update -yqq \
25 |     && apt-get install -yqq --no-install-recommends \
26 |         curl \
27 |         locales \
28 |         wget \
29 |         ca-certificates \
30 |         git \
31 |         zip \
32 |         unzip \
33 |         gcc python-dev \
34 |         python-setuptools \
35 |     && apt-get clean \
36 |     && rm -rf \
37 |         /var/lib/apt/lists/* \
38 |         /tmp/* \
39 |         /var/tmp/* \
40 |         /usr/share/man \
41 |         /usr/share/doc \
42 |         /usr/share/doc-base
43 | 
44 | # Set the locale
45 | RUN sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \
46 |     && locale-gen \
47 |     && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8
48 | 
49 | # Install go
50 | RUN cd /tmp && \
51 |     wget -O /tmp/go.tar.gz https://redirector.gvt1.com/edgedl/go/go1.9.2.linux-amd64.tar.gz && \
52 |     tar -C /usr/local -xzf go.tar.gz
53 | 
54 | # Install gcloud
55 | ENV PATH=/google-cloud-sdk/bin:/workspace:${PATH} \
56 |     CLOUDSDK_CORE_DISABLE_PROMPTS=1
57 | 
58 | RUN wget -q https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz && \
59 |     tar xzf google-cloud-sdk.tar.gz -C / && \
60 |     rm google-cloud-sdk.tar.gz && \
61 |     /google-cloud-sdk/install.sh \
62 |         --disable-installation-options \
63 |         --bash-completion=false \
64 |         --path-update=false \
65 |         --usage-reporting=false && \
66 |     gcloud components install alpha beta kubectl
67 | 
68 | # Install CRCMOD for gsutil
69 | RUN easy_install -U pip && \
70 |     pip install -U crcmod
71 | 
72 | # Install Helm
73 | RUN wget -O /tmp/get_helm.sh \
74 |     https://raw.githubusercontent.com/kubernetes/helm/master/scripts/get && \
75 |     chmod 700 /tmp/get_helm.sh && \
76 |     /tmp/get_helm.sh && \
77 |     rm /tmp/get_helm.sh
78 | 
79 | # Initialize helm
80 | RUN helm init --client-only
81 | 
82 | # Install ksonnet
83 | RUN curl -o /usr/local/bin/ks -L \
84 |     https://github.com/ksonnet/ksonnet/releases/download/v0.8.0/ks-linux-amd64 && \
85 |     chmod a+x /usr/local/bin/ks
86 | 
87 | # Install various python libraries.
88 | RUN  pip install --upgrade six pyyaml google-api-python-client \
89 |      google-cloud-storage google-auth-httplib2 pylint kubernetes==4.0.0 mock retrying
90 | 
91 | COPY bootstrap.sh /usr/local/bin
92 | RUN chmod a+x /usr/local/bin/bootstrap.sh
93 | 
94 | COPY checkout.sh /usr/local/bin
95 | RUN chmod a+x /usr/local/bin/checkout.sh
96 | 
97 | ENTRYPOINT ["/usr/local/bin/bootstrap.sh"]


--------------------------------------------------------------------------------
/tf-controller-examples/tf-cnn/launcher.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2017 Google Inc. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """A launcher suitable for invoking tf_cnn_benchmarks using TfJob.
18 | 
19 | All the launcher does is turn TF_CONFIG environment variable
20 | into extra arguments to append to the command line.
21 | """
22 | import logging
23 | import json
24 | import os
25 | import subprocess
26 | import sys
27 | import time
28 | 
29 | def run_and_stream(cmd):
30 |   logging.info("Running %s", " ".join(cmd))
31 |   process = subprocess.Popen(cmd, stdout=subprocess.PIPE,
32 |                              stderr=subprocess.STDOUT)
33 | 
34 |   while process.poll() is None:
35 |     process.stdout.flush()
36 |     if process.stderr:
37 |       process.stderr.flush()
38 |     sys.stderr.flush()
39 |     sys.stdout.flush()
40 |     for line in iter(process.stdout.readline, ''):
41 |       process.stdout.flush()
42 |       logging.info(line.strip())
43 | 
44 |   sys.stderr.flush()
45 |   sys.stdout.flush()
46 |   process.stdout.flush()
47 |   if process.stderr:
48 |     process.stderr.flush()
49 |   for line in iter(process.stdout.readline, ''):
50 |     logging.info(line.strip())
51 | 
52 |   if process.returncode != 0:
53 |     raise ValueError("cmd: {0} exited with code {1}".format(
54 |       " ".join(cmd), process.returncode))
55 | 
56 | if __name__ == "__main__":
57 |   logging.getLogger().setLevel(logging.INFO)
58 |   logging.basicConfig(level=logging.INFO,
59 |                       format=('%(levelname)s|%(asctime)s'
60 |                               '|%(pathname)s|%(lineno)d| %(message)s'),
61 |                       datefmt='%Y-%m-%dT%H:%M:%S',
62 |                       )
63 |   logging.info("Launcher started.")
64 |   tf_config = os.environ.get('TF_CONFIG', '{}')
65 |   tf_config_json = json.loads(tf_config)
66 |   cluster = tf_config_json.get('cluster', {})
67 |   job_name = tf_config_json.get('task', {}).get('type', "")
68 |   task_index = tf_config_json.get('task', {}).get('index', "")
69 | 
70 |   command = sys.argv[1:]
71 |   ps_hosts = ",".join(cluster.get("ps", []))
72 |   worker_hosts = ",".join(cluster.get("worker", []))
73 |   command.append("--job_name=" + job_name)
74 |   command.append("--ps_hosts=" + ps_hosts)
75 |   command.append("--worker_hosts=" + worker_hosts)
76 |   command.append("--task_index={0}".format(task_index))
77 | 
78 |   logging.info("Command to run: %s", " ".join(command))
79 |   with open("/opt/run_benchmarks.sh", "w") as hf:
80 |     hf.write("#!/bin/bash\n")
81 |     hf.write(" ".join(command))
82 |     hf.write("\n")
83 | 
84 |   run_and_stream(command)
85 |   logging.info("Finished: %s", " ".join(command))
86 |   # We don't want to terminate because TfJob will
87 |   # just restart the job.
88 |   while True:
89 |     logging.info("Command ran successfully sleep for ever.")
90 |     time.sleep(600)
91 | 


--------------------------------------------------------------------------------
/testing/prow_artifacts_test.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import unittest
 4 | import mock
 5 | from testing import prow_artifacts
 6 | import tempfile
 7 | 
 8 | from google.cloud import storage  # pylint: disable=no-name-in-module
 9 | 
10 | class TestProw(unittest.TestCase):
11 |   @mock.patch("testing.prow_artifacts.time.time")
12 |   def testCreateStartedPresubmit(self, mock_time):  # pylint: disable=no-self-use
13 |     """Test create started for presubmit job."""
14 |     mock_time.return_value = 1000
15 | 
16 |     os.environ["REPO_OWNER"] = "fake_org"
17 |     os.environ["REPO_NAME"] = "fake_name"
18 |     os.environ["PULL_PULL_SHA"] = "123abc"
19 |     expected = {
20 |         "timestamp": 1000,
21 |         "repos": {
22 |             "fake_org/fake_name": "123abc",
23 |         },
24 |     }
25 | 
26 |     actual = prow_artifacts.create_started()
27 | 
28 |     self.assertEquals(expected, json.loads(actual))
29 | 
30 |   @mock.patch("testing.prow_artifacts.time.time")
31 |   def testCreateFinished(self, mock_time):  # pylint: disable=no-self-use
32 |     """Test create finished job."""
33 |     mock_time.return_value = 1000
34 | 
35 |     expected = {
36 |         "timestamp": 1000,
37 |         "result": "FAILED",
38 |         "metadata": {},
39 |     }
40 | 
41 |     actual = prow_artifacts.create_finished(False)
42 | 
43 |     self.assertEquals(expected, json.loads(actual))
44 | 
45 |   @mock.patch("testing.prow_artifacts.util.run")
46 |   def testCopyArtifactsPresubmit(self, mock_run):  # pylint: disable=no-self-use
47 |     """Test copy artifacts to GCS."""
48 | 
49 |     os.environ["REPO_OWNER"] = "fake_org"
50 |     os.environ["REPO_NAME"] = "fake_name"
51 |     os.environ["PULL_NUMBER"] = "72"
52 |     os.environ["BUILD_NUMBER"] = "100"
53 |     os.environ["PULL_PULL_SHA"] = "123abc"
54 |     os.environ["JOB_NAME"] = "kubeflow-presubmit"
55 | 
56 |     temp_dir = tempfile.mkdtemp(prefix="tmpTestProwTestCreateFinished.")
57 |     args = ["--artifacts_dir=/tmp/some/dir", "copy_artifacts",
58 |             "--bucket=some_bucket"]
59 |     prow_artifacts.main(args)
60 | 
61 |     mock_run.assert_called_once_with(
62 |       ["gsutil", "-m", "rsync", "-r", "/tmp/some/dir",
63 |        "gs://some_bucket/pr-logs/pull/fake_org_fake_name/72/kubeflow-presubmit"
64 |        "/100"],
65 |     )
66 | 
67 |   def testCreateSymlink(self):
68 |     gcs_client = mock.MagicMock(spec=storage.Client)
69 |     mock_bucket = mock.MagicMock(spec=storage.Bucket)
70 |     gcs_client.get_bucket.return_value = mock_bucket
71 |     mock_blob = mock.MagicMock(spec=storage.Blob)
72 |     mock_bucket.blob.return_value = mock_blob
73 |     # We can't add the decorator the instance method because that would
74 |     # interfere with creating gcs_client since storage.Client would then
75 |     # point to the mock and not the actual class.
76 |     with mock.patch("testing.prow_artifacts.storage.Client") as mock_client:
77 |       mock_client.return_value = gcs_client
78 | 
79 |       os.environ["REPO_OWNER"] = "fake_org"
80 |       os.environ["REPO_NAME"] = "fake_name"
81 |       os.environ["PULL_NUMBER"] = "72"
82 |       os.environ["BUILD_NUMBER"] = "100"
83 |       os.environ["PULL_PULL_SHA"] = "123abc"
84 |       os.environ["JOB_NAME"] = "kubeflow-presubmit"
85 | 
86 |       args = ["--artifacts_dir=/tmp/some/dir", "create_pr_symlink",
87 |               "--bucket=some-bucket"]
88 |       prow_artifacts.main(args)
89 | 
90 |       mock_blob.upload_from_string.assert_called_once_with(
91 |         "gs://some-bucket/pr-logs/pull/fake_org_fake_name/72"
92 |         "/kubeflow-presubmit/100")
93 | 
94 | if __name__ == "__main__":
95 |   unittest.main()
96 | 


--------------------------------------------------------------------------------
/kubeflow/tf-serving/tf-serving.libsonnet:
--------------------------------------------------------------------------------
  1 | local k = import 'k.libsonnet';
  2 | local deployment = k.extensions.v1beta1.deployment;
  3 | local container = deployment.mixin.spec.template.spec.containersType;
  4 | local storageClass = k.storage.v1beta1.storageClass;
  5 | local service = k.core.v1.service;
  6 | local networkPolicy = k.extensions.v1beta1.networkPolicy;
  7 | local networkSpec = networkPolicy.mixin.spec;
  8 | 
  9 | {
 10 |   parts:: {
 11 |     deployment:: {
 12 |       local defaults = {
 13 |         image:: "gcr.io/kubeflow/model-server:1.0",
 14 |         imagePullPolicy:: "IfNotPresent",
 15 |         resources:: {
 16 |           "requests": {
 17 |             "memory": "1Gi",
 18 |             "cpu": "1"
 19 |           },
 20 |           "limits": {
 21 |             "memory": "4Gi",
 22 |             "cpu": "4"
 23 |           },
 24 |         },
 25 |       },
 26 | 
 27 |       modelService(name, namespace, labels={app:name}): {
 28 |         "apiVersion": "v1", 
 29 |         "kind": "Service", 
 30 |         "metadata": {
 31 |           "labels": labels,
 32 |           "name": name,
 33 |            namespace: namespace,
 34 |         }, 
 35 |         "spec": {
 36 |           "ports": [
 37 |             {
 38 |               "port": 9000, 
 39 |               "targetPort": 9000,
 40 |             }
 41 |           ],           
 42 |           "selector": labels,
 43 |           "type": "ClusterIP"
 44 |         }
 45 |       },
 46 | 
 47 |       modelServer(name, namespace, modelPath, labels={app:name},):
 48 |         // TODO(jlewi): Allow the model to be served from a PVC.
 49 |         local volume = {
 50 |           name: "redis-data",
 51 |           namespace: namespace,
 52 |           emptyDir: {}
 53 |         };
 54 |         base(name, namespace, modelPath, labels),
 55 | 
 56 |       local base(name, namespace, modelPath, labels) = 
 57 |       {
 58 |         apiVersion: "extensions/v1beta1",
 59 |         kind: "Deployment",
 60 |         metadata: {
 61 |           name: name,
 62 |           namespace: namespace,
 63 |           labels: labels,
 64 |         },
 65 |         spec: {
 66 |           template: {
 67 |             metadata: {
 68 |               labels: labels
 69 |             },
 70 |             spec: {
 71 |               containers: [
 72 |                 {
 73 |                   name: name,
 74 |                   image: defaults.image,
 75 |                   imagePullPolicy: defaults.imagePullPolicy,
 76 |                   // TODO(jlewi): Talk to owensk to figure out why we wrap in a shell.
 77 |                   command: [
 78 |                     "/bin/sh", 
 79 |                     "-c"
 80 |                   ], 
 81 |                   args: [
 82 |                     "/usr/bin/tensorflow_model_server --port=9000 --model_name=" + name + " --model_base_path=" + modelPath,
 83 |                   ], 
 84 |                   env: [],
 85 |                   ports: [
 86 |                     {                      
 87 |                       containerPort: 9000,
 88 |                     },
 89 |                   ],
 90 |                   // TODO(jlewi): We should add readiness and liveness probes. I think the blocker is that
 91 |                   // model-server doesn't have something we can use out of the box.                  
 92 |                   resources: defaults.resources,
 93 |                 },
 94 |               ],
 95 |               // See:  https://github.com/google/kubeflow/tree/master/components/k8s-model-server#set-the-user-optional
 96 |               // The is user and group should be defined in the Docker image.
 97 |               // Per best practices we don't run as the root user.
 98 |               securityContext: {
 99 |                 runAsUser: 1000,
100 |                 fsGroup: 1000,
101 |               },
102 |             },
103 |           },
104 |         },
105 |       },
106 |     },
107 |   },
108 | }
109 | 


--------------------------------------------------------------------------------
/kubeflow/core/prototypes/all.jsonnet:
--------------------------------------------------------------------------------
 1 | // @apiVersion 0.1
 2 | // @name io.ksonnet.pkg.kubeflow-core
 3 | // @description Kubeflow core components
 4 | // @shortDescription Kubeflow core components. This currently includes JupyterHub and the TfJob controller.
 5 | // @param name string Name to give to each of the components
 6 | // @optionalParam namespace string default Namespace
 7 | // @optionalParam disks string null Comma separated list of Google persistent disks to attach to jupyter environments.
 8 | // @optionalParam cloud string null String identifying the cloud to customize the deployment for.
 9 | // @optionalParam tfJobImage string gcr.io/tf-on-k8s-dogfood/tf_operator:v20180117-04425d9-dirty-e3b0c44 The image for the TfJob controller.
10 | // @optionalParam tfDefaultImage string null The default image to use for TensorFlow.
11 | // @optionalParam tfJobUiServiceType string ClusterIP The service type for the UI.
12 | // @optionalParam jupyterHubServiceType string ClusterIP The service type for Jupyterhub.
13 | 
14 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): We have to add namespace as an explicit parameter
15 | // because ksonnet doesn't support inheriting it from the environment yet.
16 | 
17 | local k = import 'k.libsonnet';
18 | local jupyter = import "kubeflow/core/jupyterhub.libsonnet";
19 | local tfjob = import "kubeflow/core/tf-job.libsonnet";
20 | local nfs = import "kubeflow/core/nfs.libsonnet";
21 | 
22 | local name = import 'param://name';
23 | local namespace = import 'param://namespace';
24 | 
25 | local cloud = import 'param://cloud';
26 | 
27 | // TODO(jlewi): Make this a parameter
28 | local jupyterHubImage = 'gcr.io/kubeflow/jupyterhub:1.0';
29 | local diskParam = import 'param://disks';
30 | 
31 | local diskNames = if diskParam != "null" && std.length(diskParam) > 0 then
32 |   std.split(diskParam, ',')
33 |   else [];
34 | 
35 | local jupyterConfigMap = if std.length(diskNames) == 0 then
36 | 	jupyter.parts(namespace).jupyterHubConfigMap
37 | 	else jupyter.parts(namespace).jupyterHubConfigMapWithVolumes(diskNames);
38 | 
39 | local tfJobImage = import 'param://tfJobImage';
40 | local tfDefaultImage = import 'param://tfDefaultImage';
41 | local tfJobUiServiceType = import 'param://tfJobUiServiceType';
42 | local jupyterHubServiceType = import 'param://jupyterHubServiceType';
43 | 
44 | // Create a list of the resources needed for a particular disk
45 | local diskToList = function(diskName) [
46 | 	nfs.parts(namespace, name,).diskResources(diskName).storageClass,
47 | 	nfs.parts(namespace, name,).diskResources(diskName).volumeClaim,
48 | 	nfs.parts(namespace, name,).diskResources(diskName).service,
49 | 	nfs.parts(namespace, name,).diskResources(diskName).provisioner];
50 | 
51 | local allDisks = std.flattenArrays(std.map(diskToList, diskNames));
52 | 
53 | local nfsComponents =
54 | 	if std.length(allDisks) > 0 then
55 | 	[nfs.parts(namespace, name).serviceAccount,
56 | 	 nfs.parts(namespace, name).role,
57 | 	 nfs.parts(namespace, name).roleBinding,
58 | 	 nfs.parts(namespace, name).clusterRoleBinding,] + allDisks
59 | 	else 
60 | 	[];
61 | 
62 | std.prune(k.core.v1.list.new([
63 | 	// jupyterHub components
64 | 	jupyterConfigMap,
65 |     jupyter.parts(namespace).jupyterHubService, 
66 |     jupyter.parts(namespace).jupyterHubLoadBalancer(jupyterHubServiceType),
67 |     jupyter.parts(namespace).jupyterHub(jupyterHubImage),
68 |     jupyter.parts(namespace).jupyterHubRole,
69 |     jupyter.parts(namespace).jupyterHubServiceAccount,
70 |     jupyter.parts(namespace).jupyterHubRoleBinding,    
71 | 
72 |     // TfJob controller
73 |     tfjob.parts(namespace).tfJobDeploy(tfJobImage), 
74 |     tfjob.parts(namespace).configMap(cloud, tfDefaultImage),
75 |     tfjob.parts(namespace).serviceAccount,
76 |     tfjob.parts(namespace).operatorRole,
77 |     tfjob.parts(namespace).operatorRoleBinding,
78 | 
79 |     // TfJob controll ui
80 |     tfjob.parts(namespace).ui(tfJobImage),
81 |     tfjob.parts(namespace).uiService(tfJobUiServiceType),
82 |     tfjob.parts(namespace).uiServiceAccount,
83 |     tfjob.parts(namespace).uiRole,
84 |     tfjob.parts(namespace).uiRoleBinding,
85 | ] + nfsComponents))
86 | 
87 | 


--------------------------------------------------------------------------------
/kubeflow/tf-job/prototypes/tf-cnn-benchmarks.jsonnet:
--------------------------------------------------------------------------------
  1 | // @apiVersion 0.1
  2 | // @name io.ksonnet.pkg.tf-cnn
  3 | // @description A TensorFlow CNN Benchmarking job
  4 | // @shortDescription Run the TensorFlow CNN benchmarking job.
  5 | // @param name string Name for the job.
  6 | // @optionalParam namespace string default Namespace
  7 | // @optionalParam batch_size number 32 The batch size
  8 | // @optionalParam model string resnet50 Which model to use
  9 | // @optionalParam num_gpus number 0 The number of GPUs to attach to workers.
 10 | // @optionalParam image string gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 The docker image to use for the job.
 11 | // @optionalParam image_gpu string gcr.io/kubeflow/tf-benchmarks-gpu:v20171202-bdab599-dirty-284af3 The docker image to use when using GPUs.
 12 | // @optionalParam num_ps number 1 The number of ps to use
 13 | // @optionalParam num_workers number 1 The number of workers to use
 14 | 
 15 | // We need at least 1 parameter server.
 16 | 
 17 | // TODO(jlewi): Should we move this into an examples package?
 18 | 
 19 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): We have to add namespace as an explicit parameter
 20 | // because ksonnet doesn't support inheriting it from the environment yet.
 21 | 
 22 | local k = import 'k.libsonnet';
 23 | local deployment = k.extensions.v1beta1.deployment;
 24 | local container = deployment.mixin.spec.template.spec.containersType;
 25 | local podTemplate = k.extensions.v1beta1.podTemplate;
 26 | 
 27 | local tfJob = import 'kubeflow/tf-job/tf-job.libsonnet';
 28 | 
 29 | local name = import 'param://name';
 30 | local namespace = import 'param://namespace';
 31 | 
 32 | local numGpus = import 'param://num_gpus';
 33 | local batchSize = import 'param://batch_size';
 34 | local model = import 'param://model';
 35 | 
 36 | local args = [
 37 | 	"python",
 38 | 	"tf_cnn_benchmarks.py",
 39 | 	"--batch_size=" + batchSize,
 40 | 	"--model=" + model,
 41 |     "--variable_update=parameter_server",
 42 |     "--flush_stdout=true",
 43 | ] +
 44 | if numGpus == 0 then
 45 |   # We need to set num_gpus=1 even if not using GPUs because otherwise the devie list
 46 |   # is empty because of this code
 47 |   # https://github.com/tensorflow/benchmarks/blob/master/scripts/tf_cnn_benchmarks/benchmark_cnn.py#L775
 48 |   # We won't actually use GPUs because based on other flags no ops will be assigned to GPus.
 49 |   ["--num_gpus=1",
 50 |    "--local_parameter_device=cpu",
 51 |    "--device=cpu",
 52 |    "--data_format=NHWC",]
 53 | else
 54 |   ["--num_gpus=" + numGpus,
 55 |   ]
 56 |  ; 
 57 | 
 58 | local image = import 'param://image';
 59 | local imageGpu = import 'param://image_gpu';
 60 | local numPs = import 'param://num_ps';
 61 | local numWorkers = import 'param://num_workers';
 62 | local numGpus = import 'param://num_gpus';
 63 | 
 64 | local workerSpec = if numGpus > 0 then
 65 |   	tfJob.parts.tfJobReplica("WORKER", numWorkers, args, imageGpu, numGpus)
 66 |   	else
 67 |   	tfJob.parts.tfJobReplica("WORKER", numWorkers, args, image);
 68 | 
 69 | // TODO(jlewi): Look at how the redis prototype modifies a container by
 70 | // using mapContainersWithName. Can we do something similar?
 71 | // https://github.com/ksonnet/parts/blob/9d78d6bb445d530d5b927656d2293d4f12654608/incubator/redis/redis.libsonnet
 72 | local replicas = std.map(function(s)   
 73 |   s + {
 74 |     template+: {
 75 |       spec+:  {
 76 |         // TODO(jlewi): Does this overwrite containers? 
 77 |         containers: [
 78 |           s.template.spec.containers[0] + {
 79 |             workingDir: "/opt/tf-benchmarks/scripts/tf_cnn_benchmarks",
 80 |           },
 81 |         ]
 82 |       }
 83 |     },
 84 |   },
 85 |   std.prune([workerSpec, tfJob.parts.tfJobReplica("PS", numPs, args, image)]));
 86 | 
 87 | local job = 
 88 | 	if numWorkers < 1 then
 89 | 	error "num_workers must be >= 1"
 90 | 	else 
 91 | 	if numPs < 1 then
 92 | 	error "num_ps must be >= 1"
 93 | 	else
 94 | 	tfJob.parts.tfJob(name, namespace, replicas) + {
 95 |     spec+: {
 96 | 				tfImage: image,
 97 | 				terminationPolicy: {chief:{replicaName: "WORKER", replicaIndex: 0 }}
 98 | 		}};
 99 | 
100 | std.prune(k.core.v1.list.new([job]))
101 | 


--------------------------------------------------------------------------------
/components/tf-controller/deploy_crd.yaml:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | ---
 16 | # Source: tf-job-operator-chart/templates/config.yaml
 17 | 
 18 | apiVersion: v1
 19 | kind: ConfigMap
 20 | metadata:
 21 |   name: tf-job-operator-config
 22 | data:
 23 |   controller_config_file.yaml: |
 24 |     grpcServerFilePath: /opt/mlkube/grpc_tensorflow_server/grpc_tensorflow_server.py
 25 |     accelerators:
 26 |       alpha.kubernetes.io/nvidia-gpu:
 27 |         volumes:
 28 |           - name: nvidia-libraries
 29 |             mountPath: /usr/local/nvidia/lib64 # This path is special; it is expected to be present in `/etc/ld.so.conf` inside the container image.
 30 |             hostPath: /home/kubernetes/bin/nvidia/lib
 31 |           - name: nvidia-debug-tools # optional
 32 |             mountPath: /usr/local/bin/nvidia
 33 |             hostPath: /home/kubernetes/bin/nvidia/bin
 34 | 
 35 | ---
 36 | # Source: tf-job-operator-chart/templates/service-account.yaml
 37 | 
 38 | apiVersion: v1
 39 | kind: ServiceAccount
 40 | metadata:
 41 |   name: tf-job-operator
 42 |   labels:
 43 |     app: tf-job-operator
 44 | 
 45 | ---
 46 | # Source: tf-job-operator-chart/templates/rbac.yaml
 47 | 
 48 | apiVersion: rbac.authorization.k8s.io/v1beta1
 49 | kind: ClusterRole
 50 | metadata:
 51 |   name: tf-job-operator
 52 |   labels:
 53 |     app: tf-job-operator
 54 | rules:
 55 | - apiGroups:
 56 |   - tensorflow.org
 57 |   resources:
 58 |   - tfjobs
 59 |   verbs:
 60 |   - "*"
 61 | - apiGroups:
 62 |   - apiextensions.k8s.io
 63 |   resources:
 64 |   - customresourcedefinitions
 65 |   verbs:
 66 |   - "*"
 67 | - apiGroups:
 68 |   - storage.k8s.io
 69 |   resources:
 70 |   - storageclasses
 71 |   verbs:
 72 |   - "*"
 73 | - apiGroups:
 74 |   - batch
 75 |   resources:
 76 |   - jobs
 77 |   verbs:
 78 |   - "*"
 79 | - apiGroups:
 80 |   - ""
 81 |   resources:
 82 |   - configmaps
 83 |   - pods
 84 |   - services
 85 |   - endpoints
 86 |   - persistentvolumeclaims
 87 |   - events
 88 |   verbs:
 89 |   - "*"
 90 | - apiGroups:
 91 |   - apps
 92 |   - extensions
 93 |   resources:
 94 |   - deployments
 95 |   verbs:
 96 |   - "*"
 97 | ---
 98 | kind: ClusterRoleBinding
 99 | apiVersion: rbac.authorization.k8s.io/v1beta1
100 | metadata:
101 |   name: tf-job-operator
102 |   labels:
103 |     app: tf-job-operator
104 | subjects:
105 | - kind: ServiceAccount
106 |   name: tf-job-operator
107 |   namespace: default
108 | roleRef:
109 |   apiGroup: rbac.authorization.k8s.io
110 |   kind: ClusterRole
111 |   name: tf-job-operator
112 | 
113 | 
114 | ---
115 | # Source: tf-job-operator-chart/templates/deployment.yaml
116 | apiVersion: extensions/v1beta1
117 | kind: Deployment
118 | metadata:
119 |   name: tf-job-operator
120 | spec:
121 |   replicas: 1
122 |   template:
123 |     metadata:
124 |       labels:
125 |         name: tf-job-operator
126 |     spec:
127 |       serviceAccountName: tf-job-operator
128 |       containers:
129 |       - name: tf-job-operator
130 |         image: gcr.io/tf-on-k8s-dogfood/tf_operator:v20171129-f8ec762
131 |         command:
132 |           - /opt/mlkube/tf_operator
133 |           - --controller_config_file=/etc/config/controller_config_file.yaml
134 |           - -alsologtostderr
135 |           - -v=1
136 |         env:
137 |         - name: MY_POD_NAMESPACE
138 |           valueFrom:
139 |             fieldRef:
140 |               fieldPath: metadata.namespace
141 |         - name: MY_POD_NAME
142 |           valueFrom:
143 |             fieldRef:
144 |               fieldPath: metadata.name
145 | 
146 |         volumeMounts:
147 |           - name: config-volume
148 |             mountPath: /etc/config
149 |       volumes:
150 |         - name: config-volume
151 |           configMap:
152 |             name: tf-job-operator-config
153 | 


--------------------------------------------------------------------------------
/tf-controller-examples/tf-cnn/create_job_specs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright 2017 Google Inc. All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | """A simple script to generate TfJob templates based on various parameters."""
 18 | 
 19 | import argparse
 20 | import datetime
 21 | import logging
 22 | import yaml
 23 | 
 24 | TF_JOB_GROUP = "tensorflow.org"
 25 | TF_JOB_VERSION = "v1alpha1"
 26 | TF_JOB_PLURAL = "tfjobs"
 27 | TF_JOB_KIND = "TfJob"
 28 | 
 29 | # See https://stackoverflow.com/questions/21016220/is-it-possible-to-emit-valid-yaml-with-anchors-references-disabled-using-ruby
 30 | class ExplicitDumper(yaml.SafeDumper):
 31 |   """A dumper that will never emit aliases."""
 32 | 
 33 |   def ignore_aliases(self, data):
 34 |     return True
 35 | 
 36 | if __name__ == "__main__":
 37 |   logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals
 38 |   parser = argparse.ArgumentParser(description="Create TfJob specs.")
 39 | 
 40 |   parser.add_argument(
 41 |     "--cpu_image",
 42 |       type=str,
 43 |       required=True,
 44 |       help="The docker image for CPU jobs.")
 45 | 
 46 |   parser.add_argument(
 47 |     "--gpu_image",
 48 |       type=str,
 49 |       required=True,
 50 |       help="The docker image for GPU jobs.")
 51 | 
 52 |   parser.add_argument(
 53 |     "--num_workers",
 54 |       type=int,
 55 |       default=1,
 56 |       help="The number of workers to use.")
 57 | 
 58 |   parser.add_argument(
 59 |     "--output",
 60 |       type=str,
 61 |       help="(Optional) the file to write the template to.")
 62 | 
 63 |   parser.add_argument("--gpu", dest="use_gpu", action="store_true",
 64 |                       help="Use gpus.")
 65 |   parser.add_argument("--no-gpu", dest="use_gpu", action="store_false",
 66 |                       help="Do not use gpus.")
 67 | 
 68 |   parser.set_defaults(use_gpu=True)
 69 | 
 70 |   args = parser.parse_args()
 71 | 
 72 |   namespace = "default"
 73 |   job_name = "inception-" + datetime.datetime.now().strftime("%y%m%d-%H%M%S")
 74 |   if args.use_gpu:
 75 |     job_name += "-gpu"
 76 |   else:
 77 |     job_name += "-cpu"
 78 | 
 79 |   job_name += "-{0}".format(args.num_workers)
 80 | 
 81 |   body = {}
 82 |   body['apiVersion'] = TF_JOB_GROUP + "/" + TF_JOB_VERSION
 83 |   body['kind'] = TF_JOB_KIND
 84 |   body['metadata'] = {}
 85 |   body['metadata']['name'] = job_name
 86 |   body['metadata']['namespace'] = namespace
 87 | 
 88 |   clone_on_cpu = not args.use_gpu
 89 | 
 90 |   body["spec"] = {}
 91 |   body["spec"]["replicaSpecs"] = []
 92 | 
 93 |   working_dir = "/opt/tf-benchmarks/scripts/tf_cnn_benchmarks"
 94 | 
 95 |   num_workers = args.num_workers
 96 |   num_ps = 1
 97 | 
 98 |   command = [
 99 |      "python",
100 |      "tf_cnn_benchmarks.py",
101 |      "--batch_size=32",
102 |      "--model=resnet50",
103 |      "--variable_update=parameter_server",
104 |      # tf_cnn_benchmarks uses print for logging and if we
105 |      # don't set flush_stdout the buffer isn't outputted
106 |      # until the program ends..
107 |      "--flush_stdout=true",
108 |   ]
109 | 
110 |   if args.use_gpu:
111 |     command.append("--num_gpus=1")
112 |   else:
113 |     # We need to set num_gpus=1 even if not using GPUs because otherwise the devie list
114 |     # is empty because of this code
115 |     # https://github.com/tensorflow/benchmarks/blob/master/scripts/tf_cnn_benchmarks/benchmark_cnn.py#L775
116 |     command.append("--num_gpus=1")
117 |     command.append("--local_parameter_device=cpu")
118 |     command.append("--device=cpu")
119 |     command.append("--data_format=NHWC")
120 | 
121 |   # Add the master spec.
122 |   # The master only acts as the chief and doesn't do any training so it can always use the CPU image.
123 |   master_spec = {
124 |     "replicas": 1,
125 |     "tfReplicaType": "MASTER",
126 |     "template": {
127 |       "spec": {
128 |         "containers": [
129 |           {
130 |             "image": args.cpu_image,
131 |             "name": "tensorflow",
132 |             "workingDir": working_dir,
133 |             "args": command,
134 |           }
135 |         ],
136 |         "restartPolicy": "OnFailure",
137 |       }
138 |     }
139 |   }
140 | 
141 |   body["spec"]["replicaSpecs"].append(master_spec)
142 | 
143 |   worker_image = args.cpu_image
144 |   if args.use_gpu:
145 |     worker_image = args.gpu_image
146 | 
147 |   worker_spec = {
148 |     "replicas": num_workers,
149 |     "tfReplicaType": "WORKER",
150 |     "template": {
151 |       "spec": {
152 |         "containers": [
153 |           {
154 |             "image": worker_image,
155 |             "name": "tensorflow",
156 |             "workingDir": working_dir,
157 |             "args": command,
158 |           }
159 |         ],
160 |         "restartPolicy": "OnFailure",
161 |       }
162 |     }
163 |   }
164 | 
165 |   if args.use_gpu:
166 |     worker_spec["template"]["spec"]["containers"][0]["resources"] = {
167 |       "limits": {
168 |         "nvidia.com/gpu": 1,
169 |       }
170 |     }
171 | 
172 |   body["spec"]["replicaSpecs"].append(worker_spec)
173 | 
174 |   ps_spec = {
175 |     "replicas": num_ps,
176 |     "tfReplicaType": "PS",
177 |     "template": {
178 |       "spec": {
179 |         "containers": [
180 |           {
181 |             "image": args.cpu_image,
182 |             "name": "tensorflow",
183 |             "workingDir": working_dir,
184 |             "args": command,
185 |           }
186 |         ],
187 |         "restartPolicy": "OnFailure",
188 |       }
189 |     }
190 |   }
191 | 
192 |   body["spec"]["replicaSpecs"].append(ps_spec)
193 | 
194 |   body["spec"]["tfImage"] = args.cpu_image
195 | 
196 |   # Tensorboard is crashing with TF 1.5
197 |   # body["spec"]["tensorBoard"] = {
198 |   #   "logDir": job_dir
199 |   # }
200 | 
201 |   spec = yaml.dump(body, Dumper=ExplicitDumper, default_flow_style=False)
202 | 
203 |   if args.output:
204 |     logging.info("Writing to %s", args.output)
205 |     with open(args.output, "w") as hf:
206 |       hf.write(spec)
207 |   else:
208 |     print(spec)
209 | 


--------------------------------------------------------------------------------
/testing/README.md:
--------------------------------------------------------------------------------
  1 | # Test Infrastructure
  2 | 
  3 | This directory contains the Kubeflow test Infrastructure.
  4 | 
  5 | This is a work in progress see [google/kubeflow#38](https://github.com/google/kubeflow/issues/38)
  6 | 
  7 | The current thinking is this will work as follows
  8 | 
  9 |   * Prow will be used to trigger E2E tests
 10 |   * The E2E test will launch an Argo workflow that describes the tests to run
 11 |   * Each step in the Argo workflow will be a binary invoked inside a container
 12 |   * The Argo workflow will use an NFS volume to attach a shared POSIX compliant filesystem to each step in the 
 13 |     workflow.
 14 |   * Each step in the pipeline can write outputs and junit.xml files to a test directory in the volume
 15 |   * A final step in the Argo pipeline will upload the outputs to GCS so they are available in gubernator
 16 | 
 17 | ## Accessing Argo UI
 18 | 
 19 | You can access the Argo UI over the API Server proxy.
 20 | 
 21 | We currently use the cluster
 22 | 
 23 | ```
 24 | PROJECT=mlkube-testing
 25 | ZONE=us-east1-d
 26 | CLUSTER=kubeflow-testing
 27 | NAMESPACE=kubeflow-test-infra
 28 | ```
 29 | 
 30 | After starting `kubectl proxy` on `127.0.0.1:8001`, you can connect to the argo UI via the local proxy at
 31 | 
 32 | ```
 33 | http://127.0.0.1:8001/api/v1/proxy/namespaces/kubeflow-test-infra/services/argo-ui:80/
 34 | ```
 35 | 
 36 | TODO(jlewi): We can probably make the UI publicly available since I don't think it offers any ability to launch workflows.
 37 | 
 38 | 
 39 | ## Running the tests
 40 | 
 41 | ### Run a presubmit
 42 | 
 43 | ```
 44 | ks param set workflows name e2e-test-pr-`date '+%Y%m%d-%H%M%S'`
 45 | ks param set workflows prow_env REPO_OWNER=google,REPO_NAME=kubeflow,PULL_NUMBER=${PULL_NUMBER},PULL_PULL_SHA=${COMMIT}
 46 | ks param set workflows commit ${COMMIT}
 47 | ks apply prow -c workflows
 48 | ```
 49 | 	* You can set COMMIT to `pr` to checkout the latest change on the PR.
 50 | 
 51 | ### Run a postsubmit
 52 | 
 53 | ```
 54 | ks param set workflows name e2e-test-postsubmit-`date '+%Y%m%d-%H%M%S'`
 55 | ks param set workflows prow_env REPO_OWNER=google,REPO_NAME=kubeflow,PULL_BASE_SHA=${COMMIT}
 56 | ks param set workflows commit ${COMMIT}
 57 | ks apply prow -c workflows
 58 | ```
 59 |   * You can set COMMIT to `master` to use HEAD
 60 | 
 61 | 
 62 | ## Setting up the Test Infrastructure
 63 | 
 64 | Our tests require a K8s cluster with Argo installed. This section provides the instructions 
 65 | for setting this.
 66 | 
 67 | Create a GKE cluster
 68 | 
 69 | ```
 70 | PROJECT=mlkube-testing
 71 | ZONE=us-east1-d
 72 | CLUSTER=kubeflow-testing
 73 | NAMESPACE=kubeflow-test-infra
 74 | 
 75 | gcloud --project=${PROJECT} container clusters create \
 76 | 	--zone=${ZONE} \
 77 | 	--machine-type=n1-standard-8 \
 78 | 	--cluster-version=1.8.4-gke.1 \
 79 | 	${CLUSTER}
 80 | ```
 81 | 
 82 | 
 83 | ### Create a GCP service account
 84 | 	
 85 | 	* The tests need a GCP service account to upload data to GCS for Gubernator
 86 | 
 87 | 	```
 88 | 	SERVICE_ACCOUNT=kubeflow-testing
 89 | 	gcloud iam service-accounts --project=mlkube-testing create ${SERVICE_ACCOUNT} --display-name "Kubeflow testing account"
 90 | 	gcloud projects add-iam-policy-binding ${PROJECT} \
 91 |     	--member serviceAccount:${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com --role roles/container.developer
 92 | 	```
 93 | 		* The service account needs to be able to create K8s resources as part of the test.
 94 | 
 95 | 
 96 | 	Create a secret key for the service account
 97 | 
 98 | 	```
 99 | 	gcloud iam service-accounts keys create ~/tmp/key.json \
100 |     	--iam-account ${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com
101 |     kubectl create secret generic kubeflow-testing-credentials \
102 |         --namespace=kubeflow-test-infra --from-file=`echo ~/tmp/key.json`
103 |     rm ~/tmp/key.json
104 | 	```
105 | 
106 | 	Make the service account a cluster admin
107 | 
108 | 	```
109 | 	kubectl create clusterrolebinding  ${SERVICE_ACCOUNT}-admin --clusterrole=cluster-admin  \
110 | 		--user=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com 
111 | 	```
112 | 		* The service account is used to deploye Kubeflow which entails creating various roles; so 
113 | 		  it needs sufficient RBAC permission to do so.
114 | 
115 | ### Create a GitHub Token
116 | 
117 | You need to use a GitHub token with ksonnet otherwise the test quickly runs into GitHub API limits.
118 | 
119 | TODO(jlewi): We should create a GitHub bot account to use with our tests and then create API tokens for that bot.
120 | 
121 | You can use the GitHub API to create a token
122 | 
123 |    * The token doesn't need any scopes because its only accessing public data and is just need for API metering.
124 | 
125 | To create the secret run
126 | 
127 | ```
128 | kubectl create secret generic github-token --namespace=kubeflow-test-infra --from-literal=github_token=${TOKEN}
129 | ```
130 | 
131 | ### Create a PD for NFS
132 | 
133 | Create a PD to act as the backing storage for the NFS filesystem that will be used to store data from
134 | the test runs.
135 | 
136 | ```
137 |   gcloud --project=${PROJECT} compute disks create  \
138 |   	--zone=${ZONE} kubeflow-testing --description="PD to back NFS storage for kubeflow testing." --size=1TB
139 | ```
140 | ### Create K8s Resources for Testing
141 | 
142 | The ksonnet app `test-infra` contains ksonnet configs to deploy the test infrastructure.
143 | 
144 | You can deploy argo as follows (you don't need to use argo's CLI)
145 | 
146 | ```
147 | ks apply prow -c argo
148 | ```  
149 | 
150 | Deploy NFS & Jupyter
151 | 
152 | ```
153 | ks apply prow -c nfs-jupyter
154 | ```
155 | 
156 | 	* This creates the NFS share
157 | 	* We use JupyterHub as a convenient way to access the NFS share for manual inspection of the file contents.
158 | 
159 | #### Troubleshooting
160 | 
161 | User or service account deploying the test infrastructure needs sufficient permissions to create the roles that are created as part deploying the test infrastructure. So you may need to run the following command before using ksonnet to deploy the test infrastructure.
162 | 
163 | ```
164 | kubectl create clusterrolebinding default-admin --clusterrole=cluster-admin --user=user@gmail.com
165 | ```
166 | 
167 | ##### Operator Logs
168 | 
169 | The following Stackdriver filter can be used to get the pod logs for the operator
170 | 
171 | ```
172 | resource.type="container"
173 | resource.labels.namespace_id="e2e-0117-1911-3a53"
174 | resource.labels.container_name="tf-job-operator"
175 | ```
176 | 
177 | ## Managing namespaces
178 | 
179 | All namespaces created for the tests should be labeled with `app=kubeflow-e2e-test`.
180 | 
181 | This can be used to manually delete old namespaces that weren't properly garbage collected.
182 | 


--------------------------------------------------------------------------------
/components/jupyterhub/README.md:
--------------------------------------------------------------------------------
  1 | # Jupyter and Jupyterhub
  2 | 
  3 | ## Background
  4 | 
  5 | ### Jupyter
  6 | Jupyter (formerly iPython Notebook) is a UI tool commonly used with Spark, Tensorflow and other big data processing frameworks. It is used
  7 | by data scientists and ML engineers across a variety of organizations for interactive tasks. It supports multiple languages through runners,
  8 | and allows users to run code, save code/results, and share “notebooks” with both code, documentation and output easily. 
  9 | 
 10 | ### JupyterHub
 11 | JupyterHub lets users manage authenticated access to multiple single-user Jupyter notebooks. JupyterHub delegates the launching of
 12 | single-user notebooks to pluggable components called “spawners”. JupyterHub has a sub-project named kubespawner, maintained by the
 13 | community, that enables users to provision single-user Jupyter notebooks backed by Kubernetes pods - the notebooks themselves are
 14 | Kubernetes pods.
 15 | 
 16 | ## Quick Start
 17 | 
 18 | Refer to the [user_guide](../../user_guide.md) for instructions on deploying JupyterHub via ksonnet.
 19 | 
 20 | Once that's completed, you will have a StatefulSet for Jupyterhub, a configmap for configuration, and a LoadBalancer type of service, in addition to the requisite RBAC roles.
 21 | If you are on Google Kubernetes Engine, the LoadBalancer type of service automatically creates an external IP address that can be 
 22 | used to access the notebook. Note that this is for illustration purposes only, and must be coupled with [SSL](http://jupyterhub.readthedocs.io/en/0.8.1/getting-started/security-basics.html?highlight=ssl#ssl-encryption) and configured to use an
 23 | [authentication plugin](https://github.com/willingc/jhubdocs/blob/master/jupyterhub/docs/source/authenticators.md) in production environments.
 24 | 
 25 | if you're testing and want to avoid exposing JupyterHub on an external IP address, you can use kubectl instead to gain access to the hub on your local machine.
 26 | 
 27 | ```commandline
 28 | kubectl port-forward <jupyterhub-pod-name> 8000:8000
 29 | ``` 
 30 | 
 31 | The above will expose JupyterHub on http://localhost:8000. The pod name can be obtained by running `kubectl get pods`, and will be `tf-hub-0` by default. 
 32 | 
 33 | ## Configuration
 34 | 
 35 | Configuration for JupyterHub is shipped separately and contained within the configmap defined by the [core componenent](https://github.com/google/kubeflow/tree/master/kubeflow). It is a Python file that is consumed by JupyterHub on starting up. The supplied configuration has reasonable defaults for the requisite fields and **no authenticator** configured by default. Furthermore, we provide a number of parameters that can be used to configure
 36 | the core component. To see a list of ksonnet parameters run
 37 | 
 38 | ```
 39 | ks prototype describe kubeflow-core
 40 | ```
 41 | 
 42 | If the provided parameters don't provide the flexibility you need, you can take advantage of ksonnet to customize the core component and use a config file fully specified by you.
 43 | 
 44 | Configuration includes sections for KubeSpawner and Authenticators. Spawner parameters include the form used when provisioning new 
 45 | Jupyter notebooks, and configuration defining how JupyterHub creates and interacts with Kubernetes pods for individual notebooks. 
 46 | Authenticator parameters correspond to the authentication mechanism used by JupyterHub. 
 47 | 
 48 |  
 49 | ## Usage
 50 | 
 51 | If you're using the quick-start, the external IP address of the JupyterHub instance can be obtained from `kubectl get svc`.
 52 | ```commandline
 53 |  kubectl get svc
 54 |  
 55 | NAME         TYPE           CLUSTER-IP      EXTERNAL-IP     PORT(S)        AGE
 56 | tf-hub-0       ClusterIP      None            <none>          <none>         1h
 57 | tf-hub-lb    LoadBalancer   10.43.246.148   xx.yy.zz.ww   80:32689/TCP   36m
 58 | ``` 
 59 | 
 60 | Now, you can access http://xx.yy.zz.ww with your browser. When trying to spawn a new image, a configuration page should pop up, allowing configuration of the notebook image, CPU, Memory, and additional resources. With the default `DummyAuthenticator`, it should allow any username/password to access the hub and create new notebooks. You can use an authenticator plugin if you want to secure your notebook server and use its administration functionality. 
 61 | 
 62 | ## Customization
 63 | 
 64 | ### Using your own hub image
 65 | 
 66 | An image with JupyterHub 0.8.1, kubespawner 0.7.1 and two simple authenticator plugins can be built from within the `docker/` directory using the Makefile provided. For example, if you're using Google Cloud Platform and have a project with ID `foo` configured to use gcr.io, you can do the following: 
 67 | 
 68 | ```commandline
 69 | make build PROJECT_ID=foo
 70 | make push PROJECT_ID=foo
 71 | ```
 72 | 
 73 | ### Notebook image
 74 | 
 75 | Images published under https://github.com/jupyter/docker-stacks should work directly with the Hub. The only requirement for the jupyter
 76 | notebook images that can be used in conjunction with this instance of Hub is that the same version of JupyterHub must be installed (0.8.1 by default), and that there must be a standard `start-singleuser.sh` accessible via the default PATH. 
 77 | 
 78 | ### GitHub OAuth Setup
 79 | 
 80 | After creating the initial Hub and exposing it on a public IP address, you can add GitHub based authentication. First, you'll need to create a [GitHub oauth application](https://github.com/settings/applications/new). The callback URL would be of the form `http://xx.yy.zz.ww/hub/oauth_callback`. 
 81 | 
 82 | Once the GitHub application is created, update the `manifest/config.yaml` with the `callback_url`, `client_id` and `client_secret` obtained from the GitHub UI. Ensure that the `DummyAuthenticator` is commented out and replaced by the `GitHubOAuthenticator` options. At the end, the authenticator configuration section might look like:
 83 | 
 84 | ```commandline
 85 | c.JupyterHub.authenticator_class = GitHubOAuthenticator
 86 | c.GitHubOAuthenticator.oauth_callback_url = 'http://xx.yy.zz.ww/hub/oauth_callback'
 87 | c.GitHubOAuthenticator.client_id = 'client_id_here'
 88 | c.GitHubOAuthenticator.client_secret = 'client_secret_here'
 89 | ```
 90 | 
 91 | Finally, you can update the configuration and ensure that the new configuration is picked up, by doing the following:
 92 | 
 93 | ```commandline
 94 | ks apply ${ENVIRONMENT} -c ${COMPONENT_NAME}
 95 | kubectl delete pod tf-hub-0
 96 | ```
 97 | 
 98 | The pod will come up with the new configuration and be configured to use the GitHub authenticator you specified in the previous step. You can additionally modify the configuration to add whitelists and admin users. For example, to limit it to only GitHub users user1 and user2, one might use the following configuration:
 99 | 
100 | ```
101 | c.Authenticator.whitelist = {'user1', 'user2'}
102 | ```
103 | 
104 | After changing the configuration and `kubectl apply -f config.yaml`, please note that the JupyterHub pod needs to be restarted before  the new configuration is reflected.


--------------------------------------------------------------------------------
/testing/test-infra/components/workflows.libsonnet:
--------------------------------------------------------------------------------
  1 | {
  2 |   // TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet
  3 |   // doesn't support automatically piping in the namespace from the environment to prototypes.
  4 | 
  5 |   // convert a list of two items into a map representing an environment variable
  6 |   listToMap:: function(v) 
  7 |       {
  8 |         "name": v[0],
  9 |         "value": v[1],
 10 |       },
 11 |   
 12 |   // Function to turn comma separated list of prow environment variables into a dictionary.
 13 |   parseEnv:: function(v) 
 14 |     local pieces = std.split(v, ",");
 15 |     if v != "" && std.length(pieces) > 0 then
 16 |     std.map(
 17 |       function(i) $.listToMap(std.split(i, "=")), 
 18 |       std.split(v, ","))
 19 |     else [],
 20 | 
 21 |   parts(namespace, name):: {          
 22 |     // Workflow to run the e2e test.
 23 |     e2e(prow_env, bucket): 
 24 |       // mountPath is the directory where the volume to store the test data
 25 |       // should be mounted.
 26 |       local mountPath = "/mnt/" + "test-data-volume";
 27 |       // testDir is the root directory for all data for a particular test run.
 28 |       local testDir = mountPath + "/" + name;
 29 |       // outputDir is the directory to sync to GCS to contain the output for this job.
 30 |       local outputDir = testDir + "/output";
 31 |       local artifactsDir = outputDir + "/artifacts";
 32 |       local srcDir = testDir + "/src";      
 33 |       local image = "gcr.io/mlkube-testing/kubeflow-testing";
 34 |       // The name of the NFS volume claim to use for test files.
 35 |       local nfsVolumeClaim = "kubeflow-testing";
 36 |       // The name to use for the volume to use to contain test data.
 37 |       local dataVolume = "kubeflow-test-volume";
 38 |     {
 39 |       // Build an Argo template to execute a particular command.
 40 |       // step_name: Name for the template
 41 |       // command: List to pass as the container command.
 42 |       buildTemplate(step_name, command):: {
 43 |             "name": step_name,
 44 |             "container": {              
 45 |               "command": command,
 46 |               "image": image,
 47 |               "env": [{
 48 |                 // Add the source directories to the python path.
 49 |                 "name": "PYTHONPATH",
 50 |                 "value": srcDir + ":" + srcDir + "/tensorflow_k8s",
 51 |               },
 52 |               {
 53 |                 "name": "GOOGLE_APPLICATION_CREDENTIALS",
 54 |                 "value": "/secret/gcp-credentials/key.json",
 55 |               },
 56 |               {
 57 |                   "name": "GIT_TOKEN",
 58 |                   "valueFrom": {
 59 |                     "secretKeyRef": {
 60 |                       name: "github-token",
 61 |                       key: "github_token", 
 62 |                     },
 63 |                   },
 64 |               },] + prow_env,
 65 |               "volumeMounts": [
 66 |                 {
 67 |                   "name": dataVolume,
 68 |                   "mountPath": mountPath,
 69 |                 },                
 70 |                 {
 71 |                   "name": "github-token",
 72 |                   "mountPath": "/secret/github-token",
 73 |                 },                
 74 |                 {
 75 |                   "name": "gcp-credentials",
 76 |                   "mountPath": "/secret/gcp-credentials",
 77 |                 },
 78 |               ],
 79 |             }, 
 80 |       }, // buildTemplate
 81 | 
 82 |       "apiVersion": "argoproj.io/v1alpha1", 
 83 |       "kind": "Workflow", 
 84 |       "metadata": {
 85 |         "name": name,
 86 |         "namespace": namespace,
 87 |       }, 
 88 |       // TODO(jlewi): Use OnExit to run cleanup steps.
 89 |       "spec": {
 90 |         "entrypoint": "e2e", 
 91 |         "volumes": [
 92 |           {
 93 |             "name": "github-token",
 94 |             "secret": {
 95 |               "secretName": "github-token",
 96 |             },
 97 |           },
 98 |           {
 99 |             "name": "gcp-credentials",
100 |             "secret": {
101 |               "secretName": "kubeflow-testing-credentials",
102 |             },
103 |           },
104 |           {
105 |             "name": dataVolume,
106 |             "persistentVolumeClaim": {
107 |               "claimName": nfsVolumeClaim,
108 |             },
109 |           },
110 |         ], // volumes
111 |         "templates": [
112 |           {
113 |              "name": "e2e",
114 |              "steps": [
115 |                [{
116 |                   "name": "checkout",
117 |                   "template": "checkout",
118 |                 },],
119 |                 [{
120 |                   "name": "test-deploy",
121 |                   "template": "test-deploy",
122 |                 },              
123 |                 {
124 |                   "name": "create-pr-symlink",
125 |                   "template": "create-pr-symlink",
126 |                 },],
127 |                 [{
128 |                   "name": "copy-artifacts",
129 |                   "template": "copy-artifacts",
130 |                 },],
131 |                ],
132 |           },
133 |           {
134 |             "name": "checkout",
135 |             "container": {              
136 |               "command": [
137 |                 "/usr/local/bin/checkout.sh"
138 |               ], 
139 |               "args": [
140 |                 srcDir,
141 |               ],
142 |               "env": prow_env,
143 |               "image": image,
144 |               "volumeMounts": [
145 |                 {
146 |                   "name": dataVolume,
147 |                   "mountPath": mountPath,
148 |                 },
149 |               ],
150 |             }, 
151 |           }, // checkout
152 |           $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("test-deploy", [
153 |             "python", 
154 |             "-m", 
155 |             "testing.test_deploy",
156 |             "--project=mlkube-testing", 
157 |             "--cluster=kubeflow-testing", 
158 |             "--zone=us-east1-d",
159 |             "--github_token=$(GIT_TOKEN)",
160 |             "--test_dir=" + testDir,
161 |             "--artifacts_dir=" + artifactsDir,
162 |           ]), // test-deploy
163 |           $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("create-pr-symlink", [
164 |             "python",
165 |             "-m",
166 |             "testing.prow_artifacts",
167 |             "--artifacts_dir=" + outputDir,
168 |             "create_pr_symlink",
169 |             "--bucket=" + bucket,
170 |           ]), // create-pr-symlink
171 |           $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("copy-artifacts", [
172 |             "python",
173 |             "-m",
174 |             "testing.prow_artifacts",
175 |             "--artifacts_dir=" + outputDir,
176 |             "copy_artifacts",
177 |             "--bucket=" + bucket,
178 |           ]), // copy-artifacts
179 |         ], // templates
180 |       }
181 |     },// e2e 
182 |   } // parts
183 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Kubeflow
  2 | 
  3 | The Kubeflow project is dedicated to making Machine Learning on Kubernetes easy, portable and scalable. Our goal is **not** to recreate other services, but to provide a straightforward way for spinning up best-of-breed OSS solutions. Contained in this repository are manifests for creating:
  4 | 
  5 | * A JupyterHub to create & manage interactive Jupyter notebooks
  6 | * A Tensorflow Training Controller that can be configured to use CPUs or GPUs and adjusted to the size of a cluster with a single setting
  7 | * A TF Serving container
  8 | 
  9 | This document details the steps needed to run the Kubeflow project in any environment in which Kubernetes runs.
 10 | 
 11 | ## Quick Links
 12 | * [Prow test dashboard](https://k8s-testgrid.appspot.com/sig-big-data)
 13 | * [Prow jobs dashboard](https://prow.k8s.io/?repo=google%2Fkubeflow)
 14 | * [Argo UI for E2E tests](http://testing-argo.kubeflow.io)
 15 | 
 16 | ## The Kubeflow Mission
 17 | 
 18 | Our goal is to help folks use ML more easily, by letting Kubernetes to do what it's great at:
 19 | - Easy, repeatable, portable deployments on a diverse infrastructure (laptop <-> ML rig <-> training cluster <-> production cluster)
 20 | - Deploying and managing loosely-coupled microservices
 21 | - Scaling based on demand
 22 | 
 23 | Because ML practitioners use so many different types of tools, it's a key goal that you can customize the stack to whatever your requirements (within reason) and let the system take care of the "boring stuff." While we have started with a narrow set of technologies, we are working with many different projects to include additional tooling.
 24 | 
 25 | Ultimately, we want to have a set of simple manifests that give you an easy to use ML stack _anywhere_ Kubernetes is already running and can self configure based on the cluster it deploys into.
 26 | 
 27 | 
 28 | ## Who should consider using Kubeflow?
 29 | 
 30 | Based on the current functionality you should consider using Kubeflow if:
 31 | 
 32 |   * You want to train/serve TensorFlow models in different environments (e.g. local, on prem, and cloud)
 33 |   * You want to use Jupyter notebooks to manage TensorFlow training jobs
 34 |        * kubeflow is particularly helpful if you want to launch training jobs that use more resources (more nodes or more GPUs) than your notebook.
 35 |   * You want to combine TensorFlow with other processes
 36 |        * For example, you may want to use [tensorflow/agents](https://github.com/tensorflow/agents) to run simulations to generate data for training reinforcement learning models.
 37 | 
 38 | This list is based ONLY on current capabilities. We are investing significant resources to expand the
 39 | functionality and actively soliciting help from companies and inviduals interested in contributing (see [below](README.md#who-should-consider-contributing-to-kubeflow)).
 40 | 
 41 | ## Setup
 42 | 
 43 | This documentation assumes you have a Kubernetes cluster already available. 
 44 | 
 45 | If you need help setting up a Kubernetes cluster please refer to [Kubernetes Setup](https://kubernetes.io/docs/setup/).
 46 | 
 47 | If you want to use GPUs, be sure to follow the Kubernetes [instructions for enabling GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
 48 | 
 49 | ## Quick Start
 50 | 
 51 | ### Requirements
 52 | 
 53 |   * ksonnet version [0.8.0](https://ksonnet.io/#get-started) or later.
 54 |   * Kubernetes >= 1.8 [see here](https://github.com/tensorflow/k8s#requirements)
 55 | 
 56 | ### Steps
 57 | 
 58 | In order to quickly set up all components, execute the following commands:
 59 | 
 60 | ```commandline
 61 | # Initialize a ksonnet APP
 62 | APP_NAME=my-kubeflow
 63 | ks init ${APP_NAME}
 64 | cd ${APP_NAME}
 65 | 
 66 | # Install Kubeflow components
 67 | ks registry add kubeflow github.com/google/kubeflow/tree/master/kubeflow
 68 | ks pkg install kubeflow/core
 69 | ks pkg install kubeflow/tf-serving
 70 | ks pkg install kubeflow/tf-job
 71 | 
 72 | # Deploy Kubeflow
 73 | ks generate core kubeflow-core --name=kubeflow-core --namespace=${NAMESPACE}
 74 | ks apply default -c kubeflow-core
 75 | ```
 76 | 
 77 | 
 78 | The above command sets up JupyterHub and a custom resource for running TensorFlow training jobs. Furthermore, the ksonnet packages
 79 | provide prototypes that can be used to configure TensorFlow jobs and deploy TensorFlow models. 
 80 | Used together, these make it easy for a user go from training to serving using Tensorflow with minimal
 81 | effort in a portable fashion between different environments. 
 82 | 
 83 | For more detailed instructions about how to use Kubeflow, please refer to the [user guide](user_guide.md).
 84 | 
 85 | ## Troubleshooting
 86 | 
 87 | ### Minikube
 88 | 
 89 | On [Minikube](https://github.com/kubernetes/minikube) the Virtualbox/VMware drivers for Minikube are recommended as there is a known
 90 | issue between the KVM/KVM2 driver and TensorFlow Serving. The issue is tracked in [kubernetes/minikube#2377](https://github.com/kubernetes/minikube/issues/2377).
 91 | 
 92 | ### RBAC clusters
 93 | 
 94 | If you are running on a K8s cluster with [RBAC enabled](https://kubernetes.io/docs/admin/authorization/rbac/#command-line-utilities), you may get an error like the following when deploying Kubeflow: 
 95 | 
 96 | ```
 97 | ERROR Error updating roles kubeflow-test-infra.jupyter-role: roles.rbac.authorization.k8s.io "jupyter-role" is forbidden: attempt to grant extra privileges: [PolicyRule{Resources:["*"], APIGroups:["*"], Verbs:["*"]}] user=&{your-user@acme.com  [system:authenticated] map[]} ownerrules=[PolicyRule{Resources:["selfsubjectaccessreviews"], APIGroups:["authorization.k8s.io"], Verbs:["create"]} PolicyRule{NonResourceURLs:["/api" "/api/*" "/apis" "/apis/*" "/healthz" "/swagger-2.0.0.pb-v1" "/swagger.json" "/swaggerapi" "/swaggerapi/*" "/version"], Verbs:["get"]}] ruleResolutionErrors=[]
 98 | ```
 99 | 
100 | This error indicates you do not have sufficient permissions. In many cases you can resolve this just by creating an appropriate
101 | clusterrole binding like so and then redeploying kubeflow
102 | 
103 | ```commandline
104 | kubectl create clusterrolebinding default-admin --clusterrole=cluster-admin --user=your-user@acme.com
105 | ```
106 | 
107 |   * Replace `your-user@acme.com` with the user listed in the error message.
108 | 
109 | If you're using GKE, you may want to refer to [GKE's RBAC docs](https://cloud.google.com/kubernetes-engine/docs/how-to/role-based-access-control) to understand
110 | how RBAC interacts with IAM on GCP.
111 | 
112 | ## Resources
113 | 
114 | * [user guide](user_guide.md) provides in-depth instructions for using Kubeflow
115 | * Katacoda has produced a [self-paced scenario](https://www.katacoda.com/kubeflow) for learning and trying out Kubeflow
116 | 
117 | 
118 | ## Get involved
119 | 
120 | * [Slack Channel](https://join.slack.com/t/kubeflow/shared_invite/enQtMjgyMzMxNDgyMTQ5LWUwMTIxNmZlZTk2NGU0MmFiNDE4YWJiMzFiOGNkZGZjZmRlNTExNmUwMmQ2NzMwYzk5YzQxOWQyODBlZGY2OTg)
121 | * [Twitter](http://twitter.com/kubeflow)
122 | * [Mailing List](https://groups.google.com/forum/#!forum/kubeflow-discuss)
123 | 
124 | * Review and comment on the [proposal](https://docs.google.com/document/d/1dmErPUmqqKMOe4L0ZHQglSdgDguCM4SzlsEdYXRMIDA/edit#) to define the scope and future of Kubeflow
125 | 
126 | 
127 | ### Who should consider contributing to Kubeflow?
128 | 
129 | * Folks who want to add support for other ML frameworks (e.g. PyTorch, XGBoost, etc...)
130 | * Folks who want to bring more Kubernetes magic to ML (e.g. ISTIO integration for prediction)
131 | * Folks who want to make Kubeflow a richer ML platform (e.g. support for ML pipelines, hyperparameter tuning)
132 | * Folks who want to tune Kubeflow for their particular Kubernetes distribution or Cloud
133 | * Folks who want to write tutorials/blog posts showing how to use Kubeflow to solve ML problems
134 | 


--------------------------------------------------------------------------------
/testing/run_e2e_workflow.py:
--------------------------------------------------------------------------------
  1 | """Run the E2E workflow.
  2 | 
  3 | This script submits an Argo workflow to run the E2E tests and waits for
  4 | it to finish. It is intended to invoked by prow jobs.
  5 | """
  6 | 
  7 | import argparse
  8 | import logging
  9 | from kubernetes import client as k8s_client
 10 | import os
 11 | import tempfile
 12 | from testing import argo_client
 13 | from testing import prow_artifacts
 14 | import uuid
 15 | from google.cloud import storage  # pylint: disable=no-name-in-module
 16 | from py import util
 17 | import sys
 18 | 
 19 | # The namespace to launch the Argo workflow in.
 20 | NAMESPACE = "kubeflow-test-infra"
 21 | 
 22 | # The name of the ksonnet component for the workflow
 23 | COMPONENT = "workflows"
 24 | 
 25 | def _get_src_dir():
 26 |   return os.path.abspath(os.path.join(__file__, "..",))
 27 | 
 28 | def upload_to_gcs(contents, target):
 29 |   gcs_client = storage.Client()
 30 | 
 31 |   bucket_name, path = util.split_gcs_uri(target)
 32 | 
 33 |   bucket = gcs_client.get_bucket(bucket_name)
 34 |   logging.info("Writing %s", target)
 35 |   blob = bucket.blob(path)
 36 |   blob.upload_from_string(contents)
 37 | 
 38 | def upload_file_to_gcs(source, target):
 39 |   gcs_client = storage.Client()
 40 |   bucket_name, path = util.split_gcs_uri(target)
 41 | 
 42 |   bucket = gcs_client.get_bucket(bucket_name)
 43 | 
 44 |   logging.info("Uploading file %s to %s.", source, target)
 45 |   blob = bucket.blob(path)
 46 |   blob.upload_from_filename(source)
 47 | 
 48 | def create_started_file(bucket):
 49 |   """Create the started file in gcs for gubernator."""
 50 |   contents = prow_artifacts.create_started()
 51 | 
 52 |   target = os.path.join(prow_artifacts.get_gcs_dir(bucket), "started.json")
 53 |   upload_to_gcs(contents, target)
 54 | 
 55 | def create_finished_file(bucket, success):
 56 |   """Create the started file in gcs for gubernator."""
 57 |   contents = prow_artifacts.create_finished(success)
 58 | 
 59 |   target = os.path.join(prow_artifacts.get_gcs_dir(bucket), "finished.json")
 60 |   upload_to_gcs(contents, target)
 61 | 
 62 | def run(args, file_handler):
 63 |   src_dir = _get_src_dir()
 64 |   logging.info("Source directory: %s", src_dir)
 65 |   app_dir = os.path.join(src_dir, "test-infra")
 66 | 
 67 |   create_started_file(args.bucket)
 68 | 
 69 |   if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
 70 |     logging.info("GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud "
 71 |                  "to use service account.")
 72 |     # Since a service account is set tell gcloud to use it.
 73 |     util.run(["gcloud", "auth", "activate-service-account", "--key-file=" +
 74 |               os.getenv("GOOGLE_APPLICATION_CREDENTIALS")])
 75 | 
 76 |   util.configure_kubectl(args.project, args.zone, args.cluster)
 77 |   util.load_kube_config()
 78 | 
 79 |   # Create the name for the workflow
 80 |   workflow_name = os.getenv("JOB_NAME")
 81 |   job_type = os.getenv("JOB_TYPE")
 82 |   if job_type == "presubmit":
 83 |     workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
 84 |   elif job_type == "postsubmit":
 85 |     workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA"))
 86 | 
 87 |   workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER"))
 88 | 
 89 |   # Add some salt. This is mostly a convenience for the case where you
 90 |   # are submitting jobs manually for testing/debugging. Since the prow should
 91 |   # vend unique build numbers for each job.
 92 |   workflow_name += "-{0}".format(uuid.uuid4().hex[0:4])
 93 | 
 94 |   util.run(["ks", "param", "set", "workflows", "name", workflow_name], cwd=app_dir)
 95 |   util.load_kube_config()
 96 | 
 97 |   api_client = k8s_client.ApiClient()
 98 | 
 99 |   # Set the prow environment variables.
100 |   prow_env = []
101 | 
102 |   names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER",
103 |            "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER",
104 |            "REPO_NAME"]
105 |   names.sort()
106 |   for v in names:
107 |     if not os.getenv(v):
108 |       continue
109 |     prow_env.append("{0}={1}".format(v, os.getenv(v)))
110 | 
111 |   util.run(["ks", "param", "set", COMPONENT, "prow_env", ",".join(prow_env)], cwd=app_dir)
112 |   util.run(["ks", "param", "set", COMPONENT, "namespace", NAMESPACE], cwd=app_dir)
113 |   util.run(["ks", "param", "set", COMPONENT, "bucket", args.bucket], cwd=app_dir)
114 | 
115 |   # For debugging print out the manifest
116 |   util.run(["ks", "show", "prow", "-c", COMPONENT], cwd=app_dir)
117 |   util.run(["ks", "apply", "prow", "-c", COMPONENT], cwd=app_dir)
118 | 
119 |   success = False
120 |   try:
121 |     results = argo_client.wait_for_workflow(api_client, NAMESPACE, workflow_name,
122 |                                             status_callback=argo_client.log_status)
123 |     if results["status"]["phase"] == "Succeeded":
124 |       success = True
125 |     logging.info("Workflow %s/%s finished phase: %s", NAMESPACE, workflow_name,
126 |                  results["status"]["phase"] )
127 |   except util.TimeoutError:
128 |     success = False
129 |     logging.error("Time out waiting for Workflow %s/%s to finish", NAMESPACE, workflow_name)
130 |   finally:
131 |     create_finished_file(args.bucket, success)
132 | 
133 |     # Upload logs to GCS. No logs after this point will appear in the
134 |     # file in gcs
135 |     file_handler.flush()
136 |     upload_file_to_gcs(
137 |       file_handler.baseFilename,
138 |       os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt"))
139 | 
140 |   return success
141 | 
142 | def main(unparsed_args=None):  # pylint: disable=too-many-locals
143 |   logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals
144 |   # create the top-level parser
145 |   parser = argparse.ArgumentParser(
146 |     description="Submit an Argo workflow to run the E2E tests.")
147 | 
148 |   parser.add_argument(
149 |     "--project",
150 |     default="",
151 |     type=str,
152 |     help="The project containing the GKE cluster to use to run the workflow.")
153 | 
154 |   parser.add_argument(
155 |     "--zone",
156 |     default="",
157 |     type=str,
158 |     help="The zone containing the GKE cluster to use to run the workflow.")
159 | 
160 |   parser.add_argument(
161 |     "--cluster",
162 |     default="",
163 |     type=str,
164 |     help="The GKE cluster to use to run the workflow.")
165 | 
166 |   parser.add_argument(
167 |     "--bucket",
168 |     default="",
169 |     type=str,
170 |     help="The bucket to use for the Gubernator outputs.")
171 | 
172 |   #############################################################################
173 |   # Process the command line arguments.
174 | 
175 |   # Parse the args
176 |   args = parser.parse_args(args=unparsed_args)
177 | 
178 |   # Setup a logging file handler. This way we can upload the log outputs
179 |   # to gubernator.
180 |   root_logger = logging.getLogger()
181 | 
182 |   with tempfile.NamedTemporaryFile(prefix="tmpRunE2eWorkflow", suffix="log") as hf:
183 |     test_log = hf.name
184 | 
185 |   file_handler = logging.FileHandler(test_log)
186 |   root_logger.addHandler(file_handler)
187 |   # We need to explicitly set the formatter because it will not pick up
188 |   # the BasicConfig.
189 |   formatter = logging.Formatter(fmt=("%(levelname)s|%(asctime)s"
190 |                                      "|%(pathname)s|%(lineno)d| %(message)s"),
191 |                                 datefmt="%Y-%m-%dT%H:%M:%S")
192 |   file_handler.setFormatter(formatter)
193 |   logging.info("Logging to %s", test_log)
194 | 
195 |   return run(args, file_handler)
196 | 
197 | 
198 | if __name__ == "__main__":
199 |   logging.basicConfig(level=logging.INFO,
200 |                       format=('%(levelname)s|%(asctime)s'
201 |                               '|%(pathname)s|%(lineno)d| %(message)s'),
202 |                       datefmt='%Y-%m-%dT%H:%M:%S',
203 |                       )
204 |   logging.getLogger().setLevel(logging.INFO)
205 |   success = main()
206 |   if not success:
207 |     # Exit with a non-zero exit code by to signal failure to prow.
208 |     logging.error("One or more test steps failed exiting with non-zero exit "
209 |                   "code.")
210 |     sys.exit(1)
211 | 
212 | 


--------------------------------------------------------------------------------
/kubeflow/core/nfs.libsonnet:
--------------------------------------------------------------------------------
  1 | // A ksonnet prototype/component for using NFS.
  2 | 
  3 | {
  4 |   // TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet
  5 |   // doesn't support automatically piping in the namespace from the environment to prototypes.
  6 |   //
  7 |   // Create a provisioner with the specified name.
  8 |   // disks should be a list GCP persistent disk names; these disks should be in the
  9 |   // same zone as your cluster.
 10 |   // TODO(jlewi): 
 11 |   parts(namespace, name):: {
 12 | 
 13 | 	local serviceAccountName = name,
 14 | 	local serviceAccountRoleName = name,
 15 | 
 16 | 
 17 | 	// Create the resources for a specific disk.
 18 | 	// Each NFS Provisioner can only manage 1 PD so we need to create one for each disk.
 19 | 	diskResources(diskName): {
 20 | 
 21 | 	  	local storageClassName = diskName + "-nfs",  	
 22 | 	  	local provisionerName = diskName + "-provisioner",
 23 | 	  	local storageClassProvisioner = diskName + "/nfs",
 24 | 	  	local serviceName = diskName + "-service",	  	
 25 | 
 26 | 		volumeClaim: {
 27 | 		  "apiVersion": "v1", 
 28 | 		  "kind": "PersistentVolumeClaim", 
 29 | 		  "metadata": {
 30 | 		    "annotations": {
 31 | 		      "volume.beta.kubernetes.io/storage-class": storageClassName,
 32 | 		    }, 
 33 | 		    "name": diskName,
 34 | 		    "namespace": namespace,
 35 | 		  }, 
 36 | 		  "spec": {
 37 | 		    "accessModes": [
 38 | 		      "ReadWriteMany"
 39 | 		    ], 
 40 | 		    "resources": {
 41 | 		      "requests": {
 42 | 		        "storage": "1Mi"
 43 | 		      }
 44 | 		    }
 45 | 		  }
 46 | 		},
 47 | 
 48 | 		// TODO(jlewi): Is storageClass actually name space scoped? Seems to show up in default namespace as well.
 49 | 		// TODO(jlewi): Could we just use the default cluster storage class?
 50 | 	    storageClass: {
 51 | 		  "apiVersion": "storage.k8s.io/v1beta1", 
 52 | 		  "kind": "StorageClass", 
 53 | 		  "metadata": {
 54 | 		    "name": storageClassName,	    
 55 | 		    "namespace": namespace,
 56 | 		  }, 
 57 | 		  // This value must be the same as passed as argument --provisioner to the provisioner
 58 | 		  "provisioner": storageClassProvisioner,
 59 | 		},
 60 | 
 61 | 		service: {
 62 | 		  "apiVersion": "v1", 
 63 | 		  "kind": "Service", 
 64 | 		  "metadata": {
 65 | 		    "labels": {
 66 | 		      "app": provisionerName
 67 | 		    }, 
 68 | 		    "name": serviceName,
 69 | 		    "namespace": namespace,
 70 | 		  }, 
 71 | 		  "spec": {
 72 | 		    "ports": [
 73 | 		      {
 74 | 		        "name": "nfs", 
 75 | 		        "port": 2049
 76 | 		      }, 
 77 | 		      {
 78 | 		        "name": "mountd", 
 79 | 		        "port": 20048
 80 | 		      }, 
 81 | 		      {
 82 | 		        "name": "rpcbind", 
 83 | 		        "port": 111
 84 | 		      }, 
 85 | 		      {
 86 | 		        "name": "rpcbind-udp", 
 87 | 		        "port": 111, 
 88 | 		        "protocol": "UDP"
 89 | 		      }
 90 | 		    ], 
 91 | 		    "selector": {
 92 | 		      "app": provisionerName
 93 | 		    }
 94 | 		  }
 95 | 		},
 96 | 
 97 | 	    provisioner: {
 98 | 		  "apiVersion": "extensions/v1beta1", 
 99 | 		  "kind": "Deployment", 
100 | 		  "metadata": {
101 | 		    "name": provisionerName,	   
102 | 		    "namespace": namespace,
103 | 		  }, 
104 | 		  "spec": {
105 | 		    "replicas": 1, 
106 | 		    "strategy": {
107 | 		      "type": "Recreate"
108 | 		    }, 
109 | 		    "template": {
110 | 		      "metadata": {
111 | 		        "labels": {
112 | 		          "app": provisionerName
113 | 		        }
114 | 		      }, 
115 | 		      "spec": {
116 | 		        "containers": [
117 | 		          {
118 | 		            "args": [
119 | 		              "-provisioner=" + storageClassProvisioner,
120 | 		            ], 
121 | 		            "env": [
122 | 		              {
123 | 		                "name": "POD_IP", 
124 | 		                "valueFrom": {
125 | 		                  "fieldRef": {
126 | 		                    "fieldPath": "status.podIP"
127 | 		                  }
128 | 		                }
129 | 		              }, 
130 | 		              {
131 | 		                "name": "SERVICE_NAME", 
132 | 		                "value": serviceName,
133 | 		              }, 
134 | 		              {
135 | 		                "name": "POD_NAMESPACE", 
136 | 		                "valueFrom": {
137 | 		                  "fieldRef": {
138 | 		                    "fieldPath": "metadata.namespace"
139 | 		                  }
140 | 		                }
141 | 		              }
142 | 		            ], 
143 | 		            "image": "quay.io/kubernetes_incubator/nfs-provisioner:v1.0.8", 
144 | 		            "imagePullPolicy": "IfNotPresent", 
145 | 		            "name": "nfs-provisioner", 
146 | 		            "ports": [
147 | 		              {
148 | 		                "containerPort": 2049, 
149 | 		                "name": "nfs"
150 | 		              }, 
151 | 		              {
152 | 		                "containerPort": 20048, 
153 | 		                "name": "mountd"
154 | 		              }, 
155 | 		              {
156 | 		                "containerPort": 111, 
157 | 		                "name": "rpcbind"
158 | 		              }, 
159 | 		              {
160 | 		                "containerPort": 111, 
161 | 		                "name": "rpcbind-udp", 
162 | 		                "protocol": "UDP"
163 | 		              }
164 | 		            ], 
165 | 		            "securityContext": {
166 | 		              "capabilities": {
167 | 		                "add": [
168 | 		                  "DAC_READ_SEARCH"
169 | 		                ]
170 | 		              }
171 | 		            }, 
172 | 		            "volumeMounts": [{
173 | 		                // Needs to be mounted under /export because /export is what is exported for NFS.
174 | 		                //    https://github.com/kubernetes-incubator/external-storage/tree/master/nfs#quickstart
175 | 		                "mountPath": "/export", 
176 | 		                "name": diskName,
177 | 		             }],
178 | 		          }
179 | 		        ], 
180 | 		        "volumes": [{
181 | 		            "name": diskName,
182 | 		            "gcePersistentDisk": {
183 | 		              "pdName": diskName,
184 | 		            },},],
185 | 		        "serviceAccountName": serviceAccountName, 
186 | 		      },
187 | 		    },
188 | 		  },
189 | 		}, // provisioner
190 |     },
191 | 
192 |     serviceAccount: {
193 |       "apiVersion": "v1", 
194 |       "kind": "ServiceAccount", 
195 |       "metadata": {
196 |         "labels": {
197 |           "app": name + "nfs-provisioner"
198 |         }, 
199 |         "name": serviceAccountName,
200 |         "namespace": namespace,
201 |       }
202 |     },
203 | 
204 | 	role: {
205 | 	  "apiVersion": "rbac.authorization.k8s.io/v1beta1", 
206 | 	  "kind": "Role", 
207 | 	  "metadata": {
208 | 	    "name": serviceAccountRoleName, 
209 | 	    "namespace": namespace,
210 | 	  }, 
211 | 	  "rules": [
212 | 	    {
213 | 	      "apiGroups": [
214 | 	        "*"
215 | 	      ], 
216 | 	      // TODO(jlewi): This is very permissive so we may want to lock this down.
217 | 	      "resources": [
218 | 	        "*"
219 | 	      ], 
220 | 	      "verbs": [
221 | 	        "*"
222 | 	      ]
223 | 	    }
224 | 	  ]
225 | 	},
226 | 
227 |     roleBinding:  {
228 | 	  "apiVersion": "rbac.authorization.k8s.io/v1beta1", 
229 | 	  "kind": "RoleBinding", 
230 | 	  "metadata": {
231 | 	    "name": name + "-nfs-role", 
232 | 	    "namespace": namespace
233 | 	  }, 
234 | 	  "roleRef": {
235 | 	    "apiGroup": "rbac.authorization.k8s.io", 
236 | 	    "kind": "Role", 
237 | 	    "name": serviceAccountName,
238 | 	  }, 
239 | 	  "subjects": [
240 | 	    {
241 | 	      "kind": "ServiceAccount", 
242 | 	      "name": serviceAccountRoleName, 
243 | 	      "namespace": namespace,
244 | 	    }
245 | 	  ]
246 | 	},
247 | 
248 | 	// see https://github.com/kubernetes-incubator/external-storage/tree/master/docs#authorizing-provisioners-for-rbac-or-openshift
249 | 	clusterRoleBinding:  {
250 | 	  "apiVersion": "rbac.authorization.k8s.io/v1beta1", 
251 | 	  "kind": "ClusterRoleBinding", 
252 | 	  "metadata": {
253 | 	    "name": name + "-nfs-role", 
254 | 	    "namespace": namespace
255 | 	  }, 
256 | 	  "roleRef": {
257 | 	    "apiGroup": "rbac.authorization.k8s.io", 
258 | 	    "kind": "ClusterRole", 
259 | 	    "name": "system:persistent-volume-provisioner",
260 | 	  }, 
261 | 	  "subjects": [
262 | 	    {
263 | 	      "kind": "ServiceAccount", 
264 | 	      "name": serviceAccountRoleName, 
265 | 	      "namespace": namespace,
266 | 	    }
267 | 	  ]
268 | 	},
269 | 
270 |   }, // parts
271 | }


--------------------------------------------------------------------------------
/testing/prow_artifacts.py:
--------------------------------------------------------------------------------
  1 | """Script to create artifacts needed by Gubernator.
  2 | 
  3 | For reference see:
  4 | https://github.com/kubernetes/test-infra/tree/master/gubernator
  5 | """
  6 | import argparse
  7 | import logging
  8 | import json
  9 | import os
 10 | import time
 11 | from google.cloud import storage  # pylint: disable=no-name-in-module
 12 | from py import util
 13 | 
 14 | 
 15 | # TODO(jlewi): Replace create_finished in tensorflow/k8s/py/prow.py with this
 16 | # version. We should do that when we switch tensorflow/k8s to use Argo instead
 17 | # of Airflow.
 18 | def create_started():
 19 |   """Return a string containing the contents of started.json for gubernator.
 20 |   """
 21 |   # See:
 22 |   # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout
 23 |   # For a list of fields expected by gubernator
 24 |   started = {
 25 |       "timestamp": int(time.time()),
 26 |       "repos": {
 27 |       },
 28 |   }
 29 | 
 30 |   repo_owner = os.getenv("REPO_OWNER", "")
 31 |   repo_name = os.getenv("REPO_NAME", "")
 32 | 
 33 |   if repo_owner:
 34 |     sha = os.getenv("PULL_PULL_SHA", "")
 35 |     if not sha:
 36 |       # Its a post submit job.
 37 |       sha = os.getenv("PULL_BASE_SHA", "")
 38 | 
 39 |     started["repos"][repo_owner + "/" + repo_name] = sha
 40 | 
 41 |   PULL_REFS = os.getenv("PULL_REFS", "")
 42 |   if PULL_REFS:
 43 |     started["pull"] = PULL_REFS
 44 | 
 45 |   return json.dumps(started)
 46 | 
 47 | # TODO(jlewi): Replace create_finished in tensorflow/k8s/py/prow.py with this
 48 | # version. We should do that when we switch tensorflow/k8s to use Argo instead
 49 | # of Airflow.
 50 | def create_finished(success):
 51 |   """Create a string containing the contents for finished.json.
 52 | 
 53 |   Args:
 54 |     success: Bool indicating whether the workflow succeeded or not.
 55 |   """
 56 |   if success:
 57 |     result = "SUCCESS"
 58 |   else:
 59 |     result = "FAILED"
 60 |   finished = {
 61 |       "timestamp": int(time.time()),
 62 |       "result": result,
 63 |       # Dictionary of extra key value pairs to display to the user.
 64 |       # TODO(jlewi): Perhaps we should add the GCR path of the Docker image
 65 |       # we are running in. We'd have to plumb this in from bootstrap.
 66 |       "metadata": {},
 67 |   }
 68 | 
 69 |   return json.dumps(finished)
 70 | 
 71 | def get_gcs_dir(bucket):
 72 |   """Return the GCS directory for this job."""
 73 |   pull_number = os.getenv("PULL_NUMBER")
 74 | 
 75 |   repo_owner = os.getenv("REPO_OWNER")
 76 |   repo_name = os.getenv("REPO_NAME")
 77 | 
 78 | 
 79 |   job_name = os.getenv("JOB_NAME")
 80 | 
 81 |   # GCS layout is defined here:
 82 |   # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout
 83 |   pull_number = os.getenv("PULL_NUMBER")
 84 | 
 85 |   repo_owner = os.getenv("REPO_OWNER")
 86 |   repo_name = os.getenv("REPO_NAME")
 87 | 
 88 |   if pull_number:
 89 |     output = ("gs://{bucket}/pr-logs/pull/{owner}_{repo}/"
 90 |               "{pull_number}/{job}/{build}").format(
 91 |               bucket=bucket,
 92 |               owner=repo_owner, repo=repo_name,
 93 |               pull_number=pull_number,
 94 |               job=os.getenv("JOB_NAME"),
 95 |               build=os.getenv("BUILD_NUMBER"))
 96 | 
 97 |   elif repo_owner:
 98 |     # It is a postsubmit job
 99 |     output = ("gs://{bucket}/logs/{owner}_{repo}/"
100 |               "{job}/{build}").format(
101 |                   owner=repo_owner, repo=repo_name,
102 |                   job=job_name,
103 |                   build=os.getenv("BUILD_NUMBER"))
104 |   else:
105 |     # Its a periodic job
106 |     output = ("gs://{bucket}/logs/{job}/{build}").format(
107 |         bucket=bucket,
108 |         job=job_name,
109 |         build=os.getenv("BUILD_NUMBER"))
110 | 
111 |   return output
112 | 
113 | def copy_artifacts(args):
114 |   """Sync artifacts to GCS."""
115 |   job_name = os.getenv("JOB_NAME")
116 | 
117 |   # GCS layout is defined here:
118 |   # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout
119 |   pull_number = os.getenv("PULL_NUMBER")
120 | 
121 |   repo_owner = os.getenv("REPO_OWNER")
122 |   repo_name = os.getenv("REPO_NAME")
123 | 
124 |   output = get_gcs_dir(args.bucket)
125 | 
126 |   if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
127 |     logging.info("GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud "
128 |                  "to use service account.")
129 |     # Since a service account is set tell gcloud to use it.
130 |     util.run(["gcloud", "auth", "activate-service-account", "--key-file=" +
131 |               os.getenv("GOOGLE_APPLICATION_CREDENTIALS")])
132 | 
133 |   util.run(["gsutil", "-m", "rsync", "-r", args.artifacts_dir, output])
134 | 
135 | def create_pr_symlink(args):
136 |   """Create a 'symlink' in GCS pointing at the results for a PR.
137 | 
138 |   This is a null op if PROW environment variables indicate this is not a PR
139 |   job.
140 |   """
141 |   gcs_client = storage.Client()
142 |   # GCS layout is defined here:
143 |   # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout
144 |   pull_number = os.getenv("PULL_NUMBER")
145 |   if not pull_number:
146 |     # Symlinks are only created for pull requests.
147 |     return ""
148 | 
149 |   path = "pr-logs/directory/{job}/{build}.txt".format(
150 |       job=os.getenv("JOB_NAME"), build=os.getenv("BUILD_NUMBER"))
151 | 
152 |   pull_number = os.getenv("PULL_NUMBER")
153 | 
154 |   repo_owner = os.getenv("REPO_OWNER")
155 |   repo_name = os.getenv("REPO_NAME")
156 | 
157 | 
158 |   build_dir = ("gs://{bucket}/pr-logs/pull/{owner}_{repo}/"
159 |                "{pull_number}/{job}/{build}").format(
160 |                 bucket=args.bucket,
161 |                 owner=repo_owner, repo=repo_name,
162 |                 pull_number=pull_number,
163 |                 job=os.getenv("JOB_NAME"),
164 |                 build=os.getenv("BUILD_NUMBER"))
165 |   source = util.to_gcs_uri(args.bucket, path)
166 |   target = get_gcs_dir(args.bucket)
167 |   logging.info("Creating symlink %s pointing to %s", source, target)
168 |   bucket = gcs_client.get_bucket(args.bucket)
169 |   blob = bucket.blob(path)
170 |   blob.upload_from_string(target)
171 | 
172 | def main(unparsed_args=None):  # pylint: disable=too-many-locals
173 |   logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals
174 |   # create the top-level parser
175 |   parser = argparse.ArgumentParser(
176 |     description="Create prow artifacts.")
177 | 
178 |   parser.add_argument(
179 |     "--artifacts_dir",
180 |     default="",
181 |     type=str,
182 |     help="Directory to use for all the gubernator artifacts.")
183 | 
184 |   subparsers = parser.add_subparsers()
185 | 
186 |   #############################################################################
187 |   # Copy artifacts.
188 |   parser_copy = subparsers.add_parser(
189 |     "copy_artifacts", help="Copy the artifacts.")
190 | 
191 |   parser_copy.add_argument(
192 |     "--bucket",
193 |     default="",
194 |     type=str,
195 |     help="Bucket to copy the artifacts to.")
196 | 
197 |   parser_copy.set_defaults(func=copy_artifacts)
198 | 
199 |   #############################################################################
200 |   # Create the pr symlink.
201 |   parser_link = subparsers.add_parser(
202 |     "create_pr_symlink", help="Create a symlink pointing at PR output dir; null "
203 |                            "op if prow job is not a presubmit job.")
204 | 
205 |   parser_link.add_argument(
206 |     "--bucket",
207 |     default="",
208 |     type=str,
209 |     help="Bucket to copy the artifacts to.")
210 | 
211 |   parser_link.set_defaults(func=create_pr_symlink)
212 | 
213 |   #############################################################################
214 |   # Process the command line arguments.
215 | 
216 |   # Parse the args
217 |   args = parser.parse_args(args=unparsed_args)
218 | 
219 |   # Setup a logging file handler. This way we can upload the log outputs
220 |   # to gubernator.
221 |   root_logger = logging.getLogger()
222 | 
223 |   test_log = os.path.join(os.path.join(args.artifacts_dir, "artifacts"),
224 |                           "logs", "prow_artifacts." + args.func.__name__ +
225 |                           ".log")
226 |   if not os.path.exists(os.path.dirname(test_log)):
227 |     os.makedirs(os.path.dirname(test_log))
228 | 
229 |   file_handler = logging.FileHandler(test_log)
230 |   root_logger.addHandler(file_handler)
231 |   # We need to explicitly set the formatter because it will not pick up
232 |   # the BasicConfig.
233 |   formatter = logging.Formatter(fmt=("%(levelname)s|%(asctime)s"
234 |                                      "|%(pathname)s|%(lineno)d| %(message)s"),
235 |                                 datefmt="%Y-%m-%dT%H:%M:%S")
236 |   file_handler.setFormatter(formatter)
237 |   logging.info("Logging to %s", test_log)
238 | 
239 |   args.func(args)
240 | 
241 | if __name__ == "__main__":
242 |   logging.basicConfig(level=logging.INFO,
243 |                       format=('%(levelname)s|%(asctime)s'
244 |                               '|%(pathname)s|%(lineno)d| %(message)s'),
245 |                       datefmt='%Y-%m-%dT%H:%M:%S',
246 |                       )
247 |   logging.getLogger().setLevel(logging.INFO)
248 |   main()
249 | 


--------------------------------------------------------------------------------
/kubeflow/core/jupyterhub.libsonnet:
--------------------------------------------------------------------------------
  1 | 
  2 | // local nfs = import "nfs.libsonnet";
  3 | 
  4 | {  
  5 |   // TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet
  6 |   // doesn't support automatically piping in the namespace from the environment to prototypes.
  7 |   //
  8 |   // TODO(jlewi): We should refactor this to have multiple prototypes; having 1 without any extra volumes and than 
  9 |   // a with volumes option.
 10 |   parts(namespace):: {
 11 | 
 12 |     // TODO(jlewi): We should make the default Docker image configurable
 13 |     // TODO(jlewi): We should make whether we use PVC configurable.
 14 |   	local baseKubeConfigSpawner = @"import json
 15 | import os
 16 | from kubespawner.spawner import KubeSpawner
 17 | from oauthenticator.github import GitHubOAuthenticator
 18 | 
 19 | class KubeFormSpawner(KubeSpawner):
 20 |   def _options_form_default(self):
 21 |     return '''
 22 |     <label for='image'>Image</label>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 23 |     <input name='image' placeholder='repo/image:tag'></input>
 24 |     <br/><br/>
 25 | 
 26 |     <label for='cpu_guarantee'>CPU</label>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 27 |     <input name='cpu_guarantee' placeholder='200m, 1.0, 2.5, etc'></input>
 28 |     <br/><br/>
 29 | 
 30 |     <label for='mem_guarantee'>Memory</label>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 31 |     <input name='mem_guarantee' placeholder='100Mi, 1.5Gi'></input>
 32 |     <br/><br/>
 33 | 
 34 |     <label for='extra_resource_limits'>Extra Resource Limits</label>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 35 |     <input name='extra_resource_limits' placeholder='{'nvidia.com/gpu': '3'}'></input>
 36 |     <br/><br/>
 37 |     '''
 38 | 
 39 |   def options_from_form(self, formdata):
 40 |     options = {}
 41 |     options['image'] = formdata.get('image', [''])[0].strip()
 42 |     options['cpu_guarantee'] = formdata.get('cpu_guarantee', [''])[0].strip()
 43 |     options['mem_guarantee'] = formdata.get('mem_guarantee', [''])[0].strip()
 44 |     options['extra_resource_limits'] = formdata.get('extra_resource_limits', [''])[0].strip()
 45 |     return options
 46 | 
 47 |   @property
 48 |   def singleuser_image_spec(self):
 49 |     image = 'gcr.io/kubeflow/tensorflow-notebook-cpu'
 50 |     if self.user_options.get('image'):
 51 |       image = self.user_options['image']
 52 |     return image
 53 | 
 54 |   @property
 55 |   def cpu_guarantee(self):
 56 |     cpu = '500m'
 57 |     if self.user_options.get('cpu_guarantee'):
 58 |       cpu = self.user_options['cpu_guarantee']
 59 |     return cpu
 60 | 
 61 |   @property
 62 |   def mem_guarantee(self):
 63 |     mem = '1Gi'
 64 |     if self.user_options.get('mem_guarantee'):
 65 |       mem = self.user_options['mem_guarantee']
 66 |     return mem
 67 | 
 68 |   @property
 69 |   def extra_resource_limits(self):
 70 |     extra = ''
 71 |     if self.user_options.get('extra_resource_limits'):
 72 |       extra = json.loads(self.user_options['extra_resource_limits'])
 73 |     return extra
 74 | 
 75 | ###################################################
 76 | ### JupyterHub Options
 77 | ###################################################
 78 | c.JupyterHub.ip = '0.0.0.0'
 79 | c.JupyterHub.hub_ip = '0.0.0.0'
 80 | # Don't try to cleanup servers on exit - since in general for k8s, we want
 81 | # the hub to be able to restart without losing user containers
 82 | c.JupyterHub.cleanup_servers = False
 83 | ###################################################
 84 | 
 85 | ###################################################
 86 | ### Spawner Options
 87 | ###################################################
 88 | c.JupyterHub.spawner_class = KubeFormSpawner
 89 | c.KubeSpawner.singleuser_image_spec = 'gcr.io/kubeflow/tensorflow-notebook'
 90 | c.KubeSpawner.cmd = 'start-singleuser.sh'
 91 | c.KubeSpawner.args = ['--allow-root']
 92 | # First pulls can be really slow, so let's give it a big timeout
 93 | c.KubeSpawner.start_timeout = 60 * 10
 94 | ###################################################
 95 | 
 96 | 
 97 | ###################################################
 98 | ### Authenticator Options
 99 | ###################################################
100 | c.JupyterHub.authenticator_class = 'dummyauthenticator.DummyAuthenticator'
101 | # c.JupyterHub.authenticator_class = GitHubOAuthenticator
102 | # c.GitHubOAuthenticator.oauth_callback_url = '<placeholder>'
103 | # c.GitHubOAuthenticator.client_id = '<placeholder>'
104 | # c.GitHubOAuthenticator.client_secret = '<placeholder>'
105 | 
106 | 
107 | ###################################################
108 | ### Persistent volume options
109 | ###################################################
110 | # Using persistent storage requires a default storage class.
111 | # TODO(jlewi): Verify this works on minikube.
112 | # TODO(jlewi): Should we set c.KubeSpawner.singleuser_fs_gid = 1000
113 | # see https://github.com/google/kubeflow/pull/22#issuecomment-350500944
114 | c.KubeSpawner.user_storage_pvc_ensure = True
115 | # How much disk space do we want?
116 | c.KubeSpawner.user_storage_capacity = '10Gi'
117 | c.KubeSpawner.pvc_name_template = 'claim-{username}{servername}'
118 | ",
119 | 
120 |    local baseJupyterHubConfigMap = {
121 | 	  "apiVersion": "v1", 	  
122 | 	  "kind": "ConfigMap", 
123 | 	  "metadata": {
124 | 	    "name": "jupyterhub-config",
125 | 	    namespace: namespace,
126 | 	  },
127 |    },
128 | 
129 | 
130 |    jupyterHubConfigMap: baseJupyterHubConfigMap + {
131 |    	  "data": {	    
132 | 	    "jupyterhub_config.py": baseKubeConfigSpawner,
133 | 	  }, 
134 | 	},
135 | 
136 |    jupyterHubConfigMapWithVolumes(volumeClaims): {
137 | 
138 | 
139 | 	local volumes = std.map(function(v) 
140 | 		{
141 |             'name': v,
142 |             'persistentVolumeClaim': {
143 |                 'claimName': v,
144 |             },
145 |         }, volumeClaims),
146 | 
147 | 
148 | 	local volumeMounts = std.map( function(v)
149 |         {
150 |             'mountPath': '/mnt/' + v,
151 |             'name': v,
152 |         },  volumeClaims),
153 | 
154 | 	local extendedBaseKubeConfigSpawner = baseKubeConfigSpawner    
155 |     	+ "\nc.KubeSpawner.volumes = " + std.manifestPython(volumes)
156 |      	+ "\nc.KubeSpawner.volume_mounts = " + std.manifestPython(volumeMounts),
157 | 
158 |      config: baseJupyterHubConfigMap + {   	 
159 | 		 "data": {
160 | 		 	"jupyterhub_config.py": extendedBaseKubeConfigSpawner,
161 | 		 },	 
162 | 	   },
163 | 	 }.config,
164 | 
165 |    jupyterHubService: {
166 | 	  "apiVersion": "v1", 
167 | 	  "kind": "Service", 
168 | 	  "metadata": {
169 | 	    "labels": {
170 | 	      "app": "tf-hub"
171 | 	    }, 
172 | 	    "name": "tf-hub-0",
173 | 	    namespace: namespace,
174 | 	  }, 
175 | 	  "spec": {
176 | 	    "clusterIP": "None", 
177 | 	    "ports": [
178 | 	      {
179 | 	        "name": "hub", 
180 | 	        "port": 8000
181 | 	      }
182 | 	    ], 
183 | 	    "selector": {
184 | 	      "app": "tf-hub"
185 | 	    }
186 | 	  }
187 |    },
188 | 
189 |    jupyterHubLoadBalancer(serviceType): {
190 | 	  "apiVersion": "v1", 
191 | 	  "kind": "Service", 
192 | 	  "metadata": {
193 | 	    "labels": {
194 | 	      "app": "tf-hub"
195 | 	    }, 
196 | 	    "name": "tf-hub-lb",
197 | 	    "namespace": namespace,
198 | 	  }, 
199 | 	  "spec": {
200 | 	    "ports": [
201 | 	      {
202 | 	        "name": "http", 
203 | 	        "port": 80, 
204 | 	        "targetPort": 8000
205 | 	      }
206 | 	    ], 
207 | 	    "selector": {
208 | 	      "app": "tf-hub"
209 | 	    }, 
210 | 	    "type": serviceType
211 | 	  }
212 | 	},
213 | 
214 | 	jupyterHub(image): {
215 | 	  "apiVersion": "apps/v1beta1", 
216 | 	  "kind": "StatefulSet", 
217 | 	  "metadata": {
218 | 	    "name": "tf-hub",
219 | 	    "namespace": namespace,
220 | 	  }, 
221 | 	  "spec": {
222 | 	    "replicas": 1, 
223 | 	    "serviceName": "", 
224 | 	    "template": {
225 | 	      "metadata": {
226 | 	        "labels": {
227 | 	          "app": "tf-hub"
228 | 	        }
229 | 	      }, 
230 | 	      "spec": {
231 | 	        "containers": [
232 | 	          {
233 | 	            "command": [
234 | 	              "jupyterhub", 
235 | 	              "-f", 
236 | 	              "/etc/config/jupyterhub_config.py"
237 | 	            ], 
238 | 	            "image": image, 
239 | 	            "name": "tf-hub", 
240 | 	            "volumeMounts": [
241 | 	              {
242 | 	                "mountPath": "/etc/config", 
243 | 	                "name": "config-volume"
244 | 	              }
245 | 	            ]
246 | 	          }
247 | 	        ], 
248 | 	        "serviceAccountName": "jupyter-hub", 
249 | 	        "volumes": [
250 | 	          {
251 | 	            "configMap": {
252 | 	              "name": "jupyterhub-config"
253 | 	            }, 
254 | 	            "name": "config-volume"
255 | 	          }
256 | 	        ]
257 | 	      }
258 | 	    }, 
259 | 	    "updateStrategy": {
260 | 	      "type": "RollingUpdate"
261 | 	    }
262 | 	  }
263 | 	},
264 | 
265 |    jupyterHubRole: {
266 | 	  "apiVersion": "rbac.authorization.k8s.io/v1beta1", 
267 | 	  "kind": "Role", 
268 | 	  "metadata": {
269 | 	    "name": "jupyter-role", 
270 | 	    "namespace": namespace,
271 | 	  }, 
272 | 	  "rules": [
273 | 	    {
274 | 	      "apiGroups": [
275 | 	        "*"
276 | 	      ], 
277 | 	      // TODO(jlewi): This is very permissive so we may want to lock this down.
278 | 	      "resources": [
279 | 	        "*"
280 | 	      ], 
281 | 	      "verbs": [
282 | 	        "*"
283 | 	      ]
284 | 	    }
285 | 	  ]
286 | 	},
287 |     
288 |    jupyterHubServiceAccount: {
289 |       "apiVersion": "v1", 
290 |       "kind": "ServiceAccount", 
291 |       "metadata": {
292 |         "labels": {
293 |           "app": "jupyter-hub"
294 |         }, 
295 |         "name": "jupyter-hub",
296 |         "namespace": namespace,
297 |       }
298 |     },
299 | 
300 | 	jupyterHubRoleBinding: {
301 | 	  "apiVersion": "rbac.authorization.k8s.io/v1beta1", 
302 | 	  "kind": "RoleBinding", 
303 | 	  "metadata": {
304 | 	    "name": "jupyter-role", 
305 | 	    "namespace": namespace,
306 | 	  }, 
307 | 	  "roleRef": {
308 | 	    "apiGroup": "rbac.authorization.k8s.io", 
309 | 	    "kind": "Role", 
310 | 	    "name": "jupyter-role"
311 | 	  }, 
312 | 	  "subjects": [
313 | 	    {
314 | 	      "kind": "ServiceAccount", 
315 | 	      "name": "jupyter-hub", 
316 | 	      "namespace": namespace,
317 | 	    }
318 | 	  ]
319 | 	},
320 |   }, // parts
321 | }
322 | 


--------------------------------------------------------------------------------
/testing/test_deploy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | """Test deploying Kubeflow.
  3 | 
  4 | Requirements:
  5 |   This project assumes the py directory in github.com/tensorflow/k8s corresponds
  6 |   to a top level Python package on the Python path.
  7 | 
  8 |   TODO(jlewi): Come up with a better story for how we reuse the py package
  9 |   in tensorflow/k8s. We should probably turn that into a legit Python pip
 10 |   package that is built and released as part of the tensorflow/k8s project.
 11 | """
 12 | 
 13 | import argparse
 14 | import datetime
 15 | import json
 16 | import logging
 17 | import os
 18 | import shutil
 19 | import tempfile
 20 | import uuid
 21 | 
 22 | from kubernetes import client as k8s_client
 23 | from kubernetes.client import rest
 24 | from kubernetes.config import incluster_config
 25 | 
 26 | from py import test_util
 27 | from py import util
 28 | 
 29 | def _setup_test(api_client, run_label):
 30 |   """Create the namespace for the test.
 31 | 
 32 |   Returns:
 33 |     test_dir: The local test directory.
 34 |   """
 35 | 
 36 |   api = k8s_client.CoreV1Api(api_client)
 37 |   namespace = k8s_client.V1Namespace()
 38 |   namespace.api_version = "v1"
 39 |   namespace.kind = "Namespace"
 40 |   namespace.metadata = k8s_client.V1ObjectMeta(name=run_label, labels={
 41 |     "app": "kubeflow-e2e-test",
 42 |     }
 43 |   )
 44 | 
 45 |   try:
 46 |     logging.info("Creating namespace %s", namespace.metadata.name)
 47 |     namespace = api.create_namespace(namespace)
 48 |     logging.info("Namespace %s created.", namespace.metadata.name)
 49 |   except rest.ApiException as e:
 50 |     if e.status == 409:
 51 |       logging.info("Namespace %s already exists.", namespace.metadata.name)
 52 |     else:
 53 |       raise
 54 | 
 55 |   return namespace
 56 | 
 57 | def setup(args):
 58 |   """Test deploying Kubeflow."""
 59 |   if args.cluster:
 60 |     project = args.project
 61 |     cluster_name = args.cluster
 62 |     zone = args.zone
 63 |     logging.info("Using cluster: %s in project: %s in zone: %s",
 64 |                  cluster_name, project, zone)
 65 |     # Print out config to help debug issues with accounts and
 66 |     # credentials.
 67 |     util.run(["gcloud", "config", "list"])
 68 |     util.configure_kubectl(project, zone, cluster_name)
 69 |     util.load_kube_config()
 70 |   else:
 71 |     # TODO(jlewi): This is sufficient for API access but it doesn't create
 72 |     # a kubeconfig file which ksonnet needs for ks init.
 73 |     logging.info("Running inside cluster.")
 74 |     incluster_config.load_incluster_config()
 75 | 
 76 |   # Create an API client object to talk to the K8s master.
 77 |   api_client = k8s_client.ApiClient()
 78 | 
 79 |   now = datetime.datetime.now()
 80 |   run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]
 81 | 
 82 |   if not os.path.exists(args.test_dir):
 83 |     os.makedirs(args.test_dir)
 84 | 
 85 |   logging.info("Using test directory: %s", args.test_dir)
 86 | 
 87 |   namespace_name = run_label
 88 |   def run():
 89 |     namespace = _setup_test(api_client, namespace_name)
 90 |     logging.info("Using namespace: %s", namespace)
 91 |     # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
 92 |     # see: https://github.com/ksonnet/ksonnet/issues/233
 93 |     os.environ["GITHUB_TOKEN"] = args.github_token
 94 | 
 95 |     # Initialize a ksonnet app.
 96 |     app_name = "kubeflow-test"
 97 |     util.run(["ks", "init", app_name,], cwd=args.test_dir, use_print=True)
 98 | 
 99 |     app_dir = os.path.join(args.test_dir, app_name)
100 | 
101 |     kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow"
102 |     util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)
103 | 
104 |     # Install required packages
105 |     packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]
106 | 
107 |     for p in packages:
108 |       util.run(["ks", "pkg", "install", p], cwd=app_dir)
109 | 
110 |     # Delete the vendor directory and replace with a symlink to the src
111 |     # so that we use the code at the desired commit.
112 |     target_dir = os.path.join(app_dir, "vendor", "kubeflow")
113 | 
114 |     logging.info("Deleting %s", target_dir)
115 |     shutil.rmtree(target_dir)
116 | 
117 |     source = os.path.join(args.test_dir, "src", "kubeflow")
118 |     logging.info("Creating link %s -> %s", target_dir, source)
119 |     os.symlink(source, target_dir)
120 | 
121 |     # Deploy Kubeflow
122 |     util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
123 |               "--namespace=" + namespace.metadata.name], cwd=app_dir)
124 | 
125 |     # TODO(jlewi): For reasons I don't understand even though we ran
126 |     # configure_kubectl above, if we don't rerun it we get rbac errors
127 |     # when we do ks apply; I think because we aren't using the proper service
128 |     # account. This might have something to do with the way ksonnet gets
129 |     # its credentials; maybe we need to configure credentials after calling
130 |     # ks init?
131 |     if args.cluster:
132 |       util.configure_kubectl(args.project, args.zone, args.cluster)
133 | 
134 |     apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]
135 | 
136 |     util.run(apply_command, cwd=app_dir)
137 | 
138 |     # Verify that the TfJob operator is actually deployed.
139 |     tf_job_deployment_name = "tf-job-operator"
140 |     logging.info("Verifying TfJob controller started.")
141 |     util.wait_for_deployment(api_client, namespace.metadata.name,
142 |                              tf_job_deployment_name)
143 | 
144 |     # Verify that JupyterHub is actually deployed.
145 |     jupyter_name = "tf-hub"
146 |     logging.info("Verifying TfHub started.")
147 |     util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
148 | 
149 |   main_case = test_util.TestCase()
150 |   main_case.class_name = "KubeFlow"
151 |   main_case.name = "deploy-kubeflow"
152 |   try:
153 |     test_util.wrap_test(run, main_case)
154 |   finally:
155 |     # Delete the namespace
156 |     logging.info("Deleting namespace %s", namespace_name)
157 | 
158 |     # We report teardown as a separate test case because this will help
159 |     # us track down issues with garbage collecting namespaces.
160 |     teardown = test_util.TestCase(main_case.class_name, "teardown")
161 |     def run_teardown():
162 |       core_api = k8s_client.CoreV1Api(api_client)
163 |       core_api.delete_namespace(namespace_name, {})
164 | 
165 |     try:
166 |       test_util.wrap_test(run_teardown, teardown)
167 |     except Exception as e:  # pylint: disable-msg=broad-except
168 |       logging.error("There was a problem deleting namespace: %s; %s",
169 |                     namespace_name, e.message)
170 |     junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml")
171 |     logging.info("Writing test results to %s", junit_path)
172 |     test_util.create_junit_xml_file([main_case, teardown], junit_path)
173 | 
174 | def main():  # pylint: disable=too-many-locals
175 |   logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals
176 |   # create the top-level parser
177 |   parser = argparse.ArgumentParser(
178 |     description="Test Kubeflow E2E.")
179 | 
180 |   parser.add_argument(
181 |     "--test_dir",
182 |     default="",
183 |     type=str,
184 |     help="Directory to use for all the test files. If not set a temporary "
185 |          "directory is created.")
186 | 
187 |   parser.add_argument(
188 |     "--artifacts_dir",
189 |     default="",
190 |     type=str,
191 |     help="Directory to use for artifacts that should be preserved after "
192 |          "the test runs. Defaults to test_dir if not set.")
193 | 
194 |   parser.add_argument(
195 |     "--project",
196 |     default=None,
197 |     type=str,
198 |     help="The project to use.")
199 | 
200 |   parser.add_argument(
201 |     "--cluster",
202 |     default=None,
203 |     type=str,
204 |     help=("The name of the cluster. If not set assumes the "
205 |           "script is running in a cluster and uses that cluster."))
206 | 
207 |   parser.add_argument(
208 |     "--zone",
209 |     default="us-east1-d",
210 |     type=str,
211 |     help="The zone for the cluster.")
212 | 
213 |   parser.add_argument(
214 |     "--github_token",
215 |     default=None,
216 |     type=str,
217 |     help=("The GitHub API token to use. This is needed since ksonnet uses the "
218 |           "GitHub API and without it we get rate limited. For more info see: "
219 |           "https://github.com/ksonnet/ksonnet/blob/master/docs"
220 |           "/troubleshooting.md"))
221 | 
222 |   args = parser.parse_args()
223 | 
224 |   if not args.test_dir:
225 |     logging.info("--test_dir not set; using a temporary directory.")
226 | 
227 |     now = datetime.datetime.now()
228 |     label = "test_deploy-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]
229 | 
230 |     # Create a temporary directory for this test run
231 |     args.test_dir = os.path.join(tempfile.gettempdir(), label)
232 | 
233 |   if not args.artifacts_dir:
234 |     args.artifacts_dir = args.test_dir
235 |   # Setup a logging file handler. This way we can upload the log outputs
236 |   # to gubernator.
237 |   root_logger = logging.getLogger()
238 | 
239 |   test_log = os.path.join(args.artifacts_dir, "logs", "test_deploy.log.txt")
240 |   if not os.path.exists(os.path.dirname(test_log)):
241 |     os.makedirs(os.path.dirname(test_log))
242 | 
243 |   file_handler = logging.FileHandler(test_log)
244 |   root_logger.addHandler(file_handler)
245 |   # We need to explicitly set the formatter because it will not pick up
246 |   # the BasicConfig.
247 |   formatter = logging.Formatter(fmt=("%(levelname)s|%(asctime)s"
248 |                                      "|%(pathname)s|%(lineno)d| %(message)s"),
249 |                                 datefmt="%Y-%m-%dT%H:%M:%S")
250 |   file_handler.setFormatter(formatter)
251 |   logging.info("Logging to %s", test_log)
252 | 
253 |   if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
254 |     logging.info("GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud "
255 |                  "to use service account.")
256 |     # Since a service account is set tell gcloud to use it.
257 |     util.run(["gcloud", "auth", "activate-service-account", "--key-file=" +
258 |               os.getenv("GOOGLE_APPLICATION_CREDENTIALS")])
259 |   setup(args)
260 | 
261 | if __name__ == "__main__":
262 |   logging.basicConfig(level=logging.INFO,
263 |                       format=('%(levelname)s|%(asctime)s'
264 |                               '|%(pathname)s|%(lineno)d| %(message)s'),
265 |                       datefmt='%Y-%m-%dT%H:%M:%S',
266 |                       )
267 |   logging.getLogger().setLevel(logging.INFO)
268 |   main()
269 | 


--------------------------------------------------------------------------------
/testing/test-infra/components/argo.libsonnet:
--------------------------------------------------------------------------------
  1 |   {
  2 |     // TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet
  3 |     // doesn't support automatically piping in the namespace from the environment to prototypes.
  4 | 
  5 |     // TODO(jlewi): Do we need to add parts corresponding to a service account and cluster binding role?
  6 |     // see https://github.com/argoproj/argo/blob/master/cmd/argo/commands/install.go
  7 | 
  8 |     parts(namespace):: {  
  9 |       // CRD's are not namespace scoped; see
 10 |       // https://kubernetes.io/docs/tasks/access-kubernetes-api/extend-api-custom-resource-definitions/
 11 |       crd: {
 12 |         "apiVersion": "apiextensions.k8s.io/v1beta1",
 13 |         "kind": "CustomResourceDefinition",
 14 |         "metadata": {
 15 |             "name": "workflows.argoproj.io",
 16 |         },
 17 |         "spec": {
 18 |             "group": "argoproj.io",
 19 |             "names": {
 20 |                 "kind": "Workflow",
 21 |                 "listKind": "WorkflowList",
 22 |                 "plural": "workflows",
 23 |                 "shortNames": [
 24 |                     "wf"
 25 |                 ],
 26 |                 "singular": "workflow"
 27 |             },
 28 |             "scope": "Namespaced",
 29 |             "version": "v1alpha1"
 30 |         },       
 31 |       }, // crd
 32 | 
 33 |       // Deploy the controller
 34 |       deploy: {
 35 |         "apiVersion": "extensions/v1beta1",
 36 |         "kind": "Deployment",        
 37 |             "labels": {
 38 |                 "app": "workflow-controller"
 39 |             },
 40 |         "metadata": {
 41 |             "name": "workflow-controller",
 42 |             "namespace": namespace,            
 43 |         },
 44 |         "spec": {
 45 |             "progressDeadlineSeconds": 600,
 46 |             "replicas": 1,
 47 |             "revisionHistoryLimit": 10,
 48 |             "selector": {
 49 |                 "matchLabels": {
 50 |                     "app": "workflow-controller"
 51 |                 }
 52 |             },
 53 |             "strategy": {
 54 |                 "rollingUpdate": {
 55 |                     "maxSurge": "25%",
 56 |                     "maxUnavailable": "25%"
 57 |                 },
 58 |                 "type": "RollingUpdate"
 59 |             },
 60 |             "template": {
 61 |                 "metadata": {
 62 |                     "creationTimestamp": null,
 63 |                     "labels": {
 64 |                         "app": "workflow-controller"
 65 |                     }
 66 |                 },
 67 |                 "spec": {
 68 |                     "containers": [
 69 |                         {
 70 |                             "args": [
 71 |                                 "--configmap",
 72 |                                 "workflow-controller-configmap"
 73 |                             ],
 74 |                             "command": [
 75 |                                 "workflow-controller"
 76 |                             ],
 77 |                             "env": [
 78 |                                 {
 79 |                                     "name": "ARGO_NAMESPACE",
 80 |                                     "valueFrom": {
 81 |                                         "fieldRef": {
 82 |                                             "apiVersion": "v1",
 83 |                                             "fieldPath": "metadata.namespace"
 84 |                                         }
 85 |                                     }
 86 |                                 }
 87 |                             ],
 88 |                             "image": "argoproj/workflow-controller:v2.0.0-alpha3",
 89 |                             "imagePullPolicy": "IfNotPresent",
 90 |                             "name": "workflow-controller",
 91 |                             "resources": {},
 92 |                             "terminationMessagePath": "/dev/termination-log",
 93 |                             "terminationMessagePolicy": "File"
 94 |                         }
 95 |                     ],
 96 |                     "dnsPolicy": "ClusterFirst",
 97 |                     "restartPolicy": "Always",
 98 |                     "schedulerName": "default-scheduler",
 99 |                     "securityContext": {},
100 |                     "serviceAccount": "argo",
101 |                     "serviceAccountName": "argo",
102 |                     "terminationGracePeriodSeconds": 30
103 |                 }
104 |             }
105 |         },
106 |       }, // deploy
107 | 
108 | 
109 |       deployUi: {
110 |         "apiVersion": "extensions/v1beta1",
111 |         "kind": "Deployment",
112 |         "metadata": {         
113 |             "labels": {
114 |                 "app": "argo-ui"
115 |             },
116 |             "name": "argo-ui",
117 |             "namespace": namespace,            
118 |         },
119 |         "spec": {
120 |             "progressDeadlineSeconds": 600,
121 |             "replicas": 1,
122 |             "revisionHistoryLimit": 10,
123 |             "selector": {
124 |                 "matchLabels": {
125 |                     "app": "argo-ui"
126 |                 }
127 |             },
128 |             "strategy": {
129 |                 "rollingUpdate": {
130 |                     "maxSurge": "25%",
131 |                     "maxUnavailable": "25%"
132 |                 },
133 |                 "type": "RollingUpdate"
134 |             },
135 |             "template": {
136 |                 "metadata": {
137 |                     "creationTimestamp": null,
138 |                     "labels": {
139 |                         "app": "argo-ui"
140 |                     }
141 |                 },
142 |                 "spec": {
143 |                     "containers": [
144 |                         {
145 |                             "env": [
146 |                                 {
147 |                                     "name": "ARGO_NAMESPACE",
148 |                                     "valueFrom": {
149 |                                         "fieldRef": {
150 |                                             "apiVersion": "v1",
151 |                                             "fieldPath": "metadata.namespace"
152 |                                         }
153 |                                     }
154 |                                 },
155 |                                 {
156 |                                     "name": "IN_CLUSTER",
157 |                                     "value": "true"
158 |                                 }
159 |                             ],
160 |                             "image": "argoproj/argoui:v2.0.0-alpha3",
161 |                             "imagePullPolicy": "IfNotPresent",
162 |                             "name": "argo-ui",
163 |                             "resources": {},
164 |                             "terminationMessagePath": "/dev/termination-log",
165 |                             "terminationMessagePolicy": "File"
166 |                         }
167 |                     ],
168 |                     "dnsPolicy": "ClusterFirst",
169 |                     "restartPolicy": "Always",
170 |                     "schedulerName": "default-scheduler",
171 |                     "securityContext": {},
172 |                     "serviceAccount": "argo",
173 |                     "serviceAccountName": "argo",
174 |                     "terminationGracePeriodSeconds": 30,
175 |                     "readinessProbe": {
176 |                       "httpGet": {
177 |                         "path": "/", 
178 |                         "port": 8001,
179 |                       }
180 |                     },
181 |                 }
182 |             }
183 |         },
184 |       }, // deployUi
185 | 
186 |       uiIngress:: {
187 |           "apiVersion": "extensions/v1beta1", 
188 |           "kind": "Ingress", 
189 |           "metadata": {
190 |             "name": "argo-ui",
191 |             "namespace": namespace,
192 |           }, 
193 |           "annotations": {
194 |             "kubernetes.io/ingress.global-static-ip-name": "argo-ui",
195 |           },
196 |           "spec": {
197 |             "rules": [
198 |               {
199 |                 "http": {
200 |                   "paths": [
201 |                      {
202 |                       "backend": {
203 |                         "serviceName": "argo-ui",
204 |                         "servicePort": 80,
205 |                       }, 
206 |                       "path": "/*"
207 |                     },
208 |                   ]
209 |                 }
210 |               }
211 |             ],
212 |           }
213 |       }, // ingress
214 |       
215 |       uiService: {
216 |         "apiVersion": "v1", 
217 |         "kind": "Service", 
218 |         "metadata": {
219 |           "labels": {
220 |             "app": "argo-ui"
221 |           }, 
222 |           "name": "argo-ui",
223 |           "namespace": namespace,
224 |         }, 
225 |         "spec": {
226 |           "ports": [
227 |             {
228 |               "port": 80, 
229 |               "targetPort": 8001
230 |             }
231 |           ], 
232 |           "selector": {
233 |             "app": "argo-ui"
234 |           }, 
235 |           "sessionAffinity": "None", 
236 |           "type": "NodePort",
237 |         }
238 |       },
239 | 
240 |       config: {
241 |         "apiVersion": "v1", 
242 |         "data": {
243 |           "config": @"executorImage: argoproj/argoexec:v2.0.0-alpha2"
244 |         }, 
245 |         "kind": "ConfigMap", 
246 |         "metadata": {
247 |           "name": "workflow-controller-configmap",
248 |           "namespace": namespace,
249 |         }
250 |       },
251 | 
252 |       serviceAccount: {
253 |           "apiVersion": "v1",
254 |           "kind": "ServiceAccount",
255 |           "metadata": {
256 |               "name": "argo",
257 |               "namespace": namespace,              
258 |           },
259 |       }, // service account
260 | 
261 |       // TODO(jlewi): Do we really need cluster admin privileges? Why?
262 |       // is this just because workflow controller is trying to create the CRD?
263 |       roleBinding: {
264 |           "apiVersion": "rbac.authorization.k8s.io/v1",
265 |           "kind": "ClusterRoleBinding",
266 |           "metadata": {
267 |               "name": "argo-cluster-role",
268 |               "namespace": namespace,
269 |           },
270 |           "roleRef": {
271 |               "apiGroup": "rbac.authorization.k8s.io",
272 |               "kind": "ClusterRole",
273 |               "name": "cluster-admin"
274 |           },
275 |           "subjects": [
276 |               {
277 |                   "kind": "ServiceAccount",
278 |                   "name": "argo",
279 |                   "namespace": namespace,
280 |               }
281 |           ]
282 |       },  // role binding 
283 | 
284 |       // The steps in the workflow use the default service account.
285 |       // The default service account needs sufficient permission in order
286 |       // to create namespaces and other objects used in the test.
287 |       defaultRoleBinding: {
288 |           "apiVersion": "rbac.authorization.k8s.io/v1",
289 |           "kind": "ClusterRoleBinding",
290 |           "metadata": {
291 |               "name": "default-role",
292 |               "namespace": namespace,
293 |           },
294 |           "roleRef": {
295 |               "apiGroup": "rbac.authorization.k8s.io",
296 |               "kind": "ClusterRole",
297 |               "name": "cluster-admin"
298 |           },
299 |           "subjects": [
300 |               {
301 |                   "kind": "ServiceAccount",
302 |                   "name": "default",
303 |                   "namespace": namespace,
304 |               }
305 |           ]
306 |       },  // default role binding 
307 |   } // parts
308 | }


--------------------------------------------------------------------------------
/kubeflow/core/tf-job.libsonnet:
--------------------------------------------------------------------------------
  1 | 
  2 | {  
  3 |   // TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet
  4 |   // doesn't support automatically piping in the namespace from the environment to prototypes.
  5 |   parts(namespace):: {    
  6 |     tfJobDeploy(image): {
  7 |       "apiVersion": "extensions/v1beta1", 
  8 |       "kind": "Deployment", 
  9 |       "metadata": {
 10 |         "name": "tf-job-operator",
 11 |         "namespace": namespace,
 12 |       }, 
 13 |       "spec": {
 14 |         "replicas": 1, 
 15 |         "template": {
 16 |           "metadata": {
 17 |             "labels": {
 18 |               "name": "tf-job-operator"
 19 |             }
 20 |           }, 
 21 |           "spec": {
 22 |             "containers": [
 23 |               {
 24 |                 "command": [
 25 |                   "/opt/mlkube/tf_operator", 
 26 |                   "--controller-config-file=/etc/config/controller_config_file.yaml",
 27 |                   "--alsologtostderr",
 28 |                   "-v=1",
 29 |                 ], 
 30 |                 "env": [
 31 |                   {
 32 |                     "name": "MY_POD_NAMESPACE", 
 33 |                     "valueFrom": {
 34 |                       "fieldRef": {
 35 |                         "fieldPath": "metadata.namespace"
 36 |                       }
 37 |                     }
 38 |                   }, 
 39 |                   {
 40 |                     "name": "MY_POD_NAME", 
 41 |                     "valueFrom": {
 42 |                       "fieldRef": {
 43 |                         "fieldPath": "metadata.name"
 44 |                       }
 45 |                     }
 46 |                   }
 47 |                 ], 
 48 |                 "image": image, 
 49 |                 "name": "tf-job-operator", 
 50 |                 "volumeMounts": [
 51 |                   {
 52 |                     "mountPath": "/etc/config", 
 53 |                     "name": "config-volume"
 54 |                   }
 55 |                 ]
 56 |               }
 57 |             ], 
 58 |             "serviceAccountName": "tf-job-operator", 
 59 |             "volumes": [
 60 |               {
 61 |                 "configMap": {
 62 |                   "name": "tf-job-operator-config"
 63 |                 }, 
 64 |                 "name": "config-volume"
 65 |               }
 66 |             ]
 67 |           }
 68 |         }
 69 |       }      
 70 |     },  // tfJobDeploy
 71 |   
 72 |     // Default value for 
 73 |     defaultControllerConfig(tfDefaultImage):: {
 74 |       grpcServerFilePath: "/opt/mlkube/grpc_tensorflow_server/grpc_tensorflow_server.py",      
 75 |     }
 76 |     +   if tfDefaultImage != "" && tfDefaultImage != "null" then
 77 |         {
 78 |           tfImage: tfDefaultImage,
 79 |         }
 80 |         else
 81 |         {},
 82 | 
 83 |     azureAccelerators:: {
 84 |       accelerators: {
 85 |         "alpha.kubernetes.io/nvidia-gpu": {
 86 |           volumes: [
 87 |             { name: "lib",
 88 |               mountPath: "/usr/local/nvidia/lib64",
 89 |               hostPath:  "/usr/lib/nvidia-384",
 90 |             },
 91 |             {
 92 |               name: "bin",
 93 |               mountPath: "/usr/local/nvidia/bin",
 94 |               hostPath: "/usr/lib/nvidia-384/bin",
 95 |             },
 96 |             { name: "libcuda",
 97 |               mountPath: "/usr/lib/x86_64-linux-gnu/libcuda.so.1",
 98 |               hostPath: "/usr/lib/x86_64-linux-gnu/libcuda.so.1",
 99 |             },
100 |           ]        
101 |         }  
102 |       }
103 |     },
104 | 
105 |     configData(cloud, tfDefaultImage):: self.defaultControllerConfig(tfDefaultImage) + 
106 |       if cloud == "azure" then
107 |         self.azureAccelerators
108 |       else
109 |         {},
110 | 
111 |     configMap(cloud, tfDefaultImage): {      
112 |       "apiVersion": "v1", 
113 |       "data": {        
114 |         "controller_config_file.yaml": std.manifestJson($.parts(namespace).configData(cloud, tfDefaultImage)),
115 |       }, 
116 |       "kind": "ConfigMap", 
117 |       "metadata": {
118 |         "name": "tf-job-operator-config",
119 |         "namespace": namespace,
120 |       }
121 |     },
122 | 
123 |     serviceAccount: {
124 |       "apiVersion": "v1", 
125 |       "kind": "ServiceAccount", 
126 |       "metadata": {
127 |         "labels": {
128 |           "app": "tf-job-operator"
129 |         }, 
130 |         "name": "tf-job-operator",
131 |         "namespace": namespace,
132 |       }
133 |     },
134 | 
135 |     operatorRole: {
136 |       "apiVersion": "rbac.authorization.k8s.io/v1beta1", 
137 |       "kind": "ClusterRole", 
138 |       "metadata": {
139 |         "labels": {
140 |           "app": "tf-job-operator"
141 |         }, 
142 |         "name": "tf-job-operator"
143 |       }, 
144 |       "rules": [
145 |         {
146 |           "apiGroups": [
147 |             "tensorflow.org"
148 |           ], 
149 |           "resources": [
150 |             "tfjobs"
151 |           ], 
152 |           "verbs": [
153 |             "*"
154 |           ]
155 |         }, 
156 |         {
157 |           "apiGroups": [
158 |             "apiextensions.k8s.io"
159 |           ], 
160 |           "resources": [
161 |             "customresourcedefinitions"
162 |           ], 
163 |           "verbs": [
164 |             "*"
165 |           ]
166 |         }, 
167 |         {
168 |           "apiGroups": [
169 |             "storage.k8s.io"
170 |           ], 
171 |           "resources": [
172 |             "storageclasses"
173 |           ], 
174 |           "verbs": [
175 |             "*"
176 |           ]
177 |         }, 
178 |         {
179 |           "apiGroups": [
180 |             "batch"
181 |           ], 
182 |           "resources": [
183 |             "jobs"
184 |           ], 
185 |           "verbs": [
186 |             "*"
187 |           ]
188 |         }, 
189 |         {
190 |           "apiGroups": [
191 |             ""
192 |           ], 
193 |           "resources": [
194 |             "configmaps", 
195 |             "pods", 
196 |             "services", 
197 |             "endpoints", 
198 |             "persistentvolumeclaims", 
199 |             "events"
200 |           ], 
201 |           "verbs": [
202 |             "*"
203 |           ]
204 |         }, 
205 |         {
206 |           "apiGroups": [
207 |             "apps", 
208 |             "extensions"
209 |           ], 
210 |           "resources": [
211 |             "deployments"
212 |           ], 
213 |           "verbs": [
214 |             "*"
215 |           ]
216 |         }
217 |       ]
218 |     }, // operator-role
219 | 
220 |     operatorRoleBinding:: {
221 |       "apiVersion": "rbac.authorization.k8s.io/v1beta1", 
222 |       "kind": "ClusterRoleBinding", 
223 |       "metadata": {
224 |         "labels": {
225 |           "app": "tf-job-operator"
226 |         }, 
227 |         "name": "tf-job-operator"
228 |       }, 
229 |       "roleRef": {
230 |         "apiGroup": "rbac.authorization.k8s.io", 
231 |         "kind": "ClusterRole", 
232 |         "name": "tf-job-operator"
233 |       }, 
234 |       "subjects": [
235 |         {
236 |           "kind": "ServiceAccount", 
237 |           "name": "tf-job-operator", 
238 |           "namespace": namespace,
239 |         }
240 |       ]
241 |     }, // operator-role binding
242 | 
243 |     uiService(serviceType):: {
244 |       "apiVersion": "v1", 
245 |       "kind": "Service", 
246 |       "metadata": {
247 |         "name": "tf-job-dashboard",
248 |         "namespace": namespace,
249 |       }, 
250 |       "spec": {
251 |         "ports": [
252 |           {
253 |             "port": 80, 
254 |             "targetPort": 8080
255 |           }
256 |         ], 
257 |         "selector": {
258 |           "name": "tf-job-dashboard"
259 |         }, 
260 |         "type": serviceType,
261 |       }
262 |     }, // uiService
263 | 
264 |     uiServiceAccount: {
265 |       "apiVersion": "v1", 
266 |       "kind": "ServiceAccount", 
267 |       "metadata": {
268 |         "labels": {
269 |           "app": "tf-job-dashboard"
270 |         }, 
271 |         "name": "tf-job-dashboard",
272 |         "namespace": namespace,
273 |       }
274 |     }, // uiServiceAccount
275 | 
276 |     ui(image):: {
277 |       "apiVersion": "extensions/v1beta1", 
278 |       "kind": "Deployment", 
279 |       "metadata": {
280 |         "name": "tf-job-dashboard",
281 |         "namespace": namespace,
282 |       }, 
283 |       "spec": {
284 |         "template": {
285 |           "metadata": {
286 |             "labels": {
287 |               "name": "tf-job-dashboard"
288 |             }
289 |           }, 
290 |           "spec": {
291 |             "containers": [
292 |               {
293 |                 "command": [
294 |                   "/opt/tensorflow_k8s/dashboard/backend"
295 |                 ], 
296 |                 "image": image,
297 |                 "name": "tf-job-dashboard", 
298 |                 "ports": [
299 |                   {
300 |                     "containerPort": 8080
301 |                   }
302 |                 ]
303 |               }
304 |             ],
305 |             "serviceAccountName": "tf-job-dashboard", 
306 |           }
307 |         }
308 |       },
309 |     }, // ui
310 | 
311 |     uiRole:: {
312 |       "apiVersion": "rbac.authorization.k8s.io/v1beta1", 
313 |       "kind": "ClusterRole", 
314 |       "metadata": {
315 |         "labels": {
316 |           "app": "tf-job-dashboard"
317 |         }, 
318 |         "name": "tf-job-dashboard"
319 |       }, 
320 |       "rules": [
321 |         {
322 |           "apiGroups": [
323 |             "tensorflow.org"
324 |           ], 
325 |           "resources": [
326 |             "tfjobs"
327 |           ], 
328 |           "verbs": [
329 |             "*"
330 |           ]
331 |         }, 
332 |         {
333 |           "apiGroups": [
334 |             "apiextensions.k8s.io"
335 |           ], 
336 |           "resources": [
337 |             "customresourcedefinitions"
338 |           ], 
339 |           "verbs": [
340 |             "*"
341 |           ]
342 |         }, 
343 |         {
344 |           "apiGroups": [
345 |             "storage.k8s.io"
346 |           ], 
347 |           "resources": [
348 |             "storageclasses"
349 |           ], 
350 |           "verbs": [
351 |             "*"
352 |           ]
353 |         }, 
354 |         {
355 |           "apiGroups": [
356 |             "batch"
357 |           ], 
358 |           "resources": [
359 |             "jobs"
360 |           ], 
361 |           "verbs": [
362 |             "*"
363 |           ]
364 |         }, 
365 |         {
366 |           "apiGroups": [
367 |             ""
368 |           ], 
369 |           "resources": [
370 |             "configmaps", 
371 |             "pods", 
372 |             "services", 
373 |             "endpoints", 
374 |             "persistentvolumeclaims", 
375 |             "events"
376 |           ], 
377 |           "verbs": [
378 |             "*"
379 |           ]
380 |         }, 
381 |         {
382 |           "apiGroups": [
383 |             "apps", 
384 |             "extensions"
385 |           ], 
386 |           "resources": [
387 |             "deployments"
388 |           ], 
389 |           "verbs": [
390 |             "*"
391 |           ]
392 |         }
393 |       ]
394 |     }, // uiRole 
395 | 
396 |     uiRoleBinding:: {
397 |       "apiVersion": "rbac.authorization.k8s.io/v1beta1", 
398 |       "kind": "ClusterRoleBinding", 
399 |       "metadata": {
400 |         "labels": {
401 |           "app": "tf-job-dashboard"
402 |         }, 
403 |         "name": "tf-job-dashboard"
404 |       }, 
405 |       "roleRef": {
406 |         "apiGroup": "rbac.authorization.k8s.io", 
407 |         "kind": "ClusterRole", 
408 |         "name": "tf-job-dashboard"
409 |       }, 
410 |       "subjects": [
411 |         {
412 |           "kind": "ServiceAccount", 
413 |           "name": "tf-job-dashboard", 
414 |           "namespace": namespace,
415 |         }
416 |       ]
417 |     }, // uiRoleBinding
418 |   },
419 | }
420 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------