├── testing ├── __init__.py ├── version.json ├── test-infra │ ├── environments │ │ ├── prow │ │ │ ├── spec.json │ │ │ ├── params.libsonnet │ │ │ ├── main.jsonnet │ │ │ └── .metadata │ │ │ │ └── k.libsonnet │ │ └── base.libsonnet │ ├── .ksonnet │ │ └── registries │ │ │ ├── kubeflow │ │ │ └── 5c35580d76092788b089cb447be3f3097cffe60b.yaml │ │ │ └── incubator │ │ │ └── ea3408d44c2d8ea4d321364e5533d5c60e74bce0.yaml │ ├── components │ │ ├── workflows.jsonnet │ │ ├── argo.jsonnet │ │ ├── params.libsonnet │ │ ├── nfs-jupyter.jsonnet │ │ ├── workflows.libsonnet │ │ └── argo.libsonnet │ ├── app.yaml │ └── debug_pod.yaml ├── argo_client_test.py ├── bootstrap.sh ├── Makefile ├── argo_client.py ├── checkout.sh ├── run_e2e_workflow_test.py ├── Dockerfile ├── prow_artifacts_test.py ├── README.md ├── run_e2e_workflow.py ├── prow_artifacts.py └── test_deploy.py ├── .gitmodules ├── .travis.yml ├── components ├── tf-controller │ ├── README.md │ ├── Makefile │ └── deploy_crd.yaml ├── k8s-model-server │ ├── inception-client │ │ ├── images │ │ │ └── sleeping-pepper.jpg │ │ ├── requirements.txt │ │ ├── run.sh │ │ ├── Dockerfile │ │ └── label.py │ └── docker │ │ ├── Makefile │ │ └── Dockerfile └── jupyterhub │ ├── docker │ ├── Makefile │ └── Dockerfile │ └── README.md ├── kubeflow ├── registry.yaml ├── generate_docs.py ├── README.md ├── core │ ├── parts.yaml │ ├── README.md │ ├── prototypes │ │ └── all.jsonnet │ ├── nfs.libsonnet │ ├── jupyterhub.libsonnet │ └── tf-job.libsonnet ├── tf-job │ ├── parts.yaml │ ├── tf-job.libsonnet │ ├── prototypes │ │ ├── tf-job.jsonnet │ │ └── tf-cnn-benchmarks.jsonnet │ └── README.md └── tf-serving │ ├── parts.yaml │ ├── prototypes │ └── tf-serving-all-features.jsonnet │ ├── README.md │ └── tf-serving.libsonnet ├── tf-controller-examples └── tf-cnn │ ├── README.md │ ├── Dockerfile.template │ ├── Dockerfile.cpu │ ├── Dockerfile.gpu │ ├── tf_job_gpu.yaml │ ├── tf_job_gpu_distributed.yaml │ ├── tf_job_cpu.yaml │ ├── tf_job_cpu_distributed.yaml │ ├── Makefile │ ├── launcher.py │ └── create_job_specs.py ├── .gitignore ├── Makefile ├── CONTRIBUTING.md ├── README.md └── LICENSE /testing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /testing/version.json: -------------------------------------------------------------------------------- 1 | {"image": "gcr.io/mlkube-testing/kubeflow-testing:v20180104-ce39a55-e3b0c4"} 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tensorflow_k8s"] 2 | path = tensorflow_k8s 3 | url = https://github.com/tensorflow/k8s.git 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | script: 2 | - make 3 | 4 | notifications: 5 | email: 6 | on_success: never 7 | on_failure: never 8 | -------------------------------------------------------------------------------- /testing/test-infra/environments/prow/spec.json: -------------------------------------------------------------------------------- 1 | { 2 | "server": "https://35.196.185.88", 3 | "namespace": "kubeflow-testing" 4 | } -------------------------------------------------------------------------------- /components/tf-controller/README.md: -------------------------------------------------------------------------------- 1 | Deployment manifests for Tensorflow Kubernetes controller hosted at https://github.com/tensorflow/k8s -------------------------------------------------------------------------------- /testing/test-infra/environments/base.libsonnet: -------------------------------------------------------------------------------- 1 | local components = std.extVar("__ksonnet/components"); 2 | components + { 3 | // Insert user-specified overrides here. 4 | } 5 | -------------------------------------------------------------------------------- /components/k8s-model-server/inception-client/images/sleeping-pepper.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamicwebpaige/kubeflow/master/components/k8s-model-server/inception-client/images/sleeping-pepper.jpg -------------------------------------------------------------------------------- /kubeflow/registry.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: '0.1' 2 | kind: ksonnet.io/registry 3 | libraries: 4 | core: 5 | version: master 6 | path: core 7 | tf-job: 8 | version: master 9 | path: tf-job 10 | tf-serving: 11 | version: master 12 | path: tf-serving -------------------------------------------------------------------------------- /testing/test-infra/environments/prow/params.libsonnet: -------------------------------------------------------------------------------- 1 | local params = import "../../components/params.libsonnet"; 2 | params + { 3 | components +: { 4 | // Insert component parameter overrides here. Ex: 5 | // guestbook +: { 6 | // name: "guestbook-dev", 7 | // replicas: params.global.replicas, 8 | // }, 9 | }, 10 | } 11 | -------------------------------------------------------------------------------- /testing/test-infra/environments/prow/main.jsonnet: -------------------------------------------------------------------------------- 1 | local base = import "../base.libsonnet"; 2 | local k = import "k.libsonnet"; 3 | 4 | base + { 5 | // Insert user-specified overrides here. For example if a component is named "nginx-deployment", you might have something like: 6 | // "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"}) 7 | } 8 | -------------------------------------------------------------------------------- /testing/test-infra/.ksonnet/registries/kubeflow/5c35580d76092788b089cb447be3f3097cffe60b.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "0.1" 2 | gitVersion: 3 | commitSha: 5c35580d76092788b089cb447be3f3097cffe60b 4 | refSpec: master 5 | kind: ksonnet.io/registry 6 | libraries: 7 | core: 8 | path: core 9 | version: master 10 | tf-serving: 11 | path: tf-serving 12 | version: master 13 | -------------------------------------------------------------------------------- /components/k8s-model-server/inception-client/requirements.txt: -------------------------------------------------------------------------------- 1 | backports.weakref==1.0.post1 2 | bleach==1.5.0 3 | enum34==1.1.6 4 | funcsigs==1.0.2 5 | futures==3.2.0 6 | grpcio==1.8.3 7 | html5lib==0.9999999 8 | Markdown==2.6.11 9 | mock==2.0.0 10 | numpy==1.13.3 11 | pbr==3.1.1 12 | protobuf==3.5.1 13 | six==1.11.0 14 | tensorflow==1.4.1 15 | tensorflow-serving-api==1.4.0 16 | tensorflow-tensorboard==0.4.0rc3 17 | Werkzeug==0.14.1 18 | -------------------------------------------------------------------------------- /tf-controller-examples/tf-cnn/README.md: -------------------------------------------------------------------------------- 1 | # Training TF CNN models 2 | 3 | This directory contains code to train convolutional 4 | neural networks using [tf_cnn_benchmarks](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks) 5 | which is optimized for performance. 6 | 7 | 8 | The jobs can be run on a cluster just by running kubectl 9 | 10 | e.g. 11 | 12 | ``` 13 | kubectl create -f tf_job_gpu.yaml 14 | ``` 15 | 16 | By default the examples run using synthetic data and save the trained model 17 | inside the container. -------------------------------------------------------------------------------- /testing/test-infra/components/workflows.jsonnet: -------------------------------------------------------------------------------- 1 | local params = std.extVar("__ksonnet/params").components["workflows"]; 2 | 3 | local k = import 'k.libsonnet'; 4 | local workflows = import 'workflows.libsonnet'; 5 | local namespace = params.namespace; 6 | 7 | // TODO(jlewi): Can we make name default so some random unique value? 8 | // I didn't see any routines in the standard library for datetime or random. 9 | local name = params.name; 10 | 11 | local prowEnv = workflows.parseEnv(params.prow_env); 12 | local bucket = params.bucket; 13 | std.prune(k.core.v1.list.new([workflows.parts(namespace, name).e2e(prowEnv, bucket),])) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # pkg and bin directories currently contain build artifacts 2 | # only so we exclude them. 3 | bin/ 4 | vendor/ 5 | 6 | .vscode/ 7 | 8 | # Compiled python files. 9 | *.pyc 10 | 11 | # Emacs temporary files 12 | *~ 13 | 14 | # Other temporary files 15 | .DS_Store 16 | 17 | # Files created by Gogland IDE 18 | .idea/ 19 | 20 | # Exclude wheel files for now. 21 | # The only wheel file is the TF wheel one which is quite large. 22 | # We don't want to check that into source control because it could be 23 | # quite large. 24 | *.whl 25 | 26 | # Bazel files 27 | **/bazel-* 28 | # Examples egg 29 | examples/tf_sample/tf_sample.egg-info/ 30 | examples/.ipynb_checkpoints/ 31 | 32 | **/.ipynb_checkpoints 33 | -------------------------------------------------------------------------------- /testing/test-infra/app.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 0.0.1 2 | kind: ksonnet.io/app 3 | libraries: 4 | core: 5 | gitVersion: 6 | commitSha: 5c35580d76092788b089cb447be3f3097cffe60b 7 | refSpec: master 8 | name: core 9 | registry: kubeflow 10 | name: test-infra 11 | registries: 12 | incubator: 13 | gitVersion: 14 | commitSha: ea3408d44c2d8ea4d321364e5533d5c60e74bce0 15 | refSpec: master 16 | protocol: github 17 | uri: github.com/ksonnet/parts/tree/master/incubator 18 | kubeflow: 19 | gitVersion: 20 | commitSha: 5c35580d76092788b089cb447be3f3097cffe60b 21 | refSpec: master 22 | protocol: github 23 | uri: github.com/google/kubeflow/tree/master/kubeflow 24 | version: 0.0.1 25 | -------------------------------------------------------------------------------- /testing/test-infra/components/argo.jsonnet: -------------------------------------------------------------------------------- 1 | local params = std.extVar("__ksonnet/params").components["argo"]; 2 | 3 | local k = import 'k.libsonnet'; 4 | local argo = import 'argo.libsonnet'; 5 | local namespace = params.namespace; 6 | 7 | std.prune(k.core.v1.list.new([argo.parts(namespace).crd, 8 | argo.parts(namespace).config, 9 | argo.parts(namespace).deploy, 10 | argo.parts(namespace).deployUi, 11 | argo.parts(namespace).uiService, 12 | argo.parts(namespace).uiIngress, 13 | argo.parts(namespace).serviceAccount, 14 | argo.parts(namespace).roleBinding, 15 | argo.parts(namespace).defaultRoleBinding,])) -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Google Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | 14 | all: presubmit 15 | 16 | presubmit: 17 | @echo ">> checking file boilerplate" 18 | @./build/check_boilerplate.sh 19 | 20 | TAG?=$(shell git rev-parse HEAD) 21 | .PHONY: all presubmit 22 | -------------------------------------------------------------------------------- /kubeflow/generate_docs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # This script assumes you ran go install github.com/ksonnet/parts/doc-gen 4 | 5 | import glob 6 | import os 7 | import subprocess 8 | 9 | if __name__ == "__main__": 10 | this_dir = os.path.dirname(__file__) 11 | 12 | GOPATH = os.getenv("GOPATH") 13 | doc_gen = os.path.join(GOPATH, "bin/doc-gen") 14 | for f in os.listdir(this_dir): 15 | full_dir = os.path.join(this_dir, f) 16 | if not os.path.isdir(f): 17 | continue 18 | prototypes = glob.glob(os.path.join(full_dir, "prototypes/*.jsonnet")) 19 | 20 | 21 | command = [doc_gen, os.path.join(full_dir, "parts.yaml")] 22 | command.extend(prototypes) 23 | with open(os.path.join(full_dir, "README.md"), "w") as hout: 24 | subprocess.check_call(command, stdout=hout) 25 | -------------------------------------------------------------------------------- /testing/argo_client_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import unittest 4 | 5 | from testing import argo_client 6 | from kubernetes import client as k8s_client 7 | import mock 8 | import os 9 | import yaml 10 | from py import util 11 | 12 | class ArgoClientTest(unittest.TestCase): 13 | def setUp(self): 14 | self.test_dir = os.path.join(os.path.dirname(__file__), "test-data") 15 | 16 | def test_wait_for_workflow(self): 17 | api_client = mock.MagicMock(spec=k8s_client.ApiClient) 18 | 19 | with open(os.path.join(self.test_dir, "successful_workflow.yaml")) as hf: 20 | response = yaml.load(hf) 21 | 22 | api_client.call_api.return_value = response 23 | result = argo_client.wait_for_workflow(api_client, "some-namespace", "some-set") 24 | self.assertIsNotNone(result) 25 | 26 | if __name__ == "__main__": 27 | unittest.main() -------------------------------------------------------------------------------- /kubeflow/README.md: -------------------------------------------------------------------------------- 1 | # Kubeflow Ksonnet Registry 2 | 3 | ## Overview 4 | 5 | This directory contains the Kubeflow ksonnet [registry][2]. If you are unfamiliar with ksonnet, we recommend browsing [the official site][1] to gain more context. 6 | 7 | 8 | ## Usage 9 | 10 | Please refer to the [Kubeflow user guide](https://github.com/google/kubeflow/blob/master/README.ksonnet.md) 11 | 12 | ## Library-specific Documentation 13 | 14 | Each of the libraries in this directory has its own README.md. These are autogenerated from the metadata in their `parts.yaml` file, using the [`doc-gen` script][4]. 15 | 16 | Note that you can use the `ks` commands in your terminal to access this same documentation. 17 | 18 | [1]: https://ksonnet.io 19 | [2]: https://ksonnet.io/docs/concepts#registry 20 | [3]: https://ksonnet.io/#get-started 21 | [4]: https://github.com/ksonnet/parts/blob/master/doc-gen/main.go 22 | -------------------------------------------------------------------------------- /testing/test-infra/.ksonnet/registries/incubator/ea3408d44c2d8ea4d321364e5533d5c60e74bce0.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "0.1" 2 | gitVersion: 3 | commitSha: ea3408d44c2d8ea4d321364e5533d5c60e74bce0 4 | refSpec: master 5 | kind: ksonnet.io/registry 6 | libraries: 7 | apache: 8 | path: apache 9 | version: master 10 | efk: 11 | path: efk 12 | version: master 13 | mariadb: 14 | path: mariadb 15 | version: master 16 | memcached: 17 | path: memcached 18 | version: master 19 | mongodb: 20 | path: mongodb 21 | version: master 22 | mysql: 23 | path: mysql 24 | version: master 25 | nginx: 26 | path: nginx 27 | version: master 28 | node: 29 | path: node 30 | version: master 31 | postgres: 32 | path: postgres 33 | version: master 34 | redis: 35 | path: redis 36 | version: master 37 | tomcat: 38 | path: tomcat 39 | version: master 40 | -------------------------------------------------------------------------------- /components/k8s-model-server/inception-client/run.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/bin/bash 16 | 17 | SERVER=$1 18 | PORT=$2 19 | 20 | if [ -z $SERVER ] ; then 21 | SERVER=$INCEPTION_SERVICE_HOST 22 | fi 23 | 24 | if [ -z $PORT ] ; then 25 | PORT=$INCEPTION_SERVICE_PORT 26 | fi 27 | 28 | python label.py -s $SERVER -p $PORT /data/*.jpg 29 | -------------------------------------------------------------------------------- /components/jupyterhub/docker/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | VERSION=1.0 16 | PROJECT_ID=kubeflow 17 | PROJECT=gcr.io/${PROJECT_ID} 18 | 19 | all: build 20 | 21 | build: 22 | docker build --pull -t ${PROJECT}/jupyterhub-k8s:${VERSION} . 23 | 24 | push: build 25 | gcloud docker -- push ${PROJECT}/jupyterhub-k8s:${VERSION} 26 | 27 | .PHONY: all build push 28 | -------------------------------------------------------------------------------- /components/k8s-model-server/docker/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | VERSION=1.0 16 | PROJECT_ID=kubeflow 17 | PROJECT=gcr.io/${PROJECT_ID} 18 | 19 | all: build 20 | 21 | build: 22 | docker build --pull -t ${PROJECT}/model-server:${VERSION} . 23 | 24 | push: build 25 | gcloud docker -- push ${PROJECT}/model-server:${VERSION} 26 | 27 | .PHONY: all build push 28 | -------------------------------------------------------------------------------- /kubeflow/core/parts.yaml: -------------------------------------------------------------------------------- 1 | { 2 | "name": "core", 3 | "apiVersion": "0.0.1", 4 | "kind": "ksonnet.io/parts", 5 | "description": "Core components of Kubeflow.\n", 6 | "author": "kubeflow team ", 7 | "contributors": [ 8 | { 9 | "name": "Jeremy Lewi", 10 | "email": "jlewi@google.com" 11 | } 12 | ], 13 | "repository": { 14 | "type": "git", 15 | "url": "https://github.com/google/kubeflow" 16 | }, 17 | "bugs": { 18 | "url": "https://github.com/google/kubeflow/issues" 19 | }, 20 | "keywords": [ 21 | "kubeflow", 22 | "tensorflow" 23 | ], 24 | "quickStart": { 25 | "prototype": "io.ksonnet.pkg.kubeflow", 26 | "componentName": "core", 27 | "flags": { 28 | "name": "core", 29 | "namespace": "default", 30 | "disks": "" 31 | }, 32 | "comment": "Core Kubeflow components." 33 | }, 34 | "license": "Apache 2.0" 35 | } 36 | -------------------------------------------------------------------------------- /kubeflow/tf-job/parts.yaml: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tf-job", 3 | "apiVersion": "0.0.1", 4 | "kind": "ksonnet.io/parts", 5 | "description": "Prototypes for running TensorFlow jobs.\n", 6 | "author": "kubeflow team ", 7 | "contributors": [ 8 | { 9 | "name": "Jeremy Lewi", 10 | "email": "jlewi@google.com" 11 | } 12 | ], 13 | "repository": { 14 | "type": "git", 15 | "url": "https://github.com/google/kubeflow" 16 | }, 17 | "bugs": { 18 | "url": "https://github.com/google/kubeflow/issues" 19 | }, 20 | "keywords": [ 21 | "kubeflow", 22 | "tensorflow", 23 | "database" 24 | ], 25 | "quickStart": { 26 | "prototype": "io.ksonnet.pkg.tf-job", 27 | "componentName": "tf-job", 28 | "flags": { 29 | "name": "tf-job", 30 | "namespace": "default" 31 | }, 32 | "comment": "Run TensorFlow Job" 33 | }, 34 | "license": "Apache 2.0" 35 | } 36 | -------------------------------------------------------------------------------- /components/k8s-model-server/inception-client/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM python:2.7.14 16 | 17 | RUN pip install --no-cache-dir grpcio tensorflow tensorflow-serving-api 18 | 19 | RUN mkdir -p /opt/label /data 20 | 21 | WORKDIR /opt/label 22 | 23 | COPY label.py ./ 24 | COPY run.sh ./ 25 | 26 | ARG IMAGES_DIR=images/ 27 | 28 | ADD $IMAGES_DIR /data/ 29 | 30 | ENTRYPOINT ["bash", "run.sh"] 31 | CMD [] 32 | -------------------------------------------------------------------------------- /kubeflow/tf-serving/parts.yaml: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tf-serving", 3 | "apiVersion": "0.0.1", 4 | "kind": "ksonnet.io/parts", 5 | "description": "TensorFlow serving is a server for TensorFlow models.\n", 6 | "author": "kubeflow team ", 7 | "contributors": [ 8 | { 9 | "name": "Jeremy Lewi", 10 | "email": "jlewi@google.com" 11 | } 12 | ], 13 | "repository": { 14 | "type": "git", 15 | "url": "https://github.com/google/kubeflow" 16 | }, 17 | "bugs": { 18 | "url": "https://github.com/google/kubeflow/issues" 19 | }, 20 | "keywords": [ 21 | "kubeflow", 22 | "tensorflow", 23 | "database" 24 | ], 25 | "quickStart": { 26 | "prototype": "io.ksonnet.pkg.tf-serving", 27 | "componentName": "tf-serving", 28 | "flags": { 29 | "name": "tf-serving", 30 | "namespace": "default" 31 | }, 32 | "comment": "Run TensorFlow Serving" 33 | }, 34 | "license": "Apache 2.0" 35 | } 36 | -------------------------------------------------------------------------------- /testing/test-infra/components/params.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | global: { 3 | // User-defined global parameters; accessible to all component and environments, Ex: 4 | // replicas: 4, 5 | }, 6 | components: { 7 | // Component-level parameters, defined initially from 'ks prototype use ...' 8 | // Each object below should correspond to a component in the components/ directory 9 | "argo": { 10 | namespace: "kubeflow-test-infra", 11 | }, 12 | "workflows": { 13 | bucket: "mlkube-testing_temp", 14 | name: "kubeflow-presubmit-81-2-39b6", 15 | namespace: "kubeflow-test-infra", 16 | prow_env: "BUILD_NUMBER=2,JOB_NAME=kubeflow-presubmit,JOB_TYPE=presubmit,PULL_NUMBER=81,REPO_NAME=kubeflow,REPO_OWNER=google", 17 | }, 18 | "nfs-jupyter": { 19 | cloud: "", 20 | disks: "kubeflow-testing", 21 | name: "nfs-jupyter", 22 | namespace: "kubeflow-test-infra", 23 | tfJobImage: "gcr.io/tf-on-k8s-dogfood/tf_operator:v20171214-0bd02ac", 24 | }, 25 | }, 26 | } 27 | -------------------------------------------------------------------------------- /kubeflow/tf-serving/prototypes/tf-serving-all-features.jsonnet: -------------------------------------------------------------------------------- 1 | // @apiVersion 0.1 2 | // @name io.ksonnet.pkg.tf-serving 3 | // @description TensorFlow serving 4 | // @shortDescription A TensorFlow serving deployment 5 | // @param name string Name to give to each of the components 6 | // @optionalParam namespace string default Namespace 7 | // @param model_path string Path to the model. This can be a GCS path. 8 | 9 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): We have to add namespace as an explicit parameter 10 | // because ksonnet doesn't support inheriting it from the environment yet. 11 | 12 | local k = import 'k.libsonnet'; 13 | local tfServing = import 'kubeflow/tf-serving/tf-serving.libsonnet'; 14 | 15 | local name = import 'param://name'; 16 | local namespace = import 'param://namespace'; 17 | local modelPath = import 'param://model_path'; 18 | 19 | std.prune(k.core.v1.list.new([ 20 | tfServing.parts.deployment.modelServer(name, namespace, modelPath), 21 | tfServing.parts.deployment.modelService(name, namespace), 22 | ])) 23 | -------------------------------------------------------------------------------- /testing/test-infra/debug_pod.yaml: -------------------------------------------------------------------------------- 1 | # This pod is useful for starting a shell that you can use to interactively debug our tests 2 | apiVersion: batch/v1 3 | kind: Job 4 | metadata: 5 | name: test-job 6 | namespace: kubeflow-test-infra 7 | spec: 8 | template: 9 | spec: 10 | containers: 11 | - name: test-container 12 | image: gcr.io/mlkube-testing/kubeflow-testing:latest 13 | command: ["tail", "-f", "/dev/null"] 14 | volumeMounts: 15 | - mountPath: /mnt/test-data-volume 16 | name: kubeflow-test-volume 17 | - mountPath: /secret/gcp-credentials 18 | name: gcp-credentials 19 | env: 20 | - name: GOOGLE_APPLICATION_CREDENTIALS 21 | value: /secret/gcp-credentials/key.json 22 | restartPolicy: Never 23 | volumes: 24 | - name: kubeflow-test-volume 25 | persistentVolumeClaim: 26 | claimName: kubeflow-testing 27 | - name: gcp-credentials 28 | secret: 29 | secretName: kubeflow-testing-credentials 30 | 31 | backoffLimit: 4 -------------------------------------------------------------------------------- /components/jupyterhub/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM python:3.6 16 | 17 | RUN apt-get update && \ 18 | apt-get install -y npm nodejs-legacy 19 | 20 | RUN npm install -g configurable-http-proxy && \ 21 | pip3 install --no-cache-dir \ 22 | notebook \ 23 | jupyterhub==0.8.1 \ 24 | jupyterhub-kubespawner==0.7.1 \ 25 | jupyterhub-dummyauthenticator \ 26 | oauthenticator 27 | 28 | ENTRYPOINT jupyterhub 29 | -------------------------------------------------------------------------------- /tf-controller-examples/tf-cnn/Dockerfile.template: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Docker image for running examples in Tensorflow models. 16 | # base_image depends on whether we are running on GPUs or non-GPUs 17 | FROM {{base_image}} 18 | 19 | RUN apt-get update && apt-get install -y --no-install-recommends \ 20 | ca-certificates \ 21 | build-essential \ 22 | git 23 | 24 | RUN mkdir -p /opt 25 | 26 | RUN git clone https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks 27 | 28 | COPY launcher.py /opt 29 | RUN chmod u+x /opt/* 30 | ENTRYPOINT ["/opt/launcher.py"] 31 | -------------------------------------------------------------------------------- /tf-controller-examples/tf-cnn/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Docker image for running examples in Tensorflow models. 16 | # base_image depends on whether we are running on GPUs or non-GPUs 17 | FROM tensorflow/tensorflow@sha256:5edc0446cc989ad75bc30631f89f20694fe5bf5226f665d47e5c7f35a3b18484 18 | 19 | RUN apt-get update && apt-get install -y --no-install-recommends \ 20 | ca-certificates \ 21 | build-essential \ 22 | git 23 | 24 | RUN mkdir -p /opt 25 | 26 | RUN git clone https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks 27 | 28 | COPY launcher.py /opt 29 | RUN chmod u+x /opt/* 30 | ENTRYPOINT ["/opt/launcher.py"] 31 | -------------------------------------------------------------------------------- /tf-controller-examples/tf-cnn/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Docker image for running examples in Tensorflow models. 16 | # base_image depends on whether we are running on GPUs or non-GPUs 17 | FROM tensorflow/tensorflow@sha256:bfadad8f2c80424d8d6059d3b8cd6947bf23111dc786fc33db72b56b632a1f28 18 | 19 | RUN apt-get update && apt-get install -y --no-install-recommends \ 20 | ca-certificates \ 21 | build-essential \ 22 | git 23 | 24 | RUN mkdir -p /opt 25 | 26 | RUN git clone https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks 27 | 28 | COPY launcher.py /opt 29 | RUN chmod u+x /opt/* 30 | ENTRYPOINT ["/opt/launcher.py"] 31 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution, 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Get involved 26 | 27 | * [Slack](http://kubeflow.slack.com/) 28 | * [Twitter](http://twitter.com/kubeflow) 29 | * [Mailing List](https://groups.google.com/forum/#!forum/kubeflow-discuss) 30 | -------------------------------------------------------------------------------- /testing/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script is used to bootstrap our prow jobs. 4 | # The point of this script is to check out the google/kubeflow repo 5 | # at the commit corresponding to the Prow job. We can then 6 | # invoke the launcher script at that commit to submit and 7 | # monitor an Argo workflow 8 | set -xe 9 | 10 | mkdir -p /src 11 | git clone https://github.com/google/kubeflow.git /src/google_kubeflow 12 | 13 | cd /src/google_kubeflow 14 | 15 | echo Job Name = ${JOB_NAME} 16 | 17 | # See https://github.com/kubernetes/test-infra/tree/master/prow#job-evironment-variables 18 | if [ ! -z ${PULL_NUMBER} ]; then 19 | git fetch origin pull/${PULL_NUMBER}/head:pr 20 | git checkout ${PULL_PULL_SHA} 21 | else 22 | if [ ! -z ${PULL_BASE_SHA} ]; then 23 | # Its a post submit; checkout the commit to test. 24 | git checkout ${PULL_BASE_SHA} 25 | fi 26 | fi 27 | 28 | # Update submodules. 29 | git submodule init 30 | git submodule update 31 | 32 | # Print out the commit so we can tell from logs what we checked out. 33 | echo Repo is at `git describe --tags --always --dirty` 34 | git submodule 35 | git status 36 | 37 | export PYTHONPATH=$PYTHONPATH:/src/google_kubeflow/tensorflow_k8s 38 | cd /src/google_kubeflow 39 | # Invoke the script to run the workflow 40 | python -m testing.run_e2e_workflow \ 41 | --project=mlkube-testing \ 42 | --zone=us-east1-d \ 43 | --cluster=kubeflow-testing \ 44 | --bucket=kubernetes-jenkins -------------------------------------------------------------------------------- /testing/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Requirements: 16 | # https://github.com/mattrobenolt/jinja2-cli 17 | # pip install jinja2-clie 18 | IMG = gcr.io/mlkube-testing/kubeflow-testing 19 | TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always --dirty)-$(shell git diff | sha256sum | cut -c -6) 20 | DIR := ${CURDIR} 21 | 22 | all: build 23 | 24 | # To build without the cache set the environment variable 25 | # export DOCKER_BUILD_OPTS=--no-cache 26 | build: 27 | @echo {\"image\": \"$(IMG):$(TAG)\"} > version.json 28 | docker build ${DOCKER_BUILD_OPTS} -t $(IMG):$(TAG) . 29 | docker tag $(IMG):$(TAG) $(IMG):latest 30 | @echo Built $(IMG):$(TAG) and tagged with latest 31 | 32 | push: build 33 | gcloud docker -- push $(IMG):$(TAG) 34 | gcloud docker -- push $(IMG):latest 35 | @echo Pushed $(IMG) with :latest and :$(TAG) tags 36 | -------------------------------------------------------------------------------- /kubeflow/tf-job/tf-job.libsonnet: -------------------------------------------------------------------------------- 1 | local k = import 'k.libsonnet'; 2 | 3 | { 4 | parts:: { 5 | tfJobReplica(replicaType, number, args, image, numGpus=0):: 6 | local baseContainer = { 7 | "image": image, 8 | "name": "tensorflow", 9 | }; 10 | local containerArgs = if std.length(args) > 0 then 11 | { 12 | args: args, 13 | } 14 | else {}; 15 | local resources = if numGpus > 0 then { 16 | resources: { 17 | limits: { 18 | "nvidia.com/gpu": numGpus, 19 | } 20 | } 21 | } else {}; 22 | if number > 0 then 23 | { 24 | "replicas": number, 25 | "template": { 26 | "spec": { 27 | "containers": [ 28 | baseContainer + containerArgs + resources, 29 | ], 30 | "restartPolicy": "OnFailure" 31 | } 32 | }, 33 | "tfReplicaType": replicaType, 34 | } 35 | else {}, 36 | 37 | tfJob(name, namespace, replicas):: { 38 | "apiVersion": "tensorflow.org/v1alpha1", 39 | "kind": "TfJob", 40 | "metadata": { 41 | "name": name, 42 | "namespace": namespace, 43 | }, 44 | "spec": { 45 | "replicaSpecs": replicas, 46 | } 47 | }, 48 | }, 49 | } 50 | -------------------------------------------------------------------------------- /components/tf-controller/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Create a yaml template to deploy the CRD 16 | # Requires the template plugin 17 | # https://github.com/technosophos/helm-template 18 | CHART := https://storage.googleapis.com/tf-on-k8s-dogfood-releases/latest/tf-job-operator-chart-latest.tgz 19 | deploy_config: 20 | rm -rf /tmp/tfjob_config_builder 21 | mkdir -p /tmp/tfjob_config_builder 22 | wget -O /tmp/tfjob_config_builder/tf-job-operator-chart-latest.tgz https://storage.googleapis.com/tf-on-k8s-dogfood-releases/latest/tf-job-operator-chart-latest.tgz 23 | tar -C /tmp/tfjob_config_builder -xvf /tmp/tfjob_config_builder/tf-job-operator-chart-latest.tgz 24 | # We set the templates to render because we don't want to render the tests. 25 | helm template /tmp/tfjob_config_builder/tf-job-operator-chart --set cloud=gke,rbac.install=true \ 26 | -x ./templates/config.yaml -x ./templates/deployment.yaml -x ./templates/rbac.yaml -x ./templates/service-account.yaml > deploy_crd.yaml 27 | -------------------------------------------------------------------------------- /components/k8s-model-server/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM ubuntu:16.04 16 | 17 | MAINTAINER Kenneth Owens 18 | 19 | ENV MS_USER=model-server 20 | 21 | RUN apt-get update && apt-get install -y \ 22 | build-essential \ 23 | curl \ 24 | libcurl3-dev \ 25 | git \ 26 | libfreetype6-dev \ 27 | libpng12-dev \ 28 | libzmq3-dev \ 29 | pkg-config \ 30 | python-dev \ 31 | python-numpy \ 32 | python-pip \ 33 | software-properties-common \ 34 | swig \ 35 | zip \ 36 | zlib1g-dev && \ 37 | apt-get clean && \ 38 | rm -rf /var/lib/apt/lists/* 39 | 40 | RUN echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable \ 41 | tensorflow-model-server \ 42 | tensorflow-model-server-universal" \ 43 | | tee /etc/apt/sources.list.d/tensorflow-serving.list && \ 44 | curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg \ 45 | | apt-key add - 46 | 47 | RUN apt-get update && apt-get install -y \ 48 | tensorflow-model-server && \ 49 | apt-get clean && \ 50 | rm -rf /var/lib/apt/lists/* 51 | 52 | RUN set -x \ 53 | && useradd $MS_USER \ 54 | && [ `id -u $MS_USER` -eq 1000 ] \ 55 | && [ `id -g $MS_USER` -eq 1000 ] 56 | 57 | CMD ["/bin/bash"] 58 | -------------------------------------------------------------------------------- /testing/argo_client.py: -------------------------------------------------------------------------------- 1 | """Some utility functions for working with TfJobs.""" 2 | 3 | import datetime 4 | import json 5 | import logging 6 | import time 7 | 8 | from kubernetes import client as k8s_client 9 | from kubernetes.client.rest import ApiException 10 | 11 | from py import util 12 | 13 | GROUP = "argoproj.io" 14 | VERSION = "v1alpha1" 15 | PLURAL = "workflows" 16 | KIND = "Workflow" 17 | 18 | def log_status(workflow): 19 | """A callback to use with wait_for_workflow.""" 20 | logging.info("Workflow %s in namespace %s; phase=%s", 21 | workflow["metadata"]["name"], 22 | workflow["metadata"]["namespace"], 23 | workflow["status"]["phase"]) 24 | 25 | def wait_for_workflow(client, namespace, name, 26 | timeout=datetime.timedelta(minutes=5), 27 | polling_interval=datetime.timedelta(seconds=30), 28 | status_callback=None): 29 | """Wait for the specified workflow to finish. 30 | 31 | Args: 32 | client: K8s api client. 33 | namespace: namespace for the workflow. 34 | name: Name of the workflow. 35 | timeout: How long to wait for the workflow. 36 | polling_interval: How often to poll for the status of the workflow. 37 | status_callback: (Optional): Callable. If supplied this callable is 38 | invoked after we poll the job. Callable takes a single argument which 39 | is the job. 40 | 41 | Raises: 42 | TimeoutError: If timeout waiting for the job to finish. 43 | """ 44 | crd_api = k8s_client.CustomObjectsApi(client) 45 | end_time = datetime.datetime.now() + timeout 46 | while True: 47 | results = crd_api.get_namespaced_custom_object( 48 | GROUP, VERSION, namespace, PLURAL, name) 49 | 50 | if status_callback: 51 | status_callback(results) 52 | 53 | if results["status"]["phase"] in ["Failed", "Succeeded"]: 54 | return results 55 | 56 | if datetime.datetime.now() + polling_interval > end_time: 57 | raise util.TimeoutError( 58 | "Timeout waiting for workflow {0} in namespace {1} to finish.".format( 59 | name, namespace)) 60 | 61 | time.sleep(polling_interval.seconds) 62 | -------------------------------------------------------------------------------- /kubeflow/tf-job/prototypes/tf-job.jsonnet: -------------------------------------------------------------------------------- 1 | // @apiVersion 0.1 2 | // @name io.ksonnet.pkg.tf-job 3 | // @description A TensorFlow job (could be training or evaluation). 4 | // @shortDescription A TensorFlow jjob. 5 | // @param name string Name to give to each of the components 6 | // @optionalParam namespace string default Namespace 7 | // @optionalParam args string null Comma separated list of arguments to pass to the job 8 | // @optionalParam image string null The docker image to use for the job. 9 | // @optionalParam image_gpu string null The docker image to use when using GPUs. 10 | // @optionalParam num_masters number 1 The number of masters to use 11 | // @optionalParam num_ps number 0 The number of ps to use 12 | // @optionalParam num_workers number 0 The number of workers to use 13 | // @optionalParam num_gpus number 0 The number of GPUs to attach to workers. 14 | 15 | // TODO(https://github.com/ksonnet/ksonnet/issues/235): ks param set args won't work if the arg starts with "--". 16 | 17 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): We have to add namespace as an explicit parameter 18 | // because ksonnet doesn't support inheriting it from the environment yet. 19 | 20 | local k = import 'k.libsonnet'; 21 | local tfJob = import 'kubeflow/tf-job/tf-job.libsonnet'; 22 | 23 | local name = import 'param://name'; 24 | local namespace = import 'param://namespace'; 25 | 26 | local argsParam = import 'param://args'; 27 | local args = 28 | if argsParam == "null" then 29 | [] 30 | else 31 | std.split(argsParam, ','); 32 | 33 | local image = import 'param://image'; 34 | local imageGpu = import 'param://image_gpu'; 35 | local numMasters = import 'param://num_masters'; 36 | local numPs = import 'param://num_ps'; 37 | local numWorkers = import 'param://num_workers'; 38 | local numGpus = import 'param://num_gpus'; 39 | 40 | local workerSpec = if numGpus > 0 then 41 | tfJob.parts.tfJobReplica("WORKER", numWorkers, args, imageGpu, numGpus) 42 | else 43 | tfJob.parts.tfJobReplica("WORKER", numWorkers, args, image); 44 | 45 | std.prune(k.core.v1.list.new([ 46 | tfJob.parts.tfJob(name, namespace, [ 47 | tfJob.parts.tfJobReplica("MASTER", numMasters, args, image), 48 | workerSpec, 49 | tfJob.parts.tfJobReplica("PS", numPs, args, image) 50 | ]), 51 | ])) 52 | -------------------------------------------------------------------------------- /tf-controller-examples/tf-cnn/tf_job_gpu.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: tensorflow.org/v1alpha1 16 | kind: TfJob 17 | metadata: 18 | name: inception-171202-163257-gpu-1 19 | namespace: default 20 | spec: 21 | replicaSpecs: 22 | - replicas: 1 23 | template: 24 | spec: 25 | containers: 26 | - args: 27 | - python 28 | - tf_cnn_benchmarks.py 29 | - --batch_size=32 30 | - --model=resnet50 31 | - --variable_update=parameter_server 32 | - --flush_stdout=true 33 | - --num_gpus=1 34 | image: gcr.io/kubeflow/tf-benchmarks-gpu:v20171202-bdab599-dirty-284af3 35 | name: tensorflow 36 | resources: 37 | limits: 38 | nvidia.com/gpu: 1 39 | workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks 40 | restartPolicy: OnFailure 41 | tfReplicaType: WORKER 42 | - replicas: 1 43 | template: 44 | spec: 45 | containers: 46 | - args: 47 | - python 48 | - tf_cnn_benchmarks.py 49 | - --batch_size=32 50 | - --model=resnet50 51 | - --variable_update=parameter_server 52 | - --flush_stdout=true 53 | - --num_gpus=1 54 | image: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 55 | name: tensorflow 56 | workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks 57 | restartPolicy: OnFailure 58 | tfReplicaType: PS 59 | terminationPolicy: 60 | chief: 61 | replicaName: WORKER 62 | replicaIndex: 0 63 | tfImage: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 64 | -------------------------------------------------------------------------------- /tf-controller-examples/tf-cnn/tf_job_gpu_distributed.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: tensorflow.org/v1alpha1 16 | kind: TfJob 17 | metadata: 18 | name: inception-171202-163257-gpu-3 19 | namespace: default 20 | spec: 21 | replicaSpecs: 22 | - replicas: 3 23 | template: 24 | spec: 25 | containers: 26 | - args: 27 | - python 28 | - tf_cnn_benchmarks.py 29 | - --batch_size=32 30 | - --model=resnet50 31 | - --variable_update=parameter_server 32 | - --flush_stdout=true 33 | - --num_gpus=1 34 | image: gcr.io/kubeflow/tf-benchmarks-gpu:v20171202-bdab599-dirty-284af3 35 | name: tensorflow 36 | resources: 37 | limits: 38 | nvidia.com/gpu: 1 39 | workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks 40 | restartPolicy: OnFailure 41 | tfReplicaType: WORKER 42 | - replicas: 1 43 | template: 44 | spec: 45 | containers: 46 | - args: 47 | - python 48 | - tf_cnn_benchmarks.py 49 | - --batch_size=32 50 | - --model=resnet50 51 | - --variable_update=parameter_server 52 | - --flush_stdout=true 53 | - --num_gpus=1 54 | image: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 55 | name: tensorflow 56 | workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks 57 | restartPolicy: OnFailure 58 | tfReplicaType: PS 59 | terminationPolicy: 60 | chief: 61 | replicaName: WORKER 62 | replicaIndex: 0 63 | tfImage: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 64 | -------------------------------------------------------------------------------- /kubeflow/core/README.md: -------------------------------------------------------------------------------- 1 | # core 2 | 3 | > Core components of Kubeflow. 4 | 5 | 6 | * [Quickstart](#quickstart) 7 | * [Using Prototypes](#using-prototypes) 8 | * [io.ksonnet.pkg.kubeflow-core](#io.ksonnet.pkg.kubeflow-core) 9 | 10 | ## Quickstart 11 | 12 | *The following commands use the `io.ksonnet.pkg.kubeflow` prototype to generate Kubernetes YAML for core, and then deploys it to your Kubernetes cluster.* 13 | 14 | First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)). 15 | 16 | If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init `. 17 | 18 | Finally, in the ksonnet application directory, run the following: 19 | 20 | ```shell 21 | # Expand prototype as a Jsonnet file, place in a file in the 22 | # `components/` directory. (YAML and JSON are also available.) 23 | $ ks prototype use io.ksonnet.pkg.kubeflow-core \ 24 | --name core \ 25 | --namespace default \ 26 | --disks 27 | 28 | # Apply to server. 29 | $ ks apply -f core.jsonnet 30 | ``` 31 | 32 | ## Using the library 33 | 34 | The library files for core define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure core for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache. 35 | 36 | This library provides a set of pre-fabricated "flavors" (or "distributions") of core, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs. 37 | 38 | These prototypes, as well as how to use them, are enumerated below. 39 | 40 | ### io.ksonnet.pkg.kubeflow-core 41 | 42 | Kubeflow core components 43 | #### Example 44 | 45 | ```shell 46 | # Expand prototype as a Jsonnet file, place in a file in the 47 | # `components/` directory. (YAML and JSON are also available.) 48 | $ ks prototype use io.ksonnet.pkg.kubeflow-core core \ 49 | --name YOUR_NAME_HERE 50 | ``` 51 | 52 | #### Parameters 53 | 54 | The available options to pass prototype are: 55 | 56 | * `--name=`: Name to give to each of the components [string] 57 | 58 | 59 | [rootReadme]: https://github.com/ksonnet/mixins 60 | -------------------------------------------------------------------------------- /tf-controller-examples/tf-cnn/tf_job_cpu.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: tensorflow.org/v1alpha1 16 | kind: TfJob 17 | metadata: 18 | name: inception-171202-163257-cpu-1 19 | namespace: default 20 | spec: 21 | replicaSpecs: 22 | - replicas: 1 23 | template: 24 | spec: 25 | containers: 26 | - args: 27 | - python 28 | - tf_cnn_benchmarks.py 29 | - --batch_size=32 30 | - --model=resnet50 31 | - --variable_update=parameter_server 32 | - --flush_stdout=true 33 | - --num_gpus=1 34 | - --local_parameter_device=cpu 35 | - --device=cpu 36 | - --data_format=NHWC 37 | image: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 38 | name: tensorflow 39 | workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks 40 | restartPolicy: OnFailure 41 | tfReplicaType: WORKER 42 | - replicas: 1 43 | template: 44 | spec: 45 | containers: 46 | - args: 47 | - python 48 | - tf_cnn_benchmarks.py 49 | - --batch_size=32 50 | - --model=resnet50 51 | - --variable_update=parameter_server 52 | - --flush_stdout=true 53 | - --num_gpus=1 54 | - --local_parameter_device=cpu 55 | - --device=cpu 56 | - --data_format=NHWC 57 | image: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 58 | name: tensorflow 59 | workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks 60 | restartPolicy: OnFailure 61 | tfReplicaType: PS 62 | terminationPolicy: 63 | chief: 64 | replicaName: WORKER 65 | replicaIndex: 0 66 | tfImage: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 67 | -------------------------------------------------------------------------------- /tf-controller-examples/tf-cnn/tf_job_cpu_distributed.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: tensorflow.org/v1alpha1 16 | kind: TfJob 17 | metadata: 18 | name: inception-171202-163257-cpu-3 19 | namespace: default 20 | spec: 21 | replicaSpecs: 22 | - replicas: 3 23 | template: 24 | spec: 25 | containers: 26 | - args: 27 | - python 28 | - tf_cnn_benchmarks.py 29 | - --batch_size=32 30 | - --model=resnet50 31 | - --variable_update=parameter_server 32 | - --flush_stdout=true 33 | - --num_gpus=1 34 | - --local_parameter_device=cpu 35 | - --device=cpu 36 | - --data_format=NHWC 37 | image: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 38 | name: tensorflow 39 | workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks 40 | restartPolicy: OnFailure 41 | tfReplicaType: WORKER 42 | - replicas: 1 43 | template: 44 | spec: 45 | containers: 46 | - args: 47 | - python 48 | - tf_cnn_benchmarks.py 49 | - --batch_size=32 50 | - --model=resnet50 51 | - --variable_update=parameter_server 52 | - --flush_stdout=true 53 | - --num_gpus=1 54 | - --local_parameter_device=cpu 55 | - --device=cpu 56 | - --data_format=NHWC 57 | image: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 58 | name: tensorflow 59 | workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks 60 | restartPolicy: OnFailure 61 | tfReplicaType: PS 62 | terminationPolicy: 63 | chief: 64 | replicaName: WORKER 65 | replicaIndex: 0 66 | tfImage: gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 67 | -------------------------------------------------------------------------------- /testing/test-infra/environments/prow/.metadata/k.libsonnet: -------------------------------------------------------------------------------- 1 | local k8s = import "k8s.libsonnet"; 2 | 3 | local apps = k8s.apps; 4 | local core = k8s.core; 5 | local extensions = k8s.extensions; 6 | 7 | local hidden = { 8 | mapContainers(f):: { 9 | local podContainers = super.spec.template.spec.containers, 10 | spec+: { 11 | template+: { 12 | spec+: { 13 | // IMPORTANT: This overwrites the 'containers' field 14 | // for this deployment. 15 | containers: std.map(f, podContainers), 16 | }, 17 | }, 18 | }, 19 | }, 20 | 21 | mapContainersWithName(names, f) :: 22 | local nameSet = 23 | if std.type(names) == "array" 24 | then std.set(names) 25 | else std.set([names]); 26 | local inNameSet(name) = std.length(std.setInter(nameSet, std.set([name]))) > 0; 27 | self.mapContainers( 28 | function(c) 29 | if std.objectHas(c, "name") && inNameSet(c.name) 30 | then f(c) 31 | else c 32 | ), 33 | }; 34 | 35 | k8s + { 36 | apps:: apps + { 37 | v1beta1:: apps.v1beta1 + { 38 | local v1beta1 = apps.v1beta1, 39 | 40 | daemonSet:: v1beta1.daemonSet + { 41 | mapContainers(f):: hidden.mapContainers(f), 42 | mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f), 43 | }, 44 | 45 | deployment:: v1beta1.deployment + { 46 | mapContainers(f):: hidden.mapContainers(f), 47 | mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f), 48 | }, 49 | }, 50 | }, 51 | 52 | core:: core + { 53 | v1:: core.v1 + { 54 | list:: { 55 | new(items):: 56 | {apiVersion: "v1"} + 57 | {kind: "List"} + 58 | self.items(items), 59 | 60 | items(items):: if std.type(items) == "array" then {items+: items} else {items+: [items]}, 61 | }, 62 | }, 63 | }, 64 | 65 | extensions:: extensions + { 66 | v1beta1:: extensions.v1beta1 + { 67 | local v1beta1 = extensions.v1beta1, 68 | 69 | daemonSet:: v1beta1.daemonSet + { 70 | mapContainers(f):: hidden.mapContainers(f), 71 | mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f), 72 | }, 73 | 74 | deployment:: v1beta1.deployment + { 75 | mapContainers(f):: hidden.mapContainers(f), 76 | mapContainersWithName(names, f):: hidden.mapContainersWithName(names, f), 77 | }, 78 | }, 79 | }, 80 | } 81 | -------------------------------------------------------------------------------- /kubeflow/tf-serving/README.md: -------------------------------------------------------------------------------- 1 | # tf-serving 2 | 3 | > TensorFlow serving is a server for TensorFlow models. 4 | 5 | 6 | * [Quickstart](#quickstart) 7 | * [Using Prototypes](#using-prototypes) 8 | * [io.ksonnet.pkg.tf-serving](#io.ksonnet.pkg.tf-serving) 9 | 10 | ## Quickstart 11 | 12 | *The following commands use the `io.ksonnet.pkg.tf-serving` prototype to generate Kubernetes YAML for tf-serving, and then deploys it to your Kubernetes cluster.* 13 | 14 | First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)). 15 | 16 | If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init `. 17 | 18 | Finally, in the ksonnet application directory, run the following: 19 | 20 | ```shell 21 | # Expand prototype as a Jsonnet file, place in a file in the 22 | # `components/` directory. (YAML and JSON are also available.) 23 | $ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \ 24 | --name tf-serving \ 25 | --namespace default 26 | 27 | # Apply to server. 28 | $ ks apply -f tf-serving.jsonnet 29 | ``` 30 | 31 | ## Using the library 32 | 33 | The library files for tf-serving define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure tf-serving for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache. 34 | 35 | This library provides a set of pre-fabricated "flavors" (or "distributions") of tf-serving, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs. 36 | 37 | These prototypes, as well as how to use them, are enumerated below. 38 | 39 | ### io.ksonnet.pkg.tf-serving 40 | 41 | TensorFlow serving 42 | #### Example 43 | 44 | ```shell 45 | # Expand prototype as a Jsonnet file, place in a file in the 46 | # `components/` directory. (YAML and JSON are also available.) 47 | $ ks prototype use io.ksonnet.pkg.tf-serving tf-serving \ 48 | --name YOUR_NAME_HERE \ 49 | --model_path YOUR_MODEL_PATH_HERE 50 | ``` 51 | 52 | #### Parameters 53 | 54 | The available options to pass prototype are: 55 | 56 | * `--name=`: Name to give to each of the components [string] 57 | * `--model_path=`: Path to the model. This can be a GCS path. [string] 58 | 59 | 60 | [rootReadme]: https://github.com/ksonnet/mixins 61 | -------------------------------------------------------------------------------- /testing/checkout.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script is used as the first step in our Argo workflows to check out the code 4 | # corresponding the prow job. 5 | # 6 | # TODO(jlewi): Eliminate code duplication with bootstraph.sh my moving shared code into 7 | # a bash script that can be sourced from multiple scripts. 8 | #!/bin/bash 9 | set -xe 10 | SRC_DIR=$1 11 | 12 | # Print out env for debugging. 13 | env | sort 14 | 15 | git clone https://github.com/${REPO_OWNER}/${REPO_NAME}.git /tmp/src 16 | 17 | # Some git operations are really slow when using NFS. 18 | # We observed clone times increasing from O(30) seconds to O(4 minutes) 19 | # when we switched to NFS. 20 | # As a workaround we clone into a local directory and then move the files onto 21 | # NFS. Copying to NFS is still a bottleneck and increases the run time to O(1. 5 minutes). 22 | # clone --recurse-submodules https://github.com/google/kubeflow.git /tmp/src", 23 | cd /tmp/src 24 | 25 | # We need to set the preloadindex option; to try to speedup git ops like describe 26 | # and status when using an NFS filesystem. 27 | # See: https://stackoverflow.com/questions/4994772/ways-to-improve-git-status-performance 28 | # unfortunately this doesn't seem to help with sub modules. 29 | git config core.preloadindex true 30 | 31 | # See https://github.com/kubernetes/test-infra/tree/master/prow#job-evironment-variables 32 | if [ ! -z ${PULL_NUMBER} ]; then 33 | git fetch origin pull/${PULL_NUMBER}/head:pr 34 | if [ ! -z ${PULL_PULL_SHA} ]; then 35 | git checkout ${PULL_PULL_SHA} 36 | else 37 | # Checkout the latest commit for this PR since no commit specified. 38 | git checkout pr 39 | fi 40 | else 41 | if [ ! -z ${PULL_BASE_SHA} ]; then 42 | # Its a post submit; checkout the commit to test. 43 | git checkout ${PULL_BASE_SHA} 44 | fi 45 | fi 46 | 47 | # Update submodules. 48 | git submodule init 49 | git submodule update 50 | 51 | # TODO(jlewi): As noted above the git operations below are really 52 | # slow when using NFS. 53 | # Print out the git version in the logs 54 | git describe --tags --always --dirty 55 | git status 56 | 57 | # Move it to NFS 58 | mkdir -p + ${SRC_DIR} 59 | 60 | # The period is needed because we want to copy the contents of the src directory 61 | # into srcDir not srcDir/src/. 62 | cp -r /tmp/src/. ${SRC_DIR} 63 | 64 | # Make the files world readable/writable. 65 | # This is a hack to make it easy to modify the files from jupyterhub which is using 66 | # a different user/group id. 67 | chmod -R a+rwx ${SRC_DIR} -------------------------------------------------------------------------------- /testing/test-infra/components/nfs-jupyter.jsonnet: -------------------------------------------------------------------------------- 1 | local params = std.extVar("__ksonnet/params").components["nfs-jupyter"]; 2 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): We have to add namespace as an explicit parameter 3 | // because ksonnet doesn't support inheriting it from the environment yet. 4 | 5 | local k = import 'k.libsonnet'; 6 | local jupyter = import "kubeflow/core/jupyterhub.libsonnet"; 7 | local tfjob = import "kubeflow/core/tf-job.libsonnet"; 8 | local nfs = import "kubeflow/core/nfs.libsonnet"; 9 | 10 | local name = params.name; 11 | local namespace = params.namespace; 12 | 13 | // TODO(jlewi): Make this a parameter 14 | local jupyterHubImage = 'gcr.io/kubeflow/jupyterhub:1.0'; 15 | local diskParam = params.disks; 16 | 17 | local diskNames = if diskParam != "null" && std.length(diskParam) > 0 then 18 | std.split(diskParam, ',') 19 | else []; 20 | 21 | local jupyterConfigMap = if std.length(diskNames) == 0 then 22 | jupyter.parts(namespace).jupyterHubConfigMap 23 | else jupyter.parts(namespace).jupyterHubConfigMapWithVolumes(diskNames); 24 | 25 | local tfJobImage = params.tfJobImage; 26 | 27 | // Create a list of the resources needed for a particular disk 28 | local diskToList = function(diskName) [ 29 | nfs.parts(namespace, name,).diskResources(diskName).storageClass, 30 | nfs.parts(namespace, name,).diskResources(diskName).volumeClaim, 31 | nfs.parts(namespace, name,).diskResources(diskName).service, 32 | nfs.parts(namespace, name,).diskResources(diskName).provisioner]; 33 | 34 | local allDisks = std.flattenArrays(std.map(diskToList, diskNames)); 35 | 36 | local nfsComponents = 37 | if std.length(allDisks) > 0 then 38 | [nfs.parts(namespace, name).serviceAccount, 39 | nfs.parts(namespace, name).role, 40 | nfs.parts(namespace, name).roleBinding, 41 | nfs.parts(namespace, name).clusterRoleBinding,] + allDisks 42 | else 43 | []; 44 | 45 | // TODO(jlewi): Maybe we should split this into separate components 46 | // for Jupyter and NFS. We always need NFS because its used by our 47 | // Argo workflows. But Jupyter could be optional. 48 | std.prune(k.core.v1.list.new([ 49 | // jupyterHub components 50 | jupyterConfigMap, 51 | jupyter.parts(namespace).jupyterHubService, 52 | jupyter.parts(namespace).jupyterHubLoadBalancer, 53 | jupyter.parts(namespace).jupyterHub(jupyterHubImage), 54 | jupyter.parts(namespace).jupyterHubRole, 55 | jupyter.parts(namespace).jupyterHubServiceAccount, 56 | jupyter.parts(namespace).jupyterHubRoleBinding, 57 | ] + nfsComponents)) 58 | 59 | -------------------------------------------------------------------------------- /tf-controller-examples/tf-cnn/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | REGISTRY := gcr.io/kubeflow 16 | TAG := $(shell date +v%Y%m%d)-$(shell git describe --tags --always --dirty)-$(shell git diff | sha256sum | cut -c -6) 17 | DIR := ${CURDIR} 18 | 19 | # 1.4 isn't new enough for the tf-benchmarks code 20 | # so we pin to a particular nightly build image. 21 | # CPU_BASE = tensorflow/tensorflow:nightly 22 | CPU_BASE = tensorflow/tensorflow@sha256:5edc0446cc989ad75bc30631f89f20694fe5bf5226f665d47e5c7f35a3b18484 23 | # GPU_BASE = tensorflow/tensorflow:nightly-gpu 24 | GPU_BASE = tensorflow/tensorflow@sha256:bfadad8f2c80424d8d6059d3b8cd6947bf23111dc786fc33db72b56b632a1f28 25 | 26 | BENCH_MARKS_IMAGE := $(REGISTRY)/tf-benchmarks 27 | 28 | # The published versions of the example code to use. 29 | PUBLISHED_CPU := gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 30 | PUBLISHED_GPU := gcr.io/kubeflow/tf-benchmarks-gpu:v20171202-bdab599-dirty-284af3 31 | 32 | # Build the cpu image 33 | build-cpu: 34 | jinja2 Dockerfile.template --format=yaml -D base_image=$(CPU_BASE) > Dockerfile.cpu 35 | docker build -t $(BENCH_MARKS_IMAGE)-cpu:$(TAG) -f Dockerfile.cpu ./ 36 | gcloud docker -- push $(BENCH_MARKS_IMAGE)-cpu:$(TAG) 37 | 38 | build-gpu: 39 | jinja2 Dockerfile.template --format=yaml -D base_image=$(GPU_BASE) > Dockerfile.gpu 40 | docker build -t $(BENCH_MARKS_IMAGE)-gpu:$(TAG) -f Dockerfile.gpu ./ 41 | gcloud docker -- push $(BENCH_MARKS_IMAGE)-gpu:$(TAG) 42 | 43 | build-images: build-cpu build-gpu 44 | 45 | # Create the templates 46 | build-templates: 47 | python create_job_specs.py --cpu_image=$(PUBLISHED_CPU) --gpu_image=$(PUBLISHED_GPU) \ 48 | --gpu --num_workers=1 --output=tf_job_gpu.yaml 49 | python create_job_specs.py --cpu_image=$(PUBLISHED_CPU) --gpu_image=$(PUBLISHED_GPU) \ 50 | --gpu --num_workers=3 --output=tf_job_gpu_distributed.yaml 51 | python create_job_specs.py --cpu_image=$(PUBLISHED_CPU) --gpu_image=$(PUBLISHED_GPU) \ 52 | --no-gpu --num_workers=1 --output=tf_job_cpu.yaml 53 | python create_job_specs.py --cpu_image=$(PUBLISHED_CPU) --gpu_image=$(PUBLISHED_GPU) \ 54 | --no-gpu --num_workers=3 --output=tf_job_cpu_distributed.yaml 55 | -------------------------------------------------------------------------------- /testing/run_e2e_workflow_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import unittest 4 | import mock 5 | from testing import run_e2e_workflow 6 | import tempfile 7 | 8 | from google.cloud import storage # pylint: disable=no-name-in-module 9 | 10 | class TestRunE2eWorkflow(unittest.TestCase): 11 | @mock.patch("testing.run_e2e_workflow.upload_file_to_gcs") 12 | @mock.patch("testing.run_e2e_workflow.upload_to_gcs") 13 | @mock.patch("testing.run_e2e_workflow.util.load_kube_config") 14 | @mock.patch("testing.run_e2e_workflow.argo_client.wait_for_workflow") 15 | @mock.patch("testing.run_e2e_workflow.util.configure_kubectl") 16 | @mock.patch("testing.run_e2e_workflow.util.run") 17 | def testMainPresubmit(self, mock_run, mock_configure, mock_wait, *unused_mocks): # pylint: disable=no-self-use 18 | """Test create started for presubmit job.""" 19 | 20 | os.environ["REPO_OWNER"] = "fake_org" 21 | os.environ["REPO_NAME"] = "fake_name" 22 | os.environ["PULL_NUMBER"] = "77" 23 | os.environ["PULL_PULL_SHA"] = "123abc" 24 | os.environ["JOB_NAME"] = "kubeflow-presubmit" 25 | os.environ["JOB_TYPE"] = "presubmit" 26 | os.environ["BUILD_NUMBER"] = "1234" 27 | 28 | args = ["--project=some-project", "--cluster=some-cluster", 29 | "--zone=us-east1-d", "--bucket=some-bucket"] 30 | run_e2e_workflow.main(args) 31 | 32 | mock_configure.assert_called_once_with("some-project", "us-east1-d", 33 | "some-cluster",) 34 | self.assertItemsEqual( 35 | ["ks", "param", "set", "workflows", "name"], 36 | mock_run.call_args_list[0][0][0][:-1]) 37 | # Workflow name will have some random salt at the end. 38 | self.assertRegexpMatches(mock_run.call_args_list[0][0][0][-1], 39 | "kubeflow-presubmit-77-[0-9a-z]{4}") 40 | 41 | self.assertItemsEqual( 42 | ["ks", "param", "set", "workflows", "prow_env", 43 | "BUILD_NUMBER=1234,JOB_NAME=kubeflow-presubmit,JOB_TYPE=presubmit" 44 | ",PULL_NUMBER=77,PULL_PULL_SHA=123abc,REPO_NAME=fake_name" 45 | ",REPO_OWNER=fake_org"], 46 | mock_run.call_args_list[1][0][0]) 47 | 48 | self.assertItemsEqual( 49 | ["ks", "param", "set", "workflows", "namespace", 50 | "kubeflow-test-infra"], 51 | mock_run.call_args_list[2][0][0]) 52 | 53 | self.assertItemsEqual( 54 | ["ks", "param", "set", "workflows", "bucket", "some-bucket"], 55 | mock_run.call_args_list[3][0][0]) 56 | 57 | self.assertItemsEqual( 58 | ["ks", "show", "prow", "-c", "workflows"], 59 | mock_run.call_args_list[4][0][0]) 60 | 61 | self.assertItemsEqual( 62 | ["ks", "apply", "prow", "-c", "workflows"], 63 | mock_run.call_args_list[5][0][0]) 64 | 65 | 66 | if __name__ == "__main__": 67 | unittest.main() 68 | -------------------------------------------------------------------------------- /kubeflow/tf-job/README.md: -------------------------------------------------------------------------------- 1 | # tf-job 2 | 3 | > Prototypes for running TensorFlow jobs. 4 | 5 | 6 | * [Quickstart](#quickstart) 7 | * [Using Prototypes](#using-prototypes) 8 | * [io.ksonnet.pkg.tf-job](#io.ksonnet.pkg.tf-job) 9 | * [io.ksonnet.pkg.tf-cnn](#io.ksonnet.pkg.tf-cnn) 10 | 11 | ## Quickstart 12 | 13 | *The following commands use the `io.ksonnet.pkg.tf-job` prototype to generate Kubernetes YAML for tf-job, and then deploys it to your Kubernetes cluster.* 14 | 15 | First, create a cluster and install the ksonnet CLI (see root-level [README.md](rootReadme)). 16 | 17 | If you haven't yet created a [ksonnet application](linkToSomewhere), do so using `ks init `. 18 | 19 | Finally, in the ksonnet application directory, run the following: 20 | 21 | ```shell 22 | # Expand prototype as a Jsonnet file, place in a file in the 23 | # `components/` directory. (YAML and JSON are also available.) 24 | $ ks prototype use io.ksonnet.pkg.tf-job tf-job \ 25 | --namespace default \ 26 | --name tf-job 27 | 28 | # Apply to server. 29 | $ ks apply -f tf-job.jsonnet 30 | ``` 31 | 32 | ## Using the library 33 | 34 | The library files for tf-job define a set of relevant *parts* (_e.g._, deployments, services, secrets, and so on) that can be combined to configure tf-job for a wide variety of scenarios. For example, a database like Redis may need a secret to hold the user password, or it may have no password if it's acting as a cache. 35 | 36 | This library provides a set of pre-fabricated "flavors" (or "distributions") of tf-job, each of which is configured for a different use case. These are captured as ksonnet *prototypes*, which allow users to interactively customize these distributions for their specific needs. 37 | 38 | These prototypes, as well as how to use them, are enumerated below. 39 | 40 | ### io.ksonnet.pkg.tf-job 41 | 42 | A TensorFlow job (could be training or evaluation). 43 | #### Example 44 | 45 | ```shell 46 | # Expand prototype as a Jsonnet file, place in a file in the 47 | # `components/` directory. (YAML and JSON are also available.) 48 | $ ks prototype use io.ksonnet.pkg.tf-job tf-job \ 49 | --name YOUR_NAME_HERE 50 | ``` 51 | 52 | #### Parameters 53 | 54 | The available options to pass prototype are: 55 | 56 | * `--name=`: Name to give to each of the components [string] 57 | ### io.ksonnet.pkg.tf-cnn 58 | 59 | A TensorFlow CNN Benchmarking job 60 | #### Example 61 | 62 | ```shell 63 | # Expand prototype as a Jsonnet file, place in a file in the 64 | # `components/` directory. (YAML and JSON are also available.) 65 | $ ks prototype use io.ksonnet.pkg.tf-cnn tf-job \ 66 | --name YOUR_NAME_HERE 67 | ``` 68 | 69 | #### Parameters 70 | 71 | The available options to pass prototype are: 72 | 73 | * `--name=`: Name for the job. [string] 74 | 75 | 76 | [rootReadme]: https://github.com/ksonnet/mixins 77 | -------------------------------------------------------------------------------- /components/k8s-model-server/inception-client/label.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | #!/usr/bin/env python2.7 17 | 18 | """ 19 | Runs the Inception model being served on the kubeflow model server on an image 20 | that you specify. 21 | 22 | Note: This file is a modification of the inception client available on the 23 | TensorFlow Serving GitHub repository: 24 | https://github.com/tensorflow/serving/blob/master/tensorflow_serving/example/inception_client.py 25 | """ 26 | 27 | from __future__ import print_function 28 | 29 | # This is a placeholder for a Google-internal import. 30 | 31 | import argparse 32 | 33 | from grpc.beta import implementations 34 | import tensorflow as tf 35 | 36 | from tensorflow_serving.apis import predict_pb2 37 | from tensorflow_serving.apis import prediction_service_pb2 38 | 39 | 40 | def main(image_paths, server, port): 41 | channel = implementations.insecure_channel(server, port) 42 | stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) 43 | 44 | raw_images = [] 45 | for path in image_paths: 46 | with tf.gfile.Open(path) as img: 47 | raw_images.append(img.read()) 48 | 49 | # Send request 50 | # See prediction_service.proto for gRPC request/response details. 51 | request = predict_pb2.PredictRequest() 52 | request.model_spec.name = 'inception' 53 | request.model_spec.signature_name = 'predict_images' 54 | request.inputs['images'].CopyFrom( 55 | tf.make_tensor_proto(raw_images, shape=[len(raw_images)])) 56 | result = stub.Predict(request, 10.0) # 10 secs timeout 57 | print(result) 58 | 59 | 60 | if __name__ == '__main__': 61 | parser = argparse.ArgumentParser('Label an image using Inception') 62 | parser.add_argument( 63 | '-s', 64 | '--server', 65 | help='URL of host serving the Inception model' 66 | ) 67 | parser.add_argument( 68 | '-p', 69 | '--port', 70 | type=int, 71 | default=9000, 72 | help='Port at which Inception model is being served' 73 | ) 74 | parser.add_argument( 75 | 'images', 76 | nargs='+', 77 | help='Paths (local or GCS) to images you would like to label' 78 | ) 79 | 80 | args = parser.parse_args() 81 | 82 | main(args.images, args.server, args.port) 83 | -------------------------------------------------------------------------------- /testing/Dockerfile: -------------------------------------------------------------------------------- 1 | # Docker image for running E2E tests using Argo. 2 | 3 | FROM python:2.7-slim 4 | MAINTAINER Jeremy Lewi 5 | 6 | # Never prompt the user for choices on installation/configuration of packages 7 | ENV DEBIAN_FRONTEND noninteractive 8 | ENV TERM linux 9 | 10 | # Define en_US. 11 | ENV LANGUAGE=en_US.UTF-8 \ 12 | LANG=en_US.UTF-8 \ 13 | LC_ALL=en_US.UTF-8 \ 14 | LC_CTYPE=en_US.UTF-8 \ 15 | LC_MESSAGES=en_US.UTF-8 \ 16 | LC_ALL=en_US.UTF-8 17 | 18 | 19 | # buildDeps should be packages needed only to build some other packages as 20 | # these packages are purged in a later step. 21 | # 22 | # gcc & python-dev are needed so we can install crcmod for gsutil 23 | RUN set -ex \ 24 | && apt-get update -yqq \ 25 | && apt-get install -yqq --no-install-recommends \ 26 | curl \ 27 | locales \ 28 | wget \ 29 | ca-certificates \ 30 | git \ 31 | zip \ 32 | unzip \ 33 | gcc python-dev \ 34 | python-setuptools \ 35 | && apt-get clean \ 36 | && rm -rf \ 37 | /var/lib/apt/lists/* \ 38 | /tmp/* \ 39 | /var/tmp/* \ 40 | /usr/share/man \ 41 | /usr/share/doc \ 42 | /usr/share/doc-base 43 | 44 | # Set the locale 45 | RUN sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \ 46 | && locale-gen \ 47 | && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 48 | 49 | # Install go 50 | RUN cd /tmp && \ 51 | wget -O /tmp/go.tar.gz https://redirector.gvt1.com/edgedl/go/go1.9.2.linux-amd64.tar.gz && \ 52 | tar -C /usr/local -xzf go.tar.gz 53 | 54 | # Install gcloud 55 | ENV PATH=/google-cloud-sdk/bin:/workspace:${PATH} \ 56 | CLOUDSDK_CORE_DISABLE_PROMPTS=1 57 | 58 | RUN wget -q https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz && \ 59 | tar xzf google-cloud-sdk.tar.gz -C / && \ 60 | rm google-cloud-sdk.tar.gz && \ 61 | /google-cloud-sdk/install.sh \ 62 | --disable-installation-options \ 63 | --bash-completion=false \ 64 | --path-update=false \ 65 | --usage-reporting=false && \ 66 | gcloud components install alpha beta kubectl 67 | 68 | # Install CRCMOD for gsutil 69 | RUN easy_install -U pip && \ 70 | pip install -U crcmod 71 | 72 | # Install Helm 73 | RUN wget -O /tmp/get_helm.sh \ 74 | https://raw.githubusercontent.com/kubernetes/helm/master/scripts/get && \ 75 | chmod 700 /tmp/get_helm.sh && \ 76 | /tmp/get_helm.sh && \ 77 | rm /tmp/get_helm.sh 78 | 79 | # Initialize helm 80 | RUN helm init --client-only 81 | 82 | # Install ksonnet 83 | RUN curl -o /usr/local/bin/ks -L \ 84 | https://github.com/ksonnet/ksonnet/releases/download/v0.8.0/ks-linux-amd64 && \ 85 | chmod a+x /usr/local/bin/ks 86 | 87 | # Install various python libraries. 88 | RUN pip install --upgrade six pyyaml google-api-python-client \ 89 | google-cloud-storage google-auth-httplib2 pylint kubernetes==4.0.0 mock retrying 90 | 91 | COPY bootstrap.sh /usr/local/bin 92 | RUN chmod a+x /usr/local/bin/bootstrap.sh 93 | 94 | COPY checkout.sh /usr/local/bin 95 | RUN chmod a+x /usr/local/bin/checkout.sh 96 | 97 | ENTRYPOINT ["/usr/local/bin/bootstrap.sh"] -------------------------------------------------------------------------------- /tf-controller-examples/tf-cnn/launcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2017 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """A launcher suitable for invoking tf_cnn_benchmarks using TfJob. 18 | 19 | All the launcher does is turn TF_CONFIG environment variable 20 | into extra arguments to append to the command line. 21 | """ 22 | import logging 23 | import json 24 | import os 25 | import subprocess 26 | import sys 27 | import time 28 | 29 | def run_and_stream(cmd): 30 | logging.info("Running %s", " ".join(cmd)) 31 | process = subprocess.Popen(cmd, stdout=subprocess.PIPE, 32 | stderr=subprocess.STDOUT) 33 | 34 | while process.poll() is None: 35 | process.stdout.flush() 36 | if process.stderr: 37 | process.stderr.flush() 38 | sys.stderr.flush() 39 | sys.stdout.flush() 40 | for line in iter(process.stdout.readline, ''): 41 | process.stdout.flush() 42 | logging.info(line.strip()) 43 | 44 | sys.stderr.flush() 45 | sys.stdout.flush() 46 | process.stdout.flush() 47 | if process.stderr: 48 | process.stderr.flush() 49 | for line in iter(process.stdout.readline, ''): 50 | logging.info(line.strip()) 51 | 52 | if process.returncode != 0: 53 | raise ValueError("cmd: {0} exited with code {1}".format( 54 | " ".join(cmd), process.returncode)) 55 | 56 | if __name__ == "__main__": 57 | logging.getLogger().setLevel(logging.INFO) 58 | logging.basicConfig(level=logging.INFO, 59 | format=('%(levelname)s|%(asctime)s' 60 | '|%(pathname)s|%(lineno)d| %(message)s'), 61 | datefmt='%Y-%m-%dT%H:%M:%S', 62 | ) 63 | logging.info("Launcher started.") 64 | tf_config = os.environ.get('TF_CONFIG', '{}') 65 | tf_config_json = json.loads(tf_config) 66 | cluster = tf_config_json.get('cluster', {}) 67 | job_name = tf_config_json.get('task', {}).get('type', "") 68 | task_index = tf_config_json.get('task', {}).get('index', "") 69 | 70 | command = sys.argv[1:] 71 | ps_hosts = ",".join(cluster.get("ps", [])) 72 | worker_hosts = ",".join(cluster.get("worker", [])) 73 | command.append("--job_name=" + job_name) 74 | command.append("--ps_hosts=" + ps_hosts) 75 | command.append("--worker_hosts=" + worker_hosts) 76 | command.append("--task_index={0}".format(task_index)) 77 | 78 | logging.info("Command to run: %s", " ".join(command)) 79 | with open("/opt/run_benchmarks.sh", "w") as hf: 80 | hf.write("#!/bin/bash\n") 81 | hf.write(" ".join(command)) 82 | hf.write("\n") 83 | 84 | run_and_stream(command) 85 | logging.info("Finished: %s", " ".join(command)) 86 | # We don't want to terminate because TfJob will 87 | # just restart the job. 88 | while True: 89 | logging.info("Command ran successfully sleep for ever.") 90 | time.sleep(600) 91 | -------------------------------------------------------------------------------- /testing/prow_artifacts_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import unittest 4 | import mock 5 | from testing import prow_artifacts 6 | import tempfile 7 | 8 | from google.cloud import storage # pylint: disable=no-name-in-module 9 | 10 | class TestProw(unittest.TestCase): 11 | @mock.patch("testing.prow_artifacts.time.time") 12 | def testCreateStartedPresubmit(self, mock_time): # pylint: disable=no-self-use 13 | """Test create started for presubmit job.""" 14 | mock_time.return_value = 1000 15 | 16 | os.environ["REPO_OWNER"] = "fake_org" 17 | os.environ["REPO_NAME"] = "fake_name" 18 | os.environ["PULL_PULL_SHA"] = "123abc" 19 | expected = { 20 | "timestamp": 1000, 21 | "repos": { 22 | "fake_org/fake_name": "123abc", 23 | }, 24 | } 25 | 26 | actual = prow_artifacts.create_started() 27 | 28 | self.assertEquals(expected, json.loads(actual)) 29 | 30 | @mock.patch("testing.prow_artifacts.time.time") 31 | def testCreateFinished(self, mock_time): # pylint: disable=no-self-use 32 | """Test create finished job.""" 33 | mock_time.return_value = 1000 34 | 35 | expected = { 36 | "timestamp": 1000, 37 | "result": "FAILED", 38 | "metadata": {}, 39 | } 40 | 41 | actual = prow_artifacts.create_finished(False) 42 | 43 | self.assertEquals(expected, json.loads(actual)) 44 | 45 | @mock.patch("testing.prow_artifacts.util.run") 46 | def testCopyArtifactsPresubmit(self, mock_run): # pylint: disable=no-self-use 47 | """Test copy artifacts to GCS.""" 48 | 49 | os.environ["REPO_OWNER"] = "fake_org" 50 | os.environ["REPO_NAME"] = "fake_name" 51 | os.environ["PULL_NUMBER"] = "72" 52 | os.environ["BUILD_NUMBER"] = "100" 53 | os.environ["PULL_PULL_SHA"] = "123abc" 54 | os.environ["JOB_NAME"] = "kubeflow-presubmit" 55 | 56 | temp_dir = tempfile.mkdtemp(prefix="tmpTestProwTestCreateFinished.") 57 | args = ["--artifacts_dir=/tmp/some/dir", "copy_artifacts", 58 | "--bucket=some_bucket"] 59 | prow_artifacts.main(args) 60 | 61 | mock_run.assert_called_once_with( 62 | ["gsutil", "-m", "rsync", "-r", "/tmp/some/dir", 63 | "gs://some_bucket/pr-logs/pull/fake_org_fake_name/72/kubeflow-presubmit" 64 | "/100"], 65 | ) 66 | 67 | def testCreateSymlink(self): 68 | gcs_client = mock.MagicMock(spec=storage.Client) 69 | mock_bucket = mock.MagicMock(spec=storage.Bucket) 70 | gcs_client.get_bucket.return_value = mock_bucket 71 | mock_blob = mock.MagicMock(spec=storage.Blob) 72 | mock_bucket.blob.return_value = mock_blob 73 | # We can't add the decorator the instance method because that would 74 | # interfere with creating gcs_client since storage.Client would then 75 | # point to the mock and not the actual class. 76 | with mock.patch("testing.prow_artifacts.storage.Client") as mock_client: 77 | mock_client.return_value = gcs_client 78 | 79 | os.environ["REPO_OWNER"] = "fake_org" 80 | os.environ["REPO_NAME"] = "fake_name" 81 | os.environ["PULL_NUMBER"] = "72" 82 | os.environ["BUILD_NUMBER"] = "100" 83 | os.environ["PULL_PULL_SHA"] = "123abc" 84 | os.environ["JOB_NAME"] = "kubeflow-presubmit" 85 | 86 | args = ["--artifacts_dir=/tmp/some/dir", "create_pr_symlink", 87 | "--bucket=some-bucket"] 88 | prow_artifacts.main(args) 89 | 90 | mock_blob.upload_from_string.assert_called_once_with( 91 | "gs://some-bucket/pr-logs/pull/fake_org_fake_name/72" 92 | "/kubeflow-presubmit/100") 93 | 94 | if __name__ == "__main__": 95 | unittest.main() 96 | -------------------------------------------------------------------------------- /kubeflow/tf-serving/tf-serving.libsonnet: -------------------------------------------------------------------------------- 1 | local k = import 'k.libsonnet'; 2 | local deployment = k.extensions.v1beta1.deployment; 3 | local container = deployment.mixin.spec.template.spec.containersType; 4 | local storageClass = k.storage.v1beta1.storageClass; 5 | local service = k.core.v1.service; 6 | local networkPolicy = k.extensions.v1beta1.networkPolicy; 7 | local networkSpec = networkPolicy.mixin.spec; 8 | 9 | { 10 | parts:: { 11 | deployment:: { 12 | local defaults = { 13 | image:: "gcr.io/kubeflow/model-server:1.0", 14 | imagePullPolicy:: "IfNotPresent", 15 | resources:: { 16 | "requests": { 17 | "memory": "1Gi", 18 | "cpu": "1" 19 | }, 20 | "limits": { 21 | "memory": "4Gi", 22 | "cpu": "4" 23 | }, 24 | }, 25 | }, 26 | 27 | modelService(name, namespace, labels={app:name}): { 28 | "apiVersion": "v1", 29 | "kind": "Service", 30 | "metadata": { 31 | "labels": labels, 32 | "name": name, 33 | namespace: namespace, 34 | }, 35 | "spec": { 36 | "ports": [ 37 | { 38 | "port": 9000, 39 | "targetPort": 9000, 40 | } 41 | ], 42 | "selector": labels, 43 | "type": "ClusterIP" 44 | } 45 | }, 46 | 47 | modelServer(name, namespace, modelPath, labels={app:name},): 48 | // TODO(jlewi): Allow the model to be served from a PVC. 49 | local volume = { 50 | name: "redis-data", 51 | namespace: namespace, 52 | emptyDir: {} 53 | }; 54 | base(name, namespace, modelPath, labels), 55 | 56 | local base(name, namespace, modelPath, labels) = 57 | { 58 | apiVersion: "extensions/v1beta1", 59 | kind: "Deployment", 60 | metadata: { 61 | name: name, 62 | namespace: namespace, 63 | labels: labels, 64 | }, 65 | spec: { 66 | template: { 67 | metadata: { 68 | labels: labels 69 | }, 70 | spec: { 71 | containers: [ 72 | { 73 | name: name, 74 | image: defaults.image, 75 | imagePullPolicy: defaults.imagePullPolicy, 76 | // TODO(jlewi): Talk to owensk to figure out why we wrap in a shell. 77 | command: [ 78 | "/bin/sh", 79 | "-c" 80 | ], 81 | args: [ 82 | "/usr/bin/tensorflow_model_server --port=9000 --model_name=" + name + " --model_base_path=" + modelPath, 83 | ], 84 | env: [], 85 | ports: [ 86 | { 87 | containerPort: 9000, 88 | }, 89 | ], 90 | // TODO(jlewi): We should add readiness and liveness probes. I think the blocker is that 91 | // model-server doesn't have something we can use out of the box. 92 | resources: defaults.resources, 93 | }, 94 | ], 95 | // See: https://github.com/google/kubeflow/tree/master/components/k8s-model-server#set-the-user-optional 96 | // The is user and group should be defined in the Docker image. 97 | // Per best practices we don't run as the root user. 98 | securityContext: { 99 | runAsUser: 1000, 100 | fsGroup: 1000, 101 | }, 102 | }, 103 | }, 104 | }, 105 | }, 106 | }, 107 | }, 108 | } 109 | -------------------------------------------------------------------------------- /kubeflow/core/prototypes/all.jsonnet: -------------------------------------------------------------------------------- 1 | // @apiVersion 0.1 2 | // @name io.ksonnet.pkg.kubeflow-core 3 | // @description Kubeflow core components 4 | // @shortDescription Kubeflow core components. This currently includes JupyterHub and the TfJob controller. 5 | // @param name string Name to give to each of the components 6 | // @optionalParam namespace string default Namespace 7 | // @optionalParam disks string null Comma separated list of Google persistent disks to attach to jupyter environments. 8 | // @optionalParam cloud string null String identifying the cloud to customize the deployment for. 9 | // @optionalParam tfJobImage string gcr.io/tf-on-k8s-dogfood/tf_operator:v20180117-04425d9-dirty-e3b0c44 The image for the TfJob controller. 10 | // @optionalParam tfDefaultImage string null The default image to use for TensorFlow. 11 | // @optionalParam tfJobUiServiceType string ClusterIP The service type for the UI. 12 | // @optionalParam jupyterHubServiceType string ClusterIP The service type for Jupyterhub. 13 | 14 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): We have to add namespace as an explicit parameter 15 | // because ksonnet doesn't support inheriting it from the environment yet. 16 | 17 | local k = import 'k.libsonnet'; 18 | local jupyter = import "kubeflow/core/jupyterhub.libsonnet"; 19 | local tfjob = import "kubeflow/core/tf-job.libsonnet"; 20 | local nfs = import "kubeflow/core/nfs.libsonnet"; 21 | 22 | local name = import 'param://name'; 23 | local namespace = import 'param://namespace'; 24 | 25 | local cloud = import 'param://cloud'; 26 | 27 | // TODO(jlewi): Make this a parameter 28 | local jupyterHubImage = 'gcr.io/kubeflow/jupyterhub:1.0'; 29 | local diskParam = import 'param://disks'; 30 | 31 | local diskNames = if diskParam != "null" && std.length(diskParam) > 0 then 32 | std.split(diskParam, ',') 33 | else []; 34 | 35 | local jupyterConfigMap = if std.length(diskNames) == 0 then 36 | jupyter.parts(namespace).jupyterHubConfigMap 37 | else jupyter.parts(namespace).jupyterHubConfigMapWithVolumes(diskNames); 38 | 39 | local tfJobImage = import 'param://tfJobImage'; 40 | local tfDefaultImage = import 'param://tfDefaultImage'; 41 | local tfJobUiServiceType = import 'param://tfJobUiServiceType'; 42 | local jupyterHubServiceType = import 'param://jupyterHubServiceType'; 43 | 44 | // Create a list of the resources needed for a particular disk 45 | local diskToList = function(diskName) [ 46 | nfs.parts(namespace, name,).diskResources(diskName).storageClass, 47 | nfs.parts(namespace, name,).diskResources(diskName).volumeClaim, 48 | nfs.parts(namespace, name,).diskResources(diskName).service, 49 | nfs.parts(namespace, name,).diskResources(diskName).provisioner]; 50 | 51 | local allDisks = std.flattenArrays(std.map(diskToList, diskNames)); 52 | 53 | local nfsComponents = 54 | if std.length(allDisks) > 0 then 55 | [nfs.parts(namespace, name).serviceAccount, 56 | nfs.parts(namespace, name).role, 57 | nfs.parts(namespace, name).roleBinding, 58 | nfs.parts(namespace, name).clusterRoleBinding,] + allDisks 59 | else 60 | []; 61 | 62 | std.prune(k.core.v1.list.new([ 63 | // jupyterHub components 64 | jupyterConfigMap, 65 | jupyter.parts(namespace).jupyterHubService, 66 | jupyter.parts(namespace).jupyterHubLoadBalancer(jupyterHubServiceType), 67 | jupyter.parts(namespace).jupyterHub(jupyterHubImage), 68 | jupyter.parts(namespace).jupyterHubRole, 69 | jupyter.parts(namespace).jupyterHubServiceAccount, 70 | jupyter.parts(namespace).jupyterHubRoleBinding, 71 | 72 | // TfJob controller 73 | tfjob.parts(namespace).tfJobDeploy(tfJobImage), 74 | tfjob.parts(namespace).configMap(cloud, tfDefaultImage), 75 | tfjob.parts(namespace).serviceAccount, 76 | tfjob.parts(namespace).operatorRole, 77 | tfjob.parts(namespace).operatorRoleBinding, 78 | 79 | // TfJob controll ui 80 | tfjob.parts(namespace).ui(tfJobImage), 81 | tfjob.parts(namespace).uiService(tfJobUiServiceType), 82 | tfjob.parts(namespace).uiServiceAccount, 83 | tfjob.parts(namespace).uiRole, 84 | tfjob.parts(namespace).uiRoleBinding, 85 | ] + nfsComponents)) 86 | 87 | -------------------------------------------------------------------------------- /kubeflow/tf-job/prototypes/tf-cnn-benchmarks.jsonnet: -------------------------------------------------------------------------------- 1 | // @apiVersion 0.1 2 | // @name io.ksonnet.pkg.tf-cnn 3 | // @description A TensorFlow CNN Benchmarking job 4 | // @shortDescription Run the TensorFlow CNN benchmarking job. 5 | // @param name string Name for the job. 6 | // @optionalParam namespace string default Namespace 7 | // @optionalParam batch_size number 32 The batch size 8 | // @optionalParam model string resnet50 Which model to use 9 | // @optionalParam num_gpus number 0 The number of GPUs to attach to workers. 10 | // @optionalParam image string gcr.io/kubeflow/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 The docker image to use for the job. 11 | // @optionalParam image_gpu string gcr.io/kubeflow/tf-benchmarks-gpu:v20171202-bdab599-dirty-284af3 The docker image to use when using GPUs. 12 | // @optionalParam num_ps number 1 The number of ps to use 13 | // @optionalParam num_workers number 1 The number of workers to use 14 | 15 | // We need at least 1 parameter server. 16 | 17 | // TODO(jlewi): Should we move this into an examples package? 18 | 19 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): We have to add namespace as an explicit parameter 20 | // because ksonnet doesn't support inheriting it from the environment yet. 21 | 22 | local k = import 'k.libsonnet'; 23 | local deployment = k.extensions.v1beta1.deployment; 24 | local container = deployment.mixin.spec.template.spec.containersType; 25 | local podTemplate = k.extensions.v1beta1.podTemplate; 26 | 27 | local tfJob = import 'kubeflow/tf-job/tf-job.libsonnet'; 28 | 29 | local name = import 'param://name'; 30 | local namespace = import 'param://namespace'; 31 | 32 | local numGpus = import 'param://num_gpus'; 33 | local batchSize = import 'param://batch_size'; 34 | local model = import 'param://model'; 35 | 36 | local args = [ 37 | "python", 38 | "tf_cnn_benchmarks.py", 39 | "--batch_size=" + batchSize, 40 | "--model=" + model, 41 | "--variable_update=parameter_server", 42 | "--flush_stdout=true", 43 | ] + 44 | if numGpus == 0 then 45 | # We need to set num_gpus=1 even if not using GPUs because otherwise the devie list 46 | # is empty because of this code 47 | # https://github.com/tensorflow/benchmarks/blob/master/scripts/tf_cnn_benchmarks/benchmark_cnn.py#L775 48 | # We won't actually use GPUs because based on other flags no ops will be assigned to GPus. 49 | ["--num_gpus=1", 50 | "--local_parameter_device=cpu", 51 | "--device=cpu", 52 | "--data_format=NHWC",] 53 | else 54 | ["--num_gpus=" + numGpus, 55 | ] 56 | ; 57 | 58 | local image = import 'param://image'; 59 | local imageGpu = import 'param://image_gpu'; 60 | local numPs = import 'param://num_ps'; 61 | local numWorkers = import 'param://num_workers'; 62 | local numGpus = import 'param://num_gpus'; 63 | 64 | local workerSpec = if numGpus > 0 then 65 | tfJob.parts.tfJobReplica("WORKER", numWorkers, args, imageGpu, numGpus) 66 | else 67 | tfJob.parts.tfJobReplica("WORKER", numWorkers, args, image); 68 | 69 | // TODO(jlewi): Look at how the redis prototype modifies a container by 70 | // using mapContainersWithName. Can we do something similar? 71 | // https://github.com/ksonnet/parts/blob/9d78d6bb445d530d5b927656d2293d4f12654608/incubator/redis/redis.libsonnet 72 | local replicas = std.map(function(s) 73 | s + { 74 | template+: { 75 | spec+: { 76 | // TODO(jlewi): Does this overwrite containers? 77 | containers: [ 78 | s.template.spec.containers[0] + { 79 | workingDir: "/opt/tf-benchmarks/scripts/tf_cnn_benchmarks", 80 | }, 81 | ] 82 | } 83 | }, 84 | }, 85 | std.prune([workerSpec, tfJob.parts.tfJobReplica("PS", numPs, args, image)])); 86 | 87 | local job = 88 | if numWorkers < 1 then 89 | error "num_workers must be >= 1" 90 | else 91 | if numPs < 1 then 92 | error "num_ps must be >= 1" 93 | else 94 | tfJob.parts.tfJob(name, namespace, replicas) + { 95 | spec+: { 96 | tfImage: image, 97 | terminationPolicy: {chief:{replicaName: "WORKER", replicaIndex: 0 }} 98 | }}; 99 | 100 | std.prune(k.core.v1.list.new([job])) 101 | -------------------------------------------------------------------------------- /components/tf-controller/deploy_crd.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | --- 16 | # Source: tf-job-operator-chart/templates/config.yaml 17 | 18 | apiVersion: v1 19 | kind: ConfigMap 20 | metadata: 21 | name: tf-job-operator-config 22 | data: 23 | controller_config_file.yaml: | 24 | grpcServerFilePath: /opt/mlkube/grpc_tensorflow_server/grpc_tensorflow_server.py 25 | accelerators: 26 | alpha.kubernetes.io/nvidia-gpu: 27 | volumes: 28 | - name: nvidia-libraries 29 | mountPath: /usr/local/nvidia/lib64 # This path is special; it is expected to be present in `/etc/ld.so.conf` inside the container image. 30 | hostPath: /home/kubernetes/bin/nvidia/lib 31 | - name: nvidia-debug-tools # optional 32 | mountPath: /usr/local/bin/nvidia 33 | hostPath: /home/kubernetes/bin/nvidia/bin 34 | 35 | --- 36 | # Source: tf-job-operator-chart/templates/service-account.yaml 37 | 38 | apiVersion: v1 39 | kind: ServiceAccount 40 | metadata: 41 | name: tf-job-operator 42 | labels: 43 | app: tf-job-operator 44 | 45 | --- 46 | # Source: tf-job-operator-chart/templates/rbac.yaml 47 | 48 | apiVersion: rbac.authorization.k8s.io/v1beta1 49 | kind: ClusterRole 50 | metadata: 51 | name: tf-job-operator 52 | labels: 53 | app: tf-job-operator 54 | rules: 55 | - apiGroups: 56 | - tensorflow.org 57 | resources: 58 | - tfjobs 59 | verbs: 60 | - "*" 61 | - apiGroups: 62 | - apiextensions.k8s.io 63 | resources: 64 | - customresourcedefinitions 65 | verbs: 66 | - "*" 67 | - apiGroups: 68 | - storage.k8s.io 69 | resources: 70 | - storageclasses 71 | verbs: 72 | - "*" 73 | - apiGroups: 74 | - batch 75 | resources: 76 | - jobs 77 | verbs: 78 | - "*" 79 | - apiGroups: 80 | - "" 81 | resources: 82 | - configmaps 83 | - pods 84 | - services 85 | - endpoints 86 | - persistentvolumeclaims 87 | - events 88 | verbs: 89 | - "*" 90 | - apiGroups: 91 | - apps 92 | - extensions 93 | resources: 94 | - deployments 95 | verbs: 96 | - "*" 97 | --- 98 | kind: ClusterRoleBinding 99 | apiVersion: rbac.authorization.k8s.io/v1beta1 100 | metadata: 101 | name: tf-job-operator 102 | labels: 103 | app: tf-job-operator 104 | subjects: 105 | - kind: ServiceAccount 106 | name: tf-job-operator 107 | namespace: default 108 | roleRef: 109 | apiGroup: rbac.authorization.k8s.io 110 | kind: ClusterRole 111 | name: tf-job-operator 112 | 113 | 114 | --- 115 | # Source: tf-job-operator-chart/templates/deployment.yaml 116 | apiVersion: extensions/v1beta1 117 | kind: Deployment 118 | metadata: 119 | name: tf-job-operator 120 | spec: 121 | replicas: 1 122 | template: 123 | metadata: 124 | labels: 125 | name: tf-job-operator 126 | spec: 127 | serviceAccountName: tf-job-operator 128 | containers: 129 | - name: tf-job-operator 130 | image: gcr.io/tf-on-k8s-dogfood/tf_operator:v20171129-f8ec762 131 | command: 132 | - /opt/mlkube/tf_operator 133 | - --controller_config_file=/etc/config/controller_config_file.yaml 134 | - -alsologtostderr 135 | - -v=1 136 | env: 137 | - name: MY_POD_NAMESPACE 138 | valueFrom: 139 | fieldRef: 140 | fieldPath: metadata.namespace 141 | - name: MY_POD_NAME 142 | valueFrom: 143 | fieldRef: 144 | fieldPath: metadata.name 145 | 146 | volumeMounts: 147 | - name: config-volume 148 | mountPath: /etc/config 149 | volumes: 150 | - name: config-volume 151 | configMap: 152 | name: tf-job-operator-config 153 | -------------------------------------------------------------------------------- /tf-controller-examples/tf-cnn/create_job_specs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2017 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """A simple script to generate TfJob templates based on various parameters.""" 18 | 19 | import argparse 20 | import datetime 21 | import logging 22 | import yaml 23 | 24 | TF_JOB_GROUP = "tensorflow.org" 25 | TF_JOB_VERSION = "v1alpha1" 26 | TF_JOB_PLURAL = "tfjobs" 27 | TF_JOB_KIND = "TfJob" 28 | 29 | # See https://stackoverflow.com/questions/21016220/is-it-possible-to-emit-valid-yaml-with-anchors-references-disabled-using-ruby 30 | class ExplicitDumper(yaml.SafeDumper): 31 | """A dumper that will never emit aliases.""" 32 | 33 | def ignore_aliases(self, data): 34 | return True 35 | 36 | if __name__ == "__main__": 37 | logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals 38 | parser = argparse.ArgumentParser(description="Create TfJob specs.") 39 | 40 | parser.add_argument( 41 | "--cpu_image", 42 | type=str, 43 | required=True, 44 | help="The docker image for CPU jobs.") 45 | 46 | parser.add_argument( 47 | "--gpu_image", 48 | type=str, 49 | required=True, 50 | help="The docker image for GPU jobs.") 51 | 52 | parser.add_argument( 53 | "--num_workers", 54 | type=int, 55 | default=1, 56 | help="The number of workers to use.") 57 | 58 | parser.add_argument( 59 | "--output", 60 | type=str, 61 | help="(Optional) the file to write the template to.") 62 | 63 | parser.add_argument("--gpu", dest="use_gpu", action="store_true", 64 | help="Use gpus.") 65 | parser.add_argument("--no-gpu", dest="use_gpu", action="store_false", 66 | help="Do not use gpus.") 67 | 68 | parser.set_defaults(use_gpu=True) 69 | 70 | args = parser.parse_args() 71 | 72 | namespace = "default" 73 | job_name = "inception-" + datetime.datetime.now().strftime("%y%m%d-%H%M%S") 74 | if args.use_gpu: 75 | job_name += "-gpu" 76 | else: 77 | job_name += "-cpu" 78 | 79 | job_name += "-{0}".format(args.num_workers) 80 | 81 | body = {} 82 | body['apiVersion'] = TF_JOB_GROUP + "/" + TF_JOB_VERSION 83 | body['kind'] = TF_JOB_KIND 84 | body['metadata'] = {} 85 | body['metadata']['name'] = job_name 86 | body['metadata']['namespace'] = namespace 87 | 88 | clone_on_cpu = not args.use_gpu 89 | 90 | body["spec"] = {} 91 | body["spec"]["replicaSpecs"] = [] 92 | 93 | working_dir = "/opt/tf-benchmarks/scripts/tf_cnn_benchmarks" 94 | 95 | num_workers = args.num_workers 96 | num_ps = 1 97 | 98 | command = [ 99 | "python", 100 | "tf_cnn_benchmarks.py", 101 | "--batch_size=32", 102 | "--model=resnet50", 103 | "--variable_update=parameter_server", 104 | # tf_cnn_benchmarks uses print for logging and if we 105 | # don't set flush_stdout the buffer isn't outputted 106 | # until the program ends.. 107 | "--flush_stdout=true", 108 | ] 109 | 110 | if args.use_gpu: 111 | command.append("--num_gpus=1") 112 | else: 113 | # We need to set num_gpus=1 even if not using GPUs because otherwise the devie list 114 | # is empty because of this code 115 | # https://github.com/tensorflow/benchmarks/blob/master/scripts/tf_cnn_benchmarks/benchmark_cnn.py#L775 116 | command.append("--num_gpus=1") 117 | command.append("--local_parameter_device=cpu") 118 | command.append("--device=cpu") 119 | command.append("--data_format=NHWC") 120 | 121 | # Add the master spec. 122 | # The master only acts as the chief and doesn't do any training so it can always use the CPU image. 123 | master_spec = { 124 | "replicas": 1, 125 | "tfReplicaType": "MASTER", 126 | "template": { 127 | "spec": { 128 | "containers": [ 129 | { 130 | "image": args.cpu_image, 131 | "name": "tensorflow", 132 | "workingDir": working_dir, 133 | "args": command, 134 | } 135 | ], 136 | "restartPolicy": "OnFailure", 137 | } 138 | } 139 | } 140 | 141 | body["spec"]["replicaSpecs"].append(master_spec) 142 | 143 | worker_image = args.cpu_image 144 | if args.use_gpu: 145 | worker_image = args.gpu_image 146 | 147 | worker_spec = { 148 | "replicas": num_workers, 149 | "tfReplicaType": "WORKER", 150 | "template": { 151 | "spec": { 152 | "containers": [ 153 | { 154 | "image": worker_image, 155 | "name": "tensorflow", 156 | "workingDir": working_dir, 157 | "args": command, 158 | } 159 | ], 160 | "restartPolicy": "OnFailure", 161 | } 162 | } 163 | } 164 | 165 | if args.use_gpu: 166 | worker_spec["template"]["spec"]["containers"][0]["resources"] = { 167 | "limits": { 168 | "nvidia.com/gpu": 1, 169 | } 170 | } 171 | 172 | body["spec"]["replicaSpecs"].append(worker_spec) 173 | 174 | ps_spec = { 175 | "replicas": num_ps, 176 | "tfReplicaType": "PS", 177 | "template": { 178 | "spec": { 179 | "containers": [ 180 | { 181 | "image": args.cpu_image, 182 | "name": "tensorflow", 183 | "workingDir": working_dir, 184 | "args": command, 185 | } 186 | ], 187 | "restartPolicy": "OnFailure", 188 | } 189 | } 190 | } 191 | 192 | body["spec"]["replicaSpecs"].append(ps_spec) 193 | 194 | body["spec"]["tfImage"] = args.cpu_image 195 | 196 | # Tensorboard is crashing with TF 1.5 197 | # body["spec"]["tensorBoard"] = { 198 | # "logDir": job_dir 199 | # } 200 | 201 | spec = yaml.dump(body, Dumper=ExplicitDumper, default_flow_style=False) 202 | 203 | if args.output: 204 | logging.info("Writing to %s", args.output) 205 | with open(args.output, "w") as hf: 206 | hf.write(spec) 207 | else: 208 | print(spec) 209 | -------------------------------------------------------------------------------- /testing/README.md: -------------------------------------------------------------------------------- 1 | # Test Infrastructure 2 | 3 | This directory contains the Kubeflow test Infrastructure. 4 | 5 | This is a work in progress see [google/kubeflow#38](https://github.com/google/kubeflow/issues/38) 6 | 7 | The current thinking is this will work as follows 8 | 9 | * Prow will be used to trigger E2E tests 10 | * The E2E test will launch an Argo workflow that describes the tests to run 11 | * Each step in the Argo workflow will be a binary invoked inside a container 12 | * The Argo workflow will use an NFS volume to attach a shared POSIX compliant filesystem to each step in the 13 | workflow. 14 | * Each step in the pipeline can write outputs and junit.xml files to a test directory in the volume 15 | * A final step in the Argo pipeline will upload the outputs to GCS so they are available in gubernator 16 | 17 | ## Accessing Argo UI 18 | 19 | You can access the Argo UI over the API Server proxy. 20 | 21 | We currently use the cluster 22 | 23 | ``` 24 | PROJECT=mlkube-testing 25 | ZONE=us-east1-d 26 | CLUSTER=kubeflow-testing 27 | NAMESPACE=kubeflow-test-infra 28 | ``` 29 | 30 | After starting `kubectl proxy` on `127.0.0.1:8001`, you can connect to the argo UI via the local proxy at 31 | 32 | ``` 33 | http://127.0.0.1:8001/api/v1/proxy/namespaces/kubeflow-test-infra/services/argo-ui:80/ 34 | ``` 35 | 36 | TODO(jlewi): We can probably make the UI publicly available since I don't think it offers any ability to launch workflows. 37 | 38 | 39 | ## Running the tests 40 | 41 | ### Run a presubmit 42 | 43 | ``` 44 | ks param set workflows name e2e-test-pr-`date '+%Y%m%d-%H%M%S'` 45 | ks param set workflows prow_env REPO_OWNER=google,REPO_NAME=kubeflow,PULL_NUMBER=${PULL_NUMBER},PULL_PULL_SHA=${COMMIT} 46 | ks param set workflows commit ${COMMIT} 47 | ks apply prow -c workflows 48 | ``` 49 | * You can set COMMIT to `pr` to checkout the latest change on the PR. 50 | 51 | ### Run a postsubmit 52 | 53 | ``` 54 | ks param set workflows name e2e-test-postsubmit-`date '+%Y%m%d-%H%M%S'` 55 | ks param set workflows prow_env REPO_OWNER=google,REPO_NAME=kubeflow,PULL_BASE_SHA=${COMMIT} 56 | ks param set workflows commit ${COMMIT} 57 | ks apply prow -c workflows 58 | ``` 59 | * You can set COMMIT to `master` to use HEAD 60 | 61 | 62 | ## Setting up the Test Infrastructure 63 | 64 | Our tests require a K8s cluster with Argo installed. This section provides the instructions 65 | for setting this. 66 | 67 | Create a GKE cluster 68 | 69 | ``` 70 | PROJECT=mlkube-testing 71 | ZONE=us-east1-d 72 | CLUSTER=kubeflow-testing 73 | NAMESPACE=kubeflow-test-infra 74 | 75 | gcloud --project=${PROJECT} container clusters create \ 76 | --zone=${ZONE} \ 77 | --machine-type=n1-standard-8 \ 78 | --cluster-version=1.8.4-gke.1 \ 79 | ${CLUSTER} 80 | ``` 81 | 82 | 83 | ### Create a GCP service account 84 | 85 | * The tests need a GCP service account to upload data to GCS for Gubernator 86 | 87 | ``` 88 | SERVICE_ACCOUNT=kubeflow-testing 89 | gcloud iam service-accounts --project=mlkube-testing create ${SERVICE_ACCOUNT} --display-name "Kubeflow testing account" 90 | gcloud projects add-iam-policy-binding ${PROJECT} \ 91 | --member serviceAccount:${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com --role roles/container.developer 92 | ``` 93 | * The service account needs to be able to create K8s resources as part of the test. 94 | 95 | 96 | Create a secret key for the service account 97 | 98 | ``` 99 | gcloud iam service-accounts keys create ~/tmp/key.json \ 100 | --iam-account ${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com 101 | kubectl create secret generic kubeflow-testing-credentials \ 102 | --namespace=kubeflow-test-infra --from-file=`echo ~/tmp/key.json` 103 | rm ~/tmp/key.json 104 | ``` 105 | 106 | Make the service account a cluster admin 107 | 108 | ``` 109 | kubectl create clusterrolebinding ${SERVICE_ACCOUNT}-admin --clusterrole=cluster-admin \ 110 | --user=${SERVICE_ACCOUNT}@${PROJECT}.iam.gserviceaccount.com 111 | ``` 112 | * The service account is used to deploye Kubeflow which entails creating various roles; so 113 | it needs sufficient RBAC permission to do so. 114 | 115 | ### Create a GitHub Token 116 | 117 | You need to use a GitHub token with ksonnet otherwise the test quickly runs into GitHub API limits. 118 | 119 | TODO(jlewi): We should create a GitHub bot account to use with our tests and then create API tokens for that bot. 120 | 121 | You can use the GitHub API to create a token 122 | 123 | * The token doesn't need any scopes because its only accessing public data and is just need for API metering. 124 | 125 | To create the secret run 126 | 127 | ``` 128 | kubectl create secret generic github-token --namespace=kubeflow-test-infra --from-literal=github_token=${TOKEN} 129 | ``` 130 | 131 | ### Create a PD for NFS 132 | 133 | Create a PD to act as the backing storage for the NFS filesystem that will be used to store data from 134 | the test runs. 135 | 136 | ``` 137 | gcloud --project=${PROJECT} compute disks create \ 138 | --zone=${ZONE} kubeflow-testing --description="PD to back NFS storage for kubeflow testing." --size=1TB 139 | ``` 140 | ### Create K8s Resources for Testing 141 | 142 | The ksonnet app `test-infra` contains ksonnet configs to deploy the test infrastructure. 143 | 144 | You can deploy argo as follows (you don't need to use argo's CLI) 145 | 146 | ``` 147 | ks apply prow -c argo 148 | ``` 149 | 150 | Deploy NFS & Jupyter 151 | 152 | ``` 153 | ks apply prow -c nfs-jupyter 154 | ``` 155 | 156 | * This creates the NFS share 157 | * We use JupyterHub as a convenient way to access the NFS share for manual inspection of the file contents. 158 | 159 | #### Troubleshooting 160 | 161 | User or service account deploying the test infrastructure needs sufficient permissions to create the roles that are created as part deploying the test infrastructure. So you may need to run the following command before using ksonnet to deploy the test infrastructure. 162 | 163 | ``` 164 | kubectl create clusterrolebinding default-admin --clusterrole=cluster-admin --user=user@gmail.com 165 | ``` 166 | 167 | ##### Operator Logs 168 | 169 | The following Stackdriver filter can be used to get the pod logs for the operator 170 | 171 | ``` 172 | resource.type="container" 173 | resource.labels.namespace_id="e2e-0117-1911-3a53" 174 | resource.labels.container_name="tf-job-operator" 175 | ``` 176 | 177 | ## Managing namespaces 178 | 179 | All namespaces created for the tests should be labeled with `app=kubeflow-e2e-test`. 180 | 181 | This can be used to manually delete old namespaces that weren't properly garbage collected. 182 | -------------------------------------------------------------------------------- /components/jupyterhub/README.md: -------------------------------------------------------------------------------- 1 | # Jupyter and Jupyterhub 2 | 3 | ## Background 4 | 5 | ### Jupyter 6 | Jupyter (formerly iPython Notebook) is a UI tool commonly used with Spark, Tensorflow and other big data processing frameworks. It is used 7 | by data scientists and ML engineers across a variety of organizations for interactive tasks. It supports multiple languages through runners, 8 | and allows users to run code, save code/results, and share “notebooks” with both code, documentation and output easily. 9 | 10 | ### JupyterHub 11 | JupyterHub lets users manage authenticated access to multiple single-user Jupyter notebooks. JupyterHub delegates the launching of 12 | single-user notebooks to pluggable components called “spawners”. JupyterHub has a sub-project named kubespawner, maintained by the 13 | community, that enables users to provision single-user Jupyter notebooks backed by Kubernetes pods - the notebooks themselves are 14 | Kubernetes pods. 15 | 16 | ## Quick Start 17 | 18 | Refer to the [user_guide](../../user_guide.md) for instructions on deploying JupyterHub via ksonnet. 19 | 20 | Once that's completed, you will have a StatefulSet for Jupyterhub, a configmap for configuration, and a LoadBalancer type of service, in addition to the requisite RBAC roles. 21 | If you are on Google Kubernetes Engine, the LoadBalancer type of service automatically creates an external IP address that can be 22 | used to access the notebook. Note that this is for illustration purposes only, and must be coupled with [SSL](http://jupyterhub.readthedocs.io/en/0.8.1/getting-started/security-basics.html?highlight=ssl#ssl-encryption) and configured to use an 23 | [authentication plugin](https://github.com/willingc/jhubdocs/blob/master/jupyterhub/docs/source/authenticators.md) in production environments. 24 | 25 | if you're testing and want to avoid exposing JupyterHub on an external IP address, you can use kubectl instead to gain access to the hub on your local machine. 26 | 27 | ```commandline 28 | kubectl port-forward 8000:8000 29 | ``` 30 | 31 | The above will expose JupyterHub on http://localhost:8000. The pod name can be obtained by running `kubectl get pods`, and will be `tf-hub-0` by default. 32 | 33 | ## Configuration 34 | 35 | Configuration for JupyterHub is shipped separately and contained within the configmap defined by the [core componenent](https://github.com/google/kubeflow/tree/master/kubeflow). It is a Python file that is consumed by JupyterHub on starting up. The supplied configuration has reasonable defaults for the requisite fields and **no authenticator** configured by default. Furthermore, we provide a number of parameters that can be used to configure 36 | the core component. To see a list of ksonnet parameters run 37 | 38 | ``` 39 | ks prototype describe kubeflow-core 40 | ``` 41 | 42 | If the provided parameters don't provide the flexibility you need, you can take advantage of ksonnet to customize the core component and use a config file fully specified by you. 43 | 44 | Configuration includes sections for KubeSpawner and Authenticators. Spawner parameters include the form used when provisioning new 45 | Jupyter notebooks, and configuration defining how JupyterHub creates and interacts with Kubernetes pods for individual notebooks. 46 | Authenticator parameters correspond to the authentication mechanism used by JupyterHub. 47 | 48 | 49 | ## Usage 50 | 51 | If you're using the quick-start, the external IP address of the JupyterHub instance can be obtained from `kubectl get svc`. 52 | ```commandline 53 | kubectl get svc 54 | 55 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE 56 | tf-hub-0 ClusterIP None 1h 57 | tf-hub-lb LoadBalancer 10.43.246.148 xx.yy.zz.ww 80:32689/TCP 36m 58 | ``` 59 | 60 | Now, you can access http://xx.yy.zz.ww with your browser. When trying to spawn a new image, a configuration page should pop up, allowing configuration of the notebook image, CPU, Memory, and additional resources. With the default `DummyAuthenticator`, it should allow any username/password to access the hub and create new notebooks. You can use an authenticator plugin if you want to secure your notebook server and use its administration functionality. 61 | 62 | ## Customization 63 | 64 | ### Using your own hub image 65 | 66 | An image with JupyterHub 0.8.1, kubespawner 0.7.1 and two simple authenticator plugins can be built from within the `docker/` directory using the Makefile provided. For example, if you're using Google Cloud Platform and have a project with ID `foo` configured to use gcr.io, you can do the following: 67 | 68 | ```commandline 69 | make build PROJECT_ID=foo 70 | make push PROJECT_ID=foo 71 | ``` 72 | 73 | ### Notebook image 74 | 75 | Images published under https://github.com/jupyter/docker-stacks should work directly with the Hub. The only requirement for the jupyter 76 | notebook images that can be used in conjunction with this instance of Hub is that the same version of JupyterHub must be installed (0.8.1 by default), and that there must be a standard `start-singleuser.sh` accessible via the default PATH. 77 | 78 | ### GitHub OAuth Setup 79 | 80 | After creating the initial Hub and exposing it on a public IP address, you can add GitHub based authentication. First, you'll need to create a [GitHub oauth application](https://github.com/settings/applications/new). The callback URL would be of the form `http://xx.yy.zz.ww/hub/oauth_callback`. 81 | 82 | Once the GitHub application is created, update the `manifest/config.yaml` with the `callback_url`, `client_id` and `client_secret` obtained from the GitHub UI. Ensure that the `DummyAuthenticator` is commented out and replaced by the `GitHubOAuthenticator` options. At the end, the authenticator configuration section might look like: 83 | 84 | ```commandline 85 | c.JupyterHub.authenticator_class = GitHubOAuthenticator 86 | c.GitHubOAuthenticator.oauth_callback_url = 'http://xx.yy.zz.ww/hub/oauth_callback' 87 | c.GitHubOAuthenticator.client_id = 'client_id_here' 88 | c.GitHubOAuthenticator.client_secret = 'client_secret_here' 89 | ``` 90 | 91 | Finally, you can update the configuration and ensure that the new configuration is picked up, by doing the following: 92 | 93 | ```commandline 94 | ks apply ${ENVIRONMENT} -c ${COMPONENT_NAME} 95 | kubectl delete pod tf-hub-0 96 | ``` 97 | 98 | The pod will come up with the new configuration and be configured to use the GitHub authenticator you specified in the previous step. You can additionally modify the configuration to add whitelists and admin users. For example, to limit it to only GitHub users user1 and user2, one might use the following configuration: 99 | 100 | ``` 101 | c.Authenticator.whitelist = {'user1', 'user2'} 102 | ``` 103 | 104 | After changing the configuration and `kubectl apply -f config.yaml`, please note that the JupyterHub pod needs to be restarted before the new configuration is reflected. -------------------------------------------------------------------------------- /testing/test-infra/components/workflows.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet 3 | // doesn't support automatically piping in the namespace from the environment to prototypes. 4 | 5 | // convert a list of two items into a map representing an environment variable 6 | listToMap:: function(v) 7 | { 8 | "name": v[0], 9 | "value": v[1], 10 | }, 11 | 12 | // Function to turn comma separated list of prow environment variables into a dictionary. 13 | parseEnv:: function(v) 14 | local pieces = std.split(v, ","); 15 | if v != "" && std.length(pieces) > 0 then 16 | std.map( 17 | function(i) $.listToMap(std.split(i, "=")), 18 | std.split(v, ",")) 19 | else [], 20 | 21 | parts(namespace, name):: { 22 | // Workflow to run the e2e test. 23 | e2e(prow_env, bucket): 24 | // mountPath is the directory where the volume to store the test data 25 | // should be mounted. 26 | local mountPath = "/mnt/" + "test-data-volume"; 27 | // testDir is the root directory for all data for a particular test run. 28 | local testDir = mountPath + "/" + name; 29 | // outputDir is the directory to sync to GCS to contain the output for this job. 30 | local outputDir = testDir + "/output"; 31 | local artifactsDir = outputDir + "/artifacts"; 32 | local srcDir = testDir + "/src"; 33 | local image = "gcr.io/mlkube-testing/kubeflow-testing"; 34 | // The name of the NFS volume claim to use for test files. 35 | local nfsVolumeClaim = "kubeflow-testing"; 36 | // The name to use for the volume to use to contain test data. 37 | local dataVolume = "kubeflow-test-volume"; 38 | { 39 | // Build an Argo template to execute a particular command. 40 | // step_name: Name for the template 41 | // command: List to pass as the container command. 42 | buildTemplate(step_name, command):: { 43 | "name": step_name, 44 | "container": { 45 | "command": command, 46 | "image": image, 47 | "env": [{ 48 | // Add the source directories to the python path. 49 | "name": "PYTHONPATH", 50 | "value": srcDir + ":" + srcDir + "/tensorflow_k8s", 51 | }, 52 | { 53 | "name": "GOOGLE_APPLICATION_CREDENTIALS", 54 | "value": "/secret/gcp-credentials/key.json", 55 | }, 56 | { 57 | "name": "GIT_TOKEN", 58 | "valueFrom": { 59 | "secretKeyRef": { 60 | name: "github-token", 61 | key: "github_token", 62 | }, 63 | }, 64 | },] + prow_env, 65 | "volumeMounts": [ 66 | { 67 | "name": dataVolume, 68 | "mountPath": mountPath, 69 | }, 70 | { 71 | "name": "github-token", 72 | "mountPath": "/secret/github-token", 73 | }, 74 | { 75 | "name": "gcp-credentials", 76 | "mountPath": "/secret/gcp-credentials", 77 | }, 78 | ], 79 | }, 80 | }, // buildTemplate 81 | 82 | "apiVersion": "argoproj.io/v1alpha1", 83 | "kind": "Workflow", 84 | "metadata": { 85 | "name": name, 86 | "namespace": namespace, 87 | }, 88 | // TODO(jlewi): Use OnExit to run cleanup steps. 89 | "spec": { 90 | "entrypoint": "e2e", 91 | "volumes": [ 92 | { 93 | "name": "github-token", 94 | "secret": { 95 | "secretName": "github-token", 96 | }, 97 | }, 98 | { 99 | "name": "gcp-credentials", 100 | "secret": { 101 | "secretName": "kubeflow-testing-credentials", 102 | }, 103 | }, 104 | { 105 | "name": dataVolume, 106 | "persistentVolumeClaim": { 107 | "claimName": nfsVolumeClaim, 108 | }, 109 | }, 110 | ], // volumes 111 | "templates": [ 112 | { 113 | "name": "e2e", 114 | "steps": [ 115 | [{ 116 | "name": "checkout", 117 | "template": "checkout", 118 | },], 119 | [{ 120 | "name": "test-deploy", 121 | "template": "test-deploy", 122 | }, 123 | { 124 | "name": "create-pr-symlink", 125 | "template": "create-pr-symlink", 126 | },], 127 | [{ 128 | "name": "copy-artifacts", 129 | "template": "copy-artifacts", 130 | },], 131 | ], 132 | }, 133 | { 134 | "name": "checkout", 135 | "container": { 136 | "command": [ 137 | "/usr/local/bin/checkout.sh" 138 | ], 139 | "args": [ 140 | srcDir, 141 | ], 142 | "env": prow_env, 143 | "image": image, 144 | "volumeMounts": [ 145 | { 146 | "name": dataVolume, 147 | "mountPath": mountPath, 148 | }, 149 | ], 150 | }, 151 | }, // checkout 152 | $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("test-deploy", [ 153 | "python", 154 | "-m", 155 | "testing.test_deploy", 156 | "--project=mlkube-testing", 157 | "--cluster=kubeflow-testing", 158 | "--zone=us-east1-d", 159 | "--github_token=$(GIT_TOKEN)", 160 | "--test_dir=" + testDir, 161 | "--artifacts_dir=" + artifactsDir, 162 | ]), // test-deploy 163 | $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("create-pr-symlink", [ 164 | "python", 165 | "-m", 166 | "testing.prow_artifacts", 167 | "--artifacts_dir=" + outputDir, 168 | "create_pr_symlink", 169 | "--bucket=" + bucket, 170 | ]), // create-pr-symlink 171 | $.parts(namespace, name).e2e(prow_env, bucket).buildTemplate("copy-artifacts", [ 172 | "python", 173 | "-m", 174 | "testing.prow_artifacts", 175 | "--artifacts_dir=" + outputDir, 176 | "copy_artifacts", 177 | "--bucket=" + bucket, 178 | ]), // copy-artifacts 179 | ], // templates 180 | } 181 | },// e2e 182 | } // parts 183 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kubeflow 2 | 3 | The Kubeflow project is dedicated to making Machine Learning on Kubernetes easy, portable and scalable. Our goal is **not** to recreate other services, but to provide a straightforward way for spinning up best-of-breed OSS solutions. Contained in this repository are manifests for creating: 4 | 5 | * A JupyterHub to create & manage interactive Jupyter notebooks 6 | * A Tensorflow Training Controller that can be configured to use CPUs or GPUs and adjusted to the size of a cluster with a single setting 7 | * A TF Serving container 8 | 9 | This document details the steps needed to run the Kubeflow project in any environment in which Kubernetes runs. 10 | 11 | ## Quick Links 12 | * [Prow test dashboard](https://k8s-testgrid.appspot.com/sig-big-data) 13 | * [Prow jobs dashboard](https://prow.k8s.io/?repo=google%2Fkubeflow) 14 | * [Argo UI for E2E tests](http://testing-argo.kubeflow.io) 15 | 16 | ## The Kubeflow Mission 17 | 18 | Our goal is to help folks use ML more easily, by letting Kubernetes to do what it's great at: 19 | - Easy, repeatable, portable deployments on a diverse infrastructure (laptop <-> ML rig <-> training cluster <-> production cluster) 20 | - Deploying and managing loosely-coupled microservices 21 | - Scaling based on demand 22 | 23 | Because ML practitioners use so many different types of tools, it's a key goal that you can customize the stack to whatever your requirements (within reason) and let the system take care of the "boring stuff." While we have started with a narrow set of technologies, we are working with many different projects to include additional tooling. 24 | 25 | Ultimately, we want to have a set of simple manifests that give you an easy to use ML stack _anywhere_ Kubernetes is already running and can self configure based on the cluster it deploys into. 26 | 27 | 28 | ## Who should consider using Kubeflow? 29 | 30 | Based on the current functionality you should consider using Kubeflow if: 31 | 32 | * You want to train/serve TensorFlow models in different environments (e.g. local, on prem, and cloud) 33 | * You want to use Jupyter notebooks to manage TensorFlow training jobs 34 | * kubeflow is particularly helpful if you want to launch training jobs that use more resources (more nodes or more GPUs) than your notebook. 35 | * You want to combine TensorFlow with other processes 36 | * For example, you may want to use [tensorflow/agents](https://github.com/tensorflow/agents) to run simulations to generate data for training reinforcement learning models. 37 | 38 | This list is based ONLY on current capabilities. We are investing significant resources to expand the 39 | functionality and actively soliciting help from companies and inviduals interested in contributing (see [below](README.md#who-should-consider-contributing-to-kubeflow)). 40 | 41 | ## Setup 42 | 43 | This documentation assumes you have a Kubernetes cluster already available. 44 | 45 | If you need help setting up a Kubernetes cluster please refer to [Kubernetes Setup](https://kubernetes.io/docs/setup/). 46 | 47 | If you want to use GPUs, be sure to follow the Kubernetes [instructions for enabling GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/). 48 | 49 | ## Quick Start 50 | 51 | ### Requirements 52 | 53 | * ksonnet version [0.8.0](https://ksonnet.io/#get-started) or later. 54 | * Kubernetes >= 1.8 [see here](https://github.com/tensorflow/k8s#requirements) 55 | 56 | ### Steps 57 | 58 | In order to quickly set up all components, execute the following commands: 59 | 60 | ```commandline 61 | # Initialize a ksonnet APP 62 | APP_NAME=my-kubeflow 63 | ks init ${APP_NAME} 64 | cd ${APP_NAME} 65 | 66 | # Install Kubeflow components 67 | ks registry add kubeflow github.com/google/kubeflow/tree/master/kubeflow 68 | ks pkg install kubeflow/core 69 | ks pkg install kubeflow/tf-serving 70 | ks pkg install kubeflow/tf-job 71 | 72 | # Deploy Kubeflow 73 | ks generate core kubeflow-core --name=kubeflow-core --namespace=${NAMESPACE} 74 | ks apply default -c kubeflow-core 75 | ``` 76 | 77 | 78 | The above command sets up JupyterHub and a custom resource for running TensorFlow training jobs. Furthermore, the ksonnet packages 79 | provide prototypes that can be used to configure TensorFlow jobs and deploy TensorFlow models. 80 | Used together, these make it easy for a user go from training to serving using Tensorflow with minimal 81 | effort in a portable fashion between different environments. 82 | 83 | For more detailed instructions about how to use Kubeflow, please refer to the [user guide](user_guide.md). 84 | 85 | ## Troubleshooting 86 | 87 | ### Minikube 88 | 89 | On [Minikube](https://github.com/kubernetes/minikube) the Virtualbox/VMware drivers for Minikube are recommended as there is a known 90 | issue between the KVM/KVM2 driver and TensorFlow Serving. The issue is tracked in [kubernetes/minikube#2377](https://github.com/kubernetes/minikube/issues/2377). 91 | 92 | ### RBAC clusters 93 | 94 | If you are running on a K8s cluster with [RBAC enabled](https://kubernetes.io/docs/admin/authorization/rbac/#command-line-utilities), you may get an error like the following when deploying Kubeflow: 95 | 96 | ``` 97 | ERROR Error updating roles kubeflow-test-infra.jupyter-role: roles.rbac.authorization.k8s.io "jupyter-role" is forbidden: attempt to grant extra privileges: [PolicyRule{Resources:["*"], APIGroups:["*"], Verbs:["*"]}] user=&{your-user@acme.com [system:authenticated] map[]} ownerrules=[PolicyRule{Resources:["selfsubjectaccessreviews"], APIGroups:["authorization.k8s.io"], Verbs:["create"]} PolicyRule{NonResourceURLs:["/api" "/api/*" "/apis" "/apis/*" "/healthz" "/swagger-2.0.0.pb-v1" "/swagger.json" "/swaggerapi" "/swaggerapi/*" "/version"], Verbs:["get"]}] ruleResolutionErrors=[] 98 | ``` 99 | 100 | This error indicates you do not have sufficient permissions. In many cases you can resolve this just by creating an appropriate 101 | clusterrole binding like so and then redeploying kubeflow 102 | 103 | ```commandline 104 | kubectl create clusterrolebinding default-admin --clusterrole=cluster-admin --user=your-user@acme.com 105 | ``` 106 | 107 | * Replace `your-user@acme.com` with the user listed in the error message. 108 | 109 | If you're using GKE, you may want to refer to [GKE's RBAC docs](https://cloud.google.com/kubernetes-engine/docs/how-to/role-based-access-control) to understand 110 | how RBAC interacts with IAM on GCP. 111 | 112 | ## Resources 113 | 114 | * [user guide](user_guide.md) provides in-depth instructions for using Kubeflow 115 | * Katacoda has produced a [self-paced scenario](https://www.katacoda.com/kubeflow) for learning and trying out Kubeflow 116 | 117 | 118 | ## Get involved 119 | 120 | * [Slack Channel](https://join.slack.com/t/kubeflow/shared_invite/enQtMjgyMzMxNDgyMTQ5LWUwMTIxNmZlZTk2NGU0MmFiNDE4YWJiMzFiOGNkZGZjZmRlNTExNmUwMmQ2NzMwYzk5YzQxOWQyODBlZGY2OTg) 121 | * [Twitter](http://twitter.com/kubeflow) 122 | * [Mailing List](https://groups.google.com/forum/#!forum/kubeflow-discuss) 123 | 124 | * Review and comment on the [proposal](https://docs.google.com/document/d/1dmErPUmqqKMOe4L0ZHQglSdgDguCM4SzlsEdYXRMIDA/edit#) to define the scope and future of Kubeflow 125 | 126 | 127 | ### Who should consider contributing to Kubeflow? 128 | 129 | * Folks who want to add support for other ML frameworks (e.g. PyTorch, XGBoost, etc...) 130 | * Folks who want to bring more Kubernetes magic to ML (e.g. ISTIO integration for prediction) 131 | * Folks who want to make Kubeflow a richer ML platform (e.g. support for ML pipelines, hyperparameter tuning) 132 | * Folks who want to tune Kubeflow for their particular Kubernetes distribution or Cloud 133 | * Folks who want to write tutorials/blog posts showing how to use Kubeflow to solve ML problems 134 | -------------------------------------------------------------------------------- /testing/run_e2e_workflow.py: -------------------------------------------------------------------------------- 1 | """Run the E2E workflow. 2 | 3 | This script submits an Argo workflow to run the E2E tests and waits for 4 | it to finish. It is intended to invoked by prow jobs. 5 | """ 6 | 7 | import argparse 8 | import logging 9 | from kubernetes import client as k8s_client 10 | import os 11 | import tempfile 12 | from testing import argo_client 13 | from testing import prow_artifacts 14 | import uuid 15 | from google.cloud import storage # pylint: disable=no-name-in-module 16 | from py import util 17 | import sys 18 | 19 | # The namespace to launch the Argo workflow in. 20 | NAMESPACE = "kubeflow-test-infra" 21 | 22 | # The name of the ksonnet component for the workflow 23 | COMPONENT = "workflows" 24 | 25 | def _get_src_dir(): 26 | return os.path.abspath(os.path.join(__file__, "..",)) 27 | 28 | def upload_to_gcs(contents, target): 29 | gcs_client = storage.Client() 30 | 31 | bucket_name, path = util.split_gcs_uri(target) 32 | 33 | bucket = gcs_client.get_bucket(bucket_name) 34 | logging.info("Writing %s", target) 35 | blob = bucket.blob(path) 36 | blob.upload_from_string(contents) 37 | 38 | def upload_file_to_gcs(source, target): 39 | gcs_client = storage.Client() 40 | bucket_name, path = util.split_gcs_uri(target) 41 | 42 | bucket = gcs_client.get_bucket(bucket_name) 43 | 44 | logging.info("Uploading file %s to %s.", source, target) 45 | blob = bucket.blob(path) 46 | blob.upload_from_filename(source) 47 | 48 | def create_started_file(bucket): 49 | """Create the started file in gcs for gubernator.""" 50 | contents = prow_artifacts.create_started() 51 | 52 | target = os.path.join(prow_artifacts.get_gcs_dir(bucket), "started.json") 53 | upload_to_gcs(contents, target) 54 | 55 | def create_finished_file(bucket, success): 56 | """Create the started file in gcs for gubernator.""" 57 | contents = prow_artifacts.create_finished(success) 58 | 59 | target = os.path.join(prow_artifacts.get_gcs_dir(bucket), "finished.json") 60 | upload_to_gcs(contents, target) 61 | 62 | def run(args, file_handler): 63 | src_dir = _get_src_dir() 64 | logging.info("Source directory: %s", src_dir) 65 | app_dir = os.path.join(src_dir, "test-infra") 66 | 67 | create_started_file(args.bucket) 68 | 69 | if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): 70 | logging.info("GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud " 71 | "to use service account.") 72 | # Since a service account is set tell gcloud to use it. 73 | util.run(["gcloud", "auth", "activate-service-account", "--key-file=" + 74 | os.getenv("GOOGLE_APPLICATION_CREDENTIALS")]) 75 | 76 | util.configure_kubectl(args.project, args.zone, args.cluster) 77 | util.load_kube_config() 78 | 79 | # Create the name for the workflow 80 | workflow_name = os.getenv("JOB_NAME") 81 | job_type = os.getenv("JOB_TYPE") 82 | if job_type == "presubmit": 83 | workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) 84 | elif job_type == "postsubmit": 85 | workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")) 86 | 87 | workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")) 88 | 89 | # Add some salt. This is mostly a convenience for the case where you 90 | # are submitting jobs manually for testing/debugging. Since the prow should 91 | # vend unique build numbers for each job. 92 | workflow_name += "-{0}".format(uuid.uuid4().hex[0:4]) 93 | 94 | util.run(["ks", "param", "set", "workflows", "name", workflow_name], cwd=app_dir) 95 | util.load_kube_config() 96 | 97 | api_client = k8s_client.ApiClient() 98 | 99 | # Set the prow environment variables. 100 | prow_env = [] 101 | 102 | names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", 103 | "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", 104 | "REPO_NAME"] 105 | names.sort() 106 | for v in names: 107 | if not os.getenv(v): 108 | continue 109 | prow_env.append("{0}={1}".format(v, os.getenv(v))) 110 | 111 | util.run(["ks", "param", "set", COMPONENT, "prow_env", ",".join(prow_env)], cwd=app_dir) 112 | util.run(["ks", "param", "set", COMPONENT, "namespace", NAMESPACE], cwd=app_dir) 113 | util.run(["ks", "param", "set", COMPONENT, "bucket", args.bucket], cwd=app_dir) 114 | 115 | # For debugging print out the manifest 116 | util.run(["ks", "show", "prow", "-c", COMPONENT], cwd=app_dir) 117 | util.run(["ks", "apply", "prow", "-c", COMPONENT], cwd=app_dir) 118 | 119 | success = False 120 | try: 121 | results = argo_client.wait_for_workflow(api_client, NAMESPACE, workflow_name, 122 | status_callback=argo_client.log_status) 123 | if results["status"]["phase"] == "Succeeded": 124 | success = True 125 | logging.info("Workflow %s/%s finished phase: %s", NAMESPACE, workflow_name, 126 | results["status"]["phase"] ) 127 | except util.TimeoutError: 128 | success = False 129 | logging.error("Time out waiting for Workflow %s/%s to finish", NAMESPACE, workflow_name) 130 | finally: 131 | create_finished_file(args.bucket, success) 132 | 133 | # Upload logs to GCS. No logs after this point will appear in the 134 | # file in gcs 135 | file_handler.flush() 136 | upload_file_to_gcs( 137 | file_handler.baseFilename, 138 | os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt")) 139 | 140 | return success 141 | 142 | def main(unparsed_args=None): # pylint: disable=too-many-locals 143 | logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals 144 | # create the top-level parser 145 | parser = argparse.ArgumentParser( 146 | description="Submit an Argo workflow to run the E2E tests.") 147 | 148 | parser.add_argument( 149 | "--project", 150 | default="", 151 | type=str, 152 | help="The project containing the GKE cluster to use to run the workflow.") 153 | 154 | parser.add_argument( 155 | "--zone", 156 | default="", 157 | type=str, 158 | help="The zone containing the GKE cluster to use to run the workflow.") 159 | 160 | parser.add_argument( 161 | "--cluster", 162 | default="", 163 | type=str, 164 | help="The GKE cluster to use to run the workflow.") 165 | 166 | parser.add_argument( 167 | "--bucket", 168 | default="", 169 | type=str, 170 | help="The bucket to use for the Gubernator outputs.") 171 | 172 | ############################################################################# 173 | # Process the command line arguments. 174 | 175 | # Parse the args 176 | args = parser.parse_args(args=unparsed_args) 177 | 178 | # Setup a logging file handler. This way we can upload the log outputs 179 | # to gubernator. 180 | root_logger = logging.getLogger() 181 | 182 | with tempfile.NamedTemporaryFile(prefix="tmpRunE2eWorkflow", suffix="log") as hf: 183 | test_log = hf.name 184 | 185 | file_handler = logging.FileHandler(test_log) 186 | root_logger.addHandler(file_handler) 187 | # We need to explicitly set the formatter because it will not pick up 188 | # the BasicConfig. 189 | formatter = logging.Formatter(fmt=("%(levelname)s|%(asctime)s" 190 | "|%(pathname)s|%(lineno)d| %(message)s"), 191 | datefmt="%Y-%m-%dT%H:%M:%S") 192 | file_handler.setFormatter(formatter) 193 | logging.info("Logging to %s", test_log) 194 | 195 | return run(args, file_handler) 196 | 197 | 198 | if __name__ == "__main__": 199 | logging.basicConfig(level=logging.INFO, 200 | format=('%(levelname)s|%(asctime)s' 201 | '|%(pathname)s|%(lineno)d| %(message)s'), 202 | datefmt='%Y-%m-%dT%H:%M:%S', 203 | ) 204 | logging.getLogger().setLevel(logging.INFO) 205 | success = main() 206 | if not success: 207 | # Exit with a non-zero exit code by to signal failure to prow. 208 | logging.error("One or more test steps failed exiting with non-zero exit " 209 | "code.") 210 | sys.exit(1) 211 | 212 | -------------------------------------------------------------------------------- /kubeflow/core/nfs.libsonnet: -------------------------------------------------------------------------------- 1 | // A ksonnet prototype/component for using NFS. 2 | 3 | { 4 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet 5 | // doesn't support automatically piping in the namespace from the environment to prototypes. 6 | // 7 | // Create a provisioner with the specified name. 8 | // disks should be a list GCP persistent disk names; these disks should be in the 9 | // same zone as your cluster. 10 | // TODO(jlewi): 11 | parts(namespace, name):: { 12 | 13 | local serviceAccountName = name, 14 | local serviceAccountRoleName = name, 15 | 16 | 17 | // Create the resources for a specific disk. 18 | // Each NFS Provisioner can only manage 1 PD so we need to create one for each disk. 19 | diskResources(diskName): { 20 | 21 | local storageClassName = diskName + "-nfs", 22 | local provisionerName = diskName + "-provisioner", 23 | local storageClassProvisioner = diskName + "/nfs", 24 | local serviceName = diskName + "-service", 25 | 26 | volumeClaim: { 27 | "apiVersion": "v1", 28 | "kind": "PersistentVolumeClaim", 29 | "metadata": { 30 | "annotations": { 31 | "volume.beta.kubernetes.io/storage-class": storageClassName, 32 | }, 33 | "name": diskName, 34 | "namespace": namespace, 35 | }, 36 | "spec": { 37 | "accessModes": [ 38 | "ReadWriteMany" 39 | ], 40 | "resources": { 41 | "requests": { 42 | "storage": "1Mi" 43 | } 44 | } 45 | } 46 | }, 47 | 48 | // TODO(jlewi): Is storageClass actually name space scoped? Seems to show up in default namespace as well. 49 | // TODO(jlewi): Could we just use the default cluster storage class? 50 | storageClass: { 51 | "apiVersion": "storage.k8s.io/v1beta1", 52 | "kind": "StorageClass", 53 | "metadata": { 54 | "name": storageClassName, 55 | "namespace": namespace, 56 | }, 57 | // This value must be the same as passed as argument --provisioner to the provisioner 58 | "provisioner": storageClassProvisioner, 59 | }, 60 | 61 | service: { 62 | "apiVersion": "v1", 63 | "kind": "Service", 64 | "metadata": { 65 | "labels": { 66 | "app": provisionerName 67 | }, 68 | "name": serviceName, 69 | "namespace": namespace, 70 | }, 71 | "spec": { 72 | "ports": [ 73 | { 74 | "name": "nfs", 75 | "port": 2049 76 | }, 77 | { 78 | "name": "mountd", 79 | "port": 20048 80 | }, 81 | { 82 | "name": "rpcbind", 83 | "port": 111 84 | }, 85 | { 86 | "name": "rpcbind-udp", 87 | "port": 111, 88 | "protocol": "UDP" 89 | } 90 | ], 91 | "selector": { 92 | "app": provisionerName 93 | } 94 | } 95 | }, 96 | 97 | provisioner: { 98 | "apiVersion": "extensions/v1beta1", 99 | "kind": "Deployment", 100 | "metadata": { 101 | "name": provisionerName, 102 | "namespace": namespace, 103 | }, 104 | "spec": { 105 | "replicas": 1, 106 | "strategy": { 107 | "type": "Recreate" 108 | }, 109 | "template": { 110 | "metadata": { 111 | "labels": { 112 | "app": provisionerName 113 | } 114 | }, 115 | "spec": { 116 | "containers": [ 117 | { 118 | "args": [ 119 | "-provisioner=" + storageClassProvisioner, 120 | ], 121 | "env": [ 122 | { 123 | "name": "POD_IP", 124 | "valueFrom": { 125 | "fieldRef": { 126 | "fieldPath": "status.podIP" 127 | } 128 | } 129 | }, 130 | { 131 | "name": "SERVICE_NAME", 132 | "value": serviceName, 133 | }, 134 | { 135 | "name": "POD_NAMESPACE", 136 | "valueFrom": { 137 | "fieldRef": { 138 | "fieldPath": "metadata.namespace" 139 | } 140 | } 141 | } 142 | ], 143 | "image": "quay.io/kubernetes_incubator/nfs-provisioner:v1.0.8", 144 | "imagePullPolicy": "IfNotPresent", 145 | "name": "nfs-provisioner", 146 | "ports": [ 147 | { 148 | "containerPort": 2049, 149 | "name": "nfs" 150 | }, 151 | { 152 | "containerPort": 20048, 153 | "name": "mountd" 154 | }, 155 | { 156 | "containerPort": 111, 157 | "name": "rpcbind" 158 | }, 159 | { 160 | "containerPort": 111, 161 | "name": "rpcbind-udp", 162 | "protocol": "UDP" 163 | } 164 | ], 165 | "securityContext": { 166 | "capabilities": { 167 | "add": [ 168 | "DAC_READ_SEARCH" 169 | ] 170 | } 171 | }, 172 | "volumeMounts": [{ 173 | // Needs to be mounted under /export because /export is what is exported for NFS. 174 | // https://github.com/kubernetes-incubator/external-storage/tree/master/nfs#quickstart 175 | "mountPath": "/export", 176 | "name": diskName, 177 | }], 178 | } 179 | ], 180 | "volumes": [{ 181 | "name": diskName, 182 | "gcePersistentDisk": { 183 | "pdName": diskName, 184 | },},], 185 | "serviceAccountName": serviceAccountName, 186 | }, 187 | }, 188 | }, 189 | }, // provisioner 190 | }, 191 | 192 | serviceAccount: { 193 | "apiVersion": "v1", 194 | "kind": "ServiceAccount", 195 | "metadata": { 196 | "labels": { 197 | "app": name + "nfs-provisioner" 198 | }, 199 | "name": serviceAccountName, 200 | "namespace": namespace, 201 | } 202 | }, 203 | 204 | role: { 205 | "apiVersion": "rbac.authorization.k8s.io/v1beta1", 206 | "kind": "Role", 207 | "metadata": { 208 | "name": serviceAccountRoleName, 209 | "namespace": namespace, 210 | }, 211 | "rules": [ 212 | { 213 | "apiGroups": [ 214 | "*" 215 | ], 216 | // TODO(jlewi): This is very permissive so we may want to lock this down. 217 | "resources": [ 218 | "*" 219 | ], 220 | "verbs": [ 221 | "*" 222 | ] 223 | } 224 | ] 225 | }, 226 | 227 | roleBinding: { 228 | "apiVersion": "rbac.authorization.k8s.io/v1beta1", 229 | "kind": "RoleBinding", 230 | "metadata": { 231 | "name": name + "-nfs-role", 232 | "namespace": namespace 233 | }, 234 | "roleRef": { 235 | "apiGroup": "rbac.authorization.k8s.io", 236 | "kind": "Role", 237 | "name": serviceAccountName, 238 | }, 239 | "subjects": [ 240 | { 241 | "kind": "ServiceAccount", 242 | "name": serviceAccountRoleName, 243 | "namespace": namespace, 244 | } 245 | ] 246 | }, 247 | 248 | // see https://github.com/kubernetes-incubator/external-storage/tree/master/docs#authorizing-provisioners-for-rbac-or-openshift 249 | clusterRoleBinding: { 250 | "apiVersion": "rbac.authorization.k8s.io/v1beta1", 251 | "kind": "ClusterRoleBinding", 252 | "metadata": { 253 | "name": name + "-nfs-role", 254 | "namespace": namespace 255 | }, 256 | "roleRef": { 257 | "apiGroup": "rbac.authorization.k8s.io", 258 | "kind": "ClusterRole", 259 | "name": "system:persistent-volume-provisioner", 260 | }, 261 | "subjects": [ 262 | { 263 | "kind": "ServiceAccount", 264 | "name": serviceAccountRoleName, 265 | "namespace": namespace, 266 | } 267 | ] 268 | }, 269 | 270 | }, // parts 271 | } -------------------------------------------------------------------------------- /testing/prow_artifacts.py: -------------------------------------------------------------------------------- 1 | """Script to create artifacts needed by Gubernator. 2 | 3 | For reference see: 4 | https://github.com/kubernetes/test-infra/tree/master/gubernator 5 | """ 6 | import argparse 7 | import logging 8 | import json 9 | import os 10 | import time 11 | from google.cloud import storage # pylint: disable=no-name-in-module 12 | from py import util 13 | 14 | 15 | # TODO(jlewi): Replace create_finished in tensorflow/k8s/py/prow.py with this 16 | # version. We should do that when we switch tensorflow/k8s to use Argo instead 17 | # of Airflow. 18 | def create_started(): 19 | """Return a string containing the contents of started.json for gubernator. 20 | """ 21 | # See: 22 | # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout 23 | # For a list of fields expected by gubernator 24 | started = { 25 | "timestamp": int(time.time()), 26 | "repos": { 27 | }, 28 | } 29 | 30 | repo_owner = os.getenv("REPO_OWNER", "") 31 | repo_name = os.getenv("REPO_NAME", "") 32 | 33 | if repo_owner: 34 | sha = os.getenv("PULL_PULL_SHA", "") 35 | if not sha: 36 | # Its a post submit job. 37 | sha = os.getenv("PULL_BASE_SHA", "") 38 | 39 | started["repos"][repo_owner + "/" + repo_name] = sha 40 | 41 | PULL_REFS = os.getenv("PULL_REFS", "") 42 | if PULL_REFS: 43 | started["pull"] = PULL_REFS 44 | 45 | return json.dumps(started) 46 | 47 | # TODO(jlewi): Replace create_finished in tensorflow/k8s/py/prow.py with this 48 | # version. We should do that when we switch tensorflow/k8s to use Argo instead 49 | # of Airflow. 50 | def create_finished(success): 51 | """Create a string containing the contents for finished.json. 52 | 53 | Args: 54 | success: Bool indicating whether the workflow succeeded or not. 55 | """ 56 | if success: 57 | result = "SUCCESS" 58 | else: 59 | result = "FAILED" 60 | finished = { 61 | "timestamp": int(time.time()), 62 | "result": result, 63 | # Dictionary of extra key value pairs to display to the user. 64 | # TODO(jlewi): Perhaps we should add the GCR path of the Docker image 65 | # we are running in. We'd have to plumb this in from bootstrap. 66 | "metadata": {}, 67 | } 68 | 69 | return json.dumps(finished) 70 | 71 | def get_gcs_dir(bucket): 72 | """Return the GCS directory for this job.""" 73 | pull_number = os.getenv("PULL_NUMBER") 74 | 75 | repo_owner = os.getenv("REPO_OWNER") 76 | repo_name = os.getenv("REPO_NAME") 77 | 78 | 79 | job_name = os.getenv("JOB_NAME") 80 | 81 | # GCS layout is defined here: 82 | # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout 83 | pull_number = os.getenv("PULL_NUMBER") 84 | 85 | repo_owner = os.getenv("REPO_OWNER") 86 | repo_name = os.getenv("REPO_NAME") 87 | 88 | if pull_number: 89 | output = ("gs://{bucket}/pr-logs/pull/{owner}_{repo}/" 90 | "{pull_number}/{job}/{build}").format( 91 | bucket=bucket, 92 | owner=repo_owner, repo=repo_name, 93 | pull_number=pull_number, 94 | job=os.getenv("JOB_NAME"), 95 | build=os.getenv("BUILD_NUMBER")) 96 | 97 | elif repo_owner: 98 | # It is a postsubmit job 99 | output = ("gs://{bucket}/logs/{owner}_{repo}/" 100 | "{job}/{build}").format( 101 | owner=repo_owner, repo=repo_name, 102 | job=job_name, 103 | build=os.getenv("BUILD_NUMBER")) 104 | else: 105 | # Its a periodic job 106 | output = ("gs://{bucket}/logs/{job}/{build}").format( 107 | bucket=bucket, 108 | job=job_name, 109 | build=os.getenv("BUILD_NUMBER")) 110 | 111 | return output 112 | 113 | def copy_artifacts(args): 114 | """Sync artifacts to GCS.""" 115 | job_name = os.getenv("JOB_NAME") 116 | 117 | # GCS layout is defined here: 118 | # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout 119 | pull_number = os.getenv("PULL_NUMBER") 120 | 121 | repo_owner = os.getenv("REPO_OWNER") 122 | repo_name = os.getenv("REPO_NAME") 123 | 124 | output = get_gcs_dir(args.bucket) 125 | 126 | if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): 127 | logging.info("GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud " 128 | "to use service account.") 129 | # Since a service account is set tell gcloud to use it. 130 | util.run(["gcloud", "auth", "activate-service-account", "--key-file=" + 131 | os.getenv("GOOGLE_APPLICATION_CREDENTIALS")]) 132 | 133 | util.run(["gsutil", "-m", "rsync", "-r", args.artifacts_dir, output]) 134 | 135 | def create_pr_symlink(args): 136 | """Create a 'symlink' in GCS pointing at the results for a PR. 137 | 138 | This is a null op if PROW environment variables indicate this is not a PR 139 | job. 140 | """ 141 | gcs_client = storage.Client() 142 | # GCS layout is defined here: 143 | # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout 144 | pull_number = os.getenv("PULL_NUMBER") 145 | if not pull_number: 146 | # Symlinks are only created for pull requests. 147 | return "" 148 | 149 | path = "pr-logs/directory/{job}/{build}.txt".format( 150 | job=os.getenv("JOB_NAME"), build=os.getenv("BUILD_NUMBER")) 151 | 152 | pull_number = os.getenv("PULL_NUMBER") 153 | 154 | repo_owner = os.getenv("REPO_OWNER") 155 | repo_name = os.getenv("REPO_NAME") 156 | 157 | 158 | build_dir = ("gs://{bucket}/pr-logs/pull/{owner}_{repo}/" 159 | "{pull_number}/{job}/{build}").format( 160 | bucket=args.bucket, 161 | owner=repo_owner, repo=repo_name, 162 | pull_number=pull_number, 163 | job=os.getenv("JOB_NAME"), 164 | build=os.getenv("BUILD_NUMBER")) 165 | source = util.to_gcs_uri(args.bucket, path) 166 | target = get_gcs_dir(args.bucket) 167 | logging.info("Creating symlink %s pointing to %s", source, target) 168 | bucket = gcs_client.get_bucket(args.bucket) 169 | blob = bucket.blob(path) 170 | blob.upload_from_string(target) 171 | 172 | def main(unparsed_args=None): # pylint: disable=too-many-locals 173 | logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals 174 | # create the top-level parser 175 | parser = argparse.ArgumentParser( 176 | description="Create prow artifacts.") 177 | 178 | parser.add_argument( 179 | "--artifacts_dir", 180 | default="", 181 | type=str, 182 | help="Directory to use for all the gubernator artifacts.") 183 | 184 | subparsers = parser.add_subparsers() 185 | 186 | ############################################################################# 187 | # Copy artifacts. 188 | parser_copy = subparsers.add_parser( 189 | "copy_artifacts", help="Copy the artifacts.") 190 | 191 | parser_copy.add_argument( 192 | "--bucket", 193 | default="", 194 | type=str, 195 | help="Bucket to copy the artifacts to.") 196 | 197 | parser_copy.set_defaults(func=copy_artifacts) 198 | 199 | ############################################################################# 200 | # Create the pr symlink. 201 | parser_link = subparsers.add_parser( 202 | "create_pr_symlink", help="Create a symlink pointing at PR output dir; null " 203 | "op if prow job is not a presubmit job.") 204 | 205 | parser_link.add_argument( 206 | "--bucket", 207 | default="", 208 | type=str, 209 | help="Bucket to copy the artifacts to.") 210 | 211 | parser_link.set_defaults(func=create_pr_symlink) 212 | 213 | ############################################################################# 214 | # Process the command line arguments. 215 | 216 | # Parse the args 217 | args = parser.parse_args(args=unparsed_args) 218 | 219 | # Setup a logging file handler. This way we can upload the log outputs 220 | # to gubernator. 221 | root_logger = logging.getLogger() 222 | 223 | test_log = os.path.join(os.path.join(args.artifacts_dir, "artifacts"), 224 | "logs", "prow_artifacts." + args.func.__name__ + 225 | ".log") 226 | if not os.path.exists(os.path.dirname(test_log)): 227 | os.makedirs(os.path.dirname(test_log)) 228 | 229 | file_handler = logging.FileHandler(test_log) 230 | root_logger.addHandler(file_handler) 231 | # We need to explicitly set the formatter because it will not pick up 232 | # the BasicConfig. 233 | formatter = logging.Formatter(fmt=("%(levelname)s|%(asctime)s" 234 | "|%(pathname)s|%(lineno)d| %(message)s"), 235 | datefmt="%Y-%m-%dT%H:%M:%S") 236 | file_handler.setFormatter(formatter) 237 | logging.info("Logging to %s", test_log) 238 | 239 | args.func(args) 240 | 241 | if __name__ == "__main__": 242 | logging.basicConfig(level=logging.INFO, 243 | format=('%(levelname)s|%(asctime)s' 244 | '|%(pathname)s|%(lineno)d| %(message)s'), 245 | datefmt='%Y-%m-%dT%H:%M:%S', 246 | ) 247 | logging.getLogger().setLevel(logging.INFO) 248 | main() 249 | -------------------------------------------------------------------------------- /kubeflow/core/jupyterhub.libsonnet: -------------------------------------------------------------------------------- 1 | 2 | // local nfs = import "nfs.libsonnet"; 3 | 4 | { 5 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet 6 | // doesn't support automatically piping in the namespace from the environment to prototypes. 7 | // 8 | // TODO(jlewi): We should refactor this to have multiple prototypes; having 1 without any extra volumes and than 9 | // a with volumes option. 10 | parts(namespace):: { 11 | 12 | // TODO(jlewi): We should make the default Docker image configurable 13 | // TODO(jlewi): We should make whether we use PVC configurable. 14 | local baseKubeConfigSpawner = @"import json 15 | import os 16 | from kubespawner.spawner import KubeSpawner 17 | from oauthenticator.github import GitHubOAuthenticator 18 | 19 | class KubeFormSpawner(KubeSpawner): 20 | def _options_form_default(self): 21 | return ''' 22 |       23 | 24 |

25 | 26 |       27 | 28 |

29 | 30 |       31 | 32 |

33 | 34 |       35 | 36 |

37 | ''' 38 | 39 | def options_from_form(self, formdata): 40 | options = {} 41 | options['image'] = formdata.get('image', [''])[0].strip() 42 | options['cpu_guarantee'] = formdata.get('cpu_guarantee', [''])[0].strip() 43 | options['mem_guarantee'] = formdata.get('mem_guarantee', [''])[0].strip() 44 | options['extra_resource_limits'] = formdata.get('extra_resource_limits', [''])[0].strip() 45 | return options 46 | 47 | @property 48 | def singleuser_image_spec(self): 49 | image = 'gcr.io/kubeflow/tensorflow-notebook-cpu' 50 | if self.user_options.get('image'): 51 | image = self.user_options['image'] 52 | return image 53 | 54 | @property 55 | def cpu_guarantee(self): 56 | cpu = '500m' 57 | if self.user_options.get('cpu_guarantee'): 58 | cpu = self.user_options['cpu_guarantee'] 59 | return cpu 60 | 61 | @property 62 | def mem_guarantee(self): 63 | mem = '1Gi' 64 | if self.user_options.get('mem_guarantee'): 65 | mem = self.user_options['mem_guarantee'] 66 | return mem 67 | 68 | @property 69 | def extra_resource_limits(self): 70 | extra = '' 71 | if self.user_options.get('extra_resource_limits'): 72 | extra = json.loads(self.user_options['extra_resource_limits']) 73 | return extra 74 | 75 | ################################################### 76 | ### JupyterHub Options 77 | ################################################### 78 | c.JupyterHub.ip = '0.0.0.0' 79 | c.JupyterHub.hub_ip = '0.0.0.0' 80 | # Don't try to cleanup servers on exit - since in general for k8s, we want 81 | # the hub to be able to restart without losing user containers 82 | c.JupyterHub.cleanup_servers = False 83 | ################################################### 84 | 85 | ################################################### 86 | ### Spawner Options 87 | ################################################### 88 | c.JupyterHub.spawner_class = KubeFormSpawner 89 | c.KubeSpawner.singleuser_image_spec = 'gcr.io/kubeflow/tensorflow-notebook' 90 | c.KubeSpawner.cmd = 'start-singleuser.sh' 91 | c.KubeSpawner.args = ['--allow-root'] 92 | # First pulls can be really slow, so let's give it a big timeout 93 | c.KubeSpawner.start_timeout = 60 * 10 94 | ################################################### 95 | 96 | 97 | ################################################### 98 | ### Authenticator Options 99 | ################################################### 100 | c.JupyterHub.authenticator_class = 'dummyauthenticator.DummyAuthenticator' 101 | # c.JupyterHub.authenticator_class = GitHubOAuthenticator 102 | # c.GitHubOAuthenticator.oauth_callback_url = '' 103 | # c.GitHubOAuthenticator.client_id = '' 104 | # c.GitHubOAuthenticator.client_secret = '' 105 | 106 | 107 | ################################################### 108 | ### Persistent volume options 109 | ################################################### 110 | # Using persistent storage requires a default storage class. 111 | # TODO(jlewi): Verify this works on minikube. 112 | # TODO(jlewi): Should we set c.KubeSpawner.singleuser_fs_gid = 1000 113 | # see https://github.com/google/kubeflow/pull/22#issuecomment-350500944 114 | c.KubeSpawner.user_storage_pvc_ensure = True 115 | # How much disk space do we want? 116 | c.KubeSpawner.user_storage_capacity = '10Gi' 117 | c.KubeSpawner.pvc_name_template = 'claim-{username}{servername}' 118 | ", 119 | 120 | local baseJupyterHubConfigMap = { 121 | "apiVersion": "v1", 122 | "kind": "ConfigMap", 123 | "metadata": { 124 | "name": "jupyterhub-config", 125 | namespace: namespace, 126 | }, 127 | }, 128 | 129 | 130 | jupyterHubConfigMap: baseJupyterHubConfigMap + { 131 | "data": { 132 | "jupyterhub_config.py": baseKubeConfigSpawner, 133 | }, 134 | }, 135 | 136 | jupyterHubConfigMapWithVolumes(volumeClaims): { 137 | 138 | 139 | local volumes = std.map(function(v) 140 | { 141 | 'name': v, 142 | 'persistentVolumeClaim': { 143 | 'claimName': v, 144 | }, 145 | }, volumeClaims), 146 | 147 | 148 | local volumeMounts = std.map( function(v) 149 | { 150 | 'mountPath': '/mnt/' + v, 151 | 'name': v, 152 | }, volumeClaims), 153 | 154 | local extendedBaseKubeConfigSpawner = baseKubeConfigSpawner 155 | + "\nc.KubeSpawner.volumes = " + std.manifestPython(volumes) 156 | + "\nc.KubeSpawner.volume_mounts = " + std.manifestPython(volumeMounts), 157 | 158 | config: baseJupyterHubConfigMap + { 159 | "data": { 160 | "jupyterhub_config.py": extendedBaseKubeConfigSpawner, 161 | }, 162 | }, 163 | }.config, 164 | 165 | jupyterHubService: { 166 | "apiVersion": "v1", 167 | "kind": "Service", 168 | "metadata": { 169 | "labels": { 170 | "app": "tf-hub" 171 | }, 172 | "name": "tf-hub-0", 173 | namespace: namespace, 174 | }, 175 | "spec": { 176 | "clusterIP": "None", 177 | "ports": [ 178 | { 179 | "name": "hub", 180 | "port": 8000 181 | } 182 | ], 183 | "selector": { 184 | "app": "tf-hub" 185 | } 186 | } 187 | }, 188 | 189 | jupyterHubLoadBalancer(serviceType): { 190 | "apiVersion": "v1", 191 | "kind": "Service", 192 | "metadata": { 193 | "labels": { 194 | "app": "tf-hub" 195 | }, 196 | "name": "tf-hub-lb", 197 | "namespace": namespace, 198 | }, 199 | "spec": { 200 | "ports": [ 201 | { 202 | "name": "http", 203 | "port": 80, 204 | "targetPort": 8000 205 | } 206 | ], 207 | "selector": { 208 | "app": "tf-hub" 209 | }, 210 | "type": serviceType 211 | } 212 | }, 213 | 214 | jupyterHub(image): { 215 | "apiVersion": "apps/v1beta1", 216 | "kind": "StatefulSet", 217 | "metadata": { 218 | "name": "tf-hub", 219 | "namespace": namespace, 220 | }, 221 | "spec": { 222 | "replicas": 1, 223 | "serviceName": "", 224 | "template": { 225 | "metadata": { 226 | "labels": { 227 | "app": "tf-hub" 228 | } 229 | }, 230 | "spec": { 231 | "containers": [ 232 | { 233 | "command": [ 234 | "jupyterhub", 235 | "-f", 236 | "/etc/config/jupyterhub_config.py" 237 | ], 238 | "image": image, 239 | "name": "tf-hub", 240 | "volumeMounts": [ 241 | { 242 | "mountPath": "/etc/config", 243 | "name": "config-volume" 244 | } 245 | ] 246 | } 247 | ], 248 | "serviceAccountName": "jupyter-hub", 249 | "volumes": [ 250 | { 251 | "configMap": { 252 | "name": "jupyterhub-config" 253 | }, 254 | "name": "config-volume" 255 | } 256 | ] 257 | } 258 | }, 259 | "updateStrategy": { 260 | "type": "RollingUpdate" 261 | } 262 | } 263 | }, 264 | 265 | jupyterHubRole: { 266 | "apiVersion": "rbac.authorization.k8s.io/v1beta1", 267 | "kind": "Role", 268 | "metadata": { 269 | "name": "jupyter-role", 270 | "namespace": namespace, 271 | }, 272 | "rules": [ 273 | { 274 | "apiGroups": [ 275 | "*" 276 | ], 277 | // TODO(jlewi): This is very permissive so we may want to lock this down. 278 | "resources": [ 279 | "*" 280 | ], 281 | "verbs": [ 282 | "*" 283 | ] 284 | } 285 | ] 286 | }, 287 | 288 | jupyterHubServiceAccount: { 289 | "apiVersion": "v1", 290 | "kind": "ServiceAccount", 291 | "metadata": { 292 | "labels": { 293 | "app": "jupyter-hub" 294 | }, 295 | "name": "jupyter-hub", 296 | "namespace": namespace, 297 | } 298 | }, 299 | 300 | jupyterHubRoleBinding: { 301 | "apiVersion": "rbac.authorization.k8s.io/v1beta1", 302 | "kind": "RoleBinding", 303 | "metadata": { 304 | "name": "jupyter-role", 305 | "namespace": namespace, 306 | }, 307 | "roleRef": { 308 | "apiGroup": "rbac.authorization.k8s.io", 309 | "kind": "Role", 310 | "name": "jupyter-role" 311 | }, 312 | "subjects": [ 313 | { 314 | "kind": "ServiceAccount", 315 | "name": "jupyter-hub", 316 | "namespace": namespace, 317 | } 318 | ] 319 | }, 320 | }, // parts 321 | } 322 | -------------------------------------------------------------------------------- /testing/test_deploy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """Test deploying Kubeflow. 3 | 4 | Requirements: 5 | This project assumes the py directory in github.com/tensorflow/k8s corresponds 6 | to a top level Python package on the Python path. 7 | 8 | TODO(jlewi): Come up with a better story for how we reuse the py package 9 | in tensorflow/k8s. We should probably turn that into a legit Python pip 10 | package that is built and released as part of the tensorflow/k8s project. 11 | """ 12 | 13 | import argparse 14 | import datetime 15 | import json 16 | import logging 17 | import os 18 | import shutil 19 | import tempfile 20 | import uuid 21 | 22 | from kubernetes import client as k8s_client 23 | from kubernetes.client import rest 24 | from kubernetes.config import incluster_config 25 | 26 | from py import test_util 27 | from py import util 28 | 29 | def _setup_test(api_client, run_label): 30 | """Create the namespace for the test. 31 | 32 | Returns: 33 | test_dir: The local test directory. 34 | """ 35 | 36 | api = k8s_client.CoreV1Api(api_client) 37 | namespace = k8s_client.V1Namespace() 38 | namespace.api_version = "v1" 39 | namespace.kind = "Namespace" 40 | namespace.metadata = k8s_client.V1ObjectMeta(name=run_label, labels={ 41 | "app": "kubeflow-e2e-test", 42 | } 43 | ) 44 | 45 | try: 46 | logging.info("Creating namespace %s", namespace.metadata.name) 47 | namespace = api.create_namespace(namespace) 48 | logging.info("Namespace %s created.", namespace.metadata.name) 49 | except rest.ApiException as e: 50 | if e.status == 409: 51 | logging.info("Namespace %s already exists.", namespace.metadata.name) 52 | else: 53 | raise 54 | 55 | return namespace 56 | 57 | def setup(args): 58 | """Test deploying Kubeflow.""" 59 | if args.cluster: 60 | project = args.project 61 | cluster_name = args.cluster 62 | zone = args.zone 63 | logging.info("Using cluster: %s in project: %s in zone: %s", 64 | cluster_name, project, zone) 65 | # Print out config to help debug issues with accounts and 66 | # credentials. 67 | util.run(["gcloud", "config", "list"]) 68 | util.configure_kubectl(project, zone, cluster_name) 69 | util.load_kube_config() 70 | else: 71 | # TODO(jlewi): This is sufficient for API access but it doesn't create 72 | # a kubeconfig file which ksonnet needs for ks init. 73 | logging.info("Running inside cluster.") 74 | incluster_config.load_incluster_config() 75 | 76 | # Create an API client object to talk to the K8s master. 77 | api_client = k8s_client.ApiClient() 78 | 79 | now = datetime.datetime.now() 80 | run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] 81 | 82 | if not os.path.exists(args.test_dir): 83 | os.makedirs(args.test_dir) 84 | 85 | logging.info("Using test directory: %s", args.test_dir) 86 | 87 | namespace_name = run_label 88 | def run(): 89 | namespace = _setup_test(api_client, namespace_name) 90 | logging.info("Using namespace: %s", namespace) 91 | # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; 92 | # see: https://github.com/ksonnet/ksonnet/issues/233 93 | os.environ["GITHUB_TOKEN"] = args.github_token 94 | 95 | # Initialize a ksonnet app. 96 | app_name = "kubeflow-test" 97 | util.run(["ks", "init", app_name,], cwd=args.test_dir, use_print=True) 98 | 99 | app_dir = os.path.join(args.test_dir, app_name) 100 | 101 | kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow" 102 | util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) 103 | 104 | # Install required packages 105 | packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] 106 | 107 | for p in packages: 108 | util.run(["ks", "pkg", "install", p], cwd=app_dir) 109 | 110 | # Delete the vendor directory and replace with a symlink to the src 111 | # so that we use the code at the desired commit. 112 | target_dir = os.path.join(app_dir, "vendor", "kubeflow") 113 | 114 | logging.info("Deleting %s", target_dir) 115 | shutil.rmtree(target_dir) 116 | 117 | source = os.path.join(args.test_dir, "src", "kubeflow") 118 | logging.info("Creating link %s -> %s", target_dir, source) 119 | os.symlink(source, target_dir) 120 | 121 | # Deploy Kubeflow 122 | util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", 123 | "--namespace=" + namespace.metadata.name], cwd=app_dir) 124 | 125 | # TODO(jlewi): For reasons I don't understand even though we ran 126 | # configure_kubectl above, if we don't rerun it we get rbac errors 127 | # when we do ks apply; I think because we aren't using the proper service 128 | # account. This might have something to do with the way ksonnet gets 129 | # its credentials; maybe we need to configure credentials after calling 130 | # ks init? 131 | if args.cluster: 132 | util.configure_kubectl(args.project, args.zone, args.cluster) 133 | 134 | apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] 135 | 136 | util.run(apply_command, cwd=app_dir) 137 | 138 | # Verify that the TfJob operator is actually deployed. 139 | tf_job_deployment_name = "tf-job-operator" 140 | logging.info("Verifying TfJob controller started.") 141 | util.wait_for_deployment(api_client, namespace.metadata.name, 142 | tf_job_deployment_name) 143 | 144 | # Verify that JupyterHub is actually deployed. 145 | jupyter_name = "tf-hub" 146 | logging.info("Verifying TfHub started.") 147 | util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) 148 | 149 | main_case = test_util.TestCase() 150 | main_case.class_name = "KubeFlow" 151 | main_case.name = "deploy-kubeflow" 152 | try: 153 | test_util.wrap_test(run, main_case) 154 | finally: 155 | # Delete the namespace 156 | logging.info("Deleting namespace %s", namespace_name) 157 | 158 | # We report teardown as a separate test case because this will help 159 | # us track down issues with garbage collecting namespaces. 160 | teardown = test_util.TestCase(main_case.class_name, "teardown") 161 | def run_teardown(): 162 | core_api = k8s_client.CoreV1Api(api_client) 163 | core_api.delete_namespace(namespace_name, {}) 164 | 165 | try: 166 | test_util.wrap_test(run_teardown, teardown) 167 | except Exception as e: # pylint: disable-msg=broad-except 168 | logging.error("There was a problem deleting namespace: %s; %s", 169 | namespace_name, e.message) 170 | junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml") 171 | logging.info("Writing test results to %s", junit_path) 172 | test_util.create_junit_xml_file([main_case, teardown], junit_path) 173 | 174 | def main(): # pylint: disable=too-many-locals 175 | logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals 176 | # create the top-level parser 177 | parser = argparse.ArgumentParser( 178 | description="Test Kubeflow E2E.") 179 | 180 | parser.add_argument( 181 | "--test_dir", 182 | default="", 183 | type=str, 184 | help="Directory to use for all the test files. If not set a temporary " 185 | "directory is created.") 186 | 187 | parser.add_argument( 188 | "--artifacts_dir", 189 | default="", 190 | type=str, 191 | help="Directory to use for artifacts that should be preserved after " 192 | "the test runs. Defaults to test_dir if not set.") 193 | 194 | parser.add_argument( 195 | "--project", 196 | default=None, 197 | type=str, 198 | help="The project to use.") 199 | 200 | parser.add_argument( 201 | "--cluster", 202 | default=None, 203 | type=str, 204 | help=("The name of the cluster. If not set assumes the " 205 | "script is running in a cluster and uses that cluster.")) 206 | 207 | parser.add_argument( 208 | "--zone", 209 | default="us-east1-d", 210 | type=str, 211 | help="The zone for the cluster.") 212 | 213 | parser.add_argument( 214 | "--github_token", 215 | default=None, 216 | type=str, 217 | help=("The GitHub API token to use. This is needed since ksonnet uses the " 218 | "GitHub API and without it we get rate limited. For more info see: " 219 | "https://github.com/ksonnet/ksonnet/blob/master/docs" 220 | "/troubleshooting.md")) 221 | 222 | args = parser.parse_args() 223 | 224 | if not args.test_dir: 225 | logging.info("--test_dir not set; using a temporary directory.") 226 | 227 | now = datetime.datetime.now() 228 | label = "test_deploy-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] 229 | 230 | # Create a temporary directory for this test run 231 | args.test_dir = os.path.join(tempfile.gettempdir(), label) 232 | 233 | if not args.artifacts_dir: 234 | args.artifacts_dir = args.test_dir 235 | # Setup a logging file handler. This way we can upload the log outputs 236 | # to gubernator. 237 | root_logger = logging.getLogger() 238 | 239 | test_log = os.path.join(args.artifacts_dir, "logs", "test_deploy.log.txt") 240 | if not os.path.exists(os.path.dirname(test_log)): 241 | os.makedirs(os.path.dirname(test_log)) 242 | 243 | file_handler = logging.FileHandler(test_log) 244 | root_logger.addHandler(file_handler) 245 | # We need to explicitly set the formatter because it will not pick up 246 | # the BasicConfig. 247 | formatter = logging.Formatter(fmt=("%(levelname)s|%(asctime)s" 248 | "|%(pathname)s|%(lineno)d| %(message)s"), 249 | datefmt="%Y-%m-%dT%H:%M:%S") 250 | file_handler.setFormatter(formatter) 251 | logging.info("Logging to %s", test_log) 252 | 253 | if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): 254 | logging.info("GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud " 255 | "to use service account.") 256 | # Since a service account is set tell gcloud to use it. 257 | util.run(["gcloud", "auth", "activate-service-account", "--key-file=" + 258 | os.getenv("GOOGLE_APPLICATION_CREDENTIALS")]) 259 | setup(args) 260 | 261 | if __name__ == "__main__": 262 | logging.basicConfig(level=logging.INFO, 263 | format=('%(levelname)s|%(asctime)s' 264 | '|%(pathname)s|%(lineno)d| %(message)s'), 265 | datefmt='%Y-%m-%dT%H:%M:%S', 266 | ) 267 | logging.getLogger().setLevel(logging.INFO) 268 | main() 269 | -------------------------------------------------------------------------------- /testing/test-infra/components/argo.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet 3 | // doesn't support automatically piping in the namespace from the environment to prototypes. 4 | 5 | // TODO(jlewi): Do we need to add parts corresponding to a service account and cluster binding role? 6 | // see https://github.com/argoproj/argo/blob/master/cmd/argo/commands/install.go 7 | 8 | parts(namespace):: { 9 | // CRD's are not namespace scoped; see 10 | // https://kubernetes.io/docs/tasks/access-kubernetes-api/extend-api-custom-resource-definitions/ 11 | crd: { 12 | "apiVersion": "apiextensions.k8s.io/v1beta1", 13 | "kind": "CustomResourceDefinition", 14 | "metadata": { 15 | "name": "workflows.argoproj.io", 16 | }, 17 | "spec": { 18 | "group": "argoproj.io", 19 | "names": { 20 | "kind": "Workflow", 21 | "listKind": "WorkflowList", 22 | "plural": "workflows", 23 | "shortNames": [ 24 | "wf" 25 | ], 26 | "singular": "workflow" 27 | }, 28 | "scope": "Namespaced", 29 | "version": "v1alpha1" 30 | }, 31 | }, // crd 32 | 33 | // Deploy the controller 34 | deploy: { 35 | "apiVersion": "extensions/v1beta1", 36 | "kind": "Deployment", 37 | "labels": { 38 | "app": "workflow-controller" 39 | }, 40 | "metadata": { 41 | "name": "workflow-controller", 42 | "namespace": namespace, 43 | }, 44 | "spec": { 45 | "progressDeadlineSeconds": 600, 46 | "replicas": 1, 47 | "revisionHistoryLimit": 10, 48 | "selector": { 49 | "matchLabels": { 50 | "app": "workflow-controller" 51 | } 52 | }, 53 | "strategy": { 54 | "rollingUpdate": { 55 | "maxSurge": "25%", 56 | "maxUnavailable": "25%" 57 | }, 58 | "type": "RollingUpdate" 59 | }, 60 | "template": { 61 | "metadata": { 62 | "creationTimestamp": null, 63 | "labels": { 64 | "app": "workflow-controller" 65 | } 66 | }, 67 | "spec": { 68 | "containers": [ 69 | { 70 | "args": [ 71 | "--configmap", 72 | "workflow-controller-configmap" 73 | ], 74 | "command": [ 75 | "workflow-controller" 76 | ], 77 | "env": [ 78 | { 79 | "name": "ARGO_NAMESPACE", 80 | "valueFrom": { 81 | "fieldRef": { 82 | "apiVersion": "v1", 83 | "fieldPath": "metadata.namespace" 84 | } 85 | } 86 | } 87 | ], 88 | "image": "argoproj/workflow-controller:v2.0.0-alpha3", 89 | "imagePullPolicy": "IfNotPresent", 90 | "name": "workflow-controller", 91 | "resources": {}, 92 | "terminationMessagePath": "/dev/termination-log", 93 | "terminationMessagePolicy": "File" 94 | } 95 | ], 96 | "dnsPolicy": "ClusterFirst", 97 | "restartPolicy": "Always", 98 | "schedulerName": "default-scheduler", 99 | "securityContext": {}, 100 | "serviceAccount": "argo", 101 | "serviceAccountName": "argo", 102 | "terminationGracePeriodSeconds": 30 103 | } 104 | } 105 | }, 106 | }, // deploy 107 | 108 | 109 | deployUi: { 110 | "apiVersion": "extensions/v1beta1", 111 | "kind": "Deployment", 112 | "metadata": { 113 | "labels": { 114 | "app": "argo-ui" 115 | }, 116 | "name": "argo-ui", 117 | "namespace": namespace, 118 | }, 119 | "spec": { 120 | "progressDeadlineSeconds": 600, 121 | "replicas": 1, 122 | "revisionHistoryLimit": 10, 123 | "selector": { 124 | "matchLabels": { 125 | "app": "argo-ui" 126 | } 127 | }, 128 | "strategy": { 129 | "rollingUpdate": { 130 | "maxSurge": "25%", 131 | "maxUnavailable": "25%" 132 | }, 133 | "type": "RollingUpdate" 134 | }, 135 | "template": { 136 | "metadata": { 137 | "creationTimestamp": null, 138 | "labels": { 139 | "app": "argo-ui" 140 | } 141 | }, 142 | "spec": { 143 | "containers": [ 144 | { 145 | "env": [ 146 | { 147 | "name": "ARGO_NAMESPACE", 148 | "valueFrom": { 149 | "fieldRef": { 150 | "apiVersion": "v1", 151 | "fieldPath": "metadata.namespace" 152 | } 153 | } 154 | }, 155 | { 156 | "name": "IN_CLUSTER", 157 | "value": "true" 158 | } 159 | ], 160 | "image": "argoproj/argoui:v2.0.0-alpha3", 161 | "imagePullPolicy": "IfNotPresent", 162 | "name": "argo-ui", 163 | "resources": {}, 164 | "terminationMessagePath": "/dev/termination-log", 165 | "terminationMessagePolicy": "File" 166 | } 167 | ], 168 | "dnsPolicy": "ClusterFirst", 169 | "restartPolicy": "Always", 170 | "schedulerName": "default-scheduler", 171 | "securityContext": {}, 172 | "serviceAccount": "argo", 173 | "serviceAccountName": "argo", 174 | "terminationGracePeriodSeconds": 30, 175 | "readinessProbe": { 176 | "httpGet": { 177 | "path": "/", 178 | "port": 8001, 179 | } 180 | }, 181 | } 182 | } 183 | }, 184 | }, // deployUi 185 | 186 | uiIngress:: { 187 | "apiVersion": "extensions/v1beta1", 188 | "kind": "Ingress", 189 | "metadata": { 190 | "name": "argo-ui", 191 | "namespace": namespace, 192 | }, 193 | "annotations": { 194 | "kubernetes.io/ingress.global-static-ip-name": "argo-ui", 195 | }, 196 | "spec": { 197 | "rules": [ 198 | { 199 | "http": { 200 | "paths": [ 201 | { 202 | "backend": { 203 | "serviceName": "argo-ui", 204 | "servicePort": 80, 205 | }, 206 | "path": "/*" 207 | }, 208 | ] 209 | } 210 | } 211 | ], 212 | } 213 | }, // ingress 214 | 215 | uiService: { 216 | "apiVersion": "v1", 217 | "kind": "Service", 218 | "metadata": { 219 | "labels": { 220 | "app": "argo-ui" 221 | }, 222 | "name": "argo-ui", 223 | "namespace": namespace, 224 | }, 225 | "spec": { 226 | "ports": [ 227 | { 228 | "port": 80, 229 | "targetPort": 8001 230 | } 231 | ], 232 | "selector": { 233 | "app": "argo-ui" 234 | }, 235 | "sessionAffinity": "None", 236 | "type": "NodePort", 237 | } 238 | }, 239 | 240 | config: { 241 | "apiVersion": "v1", 242 | "data": { 243 | "config": @"executorImage: argoproj/argoexec:v2.0.0-alpha2" 244 | }, 245 | "kind": "ConfigMap", 246 | "metadata": { 247 | "name": "workflow-controller-configmap", 248 | "namespace": namespace, 249 | } 250 | }, 251 | 252 | serviceAccount: { 253 | "apiVersion": "v1", 254 | "kind": "ServiceAccount", 255 | "metadata": { 256 | "name": "argo", 257 | "namespace": namespace, 258 | }, 259 | }, // service account 260 | 261 | // TODO(jlewi): Do we really need cluster admin privileges? Why? 262 | // is this just because workflow controller is trying to create the CRD? 263 | roleBinding: { 264 | "apiVersion": "rbac.authorization.k8s.io/v1", 265 | "kind": "ClusterRoleBinding", 266 | "metadata": { 267 | "name": "argo-cluster-role", 268 | "namespace": namespace, 269 | }, 270 | "roleRef": { 271 | "apiGroup": "rbac.authorization.k8s.io", 272 | "kind": "ClusterRole", 273 | "name": "cluster-admin" 274 | }, 275 | "subjects": [ 276 | { 277 | "kind": "ServiceAccount", 278 | "name": "argo", 279 | "namespace": namespace, 280 | } 281 | ] 282 | }, // role binding 283 | 284 | // The steps in the workflow use the default service account. 285 | // The default service account needs sufficient permission in order 286 | // to create namespaces and other objects used in the test. 287 | defaultRoleBinding: { 288 | "apiVersion": "rbac.authorization.k8s.io/v1", 289 | "kind": "ClusterRoleBinding", 290 | "metadata": { 291 | "name": "default-role", 292 | "namespace": namespace, 293 | }, 294 | "roleRef": { 295 | "apiGroup": "rbac.authorization.k8s.io", 296 | "kind": "ClusterRole", 297 | "name": "cluster-admin" 298 | }, 299 | "subjects": [ 300 | { 301 | "kind": "ServiceAccount", 302 | "name": "default", 303 | "namespace": namespace, 304 | } 305 | ] 306 | }, // default role binding 307 | } // parts 308 | } -------------------------------------------------------------------------------- /kubeflow/core/tf-job.libsonnet: -------------------------------------------------------------------------------- 1 | 2 | { 3 | // TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet 4 | // doesn't support automatically piping in the namespace from the environment to prototypes. 5 | parts(namespace):: { 6 | tfJobDeploy(image): { 7 | "apiVersion": "extensions/v1beta1", 8 | "kind": "Deployment", 9 | "metadata": { 10 | "name": "tf-job-operator", 11 | "namespace": namespace, 12 | }, 13 | "spec": { 14 | "replicas": 1, 15 | "template": { 16 | "metadata": { 17 | "labels": { 18 | "name": "tf-job-operator" 19 | } 20 | }, 21 | "spec": { 22 | "containers": [ 23 | { 24 | "command": [ 25 | "/opt/mlkube/tf_operator", 26 | "--controller-config-file=/etc/config/controller_config_file.yaml", 27 | "--alsologtostderr", 28 | "-v=1", 29 | ], 30 | "env": [ 31 | { 32 | "name": "MY_POD_NAMESPACE", 33 | "valueFrom": { 34 | "fieldRef": { 35 | "fieldPath": "metadata.namespace" 36 | } 37 | } 38 | }, 39 | { 40 | "name": "MY_POD_NAME", 41 | "valueFrom": { 42 | "fieldRef": { 43 | "fieldPath": "metadata.name" 44 | } 45 | } 46 | } 47 | ], 48 | "image": image, 49 | "name": "tf-job-operator", 50 | "volumeMounts": [ 51 | { 52 | "mountPath": "/etc/config", 53 | "name": "config-volume" 54 | } 55 | ] 56 | } 57 | ], 58 | "serviceAccountName": "tf-job-operator", 59 | "volumes": [ 60 | { 61 | "configMap": { 62 | "name": "tf-job-operator-config" 63 | }, 64 | "name": "config-volume" 65 | } 66 | ] 67 | } 68 | } 69 | } 70 | }, // tfJobDeploy 71 | 72 | // Default value for 73 | defaultControllerConfig(tfDefaultImage):: { 74 | grpcServerFilePath: "/opt/mlkube/grpc_tensorflow_server/grpc_tensorflow_server.py", 75 | } 76 | + if tfDefaultImage != "" && tfDefaultImage != "null" then 77 | { 78 | tfImage: tfDefaultImage, 79 | } 80 | else 81 | {}, 82 | 83 | azureAccelerators:: { 84 | accelerators: { 85 | "alpha.kubernetes.io/nvidia-gpu": { 86 | volumes: [ 87 | { name: "lib", 88 | mountPath: "/usr/local/nvidia/lib64", 89 | hostPath: "/usr/lib/nvidia-384", 90 | }, 91 | { 92 | name: "bin", 93 | mountPath: "/usr/local/nvidia/bin", 94 | hostPath: "/usr/lib/nvidia-384/bin", 95 | }, 96 | { name: "libcuda", 97 | mountPath: "/usr/lib/x86_64-linux-gnu/libcuda.so.1", 98 | hostPath: "/usr/lib/x86_64-linux-gnu/libcuda.so.1", 99 | }, 100 | ] 101 | } 102 | } 103 | }, 104 | 105 | configData(cloud, tfDefaultImage):: self.defaultControllerConfig(tfDefaultImage) + 106 | if cloud == "azure" then 107 | self.azureAccelerators 108 | else 109 | {}, 110 | 111 | configMap(cloud, tfDefaultImage): { 112 | "apiVersion": "v1", 113 | "data": { 114 | "controller_config_file.yaml": std.manifestJson($.parts(namespace).configData(cloud, tfDefaultImage)), 115 | }, 116 | "kind": "ConfigMap", 117 | "metadata": { 118 | "name": "tf-job-operator-config", 119 | "namespace": namespace, 120 | } 121 | }, 122 | 123 | serviceAccount: { 124 | "apiVersion": "v1", 125 | "kind": "ServiceAccount", 126 | "metadata": { 127 | "labels": { 128 | "app": "tf-job-operator" 129 | }, 130 | "name": "tf-job-operator", 131 | "namespace": namespace, 132 | } 133 | }, 134 | 135 | operatorRole: { 136 | "apiVersion": "rbac.authorization.k8s.io/v1beta1", 137 | "kind": "ClusterRole", 138 | "metadata": { 139 | "labels": { 140 | "app": "tf-job-operator" 141 | }, 142 | "name": "tf-job-operator" 143 | }, 144 | "rules": [ 145 | { 146 | "apiGroups": [ 147 | "tensorflow.org" 148 | ], 149 | "resources": [ 150 | "tfjobs" 151 | ], 152 | "verbs": [ 153 | "*" 154 | ] 155 | }, 156 | { 157 | "apiGroups": [ 158 | "apiextensions.k8s.io" 159 | ], 160 | "resources": [ 161 | "customresourcedefinitions" 162 | ], 163 | "verbs": [ 164 | "*" 165 | ] 166 | }, 167 | { 168 | "apiGroups": [ 169 | "storage.k8s.io" 170 | ], 171 | "resources": [ 172 | "storageclasses" 173 | ], 174 | "verbs": [ 175 | "*" 176 | ] 177 | }, 178 | { 179 | "apiGroups": [ 180 | "batch" 181 | ], 182 | "resources": [ 183 | "jobs" 184 | ], 185 | "verbs": [ 186 | "*" 187 | ] 188 | }, 189 | { 190 | "apiGroups": [ 191 | "" 192 | ], 193 | "resources": [ 194 | "configmaps", 195 | "pods", 196 | "services", 197 | "endpoints", 198 | "persistentvolumeclaims", 199 | "events" 200 | ], 201 | "verbs": [ 202 | "*" 203 | ] 204 | }, 205 | { 206 | "apiGroups": [ 207 | "apps", 208 | "extensions" 209 | ], 210 | "resources": [ 211 | "deployments" 212 | ], 213 | "verbs": [ 214 | "*" 215 | ] 216 | } 217 | ] 218 | }, // operator-role 219 | 220 | operatorRoleBinding:: { 221 | "apiVersion": "rbac.authorization.k8s.io/v1beta1", 222 | "kind": "ClusterRoleBinding", 223 | "metadata": { 224 | "labels": { 225 | "app": "tf-job-operator" 226 | }, 227 | "name": "tf-job-operator" 228 | }, 229 | "roleRef": { 230 | "apiGroup": "rbac.authorization.k8s.io", 231 | "kind": "ClusterRole", 232 | "name": "tf-job-operator" 233 | }, 234 | "subjects": [ 235 | { 236 | "kind": "ServiceAccount", 237 | "name": "tf-job-operator", 238 | "namespace": namespace, 239 | } 240 | ] 241 | }, // operator-role binding 242 | 243 | uiService(serviceType):: { 244 | "apiVersion": "v1", 245 | "kind": "Service", 246 | "metadata": { 247 | "name": "tf-job-dashboard", 248 | "namespace": namespace, 249 | }, 250 | "spec": { 251 | "ports": [ 252 | { 253 | "port": 80, 254 | "targetPort": 8080 255 | } 256 | ], 257 | "selector": { 258 | "name": "tf-job-dashboard" 259 | }, 260 | "type": serviceType, 261 | } 262 | }, // uiService 263 | 264 | uiServiceAccount: { 265 | "apiVersion": "v1", 266 | "kind": "ServiceAccount", 267 | "metadata": { 268 | "labels": { 269 | "app": "tf-job-dashboard" 270 | }, 271 | "name": "tf-job-dashboard", 272 | "namespace": namespace, 273 | } 274 | }, // uiServiceAccount 275 | 276 | ui(image):: { 277 | "apiVersion": "extensions/v1beta1", 278 | "kind": "Deployment", 279 | "metadata": { 280 | "name": "tf-job-dashboard", 281 | "namespace": namespace, 282 | }, 283 | "spec": { 284 | "template": { 285 | "metadata": { 286 | "labels": { 287 | "name": "tf-job-dashboard" 288 | } 289 | }, 290 | "spec": { 291 | "containers": [ 292 | { 293 | "command": [ 294 | "/opt/tensorflow_k8s/dashboard/backend" 295 | ], 296 | "image": image, 297 | "name": "tf-job-dashboard", 298 | "ports": [ 299 | { 300 | "containerPort": 8080 301 | } 302 | ] 303 | } 304 | ], 305 | "serviceAccountName": "tf-job-dashboard", 306 | } 307 | } 308 | }, 309 | }, // ui 310 | 311 | uiRole:: { 312 | "apiVersion": "rbac.authorization.k8s.io/v1beta1", 313 | "kind": "ClusterRole", 314 | "metadata": { 315 | "labels": { 316 | "app": "tf-job-dashboard" 317 | }, 318 | "name": "tf-job-dashboard" 319 | }, 320 | "rules": [ 321 | { 322 | "apiGroups": [ 323 | "tensorflow.org" 324 | ], 325 | "resources": [ 326 | "tfjobs" 327 | ], 328 | "verbs": [ 329 | "*" 330 | ] 331 | }, 332 | { 333 | "apiGroups": [ 334 | "apiextensions.k8s.io" 335 | ], 336 | "resources": [ 337 | "customresourcedefinitions" 338 | ], 339 | "verbs": [ 340 | "*" 341 | ] 342 | }, 343 | { 344 | "apiGroups": [ 345 | "storage.k8s.io" 346 | ], 347 | "resources": [ 348 | "storageclasses" 349 | ], 350 | "verbs": [ 351 | "*" 352 | ] 353 | }, 354 | { 355 | "apiGroups": [ 356 | "batch" 357 | ], 358 | "resources": [ 359 | "jobs" 360 | ], 361 | "verbs": [ 362 | "*" 363 | ] 364 | }, 365 | { 366 | "apiGroups": [ 367 | "" 368 | ], 369 | "resources": [ 370 | "configmaps", 371 | "pods", 372 | "services", 373 | "endpoints", 374 | "persistentvolumeclaims", 375 | "events" 376 | ], 377 | "verbs": [ 378 | "*" 379 | ] 380 | }, 381 | { 382 | "apiGroups": [ 383 | "apps", 384 | "extensions" 385 | ], 386 | "resources": [ 387 | "deployments" 388 | ], 389 | "verbs": [ 390 | "*" 391 | ] 392 | } 393 | ] 394 | }, // uiRole 395 | 396 | uiRoleBinding:: { 397 | "apiVersion": "rbac.authorization.k8s.io/v1beta1", 398 | "kind": "ClusterRoleBinding", 399 | "metadata": { 400 | "labels": { 401 | "app": "tf-job-dashboard" 402 | }, 403 | "name": "tf-job-dashboard" 404 | }, 405 | "roleRef": { 406 | "apiGroup": "rbac.authorization.k8s.io", 407 | "kind": "ClusterRole", 408 | "name": "tf-job-dashboard" 409 | }, 410 | "subjects": [ 411 | { 412 | "kind": "ServiceAccount", 413 | "name": "tf-job-dashboard", 414 | "namespace": namespace, 415 | } 416 | ] 417 | }, // uiRoleBinding 418 | }, 419 | } 420 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | --------------------------------------------------------------------------------