├── .dockerignore ├── test └── workflows │ ├── environments │ ├── test │ │ ├── globals.libsonnet │ │ ├── main.jsonnet │ │ └── params.libsonnet │ └── base.libsonnet │ ├── .gitignore │ ├── components │ ├── params.libsonnet │ ├── util.libsonnet │ └── e2e.jsonnet │ ├── app.yaml │ ├── .ksonnet │ └── registries │ │ └── incubator │ │ └── 40285d8a14f1ac5787e405e1023cf0c07f6aa28c.yaml │ └── lib │ └── v1.10.1 │ └── k.libsonnet ├── OWNERS ├── deploy ├── 1-namespace.yaml ├── 0-crd.yaml ├── 3-chainer-operator.yaml └── 2-rbac.yaml ├── examples ├── docker │ ├── chainer │ │ ├── build-and-publish.sh │ │ └── Dockerfile │ └── chainermn │ │ ├── build-and-publish.sh │ │ └── Dockerfile ├── chainerjob.yaml ├── chainerjob-mn.yaml └── chainerjob-reference.yaml ├── .travis.yml ├── Gopkg.toml ├── .gitignore ├── hack ├── boilerplate │ └── boilerplate.go.txt ├── update-codegen.sh └── verify-codegen.sh ├── prow_config.yaml ├── pkg ├── util │ └── signals │ │ ├── signal_windows.go │ │ ├── signal_posix.go │ │ └── signal.go ├── client │ ├── clientset │ │ └── versioned │ │ │ ├── doc.go │ │ │ ├── fake │ │ │ ├── doc.go │ │ │ ├── register.go │ │ │ └── clientset_generated.go │ │ │ ├── typed │ │ │ └── chainer │ │ │ │ └── v1alpha1 │ │ │ │ ├── fake │ │ │ │ ├── doc.go │ │ │ │ ├── fake_chainer_client.go │ │ │ │ └── fake_chainerjob.go │ │ │ │ ├── generated_expansion.go │ │ │ │ ├── doc.go │ │ │ │ ├── chainer_client.go │ │ │ │ └── chainerjob.go │ │ │ ├── scheme │ │ │ ├── doc.go │ │ │ └── register.go │ │ │ └── clientset.go │ ├── listers │ │ └── chainer │ │ │ └── v1alpha1 │ │ │ ├── expansion_generated.go │ │ │ └── chainerjob.go │ └── informers │ │ └── externalversions │ │ ├── internalinterfaces │ │ └── factory_interfaces.go │ │ ├── chainer │ │ ├── v1alpha1 │ │ │ ├── interface.go │ │ │ └── chainerjob.go │ │ └── interface.go │ │ ├── generic.go │ │ └── factory.go ├── apis │ └── chainer │ │ └── v1alpha1 │ │ ├── doc.go │ │ ├── constants.go │ │ ├── zz_generated.defaults.go │ │ ├── types.go │ │ ├── register.go │ │ ├── defaults.go │ │ ├── zz_generated.deepcopy.go │ │ └── defaults_test.go └── controllers │ └── backends │ ├── types.go │ ├── constants.go │ ├── mpi │ ├── assets.go │ └── mpi_backend.go │ ├── none │ ├── none_backend.go │ └── none_backend_test.go │ └── util.go ├── scripts ├── gcloud-util.sh ├── build.sh ├── create-cluster.sh ├── delete-cluster.sh ├── kfctl-util.sh └── run-test.sh ├── Dockerfile ├── cmd └── chainer-operator │ └── main.go ├── README.md ├── LICENSE └── Gopkg.lock /.dockerignore: -------------------------------------------------------------------------------- 1 | .gitignore 2 | vendor 3 | -------------------------------------------------------------------------------- /test/workflows/environments/test/globals.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /test/workflows/.gitignore: -------------------------------------------------------------------------------- 1 | /lib 2 | /.ksonnet/registries 3 | /app.override.yaml 4 | /.ks_environment 5 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | approvers: 2 | - everpeace 3 | - disktnk 4 | - jlewi 5 | reviewers: 6 | - gaocegege 7 | 8 | -------------------------------------------------------------------------------- /deploy/1-namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: chainer-operator 5 | -------------------------------------------------------------------------------- /test/workflows/environments/base.libsonnet: -------------------------------------------------------------------------------- 1 | local components = std.extVar("__ksonnet/components"); 2 | components + { 3 | // Insert user-specified overrides here. 4 | } 5 | -------------------------------------------------------------------------------- /examples/docker/chainer/build-and-publish.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | IMAGE_REPO=$1 # your_org/your_image 4 | IMAGE_TAG=${2:-latest} 5 | 6 | docker build -t $IMAGE_REPO:$IMAGE_TAG . 7 | docker push $IMAGE_REPO:$IMAGE_TAG 8 | -------------------------------------------------------------------------------- /examples/docker/chainermn/build-and-publish.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | IMAGE_REPO=$1 # your_org/your_image 4 | IMAGE_TAG=${2:-latest} 5 | 6 | docker build -t $IMAGE_REPO:$IMAGE_TAG . 7 | docker push $IMAGE_REPO:$IMAGE_TAG 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | go_import_path: github.com/kubeflow/chainer-operator 3 | 4 | go: 5 | - 1.10.2 6 | 7 | before_install: 8 | - go get -u github.com/golang/dep/cmd/dep 9 | - dep ensure 10 | 11 | install: 12 | - go get -t -d ./... 13 | 14 | script: 15 | - ./hack/verify-codegen.sh && go test ./... 16 | -------------------------------------------------------------------------------- /Gopkg.toml: -------------------------------------------------------------------------------- 1 | required = ["k8s.io/code-generator/cmd/client-gen"] 2 | 3 | [[override]] 4 | name = "k8s.io/api" 5 | version = "kubernetes-1.10.3" 6 | 7 | [[override]] 8 | name = "k8s.io/apimachinery" 9 | version = "kubernetes-1.10.3" 10 | 11 | [[override]] 12 | name = "k8s.io/client-go" 13 | version = "kubernetes-1.10.3" 14 | -------------------------------------------------------------------------------- /examples/docker/chainer/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_CHAINER_IMAGE_TAG="v4.1.0-python3" 2 | FROM chainer/chainer:$BASE_CHAINER_IMAGE_TAG 3 | 4 | RUN apt-get update && apt-get install -y wget && \ 5 | wget https://raw.githubusercontent.com/chainer/chainer/v4.1.0/examples/mnist/train_mnist.py \ 6 | -O /train_mnist.py && \ 7 | python3 -c "import chainer; chainer.datasets.get_mnist()" 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/go 2 | 3 | ### Go ### 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | 11 | # Test binary, build with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | 18 | # End of https://www.gitignore.io/api/go 19 | 20 | /vendor/ -------------------------------------------------------------------------------- /examples/chainerjob.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1alpha1 2 | kind: ChainerJob 3 | metadata: 4 | name: example-job 5 | spec: 6 | master: 7 | template: 8 | spec: 9 | containers: 10 | - image: everpeace/chainer:4.1.0 11 | command: 12 | - sh 13 | - -c 14 | - | 15 | python3 /train_mnist.py -e 2 -b 1000 -u 100 --noplot 16 | -------------------------------------------------------------------------------- /test/workflows/components/params.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | global: { 3 | // User-defined global parameters; accessible to all component and environments, Ex: 4 | // replicas: 4, 5 | }, 6 | components: { 7 | // Component-level parameters, defined initially from 'ks prototype use ...' 8 | // Each object below should correspond to a component in the components/ directory 9 | }, 10 | } 11 | -------------------------------------------------------------------------------- /test/workflows/environments/test/main.jsonnet: -------------------------------------------------------------------------------- 1 | local base = import "base.libsonnet"; 2 | // uncomment if you reference ksonnet-lib 3 | // local k = import "k.libsonnet"; 4 | 5 | base + { 6 | // Insert user-specified overrides here. For example if a component is named \"nginx-deployment\", you might have something like:\n") 7 | // "nginx-deployment"+: k.deployment.mixin.metadata.labels({foo: "bar"}) 8 | } 9 | -------------------------------------------------------------------------------- /deploy/0-crd.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apiextensions.k8s.io/v1beta1 2 | kind: CustomResourceDefinition 3 | metadata: 4 | name: chainerjobs.kubeflow.org 5 | spec: 6 | group: kubeflow.org 7 | version: v1alpha1 8 | scope: Namespaced 9 | names: 10 | plural: chainerjobs 11 | singular: chainerjob 12 | kind: ChainerJob 13 | shortNames: 14 | - chj 15 | - chjs 16 | - chjob 17 | - chjobs 18 | categories: 19 | - all 20 | -------------------------------------------------------------------------------- /test/workflows/app.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 0.1.0 2 | environments: 3 | test: 4 | destination: 5 | namespace: kubeflow-releasing 6 | server: https://35.226.49.107 7 | k8sVersion: v1.7.0 8 | path: test 9 | kind: ksonnet.io/app 10 | name: workflows 11 | registries: 12 | incubator: 13 | gitVersion: 14 | commitSha: 40285d8a14f1ac5787e405e1023cf0c07f6aa28c 15 | refSpec: master 16 | protocol: github 17 | uri: github.com/ksonnet/parts/tree/master/incubator 18 | version: 0.0.1 19 | -------------------------------------------------------------------------------- /test/workflows/environments/test/params.libsonnet: -------------------------------------------------------------------------------- 1 | local params = std.extVar("__ksonnet/params"); 2 | local globals = import "globals.libsonnet"; 3 | local envParams = params + { 4 | components +: { 5 | // Insert component parameter overrides here. Ex: 6 | // guestbook +: { 7 | // name: "guestbook-dev", 8 | // replicas: params.global.replicas, 9 | // }, 10 | }, 11 | }; 12 | 13 | { 14 | components: { 15 | [x]: envParams.components[x] + globals, for x in std.objectFields(envParams.components) 16 | }, 17 | } 18 | -------------------------------------------------------------------------------- /test/workflows/components/util.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | // convert a list of two items into a map representing an environment variable 3 | listToMap:: function(v) 4 | { 5 | name: v[0], 6 | value: v[1], 7 | }, 8 | 9 | // Function to turn comma separated list of prow environment variables into a dictionary. 10 | parseEnv:: function(v) 11 | local pieces = std.split(v, ","); 12 | if v != "" && std.length(pieces) > 0 then 13 | std.map( 14 | function(i) $.listToMap(std.split(i, "=")), 15 | std.split(v, ",") 16 | ) 17 | else [], 18 | } -------------------------------------------------------------------------------- /hack/boilerplate/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | // Copyright YEAR The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | -------------------------------------------------------------------------------- /deploy/3-chainer-operator.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: chainer-operator 5 | namespace: chainer-operator 6 | labels: 7 | app: chainer-operator 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: chainer-operator 13 | template: 14 | metadata: 15 | labels: 16 | app: chainer-operator 17 | spec: 18 | serviceAccountName: chainer-operator 19 | containers: 20 | - name: chainer-operator 21 | # TODO: change registry 22 | image: everpeace/chainer-operator:latest 23 | imagePullPolicy: Always 24 | args: [ 25 | "-v", "4", 26 | "-stderrthreshold", "INFO" 27 | ] 28 | -------------------------------------------------------------------------------- /prow_config.yaml: -------------------------------------------------------------------------------- 1 | # This file configures the workflows to trigger in our Prow jobs. 2 | # see kubeflow/testing/py/run_e2e_workflow.py 3 | workflows: 4 | - app_dir: kubeflow/chainer-operator/test/workflows 5 | # this super-short names are required so that identity lengths will be shorter than 64 6 | component: e2e 7 | name: chr 8 | job_types: 9 | - presubmit 10 | params: 11 | registry: "gcr.io/kubeflow-ci" 12 | - app_dir: kubeflow/chainer-operator/test/workflows 13 | component: e2e 14 | # this super-short names are required so that identity lengths will be shorter than 64 15 | name: chr 16 | job_types: 17 | - postsubmit 18 | params: 19 | registry: "gcr.io/kubeflow-images-public" 20 | -------------------------------------------------------------------------------- /pkg/util/signals/signal_windows.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2017 The Kubernetes Authors. 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | http://www.apache.org/licenses/LICENSE-2.0 7 | Unless required by applicable law or agreed to in writing, software 8 | distributed under the License is distributed on an "AS IS" BASIS, 9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | See the License for the specific language governing permissions and 11 | limitations under the License. 12 | */ 13 | 14 | package signals 15 | 16 | import ( 17 | "os" 18 | ) 19 | 20 | var shutdownSignals = []os.Signal{os.Interrupt} 21 | -------------------------------------------------------------------------------- /pkg/util/signals/signal_posix.go: -------------------------------------------------------------------------------- 1 | // +build !windows 2 | 3 | /* 4 | Copyright 2017 The Kubernetes Authors. 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | */ 15 | 16 | package signals 17 | 18 | import ( 19 | "os" 20 | "syscall" 21 | ) 22 | 23 | var shutdownSignals = []os.Signal{os.Interrupt, syscall.SIGTERM} 24 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | // This package has the automatically generated clientset. 18 | package versioned 19 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/fake/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | // This package has the automatically generated fake clientset. 18 | package fake 19 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/chainer/v1alpha1/fake/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | // Package fake has the automatically generated clients. 18 | package fake 19 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/chainer/v1alpha1/generated_expansion.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package v1alpha1 18 | 19 | type ChainerJobExpansion interface{} 20 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/scheme/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | // This package contains the scheme of the automatically generated clientset. 18 | package scheme 19 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/chainer/v1alpha1/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | // This package has the automatically generated typed clients. 18 | package v1alpha1 19 | -------------------------------------------------------------------------------- /pkg/apis/chainer/v1alpha1/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // +k8s:deepcopy-gen=package 16 | // +k8s:defaulter-gen=TypeMeta 17 | 18 | // Package v1alpha1 is the v1alpha1 version of the API. 19 | // +groupName=kubeflow.org 20 | package v1alpha1 21 | -------------------------------------------------------------------------------- /examples/chainerjob-mn.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1alpha1 2 | kind: ChainerJob 3 | metadata: 4 | name: example-job-mn 5 | spec: 6 | backend: mpi 7 | master: 8 | template: 9 | spec: 10 | containers: 11 | - name: chainer 12 | image: everpeace/chainermn:1.3.0 13 | command: 14 | - sh 15 | - -c 16 | - | 17 | mpiexec -n 3 -N 1 --allow-run-as-root --display-map --mca mpi_cuda_support 0 \ 18 | python3 /train_mnist.py -e 2 -b 1000 -u 100 19 | workerSets: 20 | ws0: 21 | replicas: 2 22 | template: 23 | spec: 24 | containers: 25 | - name: chainer 26 | image: everpeace/chainermn:1.3.0 27 | command: 28 | - sh 29 | - -c 30 | - | 31 | while true; do sleep 1 & wait; done 32 | -------------------------------------------------------------------------------- /scripts/gcloud-util.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2018 The Kubernetes Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | function gcloud::auth_activate(){ 18 | echo "Activating service-account" 19 | gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS} 20 | gcloud version 21 | } 22 | -------------------------------------------------------------------------------- /test/workflows/.ksonnet/registries/incubator/40285d8a14f1ac5787e405e1023cf0c07f6aa28c.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 0.1.0 2 | gitVersion: 3 | commitSha: 40285d8a14f1ac5787e405e1023cf0c07f6aa28c 4 | refSpec: master 5 | kind: ksonnet.io/registry 6 | libraries: 7 | apache: 8 | path: apache 9 | version: master 10 | efk: 11 | path: efk 12 | version: master 13 | mariadb: 14 | path: mariadb 15 | version: master 16 | memcached: 17 | path: memcached 18 | version: master 19 | mongodb: 20 | path: mongodb 21 | version: master 22 | mysql: 23 | path: mysql 24 | version: master 25 | nginx: 26 | path: nginx 27 | version: master 28 | node: 29 | path: node 30 | version: master 31 | postgres: 32 | path: postgres 33 | version: master 34 | redis: 35 | path: redis 36 | version: master 37 | tomcat: 38 | path: tomcat 39 | version: master 40 | -------------------------------------------------------------------------------- /pkg/apis/chainer/v1alpha1/constants.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package v1alpha1 16 | 17 | const ( 18 | // DefaultContainerName is the default container name 19 | DefaultContainerName = "chainer" 20 | // DefaultSlots is the default slot 21 | DefaultSlots = 1 22 | 23 | // DefaultRestartPolicy is the default restart policy 24 | DefaultRestartPolicy = "Never" 25 | ) 26 | -------------------------------------------------------------------------------- /pkg/client/listers/chainer/v1alpha1/expansion_generated.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by lister-gen. DO NOT EDIT. 16 | 17 | package v1alpha1 18 | 19 | // ChainerJobListerExpansion allows custom methods to be added to 20 | // ChainerJobLister. 21 | type ChainerJobListerExpansion interface{} 22 | 23 | // ChainerJobNamespaceListerExpansion allows custom methods to be added to 24 | // ChainerJobNamespaceLister. 25 | type ChainerJobNamespaceListerExpansion interface{} 26 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.10.2-alpine3.7 AS build 2 | 3 | # Install tools required to build the project. 4 | # We need to run `docker build --no-cache .` to update those dependencies. 5 | RUN apk add --no-cache git bash 6 | RUN go get github.com/golang/dep/cmd/dep 7 | 8 | # Gopkg.toml and Gopkg.lock lists project dependencies. 9 | # These layers are only re-built when Gopkg files are updated. 10 | COPY Gopkg.lock Gopkg.toml /go/src/github.com/kubeflow/chainer-operator/ 11 | WORKDIR /go/src/github.com/kubeflow/chainer-operator/ 12 | 13 | # Install library dependencies. 14 | RUN dep ensure -vendor-only 15 | 16 | # Copy all project and build it. 17 | # This layer is rebuilt when ever a file has changed in the project directory. 18 | COPY . /go/src/github.com/kubeflow/chainer-operator/ 19 | RUN ./hack/verify-codegen.sh && go test github.com/kubeflow/chainer-operator/... 20 | RUN go build -o /bin/chainer-operator github.com/kubeflow/chainer-operator/cmd/chainer-operator 21 | 22 | FROM alpine:3.7 23 | COPY --from=build /bin/chainer-operator /bin/chainer-operator 24 | ENTRYPOINT ["/bin/chainer-operator"] 25 | CMD ["--help"] 26 | -------------------------------------------------------------------------------- /pkg/controllers/backends/types.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package backends 16 | 17 | import ( 18 | apisv1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 19 | ) 20 | 21 | const ( 22 | ErrResourceExists = "ErrResourceExists" 23 | 24 | MessageResourceExists = "Resource %q already exists and is not managed by ChainerJob" 25 | ) 26 | 27 | // Backend is interface for various backends 28 | type Backend interface { 29 | SyncChainerJob(chjob *apisv1alpha1.ChainerJob) error 30 | } 31 | -------------------------------------------------------------------------------- /scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2018 The Kubeflow Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This shell script is used to build an image from our argo workflow 18 | 19 | set -o errexit 20 | set -o nounset 21 | set -o pipefail 22 | 23 | export PATH=${GOPATH}/bin:/usr/local/go/bin:${PATH} 24 | REGISTRY="${GCP_REGISTRY}" 25 | PROJECT="${GCP_PROJECT}" 26 | VERSION=$(git describe --tags --always --dirty) 27 | source `dirname $0`/gcloud-util.sh 28 | 29 | gcloud::auth_activate 30 | 31 | # build chainer operator image 32 | echo "building chainer-operator image in gcloud" 33 | gcloud container builds submit . --tag=${REGISTRY}/${REPO_NAME}:${VERSION} --project=${PROJECT} 34 | -------------------------------------------------------------------------------- /pkg/controllers/backends/constants.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package backends 16 | 17 | import ( 18 | appsv1 "k8s.io/api/apps/v1" 19 | ) 20 | 21 | const ( 22 | MasterSuffix = "-master" 23 | WorkerSetSuffix = "-workerset" 24 | ServiceAccountSuffix = "-launcher" 25 | RoleSuffix = ServiceAccountSuffix + "-role" 26 | RolebindingSuffix = ServiceAccountSuffix + "-rolebinding" 27 | 28 | JobLabelKey = "chainerjob.kubeflow.org/name" 29 | RoleLabelKey = "chainerjob.kubeflow.org/role" 30 | WorkersetLabelKey = "chainerjob.kubeflow.org/workersetname" 31 | RoleMaster = "master" 32 | RoleWorkerSet = "workerset" 33 | 34 | PodManagementPolicy = appsv1.ParallelPodManagement 35 | ) 36 | -------------------------------------------------------------------------------- /pkg/util/signals/signal.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2017 The Kubernetes Authors. 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | http://www.apache.org/licenses/LICENSE-2.0 7 | Unless required by applicable law or agreed to in writing, software 8 | distributed under the License is distributed on an "AS IS" BASIS, 9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | See the License for the specific language governing permissions and 11 | limitations under the License. 12 | */ 13 | 14 | package signals 15 | 16 | import ( 17 | "os" 18 | "os/signal" 19 | ) 20 | 21 | var onlyOneSignalHandler = make(chan struct{}) 22 | 23 | // SetupSignalHandler registered for SIGTERM and SIGINT. A stop channel is returned 24 | // which is closed on one of these signals. If a second signal is caught, the program 25 | // is terminated with exit code 1. 26 | func SetupSignalHandler() (stopCh <-chan struct{}) { 27 | close(onlyOneSignalHandler) // panics when called twice 28 | 29 | stop := make(chan struct{}) 30 | c := make(chan os.Signal, 2) 31 | signal.Notify(c, shutdownSignals...) 32 | go func() { 33 | <-c 34 | close(stop) 35 | <-c 36 | os.Exit(1) // second signal. Exit directly. 37 | }() 38 | 39 | return stop 40 | } 41 | -------------------------------------------------------------------------------- /examples/docker/chainermn/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM chainer/chainer:v4.1.0-python3 2 | 3 | ARG OPENMPI_VERSION="2.1.3" 4 | ARG CHAINER_MN_VERSION="1.3.0" 5 | 6 | # Install basic dependencies and locales 7 | RUN apt-get update && apt-get install -yq --no-install-recommends \ 8 | locales wget sudo ca-certificates ssh build-essential && \ 9 | rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* && \ 10 | echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && locale-gen 11 | 12 | # Install OpenMPI with cuda 13 | RUN cd /tmp && \ 14 | wget -q https://www.open-mpi.org/software/ompi/v${OPENMPI_VERSION%\.*}/downloads/openmpi-$OPENMPI_VERSION.tar.bz2 && \ 15 | tar -xjf openmpi-$OPENMPI_VERSION.tar.bz2 && \ 16 | cd /tmp/openmpi-$OPENMPI_VERSION && \ 17 | ./configure --prefix=/usr --with-cuda && make -j2 && make install && rm -r /tmp/openmpi-$OPENMPI_VERSION* && \ 18 | ompi_info --parsable --all | grep -q "mpi_built_with_cuda_support:value:true" 19 | 20 | # Install ChainerMN 21 | RUN pip3 install chainermn==$CHAINER_MN_VERSION 22 | 23 | # Download train_mnist.py example of ChainerMN and download mnist data 24 | # In practice, you would download your codes here. 25 | RUN mkdir -p /chainermn-examples/mnist && \ 26 | cd /chainermn-examples/mnist && \ 27 | wget https://raw.githubusercontent.com/chainer/chainermn/v${CHAINER_MN_VERSION}/examples/mnist/train_mnist.py -O /train_mnist.py && \ 28 | python3 -c "import chainer; chainer.datasets.get_mnist()" 29 | -------------------------------------------------------------------------------- /scripts/create-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2018 The Kubernetes Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This shell script is used to build a cluster and create a namespace from our 18 | # argo workflow 19 | 20 | 21 | set -xe 22 | set -o pipefail 23 | 24 | export CLUSTER_NAME="${CLUSTER_NAME}" 25 | export ZONE="${GCP_ZONE}" 26 | export PROJECT="${GCP_PROJECT}" 27 | export K8S_NAMESPACE="${DEPLOY_NAMESPACE}" 28 | KFCTL_DIR=${KFCTL_DIR} 29 | WORK_DIR=$(mktemp -d) 30 | source `dirname $0`/kfctl-util.sh 31 | source `dirname $0`/gcloud-util.sh 32 | 33 | gcloud::auth_activate 34 | 35 | cd ${WORK_DIR} 36 | 37 | kfctl::init ${KFCTL_DIR} ${CLUSTER_NAME} ${PROJECT} 38 | 39 | cd ${CLUSTER_NAME} 40 | cat env.sh # for debugging 41 | 42 | export CLIENT_ID=dummy 43 | export CLIENT_SECRET=dummy 44 | kfctl::generate ${KFCTL_DIR} platform 45 | kfctl::apply ${KFCTL_DIR} platform 46 | -------------------------------------------------------------------------------- /hack/update-codegen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2018 The Kubeflow Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http:#www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -o errexit 18 | set -o nounset 19 | set -o pipefail 20 | 21 | SCRIPT_ROOT=$(dirname ${BASH_SOURCE})/.. 22 | 23 | vendor/k8s.io/code-generator/generate-groups.sh \ 24 | "deepcopy,client,informer,lister" \ 25 | github.com/kubeflow/chainer-operator/pkg/client \ 26 | github.com/kubeflow/chainer-operator/pkg/apis \ 27 | chainer:v1alpha1 \ 28 | --go-header-file ${SCRIPT_ROOT}/hack/boilerplate/boilerplate.go.txt 29 | 30 | # Notice: The code in code-generator does not generate defaulter by default. 31 | echo "Generating defaulters for v1alpha1" 32 | ${GOPATH}/bin/defaulter-gen \ 33 | --input-dirs github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1 \ 34 | -O zz_generated.defaults \ 35 | --go-header-file hack/boilerplate/boilerplate.go.txt "$@" 36 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/chainer/v1alpha1/fake/fake_chainer_client.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package fake 18 | 19 | import ( 20 | v1alpha1 "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned/typed/chainer/v1alpha1" 21 | rest "k8s.io/client-go/rest" 22 | testing "k8s.io/client-go/testing" 23 | ) 24 | 25 | type FakeKubeflowV1alpha1 struct { 26 | *testing.Fake 27 | } 28 | 29 | func (c *FakeKubeflowV1alpha1) ChainerJobs(namespace string) v1alpha1.ChainerJobInterface { 30 | return &FakeChainerJobs{c, namespace} 31 | } 32 | 33 | // RESTClient returns a RESTClient that is used to communicate 34 | // with API server by this client implementation. 35 | func (c *FakeKubeflowV1alpha1) RESTClient() rest.Interface { 36 | var ret *rest.RESTClient 37 | return ret 38 | } 39 | -------------------------------------------------------------------------------- /hack/verify-codegen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2018 The Kubeflow Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http:#www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -o errexit 18 | set -o nounset 19 | set -o pipefail 20 | 21 | SCRIPT_ROOT=$(dirname "${BASH_SOURCE}")/.. 22 | 23 | DIFFROOT="${SCRIPT_ROOT}/pkg" 24 | TMP_DIFFROOT="${SCRIPT_ROOT}/_tmp/pkg" 25 | _tmp="${SCRIPT_ROOT}/_tmp" 26 | 27 | cleanup() { 28 | rm -rf "${_tmp}" 29 | } 30 | trap "cleanup" EXIT SIGINT 31 | 32 | cleanup 33 | 34 | mkdir -p "${TMP_DIFFROOT}" 35 | cp -a "${DIFFROOT}"/* "${TMP_DIFFROOT}" 36 | 37 | "${SCRIPT_ROOT}/hack/update-codegen.sh" 38 | echo "diffing ${DIFFROOT} against freshly generated codegen" 39 | ret=0 40 | diff -Naupr "${DIFFROOT}" "${TMP_DIFFROOT}" || ret=$? 41 | cp -a "${TMP_DIFFROOT}"/* "${DIFFROOT}" 42 | if [[ $ret -eq 0 ]] 43 | then 44 | echo "${DIFFROOT} up to date." 45 | else 46 | echo "${DIFFROOT} is out of date. Please run hack/update-codegen.sh" 47 | exit 1 48 | fi -------------------------------------------------------------------------------- /pkg/client/informers/externalversions/internalinterfaces/factory_interfaces.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by informer-gen. DO NOT EDIT. 16 | 17 | package internalinterfaces 18 | 19 | import ( 20 | time "time" 21 | 22 | versioned "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned" 23 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 24 | runtime "k8s.io/apimachinery/pkg/runtime" 25 | cache "k8s.io/client-go/tools/cache" 26 | ) 27 | 28 | type NewInformerFunc func(versioned.Interface, time.Duration) cache.SharedIndexInformer 29 | 30 | // SharedInformerFactory a small interface to allow for adding an informer without an import cycle 31 | type SharedInformerFactory interface { 32 | Start(stopCh <-chan struct{}) 33 | InformerFor(obj runtime.Object, newFunc NewInformerFunc) cache.SharedIndexInformer 34 | } 35 | 36 | type TweakListOptionsFunc func(*v1.ListOptions) 37 | -------------------------------------------------------------------------------- /scripts/delete-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2018 The Kubernetes Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This shell script is used to build a cluster and create a namespace from our 18 | # argo workflow 19 | 20 | 21 | set -xe 22 | set -o pipefail 23 | 24 | export CLUSTER_NAME="${CLUSTER_NAME}" 25 | export ZONE="${GCP_ZONE}" 26 | export PROJECT="${GCP_PROJECT}" 27 | export K8S_NAMESPACE="${DEPLOY_NAMESPACE}" 28 | KFCTL_DIR="${KFCTL_DIR}" 29 | WORK_DIR=$(mktemp -d) 30 | source `dirname $0`/kfctl-util.sh 31 | source `dirname $0`/gcloud-util.sh 32 | 33 | gcloud::auth_activate 34 | echo "Configuring kubectl" 35 | gcloud --project ${PROJECT} container clusters get-credentials ${CLUSTER_NAME} \ 36 | --zone ${ZONE} 37 | 38 | cd ${WORK_DIR} 39 | 40 | kfctl::init ${KFCTL_DIR} ${CLUSTER_NAME} ${PROJECT} 41 | 42 | cd ${CLUSTER_NAME} 43 | cat env.sh # for debugging 44 | 45 | export CLIENT_ID=dummy 46 | export CLIENT_SECRET=dummy 47 | kfctl::generate ${KFCTL_DIR} all 48 | kfctl::delete ${KFCTL_DIR} platform 49 | -------------------------------------------------------------------------------- /scripts/kfctl-util.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2018 The Kubernetes Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | function kfctl::_simple_exec(){ 18 | local kfctl_dir=$1 19 | local command=$2 20 | local what=$3 21 | 22 | echo "executing kfctl ${command} ${what}" 23 | ${kfctl_dir}/scripts/kfctl.sh ${command} ${what} 24 | } 25 | 26 | function kfctl::init(){ 27 | local kfctl_dir=$1 28 | local deploy_name=$2 29 | local gcp_project=$3 30 | 31 | echo "Initializing kfctl" 32 | ${kfctl_dir}/scripts/kfctl.sh init ${deploy_name} \ 33 | --platform gcp \ 34 | --project ${gcp_project} \ 35 | --skipInitProject ${gcp_project} 36 | } 37 | 38 | function kfctl::generate(){ 39 | local kfctl_dir=$1 40 | local what=$2 41 | kfctl::_simple_exec ${kfctl_dir} generate ${what} 42 | } 43 | 44 | function kfctl::apply(){ 45 | local kfctl_dir=$1 46 | local what=$2 47 | kfctl::_simple_exec ${kfctl_dir} apply ${what} 48 | } 49 | 50 | function kfctl::delete(){ 51 | local kfctl_dir=$1 52 | local what=$2 53 | kfctl::_simple_exec ${kfctl_dir} delete ${what} 54 | } 55 | -------------------------------------------------------------------------------- /pkg/apis/chainer/v1alpha1/zz_generated.defaults.go: -------------------------------------------------------------------------------- 1 | // +build !ignore_autogenerated 2 | 3 | // Copyright 2018 The Kubeflow Authors. 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // Code generated by defaulter-gen. DO NOT EDIT. 18 | 19 | package v1alpha1 20 | 21 | import ( 22 | runtime "k8s.io/apimachinery/pkg/runtime" 23 | ) 24 | 25 | // RegisterDefaults adds defaulters functions to the given scheme. 26 | // Public to allow building arbitrary schemes. 27 | // All generated defaulters are covering - they call all nested defaulters. 28 | func RegisterDefaults(scheme *runtime.Scheme) error { 29 | scheme.AddTypeDefaultingFunc(&ChainerJob{}, func(obj interface{}) { SetObjectDefaults_ChainerJob(obj.(*ChainerJob)) }) 30 | scheme.AddTypeDefaultingFunc(&ChainerJobList{}, func(obj interface{}) { SetObjectDefaults_ChainerJobList(obj.(*ChainerJobList)) }) 31 | return nil 32 | } 33 | 34 | func SetObjectDefaults_ChainerJob(in *ChainerJob) { 35 | SetDefaults_ChainerJob(in) 36 | } 37 | 38 | func SetObjectDefaults_ChainerJobList(in *ChainerJobList) { 39 | for i := range in.Items { 40 | a := &in.Items[i] 41 | SetObjectDefaults_ChainerJob(a) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /pkg/client/informers/externalversions/chainer/v1alpha1/interface.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by informer-gen. DO NOT EDIT. 16 | 17 | package v1alpha1 18 | 19 | import ( 20 | internalinterfaces "github.com/kubeflow/chainer-operator/pkg/client/informers/externalversions/internalinterfaces" 21 | ) 22 | 23 | // Interface provides access to all the informers in this group version. 24 | type Interface interface { 25 | // ChainerJobs returns a ChainerJobInformer. 26 | ChainerJobs() ChainerJobInformer 27 | } 28 | 29 | type version struct { 30 | factory internalinterfaces.SharedInformerFactory 31 | namespace string 32 | tweakListOptions internalinterfaces.TweakListOptionsFunc 33 | } 34 | 35 | // New returns a new Interface. 36 | func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { 37 | return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} 38 | } 39 | 40 | // ChainerJobs returns a ChainerJobInformer. 41 | func (v *version) ChainerJobs() ChainerJobInformer { 42 | return &chainerJobInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} 43 | } 44 | -------------------------------------------------------------------------------- /deploy/2-rbac.yaml: -------------------------------------------------------------------------------- 1 | kind: ClusterRole 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: chainer-operator 5 | rules: 6 | - apiGroups: 7 | - "" 8 | resources: 9 | - configmaps 10 | - serviceaccounts 11 | verbs: 12 | - create 13 | - update 14 | - list 15 | - watch 16 | # This is needed for the launcher Role. 17 | - apiGroups: 18 | - "" 19 | resources: 20 | - pods 21 | verbs: 22 | - get 23 | - list 24 | # This is needed for the launcher Role. 25 | - apiGroups: 26 | - "" 27 | resources: 28 | - pods/exec 29 | verbs: 30 | - create 31 | - apiGroups: 32 | - "" 33 | resources: 34 | - events 35 | verbs: 36 | - create 37 | - patch 38 | - apiGroups: 39 | - rbac.authorization.k8s.io 40 | resources: 41 | - roles 42 | - rolebindings 43 | verbs: 44 | - create 45 | - update 46 | - list 47 | - watch 48 | - apiGroups: 49 | - apps 50 | resources: 51 | - statefulsets 52 | verbs: 53 | - get 54 | - create 55 | - list 56 | - update 57 | - watch 58 | - apiGroups: 59 | - batch 60 | resources: 61 | - jobs 62 | verbs: 63 | - create 64 | - list 65 | - watch 66 | - apiGroups: 67 | - apiextensions.k8s.io 68 | resources: 69 | - customresourcedefinitions 70 | verbs: 71 | - create 72 | - get 73 | - apiGroups: 74 | - kubeflow.org 75 | resources: 76 | - chainerjobs 77 | verbs: 78 | - "*" 79 | --- 80 | apiVersion: v1 81 | kind: ServiceAccount 82 | metadata: 83 | name: chainer-operator 84 | namespace: chainer-operator 85 | --- 86 | kind: ClusterRoleBinding 87 | apiVersion: rbac.authorization.k8s.io/v1 88 | metadata: 89 | name: chainer-operator 90 | namespace: chainer-operator 91 | roleRef: 92 | apiGroup: rbac.authorization.k8s.io 93 | kind: ClusterRole 94 | name: chainer-operator 95 | subjects: 96 | - kind: ServiceAccount 97 | name: chainer-operator 98 | namespace: chainer-operator -------------------------------------------------------------------------------- /pkg/client/informers/externalversions/chainer/interface.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by informer-gen. DO NOT EDIT. 16 | 17 | package kubeflow 18 | 19 | import ( 20 | v1alpha1 "github.com/kubeflow/chainer-operator/pkg/client/informers/externalversions/chainer/v1alpha1" 21 | internalinterfaces "github.com/kubeflow/chainer-operator/pkg/client/informers/externalversions/internalinterfaces" 22 | ) 23 | 24 | // Interface provides access to each of this group's versions. 25 | type Interface interface { 26 | // V1alpha1 provides access to shared informers for resources in V1alpha1. 27 | V1alpha1() v1alpha1.Interface 28 | } 29 | 30 | type group struct { 31 | factory internalinterfaces.SharedInformerFactory 32 | namespace string 33 | tweakListOptions internalinterfaces.TweakListOptionsFunc 34 | } 35 | 36 | // New returns a new Interface. 37 | func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { 38 | return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} 39 | } 40 | 41 | // V1alpha1 returns a new v1alpha1.Interface. 42 | func (g *group) V1alpha1() v1alpha1.Interface { 43 | return v1alpha1.New(g.factory, g.namespace, g.tweakListOptions) 44 | } 45 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/fake/register.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package fake 18 | 19 | import ( 20 | kubeflowv1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 21 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | runtime "k8s.io/apimachinery/pkg/runtime" 23 | schema "k8s.io/apimachinery/pkg/runtime/schema" 24 | serializer "k8s.io/apimachinery/pkg/runtime/serializer" 25 | ) 26 | 27 | var scheme = runtime.NewScheme() 28 | var codecs = serializer.NewCodecFactory(scheme) 29 | var parameterCodec = runtime.NewParameterCodec(scheme) 30 | 31 | func init() { 32 | v1.AddToGroupVersion(scheme, schema.GroupVersion{Version: "v1"}) 33 | AddToScheme(scheme) 34 | } 35 | 36 | // AddToScheme adds all types of this clientset into the given scheme. This allows composition 37 | // of clientsets, like in: 38 | // 39 | // import ( 40 | // "k8s.io/client-go/kubernetes" 41 | // clientsetscheme "k8s.io/client-go/kubernetes/scheme" 42 | // aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" 43 | // ) 44 | // 45 | // kclientset, _ := kubernetes.NewForConfig(c) 46 | // aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) 47 | // 48 | // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types 49 | // correctly. 50 | func AddToScheme(scheme *runtime.Scheme) { 51 | kubeflowv1alpha1.AddToScheme(scheme) 52 | } 53 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/scheme/register.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package scheme 18 | 19 | import ( 20 | kubeflowv1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 21 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | runtime "k8s.io/apimachinery/pkg/runtime" 23 | schema "k8s.io/apimachinery/pkg/runtime/schema" 24 | serializer "k8s.io/apimachinery/pkg/runtime/serializer" 25 | ) 26 | 27 | var Scheme = runtime.NewScheme() 28 | var Codecs = serializer.NewCodecFactory(Scheme) 29 | var ParameterCodec = runtime.NewParameterCodec(Scheme) 30 | 31 | func init() { 32 | v1.AddToGroupVersion(Scheme, schema.GroupVersion{Version: "v1"}) 33 | AddToScheme(Scheme) 34 | } 35 | 36 | // AddToScheme adds all types of this clientset into the given scheme. This allows composition 37 | // of clientsets, like in: 38 | // 39 | // import ( 40 | // "k8s.io/client-go/kubernetes" 41 | // clientsetscheme "k8s.io/client-go/kubernetes/scheme" 42 | // aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" 43 | // ) 44 | // 45 | // kclientset, _ := kubernetes.NewForConfig(c) 46 | // aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) 47 | // 48 | // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types 49 | // correctly. 50 | func AddToScheme(scheme *runtime.Scheme) { 51 | kubeflowv1alpha1.AddToScheme(scheme) 52 | } 53 | -------------------------------------------------------------------------------- /pkg/client/informers/externalversions/generic.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by informer-gen. DO NOT EDIT. 16 | 17 | package externalversions 18 | 19 | import ( 20 | "fmt" 21 | 22 | v1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 23 | schema "k8s.io/apimachinery/pkg/runtime/schema" 24 | cache "k8s.io/client-go/tools/cache" 25 | ) 26 | 27 | // GenericInformer is type of SharedIndexInformer which will locate and delegate to other 28 | // sharedInformers based on type 29 | type GenericInformer interface { 30 | Informer() cache.SharedIndexInformer 31 | Lister() cache.GenericLister 32 | } 33 | 34 | type genericInformer struct { 35 | informer cache.SharedIndexInformer 36 | resource schema.GroupResource 37 | } 38 | 39 | // Informer returns the SharedIndexInformer. 40 | func (f *genericInformer) Informer() cache.SharedIndexInformer { 41 | return f.informer 42 | } 43 | 44 | // Lister returns the GenericLister. 45 | func (f *genericInformer) Lister() cache.GenericLister { 46 | return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource) 47 | } 48 | 49 | // ForResource gives generic access to a shared informer of the matching type 50 | // TODO extend this to unknown resources with a client pool 51 | func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { 52 | switch resource { 53 | // Group=kubeflow.org, Version=v1alpha1 54 | case v1alpha1.SchemeGroupVersion.WithResource("chainerjobs"): 55 | return &genericInformer{resource: resource.GroupResource(), informer: f.Kubeflow().V1alpha1().ChainerJobs().Informer()}, nil 56 | 57 | } 58 | 59 | return nil, fmt.Errorf("no informer found for %v", resource) 60 | } 61 | -------------------------------------------------------------------------------- /pkg/apis/chainer/v1alpha1/types.go: -------------------------------------------------------------------------------- 1 | package v1alpha1 2 | 3 | import ( 4 | batchv1 "k8s.io/api/batch/v1" 5 | "k8s.io/api/core/v1" 6 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 7 | ) 8 | 9 | // +genclient 10 | // +genclient:noStatus 11 | // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object 12 | 13 | // ChainerJob describbe chainerjob info 14 | type ChainerJob struct { 15 | metav1.TypeMeta `json:",inline"` 16 | metav1.ObjectMeta `json:"metadata,omitempty"` 17 | Spec ChainerJobSpec `json:"spec"` 18 | Status batchv1.JobStatus `json:"status"` 19 | } 20 | 21 | // IsDistributed returns the chainerjob is distributed mode or not. 22 | func IsDistributed(chjob *ChainerJob) bool { 23 | return chjob != nil && len(chjob.Spec.WorkerSets) > 0 24 | } 25 | 26 | // BackendType is the type of backend 27 | type BackendType string 28 | 29 | const ( 30 | // BackendTypeMPI = "mpi" 31 | BackendTypeMPI BackendType = "mpi" 32 | ) 33 | 34 | // ChainerJobSpec defines a spec or ChainerJob 35 | type ChainerJobSpec struct { 36 | // Backend is a type of backend for how to communicate processes. 37 | // This is valid only when WorkerSet present. 38 | Backend BackendType `json:"backend,omitempty"` 39 | 40 | // Master is a master of the job 41 | Master MasterSpec `json:"master"` 42 | 43 | // WorkerSets is a map whose key is workerset name and value is WorkerSetSpec 44 | // User can define heterogeneous WorkserSets 45 | WorkerSets map[string]*WorkerSetSpec `json:"workerSets,omitempty"` 46 | } 47 | 48 | // MasterSpec defines a spec of master of mpi cluster 49 | type MasterSpec struct { 50 | ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"` 51 | BackoffLimit *int32 `json:"backoffLimit,omitempty"` 52 | MPIConfig *MPIConfig `json:"mpiConfig,omitempty"` 53 | Template v1.PodTemplateSpec `json:"template"` 54 | } 55 | 56 | // WorkerSetSpec defines spec of workers of mpi cluster 57 | type WorkerSetSpec struct { 58 | Replicas *int32 `json:"replicas"` 59 | MPIConfig *MPIConfig `json:"mpiConfig"` 60 | Template v1.PodTemplateSpec `json:"template"` 61 | } 62 | 63 | // MPIConfig is config object for `backend: mpi` 64 | type MPIConfig struct { 65 | Slots *int32 `json:"slots,omitempty"` 66 | } 67 | 68 | // +resource:path=chainerjobs 69 | // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object 70 | 71 | // ChainerJobList is a list of ChainerJob clusters. 72 | type ChainerJobList struct { 73 | metav1.TypeMeta `json:",inline"` 74 | // Standard list metadata 75 | // More info: http://releases.k8s.io/HEAD/docs/devel/api-conventions.md#metadata 76 | metav1.ListMeta `json:"metadata,omitempty"` 77 | // Items is a list of ChainerJobs 78 | Items []ChainerJob `json:"items"` 79 | } 80 | -------------------------------------------------------------------------------- /pkg/apis/chainer/v1alpha1/register.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package v1alpha1 16 | 17 | import ( 18 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 19 | "k8s.io/apimachinery/pkg/runtime" 20 | "k8s.io/apimachinery/pkg/runtime/schema" 21 | ) 22 | 23 | var ( 24 | // TODO: move SchemeBuilder with zz_generated.deepcopy.go to k8s.io/api. 25 | // localSchemeBuilder and AddToScheme will stay in k8s.io/kubernetes. 26 | SchemeBuilder runtime.SchemeBuilder 27 | localSchemeBuilder = &SchemeBuilder 28 | AddToScheme = localSchemeBuilder.AddToScheme 29 | ) 30 | 31 | const ( 32 | // GroupName is the group name use in this package. 33 | GroupName = "kubeflow.org" 34 | // Kind is the kind name. 35 | Kind = "ChainerJob" 36 | // GroupVersion is the version. 37 | GroupVersion = "v1alpha1" 38 | // Plural is the Plural for ChainerJob. 39 | Plural = "chainerjobs" 40 | // Singular is the singular for ChainerJob. 41 | Singular = "chainerjob" 42 | ) 43 | 44 | var ( 45 | // SchemeGroupVersion is the group version used to register these objects. 46 | SchemeGroupVersion = schema.GroupVersion{ 47 | Group: GroupName, 48 | Version: GroupVersion, 49 | } 50 | // SchemeGroupVersionKind is the GroupVersionKind of the resource. 51 | SchemeGroupVersionKind = SchemeGroupVersion.WithKind(Kind) 52 | ) 53 | 54 | func init() { 55 | // We only register manually written functions here. The registration of the 56 | // generated functions takes place in the generated files. The separation 57 | // makes the code compile even when the generated files are missing. 58 | localSchemeBuilder.Register(addKnownTypes) 59 | localSchemeBuilder.Register(addDefaultingFuncs) 60 | } 61 | 62 | // Resource takes an unqualified resource and returns a Group-qualified GroupResource. 63 | func Resource(resource string) schema.GroupResource { 64 | return SchemeGroupVersion.WithResource(resource).GroupResource() 65 | } 66 | 67 | // addKnownTypes adds the set of types defined in this package to the supplied scheme. 68 | func addKnownTypes(scheme *runtime.Scheme) error { 69 | scheme.AddKnownTypes(SchemeGroupVersion, 70 | &ChainerJob{}, 71 | &ChainerJobList{}, 72 | ) 73 | metav1.AddToGroupVersion(scheme, SchemeGroupVersion) 74 | return nil 75 | } 76 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/chainer/v1alpha1/chainer_client.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package v1alpha1 18 | 19 | import ( 20 | v1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 21 | "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned/scheme" 22 | serializer "k8s.io/apimachinery/pkg/runtime/serializer" 23 | rest "k8s.io/client-go/rest" 24 | ) 25 | 26 | type KubeflowV1alpha1Interface interface { 27 | RESTClient() rest.Interface 28 | ChainerJobsGetter 29 | } 30 | 31 | // KubeflowV1alpha1Client is used to interact with features provided by the kubeflow.org group. 32 | type KubeflowV1alpha1Client struct { 33 | restClient rest.Interface 34 | } 35 | 36 | func (c *KubeflowV1alpha1Client) ChainerJobs(namespace string) ChainerJobInterface { 37 | return newChainerJobs(c, namespace) 38 | } 39 | 40 | // NewForConfig creates a new KubeflowV1alpha1Client for the given config. 41 | func NewForConfig(c *rest.Config) (*KubeflowV1alpha1Client, error) { 42 | config := *c 43 | if err := setConfigDefaults(&config); err != nil { 44 | return nil, err 45 | } 46 | client, err := rest.RESTClientFor(&config) 47 | if err != nil { 48 | return nil, err 49 | } 50 | return &KubeflowV1alpha1Client{client}, nil 51 | } 52 | 53 | // NewForConfigOrDie creates a new KubeflowV1alpha1Client for the given config and 54 | // panics if there is an error in the config. 55 | func NewForConfigOrDie(c *rest.Config) *KubeflowV1alpha1Client { 56 | client, err := NewForConfig(c) 57 | if err != nil { 58 | panic(err) 59 | } 60 | return client 61 | } 62 | 63 | // New creates a new KubeflowV1alpha1Client for the given RESTClient. 64 | func New(c rest.Interface) *KubeflowV1alpha1Client { 65 | return &KubeflowV1alpha1Client{c} 66 | } 67 | 68 | func setConfigDefaults(config *rest.Config) error { 69 | gv := v1alpha1.SchemeGroupVersion 70 | config.GroupVersion = &gv 71 | config.APIPath = "/apis" 72 | config.NegotiatedSerializer = serializer.DirectCodecFactory{CodecFactory: scheme.Codecs} 73 | 74 | if config.UserAgent == "" { 75 | config.UserAgent = rest.DefaultKubernetesUserAgent() 76 | } 77 | 78 | return nil 79 | } 80 | 81 | // RESTClient returns a RESTClient that is used to communicate 82 | // with API server by this client implementation. 83 | func (c *KubeflowV1alpha1Client) RESTClient() rest.Interface { 84 | if c == nil { 85 | return nil 86 | } 87 | return c.restClient 88 | } 89 | -------------------------------------------------------------------------------- /examples/chainerjob-reference.yaml: -------------------------------------------------------------------------------- 1 | # This is a reference of ChainerJob 2 | apiVersion: kubeflow.org/v1alpha1 3 | kind: ChainerJob 4 | metadata: 5 | name: example-job-mn 6 | 7 | # ChainerJob in distributed mode consits of Master and multiple WorkerSets. 8 | # Master is the pod (job technically) to boot your entire distributed job. 9 | # WorkerSet is the set of homogenous pod (statefulset technically). 10 | # You can define multiple WorkerSets to make heterogeneous WorkerSets. 11 | spec: 12 | 13 | # 'backend' defines the protocol to initiate process groups and exchange 14 | # tensor data among the processes. Current supported backend is "mpi". 15 | backend: mpi 16 | 17 | # master is responsible for spawning/exiting entire distributed learning 18 | # process. You con configure 'activeDeadlineSeconds'/'backoffLimit' to 19 | # customize retry behavior on failure. 20 | master: 21 | 22 | # slots in autogenerated hostfile is configurable. 23 | # Default slot is 1 or the number of GPUs you requested on 24 | # 'chainer' container. 25 | mpiConfig: 26 | slots: 1 27 | 28 | # Retry behaviors on failure of master 29 | activeDeadlineSeconds: 6000 30 | backoffLimit: 60 31 | 32 | # You can put any pod template here. There are several exception: 33 | # - a container "chainer" must exist. this is the place your mpi processes run. 34 | # - Only `restartPolicy` equal to `Never` or `OnFailure` is allowed. Default is `Never`. 35 | template: 36 | spec: 37 | containers: 38 | - name: chainer 39 | image: everpeace/chainermn:1.3.0 40 | command: 41 | - sh 42 | - -c 43 | - | 44 | mpiexec -n 3 -N 1 --allow-run-as-root --display-map --mca mpi_cuda_support 0 \ 45 | python3 /train_mnist.py -e 2 -b 1000 -u 100 46 | 47 | # You can define multiple WorkerSets to have heterogeneous WorkerSets 48 | # When the master will be completed(success or failure), all WorkerSets will be scaled-down to 0. 49 | workerSets: 50 | 51 | # 'ws0' is the name of this workerset 52 | ws0: 53 | 54 | # The number of worker replicas consiting the workerset 55 | replicas: 2 56 | 57 | # slots in autogenerated hostfile is configurable. 58 | # Default slot is 1 or the number of GPUs you requested on 59 | # 'chainer' container. 60 | mpiConfig: 61 | slots: 1 62 | 63 | # You can put any pod template here. There are several exception 64 | # - a container "chainer" must exist. this is the place your mpi processes run. 65 | # - `restartPolicy` equal to `Never` or `OnFailure` is allowed. Default is `Never` 66 | # 67 | # In typical usecase, 'chainer' containers in workersets just wait forever. 68 | template: 69 | spec: 70 | containers: 71 | - name: chainer 72 | image: everpeace/chainermn:1.3.0 73 | command: 74 | - sh 75 | - -c 76 | - | 77 | while true; do sleep 1 & wait; done 78 | -------------------------------------------------------------------------------- /cmd/chainer-operator/main.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package main 16 | 17 | import ( 18 | "flag" 19 | "time" 20 | 21 | "github.com/golang/glog" 22 | kubeinformers "k8s.io/client-go/informers" 23 | "k8s.io/client-go/kubernetes" 24 | "k8s.io/client-go/tools/clientcmd" 25 | 26 | clientset "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned" 27 | informers "github.com/kubeflow/chainer-operator/pkg/client/informers/externalversions" 28 | 29 | "github.com/kubeflow/chainer-operator/pkg/controllers" 30 | "github.com/kubeflow/chainer-operator/pkg/util/signals" 31 | ) 32 | 33 | var ( 34 | masterURL string 35 | kubeConfig string 36 | ) 37 | 38 | func main() { 39 | flag.Parse() 40 | 41 | // set up signals so we handle the first shutdown signal gracefully 42 | stopCh := signals.SetupSignalHandler() 43 | 44 | cfg, err := clientcmd.BuildConfigFromFlags(masterURL, kubeConfig) 45 | if err != nil { 46 | glog.Fatalf("Error building kubeConfig: %s", err.Error()) 47 | } 48 | 49 | kubeClient, err := kubernetes.NewForConfig(cfg) 50 | if err != nil { 51 | glog.Fatalf("Error building kubernetes clientset: %s", err.Error()) 52 | } 53 | 54 | kubeflowClient, err := clientset.NewForConfig(cfg) 55 | if err != nil { 56 | glog.Fatalf("Error building kubeflow clientset: %s", err.Error()) 57 | } 58 | 59 | kubeInformerFactory := kubeinformers.NewSharedInformerFactory(kubeClient, time.Second*30) 60 | kubeflowInformerFactory := informers.NewSharedInformerFactory(kubeflowClient, time.Second*30) 61 | 62 | controller := controllers.NewChainerJobController( 63 | kubeClient, 64 | kubeflowClient, 65 | kubeInformerFactory.Core().V1().ConfigMaps(), 66 | kubeInformerFactory.Core().V1().ServiceAccounts(), 67 | kubeInformerFactory.Rbac().V1().Roles(), 68 | kubeInformerFactory.Rbac().V1().RoleBindings(), 69 | kubeInformerFactory.Apps().V1().StatefulSets(), 70 | kubeInformerFactory.Batch().V1().Jobs(), 71 | kubeflowInformerFactory.Kubeflow().V1alpha1().ChainerJobs()) 72 | 73 | go kubeInformerFactory.Start(stopCh) 74 | go kubeflowInformerFactory.Start(stopCh) 75 | 76 | if err = controller.Run(2, stopCh); err != nil { 77 | glog.Fatalf("Error running controller: %s", err.Error()) 78 | } 79 | } 80 | 81 | func init() { 82 | flag.StringVar(&kubeConfig, "kubeconfig", "", "Path to a kubeconfig. Only required if out-of-cluster.") 83 | flag.StringVar(&masterURL, "master", "", "The address of the Kubernetes API server. Overrides any value in kubeConfig. Only required if out-of-cluster.") 84 | } 85 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/fake/clientset_generated.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package fake 18 | 19 | import ( 20 | clientset "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned" 21 | kubeflowv1alpha1 "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned/typed/chainer/v1alpha1" 22 | fakekubeflowv1alpha1 "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned/typed/chainer/v1alpha1/fake" 23 | "k8s.io/apimachinery/pkg/runtime" 24 | "k8s.io/apimachinery/pkg/watch" 25 | "k8s.io/client-go/discovery" 26 | fakediscovery "k8s.io/client-go/discovery/fake" 27 | "k8s.io/client-go/testing" 28 | ) 29 | 30 | // NewSimpleClientset returns a clientset that will respond with the provided objects. 31 | // It's backed by a very simple object tracker that processes creates, updates and deletions as-is, 32 | // without applying any validations and/or defaults. It shouldn't be considered a replacement 33 | // for a real clientset and is mostly useful in simple unit tests. 34 | func NewSimpleClientset(objects ...runtime.Object) *Clientset { 35 | o := testing.NewObjectTracker(scheme, codecs.UniversalDecoder()) 36 | for _, obj := range objects { 37 | if err := o.Add(obj); err != nil { 38 | panic(err) 39 | } 40 | } 41 | 42 | cs := &Clientset{} 43 | cs.discovery = &fakediscovery.FakeDiscovery{Fake: &cs.Fake} 44 | cs.AddReactor("*", "*", testing.ObjectReaction(o)) 45 | cs.AddWatchReactor("*", func(action testing.Action) (handled bool, ret watch.Interface, err error) { 46 | gvr := action.GetResource() 47 | ns := action.GetNamespace() 48 | watch, err := o.Watch(gvr, ns) 49 | if err != nil { 50 | return false, nil, err 51 | } 52 | return true, watch, nil 53 | }) 54 | 55 | return cs 56 | } 57 | 58 | // Clientset implements clientset.Interface. Meant to be embedded into a 59 | // struct to get a default implementation. This makes faking out just the method 60 | // you want to test easier. 61 | type Clientset struct { 62 | testing.Fake 63 | discovery *fakediscovery.FakeDiscovery 64 | } 65 | 66 | func (c *Clientset) Discovery() discovery.DiscoveryInterface { 67 | return c.discovery 68 | } 69 | 70 | var _ clientset.Interface = &Clientset{} 71 | 72 | // KubeflowV1alpha1 retrieves the KubeflowV1alpha1Client 73 | func (c *Clientset) KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface { 74 | return &fakekubeflowv1alpha1.FakeKubeflowV1alpha1{Fake: &c.Fake} 75 | } 76 | 77 | // Kubeflow retrieves the KubeflowV1alpha1Client 78 | func (c *Clientset) Kubeflow() kubeflowv1alpha1.KubeflowV1alpha1Interface { 79 | return &fakekubeflowv1alpha1.FakeKubeflowV1alpha1{Fake: &c.Fake} 80 | } 81 | -------------------------------------------------------------------------------- /pkg/controllers/backends/mpi/assets.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package mpi 16 | 17 | import ( 18 | "strings" 19 | 20 | apisv1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 21 | ) 22 | 23 | var ( 24 | kubexecSh = strings.Replace(strings.TrimSpace(`#! /bin/sh 25 | pod=$1 26 | shift 27 | if [ "$pod" = $(hostname) ]; then 28 | $@ 29 | else 30 | ${KUBECTL_DIR:-/kubeflow/chainer-operator/kube}/kubectl exec -i $pod -c %%JOB_CONTAINER_NAME%% -- $@ 31 | fi 32 | `), "%%JOB_CONTAINER_NAME%%", apisv1alpha1.DefaultContainerName, -1) 33 | 34 | kubectlDownloadSh = strings.TrimSpace(`#! /bin/sh 35 | set -ex 36 | 37 | TARGET=${1:-/kubeflow/chainer-operator/kube} 38 | MAX_TRY=${2:-10} 39 | SLEEP_SECS=${3:-5} 40 | TRIED=0 41 | COMMAND_STATUS=1 42 | 43 | until [ $COMMAND_STATUS -eq 0 ] || [ $TRIED -eq $MAX_TRY ]; do 44 | curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl && chmod +x kubectl && mv kubectl $TARGET/kubectl 45 | COMMAND_STATUS=$? 46 | sleep $SLEEP_SECS 47 | TRIED=$(expr $TRIED + 1) 48 | done 49 | `) 50 | 51 | genHostfileSh = `#! /bin/sh 52 | set -xev 53 | 54 | KUBECTL_DIR=${KUBECTL_DIR:-/kubeflow/chainer-operator/kube} 55 | ASSETS_DIR=${ASSETS_DIR:-/kubeflow/chainer-operator/assets} 56 | KUBECTL=${KUBECTL_DIR}/kubectl 57 | 58 | TARGET=${1} 59 | MASTER_POD_NAME=${2} 60 | MASTER_SLOTS=${3:-slots=1} 61 | STATEFUL_SETS_AND_SLOTS_FILE=${4:-${ASSETS_DIR}/statefulsets_and_slots} 62 | MAX_TRY=${5:-100} 63 | SLEEP_SECS=${6:-5} 64 | 65 | trap "rm -f ${TARGET}_new" EXIT TERM INT KILL 66 | 67 | cluster_size=1 68 | for ss in $(cat ${STATEFUL_SETS_AND_SLOTS_FILE} | cut -d' ' -f 1); do 69 | replicas=$($KUBECTL get statefulsets ${ss} -o=jsonpath='{.status.replicas}') 70 | cluster_size=$(expr $cluster_size + $replicas) 71 | done 72 | 73 | tried=0 74 | until [ "$(wc -l < ${TARGET}_new)" -eq $cluster_size ]; do 75 | rm -f ${TARGET}_new 76 | 77 | cat ${STATEFUL_SETS_AND_SLOTS_FILE} | while read ss_s; do 78 | ss=$(echo "${ss_s}" | cut -d' ' -f 1) 79 | slots=$(echo "${ss_s}" | cut -d' ' -f 2) 80 | replicas=$($KUBECTL get statefulsets "${ss}" -o=jsonpath='{.status.replicas}') 81 | 82 | for i in $(seq 0 ${replicas} | sed -e '$,$d'); do 83 | phase=$($KUBECTL get pod "${ss}-${i}" -o=jsonpath='{.status.phase}') 84 | if [ "$phase" = "Running" ]; then 85 | echo "${ss}-${i} ${slots}" >> ${TARGET}_new 86 | fi 87 | done 88 | done 89 | 90 | echo "${MASTER_POD_NAME} ${MASTER_SLOTS}" >> ${TARGET}_new 91 | 92 | tried=$(expr $tried + 1) 93 | if [ $tried -ge $MAX_TRY ]; then 94 | break 95 | fi 96 | sleep $SLEEP_SECS 97 | done 98 | 99 | if [ -e ${TARGET}_new ]; then 100 | mv ${TARGET}_new ${TARGET} 101 | fi 102 | ` 103 | ) 104 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/clientset.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package versioned 18 | 19 | import ( 20 | kubeflowv1alpha1 "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned/typed/chainer/v1alpha1" 21 | discovery "k8s.io/client-go/discovery" 22 | rest "k8s.io/client-go/rest" 23 | flowcontrol "k8s.io/client-go/util/flowcontrol" 24 | ) 25 | 26 | type Interface interface { 27 | Discovery() discovery.DiscoveryInterface 28 | KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface 29 | // Deprecated: please explicitly pick a version if possible. 30 | Kubeflow() kubeflowv1alpha1.KubeflowV1alpha1Interface 31 | } 32 | 33 | // Clientset contains the clients for groups. Each group has exactly one 34 | // version included in a Clientset. 35 | type Clientset struct { 36 | *discovery.DiscoveryClient 37 | kubeflowV1alpha1 *kubeflowv1alpha1.KubeflowV1alpha1Client 38 | } 39 | 40 | // KubeflowV1alpha1 retrieves the KubeflowV1alpha1Client 41 | func (c *Clientset) KubeflowV1alpha1() kubeflowv1alpha1.KubeflowV1alpha1Interface { 42 | return c.kubeflowV1alpha1 43 | } 44 | 45 | // Deprecated: Kubeflow retrieves the default version of KubeflowClient. 46 | // Please explicitly pick a version. 47 | func (c *Clientset) Kubeflow() kubeflowv1alpha1.KubeflowV1alpha1Interface { 48 | return c.kubeflowV1alpha1 49 | } 50 | 51 | // Discovery retrieves the DiscoveryClient 52 | func (c *Clientset) Discovery() discovery.DiscoveryInterface { 53 | if c == nil { 54 | return nil 55 | } 56 | return c.DiscoveryClient 57 | } 58 | 59 | // NewForConfig creates a new Clientset for the given config. 60 | func NewForConfig(c *rest.Config) (*Clientset, error) { 61 | configShallowCopy := *c 62 | if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 { 63 | configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst) 64 | } 65 | var cs Clientset 66 | var err error 67 | cs.kubeflowV1alpha1, err = kubeflowv1alpha1.NewForConfig(&configShallowCopy) 68 | if err != nil { 69 | return nil, err 70 | } 71 | 72 | cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfig(&configShallowCopy) 73 | if err != nil { 74 | return nil, err 75 | } 76 | return &cs, nil 77 | } 78 | 79 | // NewForConfigOrDie creates a new Clientset for the given config and 80 | // panics if there is an error in the config. 81 | func NewForConfigOrDie(c *rest.Config) *Clientset { 82 | var cs Clientset 83 | cs.kubeflowV1alpha1 = kubeflowv1alpha1.NewForConfigOrDie(c) 84 | 85 | cs.DiscoveryClient = discovery.NewDiscoveryClientForConfigOrDie(c) 86 | return &cs 87 | } 88 | 89 | // New creates a new Clientset for the given RESTClient. 90 | func New(c rest.Interface) *Clientset { 91 | var cs Clientset 92 | cs.kubeflowV1alpha1 = kubeflowv1alpha1.New(c) 93 | 94 | cs.DiscoveryClient = discovery.NewDiscoveryClient(c) 95 | return &cs 96 | } 97 | -------------------------------------------------------------------------------- /pkg/client/listers/chainer/v1alpha1/chainerjob.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by lister-gen. DO NOT EDIT. 16 | 17 | package v1alpha1 18 | 19 | import ( 20 | v1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 21 | "k8s.io/apimachinery/pkg/api/errors" 22 | "k8s.io/apimachinery/pkg/labels" 23 | "k8s.io/client-go/tools/cache" 24 | ) 25 | 26 | // ChainerJobLister helps list ChainerJobs. 27 | type ChainerJobLister interface { 28 | // List lists all ChainerJobs in the indexer. 29 | List(selector labels.Selector) (ret []*v1alpha1.ChainerJob, err error) 30 | // ChainerJobs returns an object that can list and get ChainerJobs. 31 | ChainerJobs(namespace string) ChainerJobNamespaceLister 32 | ChainerJobListerExpansion 33 | } 34 | 35 | // chainerJobLister implements the ChainerJobLister interface. 36 | type chainerJobLister struct { 37 | indexer cache.Indexer 38 | } 39 | 40 | // NewChainerJobLister returns a new ChainerJobLister. 41 | func NewChainerJobLister(indexer cache.Indexer) ChainerJobLister { 42 | return &chainerJobLister{indexer: indexer} 43 | } 44 | 45 | // List lists all ChainerJobs in the indexer. 46 | func (s *chainerJobLister) List(selector labels.Selector) (ret []*v1alpha1.ChainerJob, err error) { 47 | err = cache.ListAll(s.indexer, selector, func(m interface{}) { 48 | ret = append(ret, m.(*v1alpha1.ChainerJob)) 49 | }) 50 | return ret, err 51 | } 52 | 53 | // ChainerJobs returns an object that can list and get ChainerJobs. 54 | func (s *chainerJobLister) ChainerJobs(namespace string) ChainerJobNamespaceLister { 55 | return chainerJobNamespaceLister{indexer: s.indexer, namespace: namespace} 56 | } 57 | 58 | // ChainerJobNamespaceLister helps list and get ChainerJobs. 59 | type ChainerJobNamespaceLister interface { 60 | // List lists all ChainerJobs in the indexer for a given namespace. 61 | List(selector labels.Selector) (ret []*v1alpha1.ChainerJob, err error) 62 | // Get retrieves the ChainerJob from the indexer for a given namespace and name. 63 | Get(name string) (*v1alpha1.ChainerJob, error) 64 | ChainerJobNamespaceListerExpansion 65 | } 66 | 67 | // chainerJobNamespaceLister implements the ChainerJobNamespaceLister 68 | // interface. 69 | type chainerJobNamespaceLister struct { 70 | indexer cache.Indexer 71 | namespace string 72 | } 73 | 74 | // List lists all ChainerJobs in the indexer for a given namespace. 75 | func (s chainerJobNamespaceLister) List(selector labels.Selector) (ret []*v1alpha1.ChainerJob, err error) { 76 | err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { 77 | ret = append(ret, m.(*v1alpha1.ChainerJob)) 78 | }) 79 | return ret, err 80 | } 81 | 82 | // Get retrieves the ChainerJob from the indexer for a given namespace and name. 83 | func (s chainerJobNamespaceLister) Get(name string) (*v1alpha1.ChainerJob, error) { 84 | obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) 85 | if err != nil { 86 | return nil, err 87 | } 88 | if !exists { 89 | return nil, errors.NewNotFound(v1alpha1.Resource("chainerjob"), name) 90 | } 91 | return obj.(*v1alpha1.ChainerJob), nil 92 | } 93 | -------------------------------------------------------------------------------- /scripts/run-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2018 The Kubernetes Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This shell script is used to build a cluster and create a namespace from our 18 | # argo workflow 19 | 20 | 21 | set -xe 22 | set -o pipefail 23 | 24 | export CLUSTER_NAME="${CLUSTER_NAME}" 25 | export ZONE="${GCP_ZONE}" 26 | export PROJECT="${GCP_PROJECT}" 27 | export K8S_NAMESPACE="${DEPLOY_NAMESPACE}" 28 | export REGISTRY="${GCP_REGISTRY}" 29 | KFCTL_DIR="${KFCTL_DIR}" 30 | WORK_DIR=$(mktemp -d) 31 | VERSION=$(git describe --tags --always --dirty) 32 | source `dirname $0`/kfctl-util.sh 33 | source `dirname $0`/gcloud-util.sh 34 | 35 | gcloud::auth_activate 36 | echo "Configuring kubectl" 37 | gcloud --project ${PROJECT} container clusters get-credentials ${CLUSTER_NAME} \ 38 | --zone ${ZONE} 39 | 40 | cd ${WORK_DIR} 41 | 42 | kfctl::init ${KFCTL_DIR} ${CLUSTER_NAME} ${PROJECT} 43 | 44 | cd ${CLUSTER_NAME} 45 | cat env.sh # for debugging 46 | export CLIENT_ID=dummy 47 | export CLIENT_SECRET=dummy 48 | kfctl::generate ${KFCTL_DIR} all 49 | 50 | cd $(source env.sh; echo ${KUBEFLOW_KS_DIR}) 51 | 52 | # kfctl.sh generate remove default env 53 | ks env add default --namespace "${K8S_NAMESPACE}" 54 | 55 | echo "Disable spartakus" 56 | ks param set spartakus reportUsage false 57 | 58 | echo "Install the operator" 59 | ks pkg install kubeflow/chainer-job 60 | ks generate chainer-operator chainer-operator --image=${REGISTRY}/${REPO_NAME}:${VERSION} 61 | ks apply default -c chainer-operator 62 | TIMEOUT=30 63 | until kubectl get pods -n ${K8S_NAMESPACE} | grep chainer-operator | grep 1/1 || [[ $TIMEOUT -eq 1 ]]; do 64 | kubectl get pods -n ${K8S_NAMESPACE} 65 | sleep 10 66 | TIMEOUT=$(( TIMEOUT - 1 )) 67 | done 68 | 69 | if [[ $TIMEOUT -eq 1 ]]; then 70 | exit 1 71 | fi 72 | 73 | echo "Run 'ChainerJob' test" 74 | MNIST_TEST="chainer-mnist-test" 75 | ks generate chainer-job-simple ${MNIST_TEST} \ 76 | --image=everpeace/chainer:4.1.0 \ 77 | --gpus=0 \ 78 | --command=python3 \ 79 | --args='/train_mnist.py,-e,2,-b,1000,-u,100,--noplot' 80 | ks apply default -c ${MNIST_TEST} 81 | TIMEOUT=30 82 | until [[ $(kubectl -n ${K8S_NAMESPACE} get chj ${MNIST_TEST} -ojsonpath={.status.succeeded}) == 1 ]] || [[ $TIMEOUT -eq 1 ]] ; do 83 | kubectl -n ${K8S_NAMESPACE} get pods 84 | kubectl -n ${K8S_NAMESPACE} get chj ${MNIST_TEST} 85 | sleep 10 86 | TIMEOUT=$(( TIMEOUT - 1 )) 87 | done 88 | if [[ $TIMEOUT -eq 1 ]]; then 89 | exit 1 90 | fi 91 | 92 | MN_MNIST_TEST_IMAGE="chainermn-mnist-test" 93 | ks generate chainer-job ${MN_MNIST_TEST_IMAGE} \ 94 | --image=everpeace/chainermn:1.3.0 \ 95 | --workers=1 \ 96 | --workerSetName=ws \ 97 | --gpus=0 \ 98 | --command=python3 \ 99 | --args='/train_mnist.py,-e,2,-b,1000,-u,100' 100 | ks apply default -c ${MN_MNIST_TEST_IMAGE} 101 | TIMEOUT=30 102 | until [[ $(kubectl -n ${K8S_NAMESPACE} get chj ${MN_MNIST_TEST_IMAGE} -ojsonpath={.status.succeeded}) == 1 ]] || [[ $TIMEOUT -eq 1 ]] ; do 103 | kubectl -n ${K8S_NAMESPACE} get pods 104 | kubectl -n ${K8S_NAMESPACE} get chj ${MN_MNIST_TEST_IMAGE} 105 | sleep 10 106 | TIMEOUT=$(( TIMEOUT - 1 )) 107 | done 108 | if [[ $TIMEOUT -eq 1 ]]; then 109 | exit 1 110 | fi 111 | -------------------------------------------------------------------------------- /pkg/client/informers/externalversions/chainer/v1alpha1/chainerjob.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by informer-gen. DO NOT EDIT. 16 | 17 | package v1alpha1 18 | 19 | import ( 20 | time "time" 21 | 22 | chainer_v1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 23 | versioned "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned" 24 | internalinterfaces "github.com/kubeflow/chainer-operator/pkg/client/informers/externalversions/internalinterfaces" 25 | v1alpha1 "github.com/kubeflow/chainer-operator/pkg/client/listers/chainer/v1alpha1" 26 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 | runtime "k8s.io/apimachinery/pkg/runtime" 28 | watch "k8s.io/apimachinery/pkg/watch" 29 | cache "k8s.io/client-go/tools/cache" 30 | ) 31 | 32 | // ChainerJobInformer provides access to a shared informer and lister for 33 | // ChainerJobs. 34 | type ChainerJobInformer interface { 35 | Informer() cache.SharedIndexInformer 36 | Lister() v1alpha1.ChainerJobLister 37 | } 38 | 39 | type chainerJobInformer struct { 40 | factory internalinterfaces.SharedInformerFactory 41 | tweakListOptions internalinterfaces.TweakListOptionsFunc 42 | namespace string 43 | } 44 | 45 | // NewChainerJobInformer constructs a new informer for ChainerJob type. 46 | // Always prefer using an informer factory to get a shared informer instead of getting an independent 47 | // one. This reduces memory footprint and number of connections to the server. 48 | func NewChainerJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { 49 | return NewFilteredChainerJobInformer(client, namespace, resyncPeriod, indexers, nil) 50 | } 51 | 52 | // NewFilteredChainerJobInformer constructs a new informer for ChainerJob type. 53 | // Always prefer using an informer factory to get a shared informer instead of getting an independent 54 | // one. This reduces memory footprint and number of connections to the server. 55 | func NewFilteredChainerJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { 56 | return cache.NewSharedIndexInformer( 57 | &cache.ListWatch{ 58 | ListFunc: func(options v1.ListOptions) (runtime.Object, error) { 59 | if tweakListOptions != nil { 60 | tweakListOptions(&options) 61 | } 62 | return client.KubeflowV1alpha1().ChainerJobs(namespace).List(options) 63 | }, 64 | WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { 65 | if tweakListOptions != nil { 66 | tweakListOptions(&options) 67 | } 68 | return client.KubeflowV1alpha1().ChainerJobs(namespace).Watch(options) 69 | }, 70 | }, 71 | &chainer_v1alpha1.ChainerJob{}, 72 | resyncPeriod, 73 | indexers, 74 | ) 75 | } 76 | 77 | func (f *chainerJobInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { 78 | return NewFilteredChainerJobInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) 79 | } 80 | 81 | func (f *chainerJobInformer) Informer() cache.SharedIndexInformer { 82 | return f.factory.InformerFor(&chainer_v1alpha1.ChainerJob{}, f.defaultInformer) 83 | } 84 | 85 | func (f *chainerJobInformer) Lister() v1alpha1.ChainerJobLister { 86 | return v1alpha1.NewChainerJobLister(f.Informer().GetIndexer()) 87 | } 88 | -------------------------------------------------------------------------------- /pkg/apis/chainer/v1alpha1/defaults.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package v1alpha1 16 | 17 | import ( 18 | "github.com/golang/glog" 19 | "k8s.io/apimachinery/pkg/runtime" 20 | ) 21 | 22 | // Int32 is a helper routine that allocates a new int32 value 23 | // to store v and returns a pointer to it. 24 | func Int32(v int32) *int32 { 25 | return &v 26 | } 27 | 28 | func addDefaultingFuncs(scheme *runtime.Scheme) error { 29 | return RegisterDefaults(scheme) 30 | } 31 | 32 | func setWorkerSetSpecReplicas(spec *WorkerSetSpec, name string) { 33 | if spec.Replicas == nil { 34 | spec.Replicas = Int32(1) 35 | glog.V(4).Infof("setting spec.WorkerSets[%s].Replicas to 1", name) 36 | } 37 | } 38 | 39 | func setWorkerSetSpecContainerName(spec *WorkerSetSpec, name string) { 40 | containers := spec.Template.Spec.Containers 41 | if len(containers) == 1 && containers[0].Name == "" { 42 | containers[0].Name = DefaultContainerName 43 | glog.V(4).Infof("setting spec.WorkerSets[%s].Template.Spec.Containers[0].Name to %s", name, DefaultContainerName) 44 | } 45 | } 46 | 47 | func setWorkerSetMPISlots(spec *WorkerSetSpec, name string) { 48 | if spec.MPIConfig == nil || spec.MPIConfig.Slots == nil { 49 | for _, container := range spec.Template.Spec.Containers { 50 | if container.Name == DefaultContainerName { 51 | slots := Int32(DefaultSlots) 52 | if q, ok := container.Resources.Limits["nvidia.com/gpu"]; ok { 53 | slots = Int32(int32(q.Value())) 54 | } 55 | spec.MPIConfig = defaultMPIConfig(slots) 56 | glog.V(4).Infof("setting spec.WorkerSets[%+v].MPIConfig.Slots to %+v", name, *spec.MPIConfig.Slots) 57 | return 58 | } 59 | } 60 | spec.MPIConfig = defaultMPIConfig(Int32(DefaultSlots)) 61 | } 62 | } 63 | 64 | func setMPIMasterSpecContainerName(spec *MasterSpec) { 65 | containers := spec.Template.Spec.Containers 66 | if len(containers) == 1 && containers[0].Name == "" { 67 | containers[0].Name = DefaultContainerName 68 | glog.V(4).Infof("setting spec.MasterSpec.Template.Spec.Containers[0].Name to %s", DefaultContainerName) 69 | } 70 | } 71 | 72 | func defaultMPIConfig(slots *int32) *MPIConfig { 73 | return &MPIConfig{ 74 | Slots: slots, 75 | } 76 | } 77 | 78 | func setMasterSpecMPISlots(spec *MasterSpec) { 79 | if spec.MPIConfig == nil || spec.MPIConfig.Slots == nil { 80 | for _, container := range spec.Template.Spec.Containers { 81 | if container.Name == DefaultContainerName { 82 | slots := Int32(DefaultSlots) 83 | if q, ok := container.Resources.Limits["nvidia.com/gpu"]; ok { 84 | slots = Int32(int32(q.Value())) 85 | } 86 | spec.MPIConfig = defaultMPIConfig(slots) 87 | glog.V(4).Infof("setting spec.Master.MPIConfig.Slots to %+v", *spec.MPIConfig.Slots) 88 | return 89 | } 90 | } 91 | spec.MPIConfig = defaultMPIConfig(Int32(DefaultSlots)) 92 | } 93 | } 94 | 95 | func setMPIMasterRetryPolicy(spec *MasterSpec) { 96 | if spec.Template.Spec.RestartPolicy == "" { 97 | spec.Template.Spec.RestartPolicy = DefaultRestartPolicy 98 | glog.V(4).Infof("setting spec.MPISpec.MasterSpec.Spec.RestartPolicy to %s", DefaultRestartPolicy) 99 | } 100 | } 101 | 102 | func setDefaultBackend(chainerjob *ChainerJob) { 103 | if chainerjob.Spec.Backend == "" { 104 | chainerjob.Spec.Backend = BackendTypeMPI 105 | } 106 | } 107 | 108 | // SetDefaults_ChainerJob sets any unspecified values to defaults. 109 | func SetDefaults_ChainerJob(chainerjob *ChainerJob) { 110 | glog.V(4).Infof("start setting default to %s", chainerjob.Name) 111 | setMPIMasterSpecContainerName(&chainerjob.Spec.Master) 112 | setMPIMasterRetryPolicy(&chainerjob.Spec.Master) 113 | 114 | if IsDistributed(chainerjob) { 115 | setDefaultBackend(chainerjob) 116 | for name, workerSetSpec := range chainerjob.Spec.WorkerSets { 117 | setWorkerSetSpecReplicas(workerSetSpec, name) 118 | setWorkerSetSpecContainerName(workerSetSpec, name) 119 | } 120 | switch chainerjob.Spec.Backend { 121 | case BackendTypeMPI: 122 | setMasterSpecMPISlots(&chainerjob.Spec.Master) 123 | for name, workerSetSpec := range chainerjob.Spec.WorkerSets { 124 | setWorkerSetMPISlots(workerSetSpec, name) 125 | } 126 | default: 127 | // nop 128 | } 129 | } 130 | glog.V(4).Infof("finish setting default to %s", chainerjob.Name) 131 | } 132 | -------------------------------------------------------------------------------- /pkg/controllers/backends/none/none_backend.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package none 16 | 17 | import ( 18 | "fmt" 19 | 20 | batchv1 "k8s.io/api/batch/v1" 21 | corev1 "k8s.io/api/core/v1" 22 | rbacv1 "k8s.io/api/rbac/v1" 23 | "k8s.io/client-go/kubernetes" 24 | batchlister "k8s.io/client-go/listers/batch/v1" 25 | corelister "k8s.io/client-go/listers/core/v1" 26 | rbaclister "k8s.io/client-go/listers/rbac/v1" 27 | "k8s.io/client-go/tools/record" 28 | 29 | "github.com/golang/glog" 30 | apisv1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 31 | clientset "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned" 32 | "github.com/kubeflow/chainer-operator/pkg/controllers/backends" 33 | ) 34 | 35 | // Backend defines behavior of non-Disributed ChainerJob 36 | type Backend struct { 37 | kubeClient kubernetes.Interface 38 | kubeflowClient clientset.Interface 39 | serviceAccountLister corelister.ServiceAccountLister 40 | roleLister rbaclister.RoleLister 41 | roleBindingLister rbaclister.RoleBindingLister 42 | jobLister batchlister.JobLister 43 | recorder record.EventRecorder 44 | } 45 | 46 | // NewBackend is a constructor for none.Backend 47 | func NewBackend( 48 | kubeClient kubernetes.Interface, 49 | kubeflowClient clientset.Interface, 50 | serviceAccountLister corelister.ServiceAccountLister, 51 | roleLister rbaclister.RoleLister, 52 | roleBindingLister rbaclister.RoleBindingLister, 53 | jobLister batchlister.JobLister, 54 | recorder record.EventRecorder, 55 | ) backends.Backend { 56 | return &Backend{ 57 | kubeClient: kubeClient, 58 | kubeflowClient: kubeflowClient, 59 | serviceAccountLister: serviceAccountLister, 60 | roleLister: roleLister, 61 | roleBindingLister: roleBindingLister, 62 | jobLister: jobLister, 63 | recorder: recorder, 64 | } 65 | } 66 | 67 | // SyncChainerJob syncs non-Distributed ChainerJobs. 68 | func (b *Backend) SyncChainerJob(chjob *apisv1alpha1.ChainerJob) error { 69 | 70 | if apisv1alpha1.IsDistributed(chjob) { 71 | return fmt.Errorf("not syncing %s because it is a distributed job", chjob.Name) 72 | } 73 | 74 | sa, err := b.syncServiceAccount(chjob) 75 | if sa == nil || err != nil { 76 | return err 77 | } 78 | glog.V(4).Infof("syncing %s: serviceaccount %s synced.", chjob.Name, sa.Name) 79 | 80 | role, err := b.syncRole(chjob) 81 | if role == nil || err != nil { 82 | return err 83 | } 84 | glog.V(4).Infof("syncing %s: role %s synced.", chjob.Name, role.Name) 85 | 86 | rolebinding, err := b.syncRoleBinding(chjob) 87 | if rolebinding == nil || err != nil { 88 | return err 89 | } 90 | glog.V(4).Infof("syncing %s: rolebinding %s synced.", chjob.Name, rolebinding.Name) 91 | 92 | master, err := b.syncMaster(chjob) 93 | if master == nil || err != nil { 94 | return err 95 | } 96 | glog.V(4).Infof("syncing %s: job %s synced.", chjob.Name, master.Name) 97 | 98 | err = backends.UpdateChainerJobStatus(chjob, &master.Status, b.kubeflowClient) 99 | if err != nil { 100 | return err 101 | } 102 | glog.V(4).Infof("syncing %s: status updated.", chjob.Name) 103 | 104 | return nil 105 | } 106 | 107 | func (b *Backend) syncServiceAccount(chjob *apisv1alpha1.ChainerJob) (*corev1.ServiceAccount, error) { 108 | return backends.CreateServiceAccountIfNotExist( 109 | chjob, 110 | b.kubeClient, 111 | b.serviceAccountLister, 112 | b.recorder, 113 | backends.NewServiceAccount, 114 | ) 115 | } 116 | 117 | func (b *Backend) syncRole(chjob *apisv1alpha1.ChainerJob) (*rbacv1.Role, error) { 118 | return backends.CreateOrUpdateRole( 119 | chjob, 120 | b.kubeClient, 121 | b.roleLister, 122 | b.recorder, 123 | backends.NewRole, 124 | ) 125 | } 126 | 127 | func (b *Backend) syncRoleBinding(chjob *apisv1alpha1.ChainerJob) (*rbacv1.RoleBinding, error) { 128 | return backends.CreateOrUpdateRoleBinding( 129 | chjob, 130 | b.kubeClient, 131 | b.roleBindingLister, 132 | b.recorder, 133 | backends.NewRoleBindings, 134 | ) 135 | } 136 | 137 | func (b *Backend) syncMaster(chjob *apisv1alpha1.ChainerJob) (*batchv1.Job, error) { 138 | return backends.CreateJobIfNotExist( 139 | chjob, 140 | b.kubeClient, 141 | b.jobLister, 142 | b.recorder, 143 | newMasterJob, 144 | ) 145 | } 146 | 147 | func newMasterJob(chjob *apisv1alpha1.ChainerJob) *batchv1.Job { 148 | return backends.NewMasterJob(chjob) 149 | } 150 | -------------------------------------------------------------------------------- /pkg/controllers/backends/none/none_backend_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | package none 15 | 16 | import ( 17 | "encoding/json" 18 | "fmt" 19 | "reflect" 20 | "testing" 21 | 22 | "github.com/kubeflow/chainer-operator/pkg/controllers/backends" 23 | 24 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 | 26 | apisv1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 27 | batchv1 "k8s.io/api/batch/v1" 28 | corev1 "k8s.io/api/core/v1" 29 | ) 30 | 31 | func Int64(v int64) *int64 { 32 | return &v 33 | } 34 | 35 | func Bool(b bool) *bool { 36 | return &b 37 | } 38 | 39 | func ownerReference(name string) []metav1.OwnerReference { 40 | return []metav1.OwnerReference{ 41 | { 42 | APIVersion: apisv1alpha1.GroupName + "/" + apisv1alpha1.GroupVersion, 43 | Kind: apisv1alpha1.Kind, 44 | Name: name, 45 | UID: "", 46 | Controller: Bool(true), 47 | BlockOwnerDeletion: Bool(true), 48 | }, 49 | } 50 | } 51 | 52 | func simpleContainer(name string, image string) corev1.Container { 53 | return corev1.Container{ 54 | Name: name, 55 | Image: image, 56 | } 57 | } 58 | 59 | func simpleContainer2(name string, image string, envVars []corev1.EnvVar, volumeMounts []corev1.VolumeMount) corev1.Container { 60 | return corev1.Container{ 61 | Name: name, 62 | Image: image, 63 | Env: envVars, 64 | VolumeMounts: volumeMounts, 65 | } 66 | } 67 | 68 | func TestNewMaster(t *testing.T) { 69 | // single 70 | type testCase struct { 71 | name string 72 | in *apisv1alpha1.ChainerJob 73 | expected *batchv1.Job 74 | } 75 | testCases := []testCase{ 76 | { 77 | name: "single", 78 | in: &apisv1alpha1.ChainerJob{ 79 | ObjectMeta: metav1.ObjectMeta{ 80 | Name: "chj", 81 | Namespace: "ch", 82 | Labels: map[string]string{ 83 | "custom": "value", 84 | }, 85 | }, 86 | Spec: apisv1alpha1.ChainerJobSpec{ 87 | Master: apisv1alpha1.MasterSpec{ 88 | ActiveDeadlineSeconds: Int64(1000), 89 | BackoffLimit: apisv1alpha1.Int32(10), 90 | Template: corev1.PodTemplateSpec{ 91 | ObjectMeta: metav1.ObjectMeta{ 92 | Labels: map[string]string{ 93 | "custom2": "value2", 94 | }, 95 | }, 96 | Spec: corev1.PodSpec{ 97 | RestartPolicy: "Never", 98 | Containers: []corev1.Container{ 99 | simpleContainer("chainer", "dummy"), 100 | }, 101 | }, 102 | }, 103 | }, 104 | }, 105 | }, 106 | expected: &batchv1.Job{ 107 | ObjectMeta: metav1.ObjectMeta{ 108 | Name: "chj" + backends.MasterSuffix, 109 | Namespace: "ch", 110 | Labels: map[string]string{ 111 | "custom": "value", 112 | backends.JobLabelKey: "chj", 113 | backends.RoleLabelKey: backends.RoleMaster, 114 | }, 115 | OwnerReferences: ownerReference("chj"), 116 | }, 117 | Spec: batchv1.JobSpec{ 118 | Parallelism: apisv1alpha1.Int32(1), 119 | Completions: apisv1alpha1.Int32(1), 120 | ActiveDeadlineSeconds: Int64(1000), 121 | BackoffLimit: apisv1alpha1.Int32(10), 122 | Template: corev1.PodTemplateSpec{ 123 | ObjectMeta: metav1.ObjectMeta{ 124 | Labels: map[string]string{ 125 | "custom2": "value2", 126 | backends.JobLabelKey: "chj", 127 | backends.RoleLabelKey: backends.RoleMaster, 128 | }, 129 | }, 130 | Spec: corev1.PodSpec{ 131 | ServiceAccountName: "chj" + backends.ServiceAccountSuffix, 132 | RestartPolicy: "Never", 133 | Containers: []corev1.Container{ 134 | simpleContainer("chainer", "dummy"), 135 | }, 136 | }, 137 | }, 138 | }, 139 | }, 140 | }, 141 | } 142 | 143 | for _, c := range testCases { 144 | actualJob := newMasterJob(c.in) 145 | actualJSON, errActual := json.MarshalIndent(actualJob, "", " ") 146 | actual := "" 147 | if errActual != nil { 148 | t.Errorf("Couldn't pretty format %v, error: %v", actualJob, errActual) 149 | actual = fmt.Sprintf("%+v", actualJob) 150 | } else { 151 | actual = string(actualJSON) 152 | } 153 | 154 | expectedJSON, errExpected := json.MarshalIndent(c.expected, "", " ") 155 | expected := "" 156 | if errExpected != nil { 157 | t.Errorf("Couldn't pretty format %v, error: %v", c.expected, errExpected) 158 | expected = fmt.Sprintf("%v", c.expected) 159 | } else { 160 | expected = string(expectedJSON) 161 | } 162 | 163 | if !reflect.DeepEqual(actual, expected) { 164 | t.Errorf("TestCase: %v\nWant:%+v\nGot:%+v", c.name, expected, actual) 165 | } 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/chainer/v1alpha1/fake/fake_chainerjob.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package fake 18 | 19 | import ( 20 | v1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 21 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 22 | labels "k8s.io/apimachinery/pkg/labels" 23 | schema "k8s.io/apimachinery/pkg/runtime/schema" 24 | types "k8s.io/apimachinery/pkg/types" 25 | watch "k8s.io/apimachinery/pkg/watch" 26 | testing "k8s.io/client-go/testing" 27 | ) 28 | 29 | // FakeChainerJobs implements ChainerJobInterface 30 | type FakeChainerJobs struct { 31 | Fake *FakeKubeflowV1alpha1 32 | ns string 33 | } 34 | 35 | var chainerjobsResource = schema.GroupVersionResource{Group: "kubeflow.org", Version: "v1alpha1", Resource: "chainerjobs"} 36 | 37 | var chainerjobsKind = schema.GroupVersionKind{Group: "kubeflow.org", Version: "v1alpha1", Kind: "ChainerJob"} 38 | 39 | // Get takes name of the chainerJob, and returns the corresponding chainerJob object, and an error if there is any. 40 | func (c *FakeChainerJobs) Get(name string, options v1.GetOptions) (result *v1alpha1.ChainerJob, err error) { 41 | obj, err := c.Fake. 42 | Invokes(testing.NewGetAction(chainerjobsResource, c.ns, name), &v1alpha1.ChainerJob{}) 43 | 44 | if obj == nil { 45 | return nil, err 46 | } 47 | return obj.(*v1alpha1.ChainerJob), err 48 | } 49 | 50 | // List takes label and field selectors, and returns the list of ChainerJobs that match those selectors. 51 | func (c *FakeChainerJobs) List(opts v1.ListOptions) (result *v1alpha1.ChainerJobList, err error) { 52 | obj, err := c.Fake. 53 | Invokes(testing.NewListAction(chainerjobsResource, chainerjobsKind, c.ns, opts), &v1alpha1.ChainerJobList{}) 54 | 55 | if obj == nil { 56 | return nil, err 57 | } 58 | 59 | label, _, _ := testing.ExtractFromListOptions(opts) 60 | if label == nil { 61 | label = labels.Everything() 62 | } 63 | list := &v1alpha1.ChainerJobList{ListMeta: obj.(*v1alpha1.ChainerJobList).ListMeta} 64 | for _, item := range obj.(*v1alpha1.ChainerJobList).Items { 65 | if label.Matches(labels.Set(item.Labels)) { 66 | list.Items = append(list.Items, item) 67 | } 68 | } 69 | return list, err 70 | } 71 | 72 | // Watch returns a watch.Interface that watches the requested chainerJobs. 73 | func (c *FakeChainerJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { 74 | return c.Fake. 75 | InvokesWatch(testing.NewWatchAction(chainerjobsResource, c.ns, opts)) 76 | 77 | } 78 | 79 | // Create takes the representation of a chainerJob and creates it. Returns the server's representation of the chainerJob, and an error, if there is any. 80 | func (c *FakeChainerJobs) Create(chainerJob *v1alpha1.ChainerJob) (result *v1alpha1.ChainerJob, err error) { 81 | obj, err := c.Fake. 82 | Invokes(testing.NewCreateAction(chainerjobsResource, c.ns, chainerJob), &v1alpha1.ChainerJob{}) 83 | 84 | if obj == nil { 85 | return nil, err 86 | } 87 | return obj.(*v1alpha1.ChainerJob), err 88 | } 89 | 90 | // Update takes the representation of a chainerJob and updates it. Returns the server's representation of the chainerJob, and an error, if there is any. 91 | func (c *FakeChainerJobs) Update(chainerJob *v1alpha1.ChainerJob) (result *v1alpha1.ChainerJob, err error) { 92 | obj, err := c.Fake. 93 | Invokes(testing.NewUpdateAction(chainerjobsResource, c.ns, chainerJob), &v1alpha1.ChainerJob{}) 94 | 95 | if obj == nil { 96 | return nil, err 97 | } 98 | return obj.(*v1alpha1.ChainerJob), err 99 | } 100 | 101 | // Delete takes name of the chainerJob and deletes it. Returns an error if one occurs. 102 | func (c *FakeChainerJobs) Delete(name string, options *v1.DeleteOptions) error { 103 | _, err := c.Fake. 104 | Invokes(testing.NewDeleteAction(chainerjobsResource, c.ns, name), &v1alpha1.ChainerJob{}) 105 | 106 | return err 107 | } 108 | 109 | // DeleteCollection deletes a collection of objects. 110 | func (c *FakeChainerJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { 111 | action := testing.NewDeleteCollectionAction(chainerjobsResource, c.ns, listOptions) 112 | 113 | _, err := c.Fake.Invokes(action, &v1alpha1.ChainerJobList{}) 114 | return err 115 | } 116 | 117 | // Patch applies the patch and returns the patched chainerJob. 118 | func (c *FakeChainerJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.ChainerJob, err error) { 119 | obj, err := c.Fake. 120 | Invokes(testing.NewPatchSubresourceAction(chainerjobsResource, c.ns, name, data, subresources...), &v1alpha1.ChainerJob{}) 121 | 122 | if obj == nil { 123 | return nil, err 124 | } 125 | return obj.(*v1alpha1.ChainerJob), err 126 | } 127 | -------------------------------------------------------------------------------- /test/workflows/lib/v1.10.1/k.libsonnet: -------------------------------------------------------------------------------- 1 | local k8s = import 'k8s.libsonnet'; 2 | local fn = { 3 | mapContainers(f):: { 4 | local podContainers = super.spec.template.spec.containers, 5 | spec+: { 6 | template+: { 7 | spec+: { 8 | containers: std.map(f, podContainers), 9 | }, 10 | }, 11 | }, 12 | }, 13 | mapContainersWithName(names, f):: 14 | local nameSet = if std.type(names) == 'array' then std.set(names) else std.set([names]); 15 | local inNameSet(name) = std.length(std.setInter(nameSet, std.set([name]))) > 0; 16 | 17 | self.mapContainers(function(c) if std.objectHas(c, 'name') && inNameSet(c.name) then f(c) else c), 18 | }; 19 | 20 | k8s + { 21 | apps:: k8s.apps + { 22 | v1:: k8s.apps.v1 + { 23 | daemonSet:: k8s.apps.v1.daemonSet + { 24 | mapContainers(f):: fn.mapContainers(f), 25 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 26 | }, 27 | deployment:: k8s.apps.v1.deployment + { 28 | mapContainers(f):: fn.mapContainers(f), 29 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 30 | }, 31 | replicaSet:: k8s.apps.v1.replicaSet + { 32 | mapContainers(f):: fn.mapContainers(f), 33 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 34 | }, 35 | statefulSet:: k8s.apps.v1.statefulSet + { 36 | mapContainers(f):: fn.mapContainers(f), 37 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 38 | }, 39 | }, 40 | v1beta1:: k8s.apps.v1beta1 + { 41 | deployment:: k8s.apps.v1beta1.deployment + { 42 | mapContainers(f):: fn.mapContainers(f), 43 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 44 | }, 45 | statefulSet:: k8s.apps.v1beta1.statefulSet + { 46 | mapContainers(f):: fn.mapContainers(f), 47 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 48 | }, 49 | }, 50 | v1beta2:: k8s.apps.v1beta2 + { 51 | daemonSet:: k8s.apps.v1beta2.daemonSet + { 52 | mapContainers(f):: fn.mapContainers(f), 53 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 54 | }, 55 | deployment:: k8s.apps.v1beta2.deployment + { 56 | mapContainers(f):: fn.mapContainers(f), 57 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 58 | }, 59 | replicaSet:: k8s.apps.v1beta2.replicaSet + { 60 | mapContainers(f):: fn.mapContainers(f), 61 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 62 | }, 63 | statefulSet:: k8s.apps.v1beta2.statefulSet + { 64 | mapContainers(f):: fn.mapContainers(f), 65 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 66 | }, 67 | }, 68 | }, 69 | batch:: k8s.batch + { 70 | v1:: k8s.batch.v1 + { 71 | job:: k8s.batch.v1.job + { 72 | mapContainers(f):: fn.mapContainers(f), 73 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 74 | }, 75 | }, 76 | v1beta1:: k8s.batch.v1beta1 + { 77 | cronJob:: k8s.batch.v1beta1.cronJob + { 78 | mapContainers(f):: fn.mapContainers(f), 79 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 80 | }, 81 | }, 82 | v2alpha1:: k8s.batch.v2alpha1 + { 83 | cronJob:: k8s.batch.v2alpha1.cronJob + { 84 | mapContainers(f):: fn.mapContainers(f), 85 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 86 | }, 87 | }, 88 | }, 89 | core:: k8s.core + { 90 | v1:: k8s.core.v1 + { 91 | list:: { 92 | new(items):: { 93 | apiVersion: 'v1', 94 | } + { 95 | kind: 'List', 96 | } + self.items(items), 97 | items(items):: if std.type(items) == 'array' then { items+: items } else { items+: [items] }, 98 | }, 99 | pod:: k8s.core.v1.pod + { 100 | mapContainers(f):: fn.mapContainers(f), 101 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 102 | }, 103 | podTemplate:: k8s.core.v1.podTemplate + { 104 | mapContainers(f):: fn.mapContainers(f), 105 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 106 | }, 107 | replicationController:: k8s.core.v1.replicationController + { 108 | mapContainers(f):: fn.mapContainers(f), 109 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 110 | }, 111 | }, 112 | }, 113 | extensions:: k8s.extensions + { 114 | v1beta1:: k8s.extensions.v1beta1 + { 115 | daemonSet:: k8s.extensions.v1beta1.daemonSet + { 116 | mapContainers(f):: fn.mapContainers(f), 117 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 118 | }, 119 | deployment:: k8s.extensions.v1beta1.deployment + { 120 | mapContainers(f):: fn.mapContainers(f), 121 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 122 | }, 123 | replicaSet:: k8s.extensions.v1beta1.replicaSet + { 124 | mapContainers(f):: fn.mapContainers(f), 125 | mapContainersWithName(names, f):: fn.mapContainersWithName(names, f), 126 | }, 127 | }, 128 | }, 129 | } -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/chainer/v1alpha1/chainerjob.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by client-gen. DO NOT EDIT. 16 | 17 | package v1alpha1 18 | 19 | import ( 20 | v1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 21 | scheme "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned/scheme" 22 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 23 | types "k8s.io/apimachinery/pkg/types" 24 | watch "k8s.io/apimachinery/pkg/watch" 25 | rest "k8s.io/client-go/rest" 26 | ) 27 | 28 | // ChainerJobsGetter has a method to return a ChainerJobInterface. 29 | // A group's client should implement this interface. 30 | type ChainerJobsGetter interface { 31 | ChainerJobs(namespace string) ChainerJobInterface 32 | } 33 | 34 | // ChainerJobInterface has methods to work with ChainerJob resources. 35 | type ChainerJobInterface interface { 36 | Create(*v1alpha1.ChainerJob) (*v1alpha1.ChainerJob, error) 37 | Update(*v1alpha1.ChainerJob) (*v1alpha1.ChainerJob, error) 38 | Delete(name string, options *v1.DeleteOptions) error 39 | DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error 40 | Get(name string, options v1.GetOptions) (*v1alpha1.ChainerJob, error) 41 | List(opts v1.ListOptions) (*v1alpha1.ChainerJobList, error) 42 | Watch(opts v1.ListOptions) (watch.Interface, error) 43 | Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.ChainerJob, err error) 44 | ChainerJobExpansion 45 | } 46 | 47 | // chainerJobs implements ChainerJobInterface 48 | type chainerJobs struct { 49 | client rest.Interface 50 | ns string 51 | } 52 | 53 | // newChainerJobs returns a ChainerJobs 54 | func newChainerJobs(c *KubeflowV1alpha1Client, namespace string) *chainerJobs { 55 | return &chainerJobs{ 56 | client: c.RESTClient(), 57 | ns: namespace, 58 | } 59 | } 60 | 61 | // Get takes name of the chainerJob, and returns the corresponding chainerJob object, and an error if there is any. 62 | func (c *chainerJobs) Get(name string, options v1.GetOptions) (result *v1alpha1.ChainerJob, err error) { 63 | result = &v1alpha1.ChainerJob{} 64 | err = c.client.Get(). 65 | Namespace(c.ns). 66 | Resource("chainerjobs"). 67 | Name(name). 68 | VersionedParams(&options, scheme.ParameterCodec). 69 | Do(). 70 | Into(result) 71 | return 72 | } 73 | 74 | // List takes label and field selectors, and returns the list of ChainerJobs that match those selectors. 75 | func (c *chainerJobs) List(opts v1.ListOptions) (result *v1alpha1.ChainerJobList, err error) { 76 | result = &v1alpha1.ChainerJobList{} 77 | err = c.client.Get(). 78 | Namespace(c.ns). 79 | Resource("chainerjobs"). 80 | VersionedParams(&opts, scheme.ParameterCodec). 81 | Do(). 82 | Into(result) 83 | return 84 | } 85 | 86 | // Watch returns a watch.Interface that watches the requested chainerJobs. 87 | func (c *chainerJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { 88 | opts.Watch = true 89 | return c.client.Get(). 90 | Namespace(c.ns). 91 | Resource("chainerjobs"). 92 | VersionedParams(&opts, scheme.ParameterCodec). 93 | Watch() 94 | } 95 | 96 | // Create takes the representation of a chainerJob and creates it. Returns the server's representation of the chainerJob, and an error, if there is any. 97 | func (c *chainerJobs) Create(chainerJob *v1alpha1.ChainerJob) (result *v1alpha1.ChainerJob, err error) { 98 | result = &v1alpha1.ChainerJob{} 99 | err = c.client.Post(). 100 | Namespace(c.ns). 101 | Resource("chainerjobs"). 102 | Body(chainerJob). 103 | Do(). 104 | Into(result) 105 | return 106 | } 107 | 108 | // Update takes the representation of a chainerJob and updates it. Returns the server's representation of the chainerJob, and an error, if there is any. 109 | func (c *chainerJobs) Update(chainerJob *v1alpha1.ChainerJob) (result *v1alpha1.ChainerJob, err error) { 110 | result = &v1alpha1.ChainerJob{} 111 | err = c.client.Put(). 112 | Namespace(c.ns). 113 | Resource("chainerjobs"). 114 | Name(chainerJob.Name). 115 | Body(chainerJob). 116 | Do(). 117 | Into(result) 118 | return 119 | } 120 | 121 | // Delete takes name of the chainerJob and deletes it. Returns an error if one occurs. 122 | func (c *chainerJobs) Delete(name string, options *v1.DeleteOptions) error { 123 | return c.client.Delete(). 124 | Namespace(c.ns). 125 | Resource("chainerjobs"). 126 | Name(name). 127 | Body(options). 128 | Do(). 129 | Error() 130 | } 131 | 132 | // DeleteCollection deletes a collection of objects. 133 | func (c *chainerJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { 134 | return c.client.Delete(). 135 | Namespace(c.ns). 136 | Resource("chainerjobs"). 137 | VersionedParams(&listOptions, scheme.ParameterCodec). 138 | Body(options). 139 | Do(). 140 | Error() 141 | } 142 | 143 | // Patch applies the patch and returns the patched chainerJob. 144 | func (c *chainerJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.ChainerJob, err error) { 145 | result = &v1alpha1.ChainerJob{} 146 | err = c.client.Patch(pt). 147 | Namespace(c.ns). 148 | Resource("chainerjobs"). 149 | SubResource(subresources...). 150 | Name(name). 151 | Body(data). 152 | Do(). 153 | Into(result) 154 | return 155 | } 156 | -------------------------------------------------------------------------------- /pkg/apis/chainer/v1alpha1/zz_generated.deepcopy.go: -------------------------------------------------------------------------------- 1 | // +build !ignore_autogenerated 2 | 3 | // Copyright 2018 The Kubeflow Authors. 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // Code generated by deepcopy-gen. DO NOT EDIT. 18 | 19 | package v1alpha1 20 | 21 | import ( 22 | runtime "k8s.io/apimachinery/pkg/runtime" 23 | ) 24 | 25 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 26 | func (in *ChainerJob) DeepCopyInto(out *ChainerJob) { 27 | *out = *in 28 | out.TypeMeta = in.TypeMeta 29 | in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) 30 | in.Spec.DeepCopyInto(&out.Spec) 31 | in.Status.DeepCopyInto(&out.Status) 32 | return 33 | } 34 | 35 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ChainerJob. 36 | func (in *ChainerJob) DeepCopy() *ChainerJob { 37 | if in == nil { 38 | return nil 39 | } 40 | out := new(ChainerJob) 41 | in.DeepCopyInto(out) 42 | return out 43 | } 44 | 45 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. 46 | func (in *ChainerJob) DeepCopyObject() runtime.Object { 47 | if c := in.DeepCopy(); c != nil { 48 | return c 49 | } 50 | return nil 51 | } 52 | 53 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 54 | func (in *ChainerJobList) DeepCopyInto(out *ChainerJobList) { 55 | *out = *in 56 | out.TypeMeta = in.TypeMeta 57 | out.ListMeta = in.ListMeta 58 | if in.Items != nil { 59 | in, out := &in.Items, &out.Items 60 | *out = make([]ChainerJob, len(*in)) 61 | for i := range *in { 62 | (*in)[i].DeepCopyInto(&(*out)[i]) 63 | } 64 | } 65 | return 66 | } 67 | 68 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ChainerJobList. 69 | func (in *ChainerJobList) DeepCopy() *ChainerJobList { 70 | if in == nil { 71 | return nil 72 | } 73 | out := new(ChainerJobList) 74 | in.DeepCopyInto(out) 75 | return out 76 | } 77 | 78 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. 79 | func (in *ChainerJobList) DeepCopyObject() runtime.Object { 80 | if c := in.DeepCopy(); c != nil { 81 | return c 82 | } 83 | return nil 84 | } 85 | 86 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 87 | func (in *ChainerJobSpec) DeepCopyInto(out *ChainerJobSpec) { 88 | *out = *in 89 | in.Master.DeepCopyInto(&out.Master) 90 | if in.WorkerSets != nil { 91 | in, out := &in.WorkerSets, &out.WorkerSets 92 | *out = make(map[string]*WorkerSetSpec, len(*in)) 93 | for key, val := range *in { 94 | if val == nil { 95 | (*out)[key] = nil 96 | } else { 97 | (*out)[key] = new(WorkerSetSpec) 98 | val.DeepCopyInto((*out)[key]) 99 | } 100 | } 101 | } 102 | return 103 | } 104 | 105 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ChainerJobSpec. 106 | func (in *ChainerJobSpec) DeepCopy() *ChainerJobSpec { 107 | if in == nil { 108 | return nil 109 | } 110 | out := new(ChainerJobSpec) 111 | in.DeepCopyInto(out) 112 | return out 113 | } 114 | 115 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 116 | func (in *MPIConfig) DeepCopyInto(out *MPIConfig) { 117 | *out = *in 118 | if in.Slots != nil { 119 | in, out := &in.Slots, &out.Slots 120 | if *in == nil { 121 | *out = nil 122 | } else { 123 | *out = new(int32) 124 | **out = **in 125 | } 126 | } 127 | return 128 | } 129 | 130 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPIConfig. 131 | func (in *MPIConfig) DeepCopy() *MPIConfig { 132 | if in == nil { 133 | return nil 134 | } 135 | out := new(MPIConfig) 136 | in.DeepCopyInto(out) 137 | return out 138 | } 139 | 140 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 141 | func (in *MasterSpec) DeepCopyInto(out *MasterSpec) { 142 | *out = *in 143 | if in.ActiveDeadlineSeconds != nil { 144 | in, out := &in.ActiveDeadlineSeconds, &out.ActiveDeadlineSeconds 145 | if *in == nil { 146 | *out = nil 147 | } else { 148 | *out = new(int64) 149 | **out = **in 150 | } 151 | } 152 | if in.BackoffLimit != nil { 153 | in, out := &in.BackoffLimit, &out.BackoffLimit 154 | if *in == nil { 155 | *out = nil 156 | } else { 157 | *out = new(int32) 158 | **out = **in 159 | } 160 | } 161 | if in.MPIConfig != nil { 162 | in, out := &in.MPIConfig, &out.MPIConfig 163 | if *in == nil { 164 | *out = nil 165 | } else { 166 | *out = new(MPIConfig) 167 | (*in).DeepCopyInto(*out) 168 | } 169 | } 170 | in.Template.DeepCopyInto(&out.Template) 171 | return 172 | } 173 | 174 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MasterSpec. 175 | func (in *MasterSpec) DeepCopy() *MasterSpec { 176 | if in == nil { 177 | return nil 178 | } 179 | out := new(MasterSpec) 180 | in.DeepCopyInto(out) 181 | return out 182 | } 183 | 184 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 185 | func (in *WorkerSetSpec) DeepCopyInto(out *WorkerSetSpec) { 186 | *out = *in 187 | if in.Replicas != nil { 188 | in, out := &in.Replicas, &out.Replicas 189 | if *in == nil { 190 | *out = nil 191 | } else { 192 | *out = new(int32) 193 | **out = **in 194 | } 195 | } 196 | if in.MPIConfig != nil { 197 | in, out := &in.MPIConfig, &out.MPIConfig 198 | if *in == nil { 199 | *out = nil 200 | } else { 201 | *out = new(MPIConfig) 202 | (*in).DeepCopyInto(*out) 203 | } 204 | } 205 | in.Template.DeepCopyInto(&out.Template) 206 | return 207 | } 208 | 209 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkerSetSpec. 210 | func (in *WorkerSetSpec) DeepCopy() *WorkerSetSpec { 211 | if in == nil { 212 | return nil 213 | } 214 | out := new(WorkerSetSpec) 215 | in.DeepCopyInto(out) 216 | return out 217 | } 218 | -------------------------------------------------------------------------------- /pkg/client/informers/externalversions/factory.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by informer-gen. DO NOT EDIT. 16 | 17 | package externalversions 18 | 19 | import ( 20 | reflect "reflect" 21 | sync "sync" 22 | time "time" 23 | 24 | versioned "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned" 25 | chainer "github.com/kubeflow/chainer-operator/pkg/client/informers/externalversions/chainer" 26 | internalinterfaces "github.com/kubeflow/chainer-operator/pkg/client/informers/externalversions/internalinterfaces" 27 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 | runtime "k8s.io/apimachinery/pkg/runtime" 29 | schema "k8s.io/apimachinery/pkg/runtime/schema" 30 | cache "k8s.io/client-go/tools/cache" 31 | ) 32 | 33 | // SharedInformerOption defines the functional option type for SharedInformerFactory. 34 | type SharedInformerOption func(*sharedInformerFactory) *sharedInformerFactory 35 | 36 | type sharedInformerFactory struct { 37 | client versioned.Interface 38 | namespace string 39 | tweakListOptions internalinterfaces.TweakListOptionsFunc 40 | lock sync.Mutex 41 | defaultResync time.Duration 42 | customResync map[reflect.Type]time.Duration 43 | 44 | informers map[reflect.Type]cache.SharedIndexInformer 45 | // startedInformers is used for tracking which informers have been started. 46 | // This allows Start() to be called multiple times safely. 47 | startedInformers map[reflect.Type]bool 48 | } 49 | 50 | // WithCustomResyncConfig sets a custom resync period for the specified informer types. 51 | func WithCustomResyncConfig(resyncConfig map[v1.Object]time.Duration) SharedInformerOption { 52 | return func(factory *sharedInformerFactory) *sharedInformerFactory { 53 | for k, v := range resyncConfig { 54 | factory.customResync[reflect.TypeOf(k)] = v 55 | } 56 | return factory 57 | } 58 | } 59 | 60 | // WithTweakListOptions sets a custom filter on all listers of the configured SharedInformerFactory. 61 | func WithTweakListOptions(tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerOption { 62 | return func(factory *sharedInformerFactory) *sharedInformerFactory { 63 | factory.tweakListOptions = tweakListOptions 64 | return factory 65 | } 66 | } 67 | 68 | // WithNamespace limits the SharedInformerFactory to the specified namespace. 69 | func WithNamespace(namespace string) SharedInformerOption { 70 | return func(factory *sharedInformerFactory) *sharedInformerFactory { 71 | factory.namespace = namespace 72 | return factory 73 | } 74 | } 75 | 76 | // NewSharedInformerFactory constructs a new instance of sharedInformerFactory for all namespaces. 77 | func NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) SharedInformerFactory { 78 | return NewSharedInformerFactoryWithOptions(client, defaultResync) 79 | } 80 | 81 | // NewFilteredSharedInformerFactory constructs a new instance of sharedInformerFactory. 82 | // Listers obtained via this SharedInformerFactory will be subject to the same filters 83 | // as specified here. 84 | // Deprecated: Please use NewSharedInformerFactoryWithOptions instead 85 | func NewFilteredSharedInformerFactory(client versioned.Interface, defaultResync time.Duration, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerFactory { 86 | return NewSharedInformerFactoryWithOptions(client, defaultResync, WithNamespace(namespace), WithTweakListOptions(tweakListOptions)) 87 | } 88 | 89 | // NewSharedInformerFactoryWithOptions constructs a new instance of a SharedInformerFactory with additional options. 90 | func NewSharedInformerFactoryWithOptions(client versioned.Interface, defaultResync time.Duration, options ...SharedInformerOption) SharedInformerFactory { 91 | factory := &sharedInformerFactory{ 92 | client: client, 93 | namespace: v1.NamespaceAll, 94 | defaultResync: defaultResync, 95 | informers: make(map[reflect.Type]cache.SharedIndexInformer), 96 | startedInformers: make(map[reflect.Type]bool), 97 | customResync: make(map[reflect.Type]time.Duration), 98 | } 99 | 100 | // Apply all options 101 | for _, opt := range options { 102 | factory = opt(factory) 103 | } 104 | 105 | return factory 106 | } 107 | 108 | // Start initializes all requested informers. 109 | func (f *sharedInformerFactory) Start(stopCh <-chan struct{}) { 110 | f.lock.Lock() 111 | defer f.lock.Unlock() 112 | 113 | for informerType, informer := range f.informers { 114 | if !f.startedInformers[informerType] { 115 | go informer.Run(stopCh) 116 | f.startedInformers[informerType] = true 117 | } 118 | } 119 | } 120 | 121 | // WaitForCacheSync waits for all started informers' cache were synced. 122 | func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool { 123 | informers := func() map[reflect.Type]cache.SharedIndexInformer { 124 | f.lock.Lock() 125 | defer f.lock.Unlock() 126 | 127 | informers := map[reflect.Type]cache.SharedIndexInformer{} 128 | for informerType, informer := range f.informers { 129 | if f.startedInformers[informerType] { 130 | informers[informerType] = informer 131 | } 132 | } 133 | return informers 134 | }() 135 | 136 | res := map[reflect.Type]bool{} 137 | for informType, informer := range informers { 138 | res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced) 139 | } 140 | return res 141 | } 142 | 143 | // InternalInformerFor returns the SharedIndexInformer for obj using an internal 144 | // client. 145 | func (f *sharedInformerFactory) InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer { 146 | f.lock.Lock() 147 | defer f.lock.Unlock() 148 | 149 | informerType := reflect.TypeOf(obj) 150 | informer, exists := f.informers[informerType] 151 | if exists { 152 | return informer 153 | } 154 | 155 | resyncPeriod, exists := f.customResync[informerType] 156 | if !exists { 157 | resyncPeriod = f.defaultResync 158 | } 159 | 160 | informer = newFunc(f.client, resyncPeriod) 161 | f.informers[informerType] = informer 162 | 163 | return informer 164 | } 165 | 166 | // SharedInformerFactory provides shared informers for resources in all known 167 | // API group versions. 168 | type SharedInformerFactory interface { 169 | internalinterfaces.SharedInformerFactory 170 | ForResource(resource schema.GroupVersionResource) (GenericInformer, error) 171 | WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool 172 | 173 | Kubeflow() chainer.Interface 174 | } 175 | 176 | func (f *sharedInformerFactory) Kubeflow() chainer.Interface { 177 | return chainer.New(f, f.namespace, f.tweakListOptions) 178 | } 179 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## :warning: **kubeflow/chainer-operator is not maintained** 2 | 3 | This repository has been deprecated, and will be archived soon (Nov 30th, 2021). Please consider to user [normal Jobs](https://kubernetes.io/docs/tasks/job/) for non-distributed cases and [kubeflow/mpi-controller](https://github.com/kubeflow/mpi-operator) for distributed cases. 4 | 5 | # K8s Custom Resource and Operator For Chainer/ChainerMN jobs 6 | 7 | __Experimental repo notice: This repository is experimental and currently only serves as a proof of concept for running distributed training with Chainer/ChainerMN on Kubernetes.__ 8 | 9 | ## Overview 10 | 11 | `ChainerJob` provides a Kubernetes custom resource that makes it easy to run distributed or non-distributed Chainer jobs on Kubernetes. 12 | 13 | Using a Custom Resource Definition (CRD) gives users the ability to create and manage Chainer Jobs just like builtin K8s resources. For example to create a job 14 | 15 | ```bash 16 | $ kubectl create -f examples/chainerjob.yaml 17 | chainerjob.kubeflow.org "example-job" created 18 | ``` 19 | 20 | To list chainer jobs: 21 | 22 | ```bash 23 | $ kubectl get chainerjobs 24 | NAME AGE 25 | example-job 12s 26 | ``` 27 | 28 | ## Installing the ChainerJob CRD and its operator on your k8s cluster 29 | 30 | ```bash 31 | kubectl create -f deploy/ 32 | ``` 33 | 34 | This will create: 35 | 36 | - `ChainerJob` Custom Resource Definition (CRD) 37 | - `chainer-operator` namespace 38 | - RBAC related resources 39 | - `ServiceAccount` 40 | - `ClusterRole` 41 | - please see [`2-rbac.yaml`](deploy/2-rbac.yaml) for detailed authorized operations 42 | - `ClusterRoleBinding` 43 | - `Deployment` for the chainer-operator 44 | 45 | ## Creating a ChainerJob 46 | 47 | Once defining `ChainerJob` CRD and operator is up, you create a job by defining `ChainerJob` custom resource. 48 | 49 | ```bash 50 | kubectl create -f examples/chainerjob-mn.yaml 51 | ``` 52 | 53 | In this case the job spec looks like this: 54 | 55 | ```yaml 56 | apiVersion: kubeflow.org/v1alpha1 57 | kind: ChainerJob 58 | metadata: 59 | name: example-job-mn 60 | spec: 61 | backend: mpi 62 | master: 63 | template: 64 | spec: 65 | containers: 66 | - name: chainer 67 | image: everpeace/chainermn:1.3.0 68 | command: 69 | - sh 70 | - -c 71 | - | 72 | mpiexec -n 3 -N 1 --allow-run-as-root --display-map --mca mpi_cuda_support 0 \ 73 | python3 /train_mnist.py -e 2 -b 1000 -u 100 74 | workerSets: 75 | ws0: 76 | replicas: 2 77 | template: 78 | spec: 79 | containers: 80 | - name: chainer 81 | image: everpeace/chainermn:1.3.0 82 | command: 83 | - sh 84 | - -c 85 | - | 86 | while true; do sleep 1 & wait; done 87 | ``` 88 | 89 | `ChainerJob` consists of Master/Workers. 90 | 91 | ### master 92 | 93 | - A `ChainerJob` must have only one master 94 | - `master` is a pod (job technically) to boot your entire distributed job. 95 | - The pod must contain a container named `chainer` 96 | - `master` will be restarted automatically when it failed. You can customize retry behavior with `activeDeadlineSeconds`/`backoffLimit`. Please see [examples/chainerjob-reference.yaml](examples/chainerjob-reference.yaml) for details. 97 | 98 | ### workerSets 99 | 100 | - WorkerSet is a concept of a group of pods (statefulset technically) having homogeneous configurations. 101 | - You can define multiple WorkerSets to create heterogenous workers. 102 | - A `ChainerJob` can have 0 or more WorkerSets. 103 | - WorkerSets can have 1 or more workers. 104 | - Workers are automatically restarted if they exit 105 | 106 | ### backend 107 | 108 | - `backend` define the to initiate process groups and exchange tensor data among the processes. 109 | - Current supported backend is `mpi`. 110 | 111 | #### `backend: mpi` 112 | 113 | - the operator automatically setup mpi environment among `master` and `workerSets` 114 | - `hostfile` and required configurations will be generated automatically 115 | - `slots=` clause in `hostfile` can be configurable. Please see [examples/chainerjob-reference.yaml](examples/chainerjob-reference.yaml) for details. 116 | - Default value is 1 or the number of GPUs you requested in a container named `chainer`. 117 | 118 | ## Using GPUs 119 | 120 | Kubernetes supports to [schedule GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/) ([instructions on GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/gpus)). 121 | 122 | Once you get GPU equipped cluster, you can attach `nvidia.com/gpu` resource to your `ChainerJob` definition like this. 123 | 124 | ```yaml 125 | apiVersion: kubeflow.org/v1alpha1 126 | kind: ChainerJob 127 | metadata: 128 | name: example-job-mn 129 | spec: 130 | backend: mpi 131 | master: 132 | template: 133 | spec: 134 | containers: 135 | - name: chainer 136 | image: everpeace/chainermn:1.3.0 137 | resources: 138 | limits: 139 | nvidia.com/gpu: 1 140 | ... 141 | ``` 142 | 143 | Follow [chainer's instruction](https://docs.chainer.org/en/stable/guides/gpu.html) for using in Chainer. 144 | 145 | ## Monotring your Job 146 | 147 | To get status of your `ChainerJob` 148 | 149 | ```yaml 150 | $ kubectl get chainerjobs $JOB_NAME -o yaml 151 | 152 | apiVersion: kubeflow.org/v1alpha1 153 | kind: ChainerJob 154 | ... 155 | status: 156 | completionTime: 2018-06-13T02:13:47Z 157 | conditions: 158 | - lastProbeTime: 2018-06-13T02:13:47Z 159 | lastTransitionTime: 2018-06-13T02:13:47Z 160 | status: "True" 161 | type: Complete 162 | startTime: 2018-06-13T02:04:47Z 163 | succeeded: 1 164 | ``` 165 | 166 | You can also list all the pods belonging `ChainerJob` by using label `chainerjob.kubeflow.org/name`. 167 | 168 | ```bash 169 | $ kubecl get all -l chainerjob.kubeflow.org/name=example-job-mn 170 | 171 | NAME READY STATUS RESTARTS AGE 172 | pod/example-job-mn-master-jm9qw 1/1 Running 0 1m 173 | pod/example-job-mn-workerset-ws0-0 1/1 Running 0 1m 174 | pod/example-job-mn-workerset-ws0-1 1/1 Running 0 1m 175 | 176 | NAME DESIRED CURRENT AGE 177 | statefulset.apps/example-job-mn-workerset-ws0 2 2 1m 178 | 179 | NAME DESIRED SUCCESSFUL AGE 180 | job.batch/example-job-mn-master 1 0 1m 181 | ``` 182 | 183 | ## Access to the logs 184 | 185 | Once you can get pod names which belongs to `ChainerJob`, you can inspect logs in standard ways. 186 | 187 | ```bash 188 | $ kubectl logs example-job-mn-master-jm9qw 189 | Data for JOB [41689,1] offset 0 190 | 191 | ======================== JOB MAP ======================== 192 | 193 | Data for node: example-job-mn-master-8qvk2 Num slots: 1 Max slots: 0 Num procs: 1 194 | Process OMPI jobid: [41689,1] App: 0 Process rank: 0 Bound: UNBOUND 195 | 196 | Data for node: example-job-mn-workerset-ws0-0 Num slots: 1 Max slots: 0 Num procs: 1 197 | Process OMPI jobid: [41689,1] App: 0 Process rank: 1 Bound: UNBOUND 198 | 199 | Data for node: example-job-mn-workerset-ws0-1 Num slots: 1 Max slots: 0 Num procs: 1 200 | Process OMPI jobid: [41689,1] App: 0 Process rank: 2 Bound: UNBOUND 201 | 202 | ============================================================= 203 | Warning: using naive communicator because only naive supports CPU-only execution 204 | Warning: using naive communicator because only naive supports CPU-only execution 205 | Warning: using naive communicator because only naive supports CPU-only execution 206 | ========================================== 207 | Num process (COMM_WORLD): 3 208 | Using hierarchical communicator 209 | Num unit: 100 210 | Num Minibatch-size: 1000 211 | Num epoch: 2 212 | ========================================== 213 | epoch main/loss validation/main/loss main/accuracy validation/main/accuracy elapsed_time 214 | 1 1.68413 0.87129 0.5325 0.807938 10.3654 215 | 2 0.58754 0.403208 0.8483 0.884564 16.4705 216 | ... 217 | ``` 218 | 219 | -------------------------------------------------------------------------------- /test/workflows/components/e2e.jsonnet: -------------------------------------------------------------------------------- 1 | local k = import 'k.libsonnet'; 2 | local util = import 'util.libsonnet'; 3 | 4 | // default parameters. 5 | local defaultParams = { 6 | project:: "kubeflow-ci", 7 | zone:: "us-east1-d", 8 | // Default registry to use. 9 | registry:: "gcr.io/" + defaultParams.project, 10 | 11 | // The image tag to use. 12 | // Defaults to a value based on the name. 13 | versionTag:: null, 14 | 15 | // The name of the secret containing GCP credentials. 16 | gcpCredentialsSecretName:: "kubeflow-testing-credentials", 17 | }; 18 | 19 | local params = defaultParams + std.extVar("__ksonnet/params").components.e2e; 20 | local namespace = params.namespace; 21 | local name = params.name; 22 | local prowEnv = util.parseEnv(params.prow_env); 23 | local bucket = params.bucket; 24 | 25 | local workflow = { 26 | // mountPath is the directory where the volume to store the test data 27 | // should be mounted. 28 | local mountPath = "/mnt/" + "test-data-volume", 29 | // testDir is the root directory for all data for a particular test run. 30 | local testDir = mountPath + "/" + name, 31 | // outputDir is the directory to sync to GCS to contain the output for this job. 32 | local outputDir = testDir + "/output", 33 | local artifactsDir = outputDir + "/artifacts", 34 | local goDir = testDir + "/go", 35 | // Source directory where all repos should be checked out 36 | local srcRootDir = testDir + "/src", 37 | // The directory containing the kubeflow/chainer-operator repo 38 | local srcDir = srcRootDir + "/kubeflow/chainer-operator", 39 | local testWorkerImage = "gcr.io/kubeflow-ci/test-worker", 40 | // The name of the NFS volume claim to use for test files. 41 | // local nfsVolumeClaim = "kubeflow-testing"; 42 | local nfsVolumeClaim = "nfs-external", 43 | // The name to use for the volume to use to contain test data. 44 | local dataVolume = "kubeflow-test-volume", 45 | local versionTag = if params.versionTag != null then 46 | params.versionTag 47 | else name, 48 | local chainerJobImage = params.registry + "/chainer-operator:" + versionTag, 49 | 50 | // The namespace on the cluster we spin up to deploy into. 51 | local deployNamespace = "kubeflow", 52 | // The directory within the kubeflow_testing submodule containing 53 | // py scripts to use. 54 | local k8sPy = srcDir, 55 | local kubeflowPy = srcRootDir + "/kubeflow/testing/py", 56 | local kfctlDir = srcRootDir + "/kubeflow/kubeflow", 57 | 58 | local project = params.project, 59 | // GKE cluster to use 60 | // We need to truncate the cluster to no more than 40 characters because 61 | // cluster names can be a max of 40 characters. 62 | // We expect the suffix of the cluster name to be unique salt. 63 | // We prepend a z because cluster name must start with an alphanumeric character 64 | // and if we cut the prefix we might end up starting with "-" or other invalid 65 | // character for first character. 66 | local cluster = 67 | if std.length(name) > 20 then 68 | "z" + std.substr(name, std.length(name) - 19, 19) 69 | else 70 | name, 71 | local zone = params.zone, 72 | local registry = params.registry, 73 | local chart = srcDir + "/chainer-operator-chart", 74 | 75 | // Build an Argo template to execute a particular command. 76 | // step_name: Name for the template 77 | // command: List to pass as the container command. 78 | buildTemplate(step_name, image, command) :: { 79 | name: step_name, 80 | container: { 81 | command: command, 82 | image: image, 83 | workingDir: srcDir, 84 | env: [ 85 | { 86 | // Add the source directories to the python path. 87 | name: "PYTHONPATH", 88 | value: k8sPy + ":" + kubeflowPy, 89 | }, 90 | { 91 | // Set the GOPATH 92 | name: "GOPATH", 93 | value: goDir, 94 | }, 95 | { 96 | name: "KFCTL_DIR", 97 | value: kfctlDir, 98 | }, 99 | { 100 | name: "CLUSTER_NAME", 101 | value: cluster, 102 | }, 103 | { 104 | name: "GCP_ZONE", 105 | value: zone, 106 | }, 107 | { 108 | name: "GCP_PROJECT", 109 | value: project, 110 | }, 111 | { 112 | name: "GCP_REGISTRY", 113 | value: registry, 114 | }, 115 | { 116 | name: "DEPLOY_NAMESPACE", 117 | value: deployNamespace, 118 | }, 119 | { 120 | name: "GOOGLE_APPLICATION_CREDENTIALS", 121 | value: "/secret/gcp-credentials/key.json", 122 | }, 123 | { 124 | name: "GIT_TOKEN", 125 | valueFrom: { 126 | secretKeyRef: { 127 | name: "github-token", 128 | key: "github_token", 129 | }, 130 | }, 131 | }, 132 | ] + prowEnv, 133 | volumeMounts: [ 134 | { 135 | name: dataVolume, 136 | mountPath: mountPath, 137 | }, 138 | { 139 | name: "github-token", 140 | mountPath: "/secret/github-token", 141 | }, 142 | { 143 | name: "gcp-credentials", 144 | mountPath: "/secret/gcp-credentials", 145 | }, 146 | ], 147 | }, 148 | }, // buildTemplate 149 | 150 | apiVersion: "argoproj.io/v1alpha1", 151 | kind: "Workflow", 152 | metadata: { 153 | name: name, 154 | namespace: namespace, 155 | }, 156 | // TODO(jlewi): Use OnExit to run cleanup steps. 157 | spec: { 158 | entrypoint: "e2e", 159 | volumes: [ 160 | { 161 | name: "github-token", 162 | secret: { 163 | secretName: "github-token", 164 | }, 165 | }, 166 | { 167 | name: "gcp-credentials", 168 | secret: { 169 | secretName: params.gcpCredentialsSecretName, 170 | }, 171 | }, 172 | { 173 | name: dataVolume, 174 | persistentVolumeClaim: { 175 | claimName: nfsVolumeClaim, 176 | }, 177 | }, 178 | ], // volumes 179 | // onExit specifies the template that should always run when the workflow completes. 180 | onExit: "exit-handler", 181 | templates: [ 182 | { 183 | name: "e2e", 184 | steps: [ 185 | [{ 186 | name: "checkout", 187 | template: "checkout", 188 | }], 189 | [ 190 | { 191 | name: "build", 192 | template: "build", 193 | }, 194 | { 195 | name: "create-pr-symlink", 196 | template: "create-pr-symlink", 197 | }, 198 | ], 199 | [ // Setup cluster needs to run after build because we depend on the chart 200 | // created by the build statement. 201 | { 202 | name: "setup-cluster", 203 | template: "setup-cluster", 204 | }, 205 | ], 206 | [ 207 | { 208 | name: "run-tests", 209 | template: "run-tests", 210 | }, 211 | ], 212 | ], 213 | }, 214 | { 215 | name: "exit-handler", 216 | steps: [ 217 | [{ 218 | name: "teardown-cluster", 219 | template: "teardown-cluster", 220 | 221 | }], 222 | [{ 223 | name: "copy-artifacts", 224 | template: "copy-artifacts", 225 | }], 226 | ], 227 | }, 228 | { 229 | name: "checkout", 230 | container: { 231 | command: [ 232 | "/usr/local/bin/checkout.sh", 233 | srcRootDir, 234 | ], 235 | env: prowEnv + [{ 236 | name: "EXTRA_REPOS", 237 | value: "kubeflow/kubeflow@HEAD;kubeflow/testing@HEAD", 238 | }], 239 | image: testWorkerImage, 240 | volumeMounts: [ 241 | { 242 | name: dataVolume, 243 | mountPath: mountPath, 244 | }, 245 | ], 246 | }, 247 | }, // checkout 248 | workflow.buildTemplate("setup-cluster",testWorkerImage, [ 249 | "scripts/create-cluster.sh", 250 | ]), // setup cluster 251 | workflow.buildTemplate("run-tests", testWorkerImage, [ 252 | "scripts/run-test.sh", 253 | ]), // run tests 254 | workflow.buildTemplate("create-pr-symlink", testWorkerImage, [ 255 | "python", 256 | "-m", 257 | "kubeflow.testing.prow_artifacts", 258 | "--artifacts_dir=" + outputDir, 259 | "create_pr_symlink", 260 | "--bucket=" + bucket, 261 | ]), // create-pr-symlink 262 | workflow.buildTemplate("teardown-cluster",testWorkerImage, [ 263 | "scripts/delete-cluster.sh", 264 | ]), // teardown cluster 265 | workflow.buildTemplate("copy-artifacts", testWorkerImage, [ 266 | "python", 267 | "-m", 268 | "kubeflow.testing.prow_artifacts", 269 | "--artifacts_dir=" + outputDir, 270 | "copy_artifacts", 271 | "--bucket=" + bucket, 272 | ]), // copy-artifacts 273 | workflow.buildTemplate("build", testWorkerImage, [ 274 | "scripts/build.sh", 275 | ]), // build 276 | ], // templates 277 | }, 278 | }; 279 | 280 | std.prune(k.core.v1.list.new([workflow])) 281 | -------------------------------------------------------------------------------- /pkg/apis/chainer/v1alpha1/defaults_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package v1alpha1 16 | 17 | import ( 18 | "encoding/json" 19 | "fmt" 20 | "reflect" 21 | "testing" 22 | 23 | "k8s.io/apimachinery/pkg/api/resource" 24 | 25 | corev1 "k8s.io/api/core/v1" 26 | ) 27 | 28 | func TestSetDefaults_ChainerJob(t *testing.T) { 29 | type testCase struct { 30 | name string 31 | in *ChainerJob 32 | expected *ChainerJob 33 | } 34 | 35 | testCases := []testCase{ 36 | { 37 | name: "normal case(backend, slots, container name, replicas, restartPolicy)", 38 | in: &ChainerJob{ 39 | Spec: ChainerJobSpec{ 40 | Master: MasterSpec{ 41 | Template: corev1.PodTemplateSpec{ 42 | Spec: corev1.PodSpec{ 43 | Containers: []corev1.Container{ 44 | { 45 | Image: "dummy", 46 | }, 47 | }, 48 | }, 49 | }, 50 | }, 51 | WorkerSets: map[string]*WorkerSetSpec{ 52 | "ws": &WorkerSetSpec{ 53 | Template: corev1.PodTemplateSpec{ 54 | Spec: corev1.PodSpec{ 55 | Containers: []corev1.Container{ 56 | { 57 | Image: "dummy", 58 | }, 59 | }, 60 | }, 61 | }, 62 | }, 63 | }, 64 | }, 65 | }, 66 | expected: &ChainerJob{ 67 | Spec: ChainerJobSpec{ 68 | Backend: BackendTypeMPI, 69 | Master: MasterSpec{ 70 | MPIConfig: &MPIConfig{ 71 | Slots: Int32(DefaultSlots), 72 | }, 73 | Template: corev1.PodTemplateSpec{ 74 | Spec: corev1.PodSpec{ 75 | RestartPolicy: DefaultRestartPolicy, 76 | Containers: []corev1.Container{ 77 | { 78 | Name: DefaultContainerName, 79 | Image: "dummy", 80 | }, 81 | }, 82 | }, 83 | }, 84 | }, 85 | WorkerSets: map[string]*WorkerSetSpec{ 86 | "ws": &WorkerSetSpec{ 87 | Replicas: Int32(1), 88 | MPIConfig: &MPIConfig{ 89 | Slots: Int32(DefaultSlots), 90 | }, 91 | Template: corev1.PodTemplateSpec{ 92 | Spec: corev1.PodSpec{ 93 | Containers: []corev1.Container{ 94 | { 95 | Name: DefaultContainerName, 96 | Image: "dummy", 97 | }, 98 | }, 99 | }, 100 | }, 101 | }, 102 | }, 103 | }, 104 | }, 105 | }, 106 | { 107 | name: "only master (container name, replicas, restartPolicy)", 108 | in: &ChainerJob{ 109 | Spec: ChainerJobSpec{ 110 | Master: MasterSpec{ 111 | Template: corev1.PodTemplateSpec{ 112 | Spec: corev1.PodSpec{ 113 | Containers: []corev1.Container{ 114 | { 115 | Image: "dummy", 116 | }, 117 | }, 118 | }, 119 | }, 120 | }, 121 | }, 122 | }, 123 | expected: &ChainerJob{ 124 | Spec: ChainerJobSpec{ 125 | Master: MasterSpec{ 126 | Template: corev1.PodTemplateSpec{ 127 | Spec: corev1.PodSpec{ 128 | RestartPolicy: DefaultRestartPolicy, 129 | Containers: []corev1.Container{ 130 | { 131 | Name: DefaultContainerName, 132 | Image: "dummy", 133 | }, 134 | }, 135 | }, 136 | }, 137 | }, 138 | }, 139 | }, 140 | }, 141 | { 142 | name: "nvidia.com/gpus to be slots", 143 | in: &ChainerJob{ 144 | Spec: ChainerJobSpec{ 145 | Backend: BackendTypeMPI, 146 | Master: MasterSpec{ 147 | Template: corev1.PodTemplateSpec{ 148 | Spec: corev1.PodSpec{ 149 | Containers: []corev1.Container{ 150 | { 151 | Image: "dummy", 152 | Resources: corev1.ResourceRequirements{ 153 | Limits: corev1.ResourceList{ 154 | "nvidia.com/gpu": resource.MustParse("2"), 155 | }, 156 | }, 157 | }, 158 | }, 159 | }, 160 | }, 161 | }, 162 | WorkerSets: map[string]*WorkerSetSpec{ 163 | "ws": &WorkerSetSpec{ 164 | Template: corev1.PodTemplateSpec{ 165 | Spec: corev1.PodSpec{ 166 | Containers: []corev1.Container{ 167 | { 168 | Image: "dummy", 169 | Resources: corev1.ResourceRequirements{ 170 | Limits: corev1.ResourceList{ 171 | "nvidia.com/gpu": resource.MustParse("3"), 172 | }, 173 | }, 174 | }, 175 | }, 176 | }, 177 | }, 178 | }, 179 | }, 180 | }, 181 | }, 182 | expected: &ChainerJob{ 183 | Spec: ChainerJobSpec{ 184 | Backend: BackendTypeMPI, 185 | Master: MasterSpec{ 186 | MPIConfig: &MPIConfig{ 187 | Slots: Int32(2), 188 | }, 189 | Template: corev1.PodTemplateSpec{ 190 | Spec: corev1.PodSpec{ 191 | RestartPolicy: DefaultRestartPolicy, 192 | Containers: []corev1.Container{ 193 | { 194 | Name: DefaultContainerName, 195 | Image: "dummy", 196 | Resources: corev1.ResourceRequirements{ 197 | Limits: corev1.ResourceList{ 198 | "nvidia.com/gpu": resource.MustParse("2"), 199 | }, 200 | }, 201 | }, 202 | }, 203 | }, 204 | }, 205 | }, 206 | WorkerSets: map[string]*WorkerSetSpec{ 207 | "ws": &WorkerSetSpec{ 208 | MPIConfig: &MPIConfig{ 209 | Slots: Int32(3), 210 | }, 211 | Replicas: Int32(1), 212 | Template: corev1.PodTemplateSpec{ 213 | Spec: corev1.PodSpec{ 214 | Containers: []corev1.Container{ 215 | { 216 | Name: DefaultContainerName, 217 | Image: "dummy", 218 | Resources: corev1.ResourceRequirements{ 219 | Limits: corev1.ResourceList{ 220 | "nvidia.com/gpu": resource.MustParse("3"), 221 | }, 222 | }, 223 | }, 224 | }, 225 | }, 226 | }, 227 | }, 228 | }, 229 | }, 230 | }, 231 | }, 232 | { 233 | name: "multiple containers case doensn't set default ContainerName but set default Slots", 234 | in: &ChainerJob{ 235 | Spec: ChainerJobSpec{ 236 | Backend: BackendTypeMPI, 237 | Master: MasterSpec{ 238 | Template: corev1.PodTemplateSpec{ 239 | Spec: corev1.PodSpec{ 240 | Containers: []corev1.Container{ 241 | { 242 | Image: "dummy", 243 | }, 244 | { 245 | Image: "dummy", 246 | }, 247 | }, 248 | }, 249 | }, 250 | }, 251 | WorkerSets: map[string]*WorkerSetSpec{ 252 | "ws": &WorkerSetSpec{ 253 | Template: corev1.PodTemplateSpec{ 254 | Spec: corev1.PodSpec{ 255 | Containers: []corev1.Container{ 256 | { 257 | Image: "dummy", 258 | }, 259 | { 260 | Image: "dummy", 261 | }, 262 | }, 263 | }, 264 | }, 265 | }, 266 | }, 267 | }, 268 | }, 269 | expected: &ChainerJob{ 270 | Spec: ChainerJobSpec{ 271 | Backend: BackendTypeMPI, 272 | Master: MasterSpec{ 273 | MPIConfig: &MPIConfig{ 274 | Slots: Int32(DefaultSlots), 275 | }, 276 | Template: corev1.PodTemplateSpec{ 277 | Spec: corev1.PodSpec{ 278 | RestartPolicy: DefaultRestartPolicy, 279 | Containers: []corev1.Container{ 280 | { 281 | Image: "dummy", 282 | }, 283 | { 284 | Image: "dummy", 285 | }, 286 | }, 287 | }, 288 | }, 289 | }, 290 | WorkerSets: map[string]*WorkerSetSpec{ 291 | "ws": &WorkerSetSpec{ 292 | MPIConfig: &MPIConfig{ 293 | Slots: Int32(DefaultSlots), 294 | }, 295 | Replicas: Int32(1), 296 | Template: corev1.PodTemplateSpec{ 297 | Spec: corev1.PodSpec{ 298 | Containers: []corev1.Container{ 299 | { 300 | Image: "dummy", 301 | }, 302 | { 303 | Image: "dummy", 304 | }, 305 | }, 306 | }, 307 | }, 308 | }, 309 | }, 310 | }, 311 | }, 312 | }, 313 | { 314 | name: "everyghing is already filled", 315 | in: &ChainerJob{ 316 | Spec: ChainerJobSpec{ 317 | Backend: BackendTypeMPI, 318 | Master: MasterSpec{ 319 | MPIConfig: &MPIConfig{ 320 | Slots: Int32(3), 321 | }, 322 | Template: corev1.PodTemplateSpec{ 323 | Spec: corev1.PodSpec{ 324 | RestartPolicy: corev1.RestartPolicyOnFailure, 325 | Containers: []corev1.Container{ 326 | { 327 | Name: "dummy", 328 | Image: "dummy", 329 | }, 330 | }, 331 | }, 332 | }, 333 | }, 334 | WorkerSets: map[string]*WorkerSetSpec{ 335 | "ws": &WorkerSetSpec{ 336 | Replicas: Int32(5), 337 | MPIConfig: &MPIConfig{ 338 | Slots: Int32(2), 339 | }, 340 | Template: corev1.PodTemplateSpec{ 341 | Spec: corev1.PodSpec{ 342 | Containers: []corev1.Container{ 343 | { 344 | Name: "dummy", 345 | Image: "dummy", 346 | }, 347 | }, 348 | }, 349 | }, 350 | }, 351 | }, 352 | }, 353 | }, 354 | expected: &ChainerJob{ 355 | Spec: ChainerJobSpec{ 356 | Backend: BackendTypeMPI, 357 | Master: MasterSpec{ 358 | MPIConfig: &MPIConfig{ 359 | Slots: Int32(3), 360 | }, 361 | Template: corev1.PodTemplateSpec{ 362 | Spec: corev1.PodSpec{ 363 | RestartPolicy: corev1.RestartPolicyOnFailure, 364 | Containers: []corev1.Container{ 365 | { 366 | Name: "dummy", 367 | Image: "dummy", 368 | }, 369 | }, 370 | }, 371 | }, 372 | }, 373 | WorkerSets: map[string]*WorkerSetSpec{ 374 | "ws": &WorkerSetSpec{ 375 | Replicas: Int32(5), 376 | MPIConfig: &MPIConfig{ 377 | Slots: Int32(2), 378 | }, 379 | Template: corev1.PodTemplateSpec{ 380 | Spec: corev1.PodSpec{ 381 | Containers: []corev1.Container{ 382 | { 383 | Name: "dummy", 384 | Image: "dummy", 385 | }, 386 | }, 387 | }, 388 | }, 389 | }, 390 | }, 391 | }, 392 | }, 393 | }, 394 | } 395 | 396 | for _, c := range testCases { 397 | SetDefaults_ChainerJob(c.in) 398 | actualJSON, errActual := json.MarshalIndent(c.in, "", " ") 399 | actual := "" 400 | if errActual != nil { 401 | t.Errorf("Couldn't pretty format %v, error: %v", c.in, errActual) 402 | actual = fmt.Sprintf("%+v", c.in) 403 | } else { 404 | actual = string(actualJSON) 405 | } 406 | 407 | expectedJSON, errExpected := json.MarshalIndent(c.expected, "", " ") 408 | expected := "" 409 | if errExpected != nil { 410 | t.Errorf("Couldn't pretty format %v, error: %v", c.expected, errExpected) 411 | expected = fmt.Sprintf("%v", c.expected) 412 | } else { 413 | expected = string(expectedJSON) 414 | } 415 | 416 | if !reflect.DeepEqual(actual, expected) { 417 | t.Errorf("TestCase: %v\nWant:%+v\nGot:%+v", c.name, expected, actual) 418 | } 419 | } 420 | } 421 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Gopkg.lock: -------------------------------------------------------------------------------- 1 | # This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. 2 | 3 | 4 | [[projects]] 5 | name = "github.com/davecgh/go-spew" 6 | packages = ["spew"] 7 | revision = "346938d642f2ec3594ed81d874461961cd0faa76" 8 | version = "v1.1.0" 9 | 10 | [[projects]] 11 | name = "github.com/ghodss/yaml" 12 | packages = ["."] 13 | revision = "0ca9ea5df5451ffdf184b4428c902747c2c11cd7" 14 | version = "v1.0.0" 15 | 16 | [[projects]] 17 | name = "github.com/gogo/protobuf" 18 | packages = [ 19 | "proto", 20 | "sortkeys" 21 | ] 22 | revision = "1adfc126b41513cc696b209667c8656ea7aac67c" 23 | version = "v1.0.0" 24 | 25 | [[projects]] 26 | branch = "master" 27 | name = "github.com/golang/glog" 28 | packages = ["."] 29 | revision = "23def4e6c14b4da8ac2ed8007337bc5eb5007998" 30 | 31 | [[projects]] 32 | branch = "master" 33 | name = "github.com/golang/groupcache" 34 | packages = ["lru"] 35 | revision = "24b0969c4cb722950103eed87108c8d291a8df00" 36 | 37 | [[projects]] 38 | name = "github.com/golang/protobuf" 39 | packages = [ 40 | "proto", 41 | "ptypes", 42 | "ptypes/any", 43 | "ptypes/duration", 44 | "ptypes/timestamp" 45 | ] 46 | revision = "b4deda0973fb4c70b50d226b1af49f3da59f5265" 47 | version = "v1.1.0" 48 | 49 | [[projects]] 50 | branch = "master" 51 | name = "github.com/google/gofuzz" 52 | packages = ["."] 53 | revision = "24818f796faf91cd76ec7bddd72458fbced7a6c1" 54 | 55 | [[projects]] 56 | name = "github.com/googleapis/gnostic" 57 | packages = [ 58 | "OpenAPIv2", 59 | "compiler", 60 | "extensions" 61 | ] 62 | revision = "7c663266750e7d82587642f65e60bc4083f1f84e" 63 | version = "v0.2.0" 64 | 65 | [[projects]] 66 | branch = "master" 67 | name = "github.com/hashicorp/golang-lru" 68 | packages = [ 69 | ".", 70 | "simplelru" 71 | ] 72 | revision = "0fb14efe8c47ae851c0034ed7a448854d3d34cf3" 73 | 74 | [[projects]] 75 | branch = "master" 76 | name = "github.com/howeyc/gopass" 77 | packages = ["."] 78 | revision = "bf9dde6d0d2c004a008c27aaee91170c786f6db8" 79 | 80 | [[projects]] 81 | name = "github.com/imdario/mergo" 82 | packages = ["."] 83 | revision = "9d5f1277e9a8ed20c3684bda8fde67c05628518c" 84 | version = "v0.3.4" 85 | 86 | [[projects]] 87 | name = "github.com/json-iterator/go" 88 | packages = ["."] 89 | revision = "ca39e5af3ece67bbcda3d0f4f56a8e24d9f2dad4" 90 | version = "1.1.3" 91 | 92 | [[projects]] 93 | name = "github.com/modern-go/concurrent" 94 | packages = ["."] 95 | revision = "bacd9c7ef1dd9b15be4a9909b8ac7a4e313eec94" 96 | version = "1.0.3" 97 | 98 | [[projects]] 99 | name = "github.com/modern-go/reflect2" 100 | packages = ["."] 101 | revision = "1df9eeb2bb81f327b96228865c5687bc2194af3f" 102 | version = "1.0.0" 103 | 104 | [[projects]] 105 | name = "github.com/spf13/pflag" 106 | packages = ["."] 107 | revision = "583c0c0531f06d5278b7d917446061adc344b5cd" 108 | version = "v1.0.1" 109 | 110 | [[projects]] 111 | branch = "master" 112 | name = "golang.org/x/crypto" 113 | packages = ["ssh/terminal"] 114 | revision = "b47b1587369238182299fe4dad77d05b8b461e06" 115 | 116 | [[projects]] 117 | branch = "master" 118 | name = "golang.org/x/net" 119 | packages = [ 120 | "context", 121 | "http/httpguts", 122 | "http2", 123 | "http2/hpack", 124 | "idna" 125 | ] 126 | revision = "1e491301e022f8f977054da4c2d852decd59571f" 127 | 128 | [[projects]] 129 | branch = "master" 130 | name = "golang.org/x/sys" 131 | packages = [ 132 | "unix", 133 | "windows" 134 | ] 135 | revision = "9527bec2660bd847c050fda93a0f0c6dee0800bb" 136 | 137 | [[projects]] 138 | name = "golang.org/x/text" 139 | packages = [ 140 | "collate", 141 | "collate/build", 142 | "internal/colltab", 143 | "internal/gen", 144 | "internal/tag", 145 | "internal/triegen", 146 | "internal/ucd", 147 | "language", 148 | "secure/bidirule", 149 | "transform", 150 | "unicode/bidi", 151 | "unicode/cldr", 152 | "unicode/norm", 153 | "unicode/rangetable" 154 | ] 155 | revision = "f21a4dfb5e38f5895301dc265a8def02365cc3d0" 156 | version = "v0.3.0" 157 | 158 | [[projects]] 159 | branch = "master" 160 | name = "golang.org/x/time" 161 | packages = ["rate"] 162 | revision = "fbb02b2291d28baffd63558aa44b4b56f178d650" 163 | 164 | [[projects]] 165 | branch = "master" 166 | name = "golang.org/x/tools" 167 | packages = [ 168 | "go/ast/astutil", 169 | "imports", 170 | "internal/fastwalk" 171 | ] 172 | revision = "a5b4c53f6e8bdcafa95a94671bf2d1203365858b" 173 | 174 | [[projects]] 175 | name = "gopkg.in/inf.v0" 176 | packages = ["."] 177 | revision = "d2d2541c53f18d2a059457998ce2876cc8e67cbf" 178 | version = "v0.9.1" 179 | 180 | [[projects]] 181 | name = "gopkg.in/yaml.v2" 182 | packages = ["."] 183 | revision = "5420a8b6744d3b0345ab293f6fcba19c978f1183" 184 | version = "v2.2.1" 185 | 186 | [[projects]] 187 | name = "k8s.io/api" 188 | packages = [ 189 | "admissionregistration/v1alpha1", 190 | "admissionregistration/v1beta1", 191 | "apps/v1", 192 | "apps/v1beta1", 193 | "apps/v1beta2", 194 | "authentication/v1", 195 | "authentication/v1beta1", 196 | "authorization/v1", 197 | "authorization/v1beta1", 198 | "autoscaling/v1", 199 | "autoscaling/v2beta1", 200 | "batch/v1", 201 | "batch/v1beta1", 202 | "batch/v2alpha1", 203 | "certificates/v1beta1", 204 | "core/v1", 205 | "events/v1beta1", 206 | "extensions/v1beta1", 207 | "networking/v1", 208 | "policy/v1beta1", 209 | "rbac/v1", 210 | "rbac/v1alpha1", 211 | "rbac/v1beta1", 212 | "scheduling/v1alpha1", 213 | "settings/v1alpha1", 214 | "storage/v1", 215 | "storage/v1alpha1", 216 | "storage/v1beta1" 217 | ] 218 | revision = "feb48db456a5912850dcccbd42a3535382ba76de" 219 | version = "kubernetes-1.10.3" 220 | 221 | [[projects]] 222 | name = "k8s.io/apimachinery" 223 | packages = [ 224 | "pkg/api/errors", 225 | "pkg/api/meta", 226 | "pkg/api/resource", 227 | "pkg/apis/meta/internalversion", 228 | "pkg/apis/meta/v1", 229 | "pkg/apis/meta/v1/unstructured", 230 | "pkg/apis/meta/v1beta1", 231 | "pkg/conversion", 232 | "pkg/conversion/queryparams", 233 | "pkg/fields", 234 | "pkg/labels", 235 | "pkg/runtime", 236 | "pkg/runtime/schema", 237 | "pkg/runtime/serializer", 238 | "pkg/runtime/serializer/json", 239 | "pkg/runtime/serializer/protobuf", 240 | "pkg/runtime/serializer/recognizer", 241 | "pkg/runtime/serializer/streaming", 242 | "pkg/runtime/serializer/versioning", 243 | "pkg/selection", 244 | "pkg/types", 245 | "pkg/util/cache", 246 | "pkg/util/clock", 247 | "pkg/util/diff", 248 | "pkg/util/errors", 249 | "pkg/util/framer", 250 | "pkg/util/intstr", 251 | "pkg/util/json", 252 | "pkg/util/mergepatch", 253 | "pkg/util/net", 254 | "pkg/util/runtime", 255 | "pkg/util/sets", 256 | "pkg/util/strategicpatch", 257 | "pkg/util/validation", 258 | "pkg/util/validation/field", 259 | "pkg/util/wait", 260 | "pkg/util/yaml", 261 | "pkg/version", 262 | "pkg/watch", 263 | "third_party/forked/golang/json", 264 | "third_party/forked/golang/reflect" 265 | ] 266 | revision = "31dade610c053669d8054bfd847da657251e8c1a" 267 | version = "kubernetes-1.10.3" 268 | 269 | [[projects]] 270 | name = "k8s.io/client-go" 271 | packages = [ 272 | "discovery", 273 | "discovery/fake", 274 | "informers", 275 | "informers/admissionregistration", 276 | "informers/admissionregistration/v1alpha1", 277 | "informers/admissionregistration/v1beta1", 278 | "informers/apps", 279 | "informers/apps/v1", 280 | "informers/apps/v1beta1", 281 | "informers/apps/v1beta2", 282 | "informers/autoscaling", 283 | "informers/autoscaling/v1", 284 | "informers/autoscaling/v2beta1", 285 | "informers/batch", 286 | "informers/batch/v1", 287 | "informers/batch/v1beta1", 288 | "informers/batch/v2alpha1", 289 | "informers/certificates", 290 | "informers/certificates/v1beta1", 291 | "informers/core", 292 | "informers/core/v1", 293 | "informers/events", 294 | "informers/events/v1beta1", 295 | "informers/extensions", 296 | "informers/extensions/v1beta1", 297 | "informers/internalinterfaces", 298 | "informers/networking", 299 | "informers/networking/v1", 300 | "informers/policy", 301 | "informers/policy/v1beta1", 302 | "informers/rbac", 303 | "informers/rbac/v1", 304 | "informers/rbac/v1alpha1", 305 | "informers/rbac/v1beta1", 306 | "informers/scheduling", 307 | "informers/scheduling/v1alpha1", 308 | "informers/settings", 309 | "informers/settings/v1alpha1", 310 | "informers/storage", 311 | "informers/storage/v1", 312 | "informers/storage/v1alpha1", 313 | "informers/storage/v1beta1", 314 | "kubernetes", 315 | "kubernetes/scheme", 316 | "kubernetes/typed/admissionregistration/v1alpha1", 317 | "kubernetes/typed/admissionregistration/v1beta1", 318 | "kubernetes/typed/apps/v1", 319 | "kubernetes/typed/apps/v1beta1", 320 | "kubernetes/typed/apps/v1beta2", 321 | "kubernetes/typed/authentication/v1", 322 | "kubernetes/typed/authentication/v1beta1", 323 | "kubernetes/typed/authorization/v1", 324 | "kubernetes/typed/authorization/v1beta1", 325 | "kubernetes/typed/autoscaling/v1", 326 | "kubernetes/typed/autoscaling/v2beta1", 327 | "kubernetes/typed/batch/v1", 328 | "kubernetes/typed/batch/v1beta1", 329 | "kubernetes/typed/batch/v2alpha1", 330 | "kubernetes/typed/certificates/v1beta1", 331 | "kubernetes/typed/core/v1", 332 | "kubernetes/typed/events/v1beta1", 333 | "kubernetes/typed/extensions/v1beta1", 334 | "kubernetes/typed/networking/v1", 335 | "kubernetes/typed/policy/v1beta1", 336 | "kubernetes/typed/rbac/v1", 337 | "kubernetes/typed/rbac/v1alpha1", 338 | "kubernetes/typed/rbac/v1beta1", 339 | "kubernetes/typed/scheduling/v1alpha1", 340 | "kubernetes/typed/settings/v1alpha1", 341 | "kubernetes/typed/storage/v1", 342 | "kubernetes/typed/storage/v1alpha1", 343 | "kubernetes/typed/storage/v1beta1", 344 | "listers/admissionregistration/v1alpha1", 345 | "listers/admissionregistration/v1beta1", 346 | "listers/apps/v1", 347 | "listers/apps/v1beta1", 348 | "listers/apps/v1beta2", 349 | "listers/autoscaling/v1", 350 | "listers/autoscaling/v2beta1", 351 | "listers/batch/v1", 352 | "listers/batch/v1beta1", 353 | "listers/batch/v2alpha1", 354 | "listers/certificates/v1beta1", 355 | "listers/core/v1", 356 | "listers/events/v1beta1", 357 | "listers/extensions/v1beta1", 358 | "listers/networking/v1", 359 | "listers/policy/v1beta1", 360 | "listers/rbac/v1", 361 | "listers/rbac/v1alpha1", 362 | "listers/rbac/v1beta1", 363 | "listers/scheduling/v1alpha1", 364 | "listers/settings/v1alpha1", 365 | "listers/storage/v1", 366 | "listers/storage/v1alpha1", 367 | "listers/storage/v1beta1", 368 | "pkg/apis/clientauthentication", 369 | "pkg/apis/clientauthentication/v1alpha1", 370 | "pkg/version", 371 | "plugin/pkg/client/auth/exec", 372 | "rest", 373 | "rest/watch", 374 | "testing", 375 | "tools/auth", 376 | "tools/cache", 377 | "tools/clientcmd", 378 | "tools/clientcmd/api", 379 | "tools/clientcmd/api/latest", 380 | "tools/clientcmd/api/v1", 381 | "tools/metrics", 382 | "tools/pager", 383 | "tools/record", 384 | "tools/reference", 385 | "transport", 386 | "util/buffer", 387 | "util/cert", 388 | "util/flowcontrol", 389 | "util/homedir", 390 | "util/integer", 391 | "util/retry", 392 | "util/workqueue" 393 | ] 394 | revision = "29ae1f00c3d8bb759d6246c357573a9af3c659c1" 395 | version = "kubernetes-1.10.3" 396 | 397 | [[projects]] 398 | branch = "master" 399 | name = "k8s.io/code-generator" 400 | packages = [ 401 | "cmd/client-gen", 402 | "cmd/client-gen/args", 403 | "cmd/client-gen/generators", 404 | "cmd/client-gen/generators/fake", 405 | "cmd/client-gen/generators/scheme", 406 | "cmd/client-gen/generators/util", 407 | "cmd/client-gen/path", 408 | "cmd/client-gen/types", 409 | "pkg/util" 410 | ] 411 | revision = "4c99649af8fee16989100e4cc3e6a75143530b28" 412 | 413 | [[projects]] 414 | branch = "master" 415 | name = "k8s.io/gengo" 416 | packages = [ 417 | "args", 418 | "generator", 419 | "namer", 420 | "parser", 421 | "types" 422 | ] 423 | revision = "2e1a79edcaecf0bfbde129a1fd55624b66adb699" 424 | 425 | [[projects]] 426 | branch = "master" 427 | name = "k8s.io/kube-openapi" 428 | packages = ["pkg/util/proto"] 429 | revision = "8a9b82f00b3a86eac24681da3f9fe6c34c01cea2" 430 | 431 | [solve-meta] 432 | analyzer-name = "dep" 433 | analyzer-version = 1 434 | inputs-digest = "ebc13e03c2b93197e18061ad0c0ae45bf7537cb6f5d44ebe1d2e2c97c798a81e" 435 | solver-name = "gps-cdcl" 436 | solver-version = 1 437 | -------------------------------------------------------------------------------- /pkg/controllers/backends/util.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package backends 16 | 17 | import ( 18 | "fmt" 19 | 20 | apisv1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 21 | appsv1 "k8s.io/api/apps/v1" 22 | batchv1 "k8s.io/api/batch/v1" 23 | corev1 "k8s.io/api/core/v1" 24 | rbacv1 "k8s.io/api/rbac/v1" 25 | "k8s.io/apimachinery/pkg/api/errors" 26 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 | 28 | "k8s.io/client-go/kubernetes" 29 | appslister "k8s.io/client-go/listers/apps/v1" 30 | batchlister "k8s.io/client-go/listers/batch/v1" 31 | corelister "k8s.io/client-go/listers/core/v1" 32 | rbaclister "k8s.io/client-go/listers/rbac/v1" 33 | "k8s.io/client-go/tools/record" 34 | 35 | clientset "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned" 36 | ) 37 | 38 | func NewServiceAccount(chjob *apisv1alpha1.ChainerJob) *corev1.ServiceAccount { 39 | return &corev1.ServiceAccount{ 40 | ObjectMeta: metav1.ObjectMeta{ 41 | Name: chjob.Name + ServiceAccountSuffix, 42 | Namespace: chjob.Namespace, 43 | Labels: map[string]string{ 44 | JobLabelKey: chjob.Name, 45 | }, 46 | OwnerReferences: []metav1.OwnerReference{ 47 | *metav1.NewControllerRef(chjob, apisv1alpha1.SchemeGroupVersionKind), 48 | }, 49 | }, 50 | } 51 | } 52 | 53 | func NewRole(chjob *apisv1alpha1.ChainerJob) *rbacv1.Role { 54 | return &rbacv1.Role{ 55 | ObjectMeta: metav1.ObjectMeta{ 56 | Name: chjob.Name + RoleSuffix, 57 | Namespace: chjob.Namespace, 58 | Labels: map[string]string{ 59 | JobLabelKey: chjob.Name, 60 | }, 61 | OwnerReferences: []metav1.OwnerReference{ 62 | *metav1.NewControllerRef(chjob, apisv1alpha1.SchemeGroupVersionKind), 63 | }, 64 | }, 65 | Rules: []rbacv1.PolicyRule{}, 66 | } 67 | } 68 | 69 | func NewRoleBindings(chjob *apisv1alpha1.ChainerJob) *rbacv1.RoleBinding { 70 | return &rbacv1.RoleBinding{ 71 | ObjectMeta: metav1.ObjectMeta{ 72 | Name: chjob.Name + RolebindingSuffix, 73 | Namespace: chjob.Namespace, 74 | Labels: map[string]string{ 75 | JobLabelKey: chjob.Name, 76 | }, 77 | OwnerReferences: []metav1.OwnerReference{ 78 | *metav1.NewControllerRef(chjob, apisv1alpha1.SchemeGroupVersionKind), 79 | }, 80 | }, 81 | Subjects: []rbacv1.Subject{ 82 | { 83 | Kind: rbacv1.ServiceAccountKind, 84 | Name: chjob.Name + ServiceAccountSuffix, 85 | Namespace: chjob.Namespace, 86 | }, 87 | }, 88 | RoleRef: rbacv1.RoleRef{ 89 | APIGroup: rbacv1.GroupName, 90 | Kind: "Role", 91 | Name: chjob.Name + RoleSuffix, 92 | }, 93 | } 94 | } 95 | 96 | func NewMasterJob(chjob *apisv1alpha1.ChainerJob) *batchv1.Job { 97 | masterSpec := chjob.Spec.Master 98 | 99 | // manged parts to be injected. 100 | managedLabels := map[string]string{ 101 | JobLabelKey: chjob.Name, 102 | RoleLabelKey: RoleMaster, 103 | } 104 | 105 | // inject aboves to user defined template 106 | jobLabels := map[string]string{} 107 | for k, v := range chjob.Labels { 108 | jobLabels[k] = v 109 | } 110 | podTemplate := masterSpec.Template.DeepCopy() 111 | if podTemplate.Labels == nil { 112 | podTemplate.Labels = make(map[string]string, len(managedLabels)) 113 | } 114 | for k, v := range managedLabels { 115 | podTemplate.Labels[k] = v 116 | jobLabels[k] = v 117 | } 118 | podTemplate.Spec.ServiceAccountName = chjob.Name + ServiceAccountSuffix 119 | 120 | return &batchv1.Job{ 121 | ObjectMeta: metav1.ObjectMeta{ 122 | Name: chjob.Name + MasterSuffix, 123 | Namespace: chjob.Namespace, 124 | Labels: jobLabels, 125 | OwnerReferences: []metav1.OwnerReference{ 126 | *metav1.NewControllerRef(chjob, apisv1alpha1.SchemeGroupVersionKind), 127 | }, 128 | }, 129 | Spec: batchv1.JobSpec{ 130 | Completions: apisv1alpha1.Int32(1), 131 | Parallelism: apisv1alpha1.Int32(1), 132 | BackoffLimit: masterSpec.BackoffLimit, 133 | ActiveDeadlineSeconds: masterSpec.ActiveDeadlineSeconds, 134 | Template: *podTemplate, 135 | }, 136 | } 137 | } 138 | 139 | func NewWorkerSet(chjob *apisv1alpha1.ChainerJob, done bool, name string) *appsv1.StatefulSet { 140 | workerSetSpec := chjob.Spec.WorkerSets[name] 141 | 142 | var targetReplicas *int32 143 | if done { 144 | targetReplicas = apisv1alpha1.Int32(0) 145 | } else { 146 | targetReplicas = chjob.Spec.WorkerSets[name].Replicas 147 | } 148 | 149 | // manged parts to be injected. 150 | managedLabels := map[string]string{ 151 | JobLabelKey: chjob.Name, 152 | RoleLabelKey: RoleWorkerSet, 153 | WorkersetLabelKey: name, 154 | } 155 | 156 | // inject aboves to user defined template 157 | ssLabels := map[string]string{} 158 | for k, v := range chjob.Labels { 159 | ssLabels[k] = v 160 | } 161 | podTemplate := workerSetSpec.Template.DeepCopy() 162 | if podTemplate.Labels == nil { 163 | podTemplate.Labels = make(map[string]string, len(managedLabels)) 164 | } 165 | for k, v := range managedLabels { 166 | podTemplate.Labels[k] = v 167 | ssLabels[k] = v 168 | } 169 | podTemplate.Spec.ServiceAccountName = chjob.Name + ServiceAccountSuffix 170 | 171 | return &appsv1.StatefulSet{ 172 | ObjectMeta: metav1.ObjectMeta{ 173 | Name: chjob.Name + WorkerSetSuffix + "-" + name, 174 | Namespace: chjob.Namespace, 175 | Labels: ssLabels, 176 | OwnerReferences: []metav1.OwnerReference{ 177 | *metav1.NewControllerRef(chjob, apisv1alpha1.SchemeGroupVersionKind), 178 | }, 179 | }, 180 | Spec: appsv1.StatefulSetSpec{ 181 | PodManagementPolicy: PodManagementPolicy, 182 | Replicas: targetReplicas, 183 | Selector: &metav1.LabelSelector{ 184 | MatchLabels: podTemplate.Labels, 185 | }, 186 | ServiceName: chjob.Name + WorkerSetSuffix + "-" + name, 187 | Template: *podTemplate, 188 | }, 189 | } 190 | } 191 | 192 | func CreateOrUpdateConfigMap( 193 | chjob *apisv1alpha1.ChainerJob, 194 | kubeClient kubernetes.Interface, 195 | cmLister corelister.ConfigMapLister, 196 | recorder record.EventRecorder, 197 | newResource func(chj *apisv1alpha1.ChainerJob) *corev1.ConfigMap, 198 | ) (*corev1.ConfigMap, error) { 199 | desired := newResource(chjob) 200 | client := kubeClient.CoreV1().ConfigMaps(desired.Namespace) 201 | lister := cmLister.ConfigMaps(desired.Namespace) 202 | 203 | rs, err := lister.Get(desired.Name) 204 | 205 | created := false 206 | if errors.IsNotFound(err) { 207 | rs, err = client.Create(desired) 208 | created = true 209 | } 210 | 211 | if err != nil { 212 | // If an error occurs during Get, we'll requeue the item so we can 213 | // attempt processing again later. This could have been caused by a 214 | // temporary network failure, or any other transient reason. 215 | return nil, err 216 | } 217 | 218 | // If the resource is not controlled by this ChainerJob resource, we should log 219 | // a warning to the event recorder and return. 220 | if !metav1.IsControlledBy(rs, chjob) { 221 | msg := fmt.Sprintf(MessageResourceExists, rs.Name) 222 | recorder.Event(chjob, corev1.EventTypeWarning, ErrResourceExists, msg) 223 | return rs, fmt.Errorf(msg) 224 | } 225 | 226 | if !created { 227 | rs, err = client.Update(desired) 228 | if err != nil { 229 | return rs, err 230 | } 231 | } 232 | 233 | return rs, nil 234 | } 235 | 236 | func CreateServiceAccountIfNotExist( 237 | chjob *apisv1alpha1.ChainerJob, 238 | kubeClient kubernetes.Interface, 239 | saLister corelister.ServiceAccountLister, 240 | recorder record.EventRecorder, 241 | newResource func(chj *apisv1alpha1.ChainerJob) *corev1.ServiceAccount, 242 | ) (*corev1.ServiceAccount, error) { 243 | desired := newResource(chjob) 244 | client := kubeClient.CoreV1().ServiceAccounts(desired.Namespace) 245 | lister := saLister.ServiceAccounts(desired.Namespace) 246 | 247 | rs, err := lister.Get(desired.Name) 248 | 249 | if errors.IsNotFound(err) { 250 | rs, err = client.Create(desired) 251 | } 252 | 253 | if err != nil { 254 | // If an error occurs during Get, we'll requeue the item so we can 255 | // attempt processing again later. This could have been caused by a 256 | // temporary network failure, or any other transient reason. 257 | return nil, err 258 | } 259 | 260 | // If the resource is not controlled by this ChainerJob resource, we should log 261 | // a warning to the event recorder and return. 262 | if !metav1.IsControlledBy(rs, chjob) { 263 | msg := fmt.Sprintf(MessageResourceExists, rs.Name) 264 | recorder.Event(chjob, corev1.EventTypeWarning, ErrResourceExists, msg) 265 | return rs, fmt.Errorf(msg) 266 | } 267 | 268 | return rs, nil 269 | } 270 | 271 | func CreateOrUpdateRole( 272 | chjob *apisv1alpha1.ChainerJob, 273 | kubeClient kubernetes.Interface, 274 | roleLister rbaclister.RoleLister, 275 | recorder record.EventRecorder, 276 | newResource func(chj *apisv1alpha1.ChainerJob) *rbacv1.Role, 277 | ) (*rbacv1.Role, error) { 278 | desired := newResource(chjob) 279 | client := kubeClient.RbacV1().Roles(desired.Namespace) 280 | lister := roleLister.Roles(desired.Namespace) 281 | 282 | rs, err := lister.Get(desired.Name) 283 | 284 | created := false 285 | if errors.IsNotFound(err) { 286 | rs, err = client.Create(desired) 287 | created = true 288 | } 289 | 290 | if err != nil { 291 | // If an error occurs during Get, we'll requeue the item so we can 292 | // attempt processing again later. This could have been caused by a 293 | // temporary network failure, or any other transient reason. 294 | return nil, err 295 | } 296 | 297 | // If the resource is not controlled by this ChainerJob resource, we should log 298 | // a warning to the event recorder and return. 299 | if !metav1.IsControlledBy(rs, chjob) { 300 | msg := fmt.Sprintf(MessageResourceExists, rs.Name) 301 | recorder.Event(chjob, corev1.EventTypeWarning, ErrResourceExists, msg) 302 | return rs, fmt.Errorf(msg) 303 | } 304 | 305 | if !created { 306 | rs, err = client.Update(desired) 307 | if err != nil { 308 | return rs, err 309 | } 310 | } 311 | 312 | return rs, nil 313 | } 314 | 315 | func CreateOrUpdateRoleBinding( 316 | chjob *apisv1alpha1.ChainerJob, 317 | kubeClient kubernetes.Interface, 318 | roleBindingLister rbaclister.RoleBindingLister, 319 | recorder record.EventRecorder, 320 | newResource func(chj *apisv1alpha1.ChainerJob) *rbacv1.RoleBinding, 321 | ) (*rbacv1.RoleBinding, error) { 322 | desired := newResource(chjob) 323 | client := kubeClient.RbacV1().RoleBindings(desired.Namespace) 324 | lister := roleBindingLister.RoleBindings(desired.Namespace) 325 | 326 | rs, err := lister.Get(desired.Name) 327 | 328 | created := false 329 | if errors.IsNotFound(err) { 330 | rs, err = client.Create(desired) 331 | created = true 332 | } 333 | 334 | if err != nil { 335 | // If an error occurs during Get, we'll requeue the item so we can 336 | // attempt processing again later. This could have been caused by a 337 | // temporary network failure, or any other transient reason. 338 | return nil, err 339 | } 340 | 341 | // If the resource is not controlled by this ChainerJob resource, we should log 342 | // a warning to the event recorder and return. 343 | if !metav1.IsControlledBy(rs, chjob) { 344 | msg := fmt.Sprintf(MessageResourceExists, rs.Name) 345 | recorder.Event(chjob, corev1.EventTypeWarning, ErrResourceExists, msg) 346 | return rs, fmt.Errorf(msg) 347 | } 348 | 349 | if !created { 350 | rs, err = client.Update(desired) 351 | if err != nil { 352 | return rs, err 353 | } 354 | } 355 | 356 | return rs, nil 357 | } 358 | 359 | func CreateJobIfNotExist( 360 | chjob *apisv1alpha1.ChainerJob, 361 | kubeClient kubernetes.Interface, 362 | jobLister batchlister.JobLister, 363 | recorder record.EventRecorder, 364 | newResource func(chj *apisv1alpha1.ChainerJob) *batchv1.Job, 365 | ) (*batchv1.Job, error) { 366 | desired := newResource(chjob) 367 | client := kubeClient.BatchV1().Jobs(desired.Namespace) 368 | lister := jobLister.Jobs(desired.Namespace) 369 | 370 | rs, err := lister.Get(desired.Name) 371 | 372 | if errors.IsNotFound(err) { 373 | rs, err = client.Create(desired) 374 | } 375 | 376 | if err != nil { 377 | // If an error occurs during Get, we'll requeue the item so we can 378 | // attempt processing again later. This could have been caused by a 379 | // temporary network failure, or any other transient reason. 380 | return nil, err 381 | } 382 | 383 | // If the resource is not controlled by this ChainerJob resource, we should log 384 | // a warning to the event recorder and return. 385 | if !metav1.IsControlledBy(rs, chjob) { 386 | msg := fmt.Sprintf(MessageResourceExists, rs.Name) 387 | recorder.Event(chjob, corev1.EventTypeWarning, ErrResourceExists, msg) 388 | return rs, fmt.Errorf(msg) 389 | } 390 | 391 | return rs, nil 392 | } 393 | 394 | func CreateOrUpdateStatefulSet( 395 | chjob *apisv1alpha1.ChainerJob, 396 | kubeClient kubernetes.Interface, 397 | statefulSetLister appslister.StatefulSetLister, 398 | recorder record.EventRecorder, 399 | newResource func(chj *apisv1alpha1.ChainerJob) *appsv1.StatefulSet, 400 | ) (*appsv1.StatefulSet, error) { 401 | desired := newResource(chjob) 402 | client := kubeClient.AppsV1().StatefulSets(desired.Namespace) 403 | lister := statefulSetLister.StatefulSets(desired.Namespace) 404 | 405 | rs, err := lister.Get(desired.Name) 406 | 407 | created := false 408 | if errors.IsNotFound(err) { 409 | rs, err = client.Create(desired) 410 | created = true 411 | } 412 | 413 | if err != nil { 414 | // If an error occurs during Get, we'll requeue the item so we can 415 | // attempt processing again later. This could have been caused by a 416 | // temporary network failure, or any other transient reason. 417 | return nil, err 418 | } 419 | 420 | // If the resource is not controlled by this ChainerJob resource, we should log 421 | // a warning to the event recorder and return. 422 | if !metav1.IsControlledBy(rs, chjob) { 423 | msg := fmt.Sprintf(MessageResourceExists, rs.Name) 424 | recorder.Event(chjob, corev1.EventTypeWarning, ErrResourceExists, msg) 425 | return rs, fmt.Errorf(msg) 426 | } 427 | 428 | if !created { 429 | rs, err = client.Update(desired) 430 | if err != nil { 431 | return rs, err 432 | } 433 | } 434 | 435 | return rs, nil 436 | } 437 | 438 | func UpdateChainerJobStatus( 439 | chjob *apisv1alpha1.ChainerJob, 440 | status *batchv1.JobStatus, 441 | kubeflowClient clientset.Interface, 442 | ) error { 443 | chjobCopy := chjob.DeepCopy() 444 | chjobCopy.Status = *status.DeepCopy() 445 | 446 | _, err := kubeflowClient.KubeflowV1alpha1().ChainerJobs(chjob.Namespace).Update(chjobCopy) 447 | 448 | return err 449 | } 450 | -------------------------------------------------------------------------------- /pkg/controllers/backends/mpi/mpi_backend.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Kubeflow Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package mpi 16 | 17 | import ( 18 | "bytes" 19 | "fmt" 20 | "strconv" 21 | 22 | "github.com/golang/glog" 23 | 24 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 | 26 | appsv1 "k8s.io/api/apps/v1" 27 | batchv1 "k8s.io/api/batch/v1" 28 | corev1 "k8s.io/api/core/v1" 29 | rbacv1 "k8s.io/api/rbac/v1" 30 | 31 | "k8s.io/client-go/kubernetes" 32 | appslister "k8s.io/client-go/listers/apps/v1" 33 | batchlister "k8s.io/client-go/listers/batch/v1" 34 | corelister "k8s.io/client-go/listers/core/v1" 35 | rbaclister "k8s.io/client-go/listers/rbac/v1" 36 | "k8s.io/client-go/tools/record" 37 | 38 | apisv1alpha1 "github.com/kubeflow/chainer-operator/pkg/apis/chainer/v1alpha1" 39 | clientset "github.com/kubeflow/chainer-operator/pkg/client/clientset/versioned" 40 | 41 | "github.com/kubeflow/chainer-operator/pkg/controllers/backends" 42 | ) 43 | 44 | const ( 45 | assetsSuffix = "-assets" 46 | 47 | kubectlDownloaderContainerName = "chainer-operator-kubectl-downloader" 48 | hostfileGeneratorContainerName = "chainer-operator-hostfile-generator" 49 | 50 | kubectlDownloadScriptName = "download_kubectl.sh" 51 | kubexecScriptName = "kubexeb.sh" 52 | genHostfileScriptName = "gen_hostfile.sh" 53 | statefulSetsAndSlotsFileName = "statefulsets_and_slots" 54 | 55 | volumeNameBase = "chainer-operator-volume" 56 | assetsVolumeName = volumeNameBase + "-assets" 57 | kubectlVolumeName = volumeNameBase + "-kubectl" 58 | hostfileVolumeName = volumeNameBase + "-generated" 59 | mountPathBase = "/kubeflow/chainer-operator" 60 | assetsMountPath = mountPathBase + "/assets" 61 | kubectlMountPath = mountPathBase + "/kube" 62 | hostfileMountPath = mountPathBase + "/generated" 63 | hostfileName = "hostfile" 64 | 65 | kubectlDownloaderContainerImage = "tutum/curl" 66 | hostfileGeneratorContainerImage = "alpine" 67 | kubectlDirEnv = "KUBECTL_DIR" 68 | 69 | podManagementPolicy = appsv1.ParallelPodManagement 70 | ) 71 | 72 | // Backend is responsible for syncing ChainerJob with backend:mpi 73 | type Backend struct { 74 | kubeClient kubernetes.Interface 75 | kubeflowClient clientset.Interface 76 | serviceAccountLister corelister.ServiceAccountLister 77 | roleLister rbaclister.RoleLister 78 | roleBindingLister rbaclister.RoleBindingLister 79 | configMapLister corelister.ConfigMapLister 80 | jobLister batchlister.JobLister 81 | statefulSetLister appslister.StatefulSetLister 82 | 83 | recorder record.EventRecorder 84 | } 85 | 86 | // NewBackend is constructor for Backend 87 | func NewBackend( 88 | kubeClient kubernetes.Interface, 89 | kubeflowClient clientset.Interface, 90 | serviceAccountLister corelister.ServiceAccountLister, 91 | roleLister rbaclister.RoleLister, 92 | roleBindingLister rbaclister.RoleBindingLister, 93 | configMapLister corelister.ConfigMapLister, 94 | jobLister batchlister.JobLister, 95 | statefulSetLister appslister.StatefulSetLister, 96 | recorder record.EventRecorder, 97 | ) backends.Backend { 98 | return &Backend{ 99 | kubeClient: kubeClient, 100 | kubeflowClient: kubeflowClient, 101 | serviceAccountLister: serviceAccountLister, 102 | roleLister: roleLister, 103 | roleBindingLister: roleBindingLister, 104 | configMapLister: configMapLister, 105 | jobLister: jobLister, 106 | statefulSetLister: statefulSetLister, 107 | recorder: recorder, 108 | } 109 | } 110 | 111 | // SyncChainerJob is main function to sync ChainerJob with "backend: mpi" 112 | func (b *Backend) SyncChainerJob(chjob *apisv1alpha1.ChainerJob) error { 113 | if !apisv1alpha1.IsDistributed(chjob) || chjob.Spec.Backend != apisv1alpha1.BackendTypeMPI { 114 | return fmt.Errorf("not syncing %s because it is not a mpi backended distributed job", chjob.Name) 115 | } 116 | 117 | sa, err := b.syncServiceAccount(chjob) 118 | if sa == nil || err != nil { 119 | return err 120 | } 121 | glog.V(4).Infof("syncing %s: serviceaccount %s synced.", chjob.Name, sa.Name) 122 | 123 | role, err := b.syncRole(chjob) 124 | if role == nil || err != nil { 125 | return err 126 | } 127 | glog.V(4).Infof("syncing %s: role %s synced.", chjob.Name, role.Name) 128 | 129 | rolebinding, err := b.syncRoleBinding(chjob) 130 | if rolebinding == nil || err != nil { 131 | return err 132 | } 133 | glog.V(4).Infof("syncing %s: rolebinding %s synced.", chjob.Name, rolebinding.Name) 134 | 135 | if apisv1alpha1.IsDistributed(chjob) { 136 | cm, err := b.syncConfigMap(chjob) 137 | if cm == nil || err != nil { 138 | return err 139 | } 140 | glog.V(4).Infof("syncing %s: configmap %s synced.", chjob.Name, cm.Name) 141 | } 142 | 143 | master, err := b.syncMaster(chjob) 144 | if master == nil || err != nil { 145 | return err 146 | } 147 | glog.V(4).Infof("syncing %s: job %s synced.", chjob.Name, master.Name) 148 | 149 | if apisv1alpha1.IsDistributed(chjob) { 150 | workerSets, err := b.syncWorkerSets(chjob, isJobDone(master)) 151 | if len(workerSets) == 0 || err != nil { 152 | return err 153 | } 154 | for _, ws := range workerSets { 155 | glog.V(4).Infof("syncing %s: statefulset %s synced.", chjob.Name, ws.Name) 156 | } 157 | } 158 | 159 | err = backends.UpdateChainerJobStatus(chjob, &master.Status, b.kubeflowClient) 160 | if err != nil { 161 | return err 162 | } 163 | glog.V(4).Infof("syncing %s: updated status.", chjob.Name) 164 | 165 | return nil 166 | } 167 | 168 | func (b *Backend) syncServiceAccount(chjob *apisv1alpha1.ChainerJob) (*corev1.ServiceAccount, error) { 169 | return backends.CreateServiceAccountIfNotExist( 170 | chjob, 171 | b.kubeClient, 172 | b.serviceAccountLister, 173 | b.recorder, 174 | backends.NewServiceAccount, 175 | ) 176 | } 177 | 178 | func (b *Backend) syncRole(chjob *apisv1alpha1.ChainerJob) (*rbacv1.Role, error) { 179 | return backends.CreateOrUpdateRole( 180 | chjob, 181 | b.kubeClient, 182 | b.roleLister, 183 | b.recorder, 184 | newRole, 185 | ) 186 | } 187 | 188 | func newRole(chjob *apisv1alpha1.ChainerJob) *rbacv1.Role { 189 | podNames := make([]string, 0) 190 | ssNames := make([]string, 0) 191 | for name, workerSpec := range chjob.Spec.WorkerSets { 192 | ssName := fmt.Sprintf("%s%s-%s", chjob.Name, backends.WorkerSetSuffix, name) 193 | ssNames = append(ssNames, ssName) 194 | for i := 0; i < int(*workerSpec.Replicas); i++ { 195 | podNames = append(podNames, fmt.Sprintf("%s-%d", ssName, i)) 196 | } 197 | } 198 | rules := []rbacv1.PolicyRule{ 199 | rbacv1.PolicyRule{ 200 | APIGroups: []string{"apps"}, 201 | Verbs: []string{"get"}, 202 | Resources: []string{"statefulsets"}, 203 | ResourceNames: ssNames, 204 | }, 205 | rbacv1.PolicyRule{ 206 | APIGroups: []string{""}, 207 | Verbs: []string{"get", "list"}, 208 | Resources: []string{"pods"}, 209 | ResourceNames: podNames, 210 | }, 211 | rbacv1.PolicyRule{ 212 | APIGroups: []string{""}, 213 | Verbs: []string{"create"}, 214 | Resources: []string{"pods/exec"}, 215 | ResourceNames: podNames, 216 | }, 217 | } 218 | role := backends.NewRole(chjob) 219 | role.Rules = rules 220 | return role 221 | } 222 | 223 | func (b *Backend) syncRoleBinding(chjob *apisv1alpha1.ChainerJob) (*rbacv1.RoleBinding, error) { 224 | return backends.CreateOrUpdateRoleBinding( 225 | chjob, 226 | b.kubeClient, 227 | b.roleBindingLister, 228 | b.recorder, 229 | backends.NewRoleBindings, 230 | ) 231 | } 232 | 233 | func (b *Backend) syncConfigMap(chjob *apisv1alpha1.ChainerJob) (*corev1.ConfigMap, error) { 234 | return backends.CreateOrUpdateConfigMap( 235 | chjob, 236 | b.kubeClient, 237 | b.configMapLister, 238 | b.recorder, 239 | newConfigMap, 240 | ) 241 | } 242 | 243 | func (b *Backend) syncMaster(chjob *apisv1alpha1.ChainerJob) (*batchv1.Job, error) { 244 | return backends.CreateJobIfNotExist( 245 | chjob, 246 | b.kubeClient, 247 | b.jobLister, 248 | b.recorder, 249 | newMasterJob, 250 | ) 251 | } 252 | 253 | func (b *Backend) syncWorkerSets(chjob *apisv1alpha1.ChainerJob, done bool) ([]*appsv1.StatefulSet, error) { 254 | sss := make([]*appsv1.StatefulSet, 0) 255 | 256 | for name := range chjob.Spec.WorkerSets { 257 | ss, err := b.syncWorkerSet(chjob, done, name) 258 | if err != nil { 259 | return sss, err 260 | } 261 | sss = append(sss, ss) 262 | } 263 | 264 | return sss, nil 265 | } 266 | 267 | func (b *Backend) syncWorkerSet(chjob *apisv1alpha1.ChainerJob, done bool, name string) (*appsv1.StatefulSet, error) { 268 | return backends.CreateOrUpdateStatefulSet( 269 | chjob, 270 | b.kubeClient, 271 | b.statefulSetLister, 272 | b.recorder, 273 | func(chj *apisv1alpha1.ChainerJob) *appsv1.StatefulSet { 274 | return newWorkerSet(chj, done, name) 275 | }, 276 | ) 277 | } 278 | 279 | func newConfigMap(chjob *apisv1alpha1.ChainerJob) *corev1.ConfigMap { 280 | var ssAndSlots bytes.Buffer 281 | for name, spec := range chjob.Spec.WorkerSets { 282 | ssAndSlots.WriteString(fmt.Sprintf("%s%s-%s slots=%d\n", chjob.Name, backends.WorkerSetSuffix, name, *spec.MPIConfig.Slots)) 283 | } 284 | return &corev1.ConfigMap{ 285 | ObjectMeta: metav1.ObjectMeta{ 286 | Name: chjob.Name + assetsSuffix, 287 | Namespace: chjob.Namespace, 288 | Labels: map[string]string{ 289 | backends.JobLabelKey: chjob.Name, 290 | }, 291 | OwnerReferences: []metav1.OwnerReference{ 292 | *metav1.NewControllerRef(chjob, apisv1alpha1.SchemeGroupVersionKind), 293 | }, 294 | }, 295 | Data: map[string]string{ 296 | kubectlDownloadScriptName: kubectlDownloadSh, 297 | kubexecScriptName: kubexecSh, 298 | genHostfileScriptName: genHostfileSh, 299 | statefulSetsAndSlotsFileName: ssAndSlots.String(), 300 | }, 301 | } 302 | } 303 | 304 | func newManagedMPIVolumes(chjob *apisv1alpha1.ChainerJob) []corev1.Volume { 305 | scriptMode := int32(0555) 306 | return []corev1.Volume{ 307 | corev1.Volume{ 308 | Name: hostfileVolumeName, 309 | VolumeSource: corev1.VolumeSource{ 310 | EmptyDir: &corev1.EmptyDirVolumeSource{}, 311 | }, 312 | }, 313 | corev1.Volume{ 314 | Name: kubectlVolumeName, 315 | VolumeSource: corev1.VolumeSource{ 316 | EmptyDir: &corev1.EmptyDirVolumeSource{}, 317 | }, 318 | }, 319 | corev1.Volume{ 320 | Name: assetsVolumeName, 321 | VolumeSource: corev1.VolumeSource{ 322 | ConfigMap: &corev1.ConfigMapVolumeSource{ 323 | LocalObjectReference: corev1.LocalObjectReference{ 324 | Name: chjob.Name + assetsSuffix, 325 | }, 326 | Items: []corev1.KeyToPath{ 327 | corev1.KeyToPath{ 328 | Key: kubectlDownloadScriptName, 329 | Path: kubectlDownloadScriptName, 330 | Mode: &scriptMode, 331 | }, 332 | corev1.KeyToPath{ 333 | Key: kubexecScriptName, 334 | Path: kubexecScriptName, 335 | Mode: &scriptMode, 336 | }, 337 | corev1.KeyToPath{ 338 | Key: genHostfileScriptName, 339 | Path: genHostfileScriptName, 340 | Mode: &scriptMode, 341 | }, 342 | corev1.KeyToPath{ 343 | Key: statefulSetsAndSlotsFileName, 344 | Path: statefulSetsAndSlotsFileName, 345 | Mode: &scriptMode, 346 | }, 347 | }, 348 | }, 349 | }, 350 | }, 351 | } 352 | } 353 | 354 | func newManagedMPIVolumeMounts(chjob *apisv1alpha1.ChainerJob) []corev1.VolumeMount { 355 | return []corev1.VolumeMount{ 356 | corev1.VolumeMount{ 357 | Name: hostfileVolumeName, 358 | MountPath: hostfileMountPath, 359 | }, 360 | corev1.VolumeMount{ 361 | Name: kubectlVolumeName, 362 | MountPath: kubectlMountPath, 363 | }, 364 | corev1.VolumeMount{ 365 | Name: assetsVolumeName, 366 | MountPath: assetsMountPath, 367 | }, 368 | } 369 | } 370 | 371 | func newHostfileGeneratorContainer(volumeMounts []corev1.VolumeMount, masterSlots *int32) []corev1.Container { 372 | return []corev1.Container{ 373 | corev1.Container{ 374 | Name: hostfileGeneratorContainerName, 375 | Image: "alpine", 376 | VolumeMounts: volumeMounts, 377 | Env: []corev1.EnvVar{ 378 | corev1.EnvVar{ 379 | Name: "POD_NAME", 380 | ValueFrom: &corev1.EnvVarSource{ 381 | FieldRef: &corev1.ObjectFieldSelector{ 382 | FieldPath: "metadata.name", 383 | }, 384 | }, 385 | }, 386 | }, 387 | Command: []string{ 388 | assetsMountPath + "/" + genHostfileScriptName, 389 | }, 390 | Args: []string{ 391 | hostfileMountPath + "/" + hostfileName, 392 | "$(POD_NAME)", 393 | "slots=" + strconv.Itoa(int(*masterSlots)), 394 | assetsMountPath + "/" + statefulSetsAndSlotsFileName, 395 | }, 396 | }, 397 | } 398 | } 399 | 400 | func newKubectlDownloaderContainer(volumeMounts []corev1.VolumeMount) []corev1.Container { 401 | return []corev1.Container{ 402 | corev1.Container{ 403 | Name: kubectlDownloaderContainerName, 404 | Image: kubectlDownloaderContainerImage, 405 | Command: []string{ 406 | assetsMountPath + "/" + kubectlDownloadScriptName, 407 | }, 408 | Args: []string{kubectlMountPath}, 409 | VolumeMounts: volumeMounts, 410 | }, 411 | } 412 | } 413 | 414 | func newManagedMPIEnvVars() []corev1.EnvVar { 415 | return []corev1.EnvVar{ 416 | corev1.EnvVar{ 417 | Name: kubectlDirEnv, 418 | Value: kubectlMountPath, 419 | }, 420 | corev1.EnvVar{ 421 | Name: "OMPI_MCA_plm_rsh_agent", 422 | Value: assetsMountPath + "/" + kubexecScriptName, 423 | }, 424 | corev1.EnvVar{ 425 | Name: "OMPI_MCA_orte_keep_fqdn_hostnames", 426 | Value: "t", 427 | }, 428 | corev1.EnvVar{ 429 | Name: "OMPI_MCA_orte_default_hostfile", 430 | Value: hostfileMountPath + "/" + hostfileName, 431 | }, 432 | corev1.EnvVar{ 433 | Name: "OMPI_MCA_btl_tcp_if_exclude", 434 | Value: "lo,docker0", 435 | }, 436 | } 437 | } 438 | 439 | func newMasterJob(chjob *apisv1alpha1.ChainerJob) *batchv1.Job { 440 | // generate base job 441 | job := backends.NewMasterJob(chjob) 442 | 443 | // decorating base job 444 | // manged parts to be injected. 445 | managedVolumes := newManagedMPIVolumes(chjob) 446 | managedVolumeMounts := newManagedMPIVolumeMounts(chjob) 447 | managedInitContainers := append( 448 | newKubectlDownloaderContainer(managedVolumeMounts), 449 | newHostfileGeneratorContainer(managedVolumeMounts, chjob.Spec.Master.MPIConfig.Slots)...) 450 | managedEnv := newManagedMPIEnvVars() 451 | 452 | // inject aboves to user defined podTemplate 453 | podTemplate := job.Spec.Template.DeepCopy() 454 | podTemplate.Spec.Volumes = append(podTemplate.Spec.Volumes, managedVolumes...) 455 | modifiedInitContainers := make([]corev1.Container, len(podTemplate.Spec.InitContainers)) 456 | for i, initContainer := range podTemplate.Spec.InitContainers { 457 | modifiedInitContainer := initContainer.DeepCopy() 458 | modifiedInitContainer.VolumeMounts = append(modifiedInitContainer.VolumeMounts, managedVolumeMounts...) 459 | modifiedInitContainers[i] = *modifiedInitContainer 460 | } 461 | podTemplate.Spec.InitContainers = append(modifiedInitContainers, managedInitContainers...) 462 | modifiedContainers := make([]corev1.Container, len(podTemplate.Spec.Containers)) 463 | for i, container := range podTemplate.Spec.Containers { 464 | modifiedContainer := container.DeepCopy() 465 | modifiedContainer.Env = append(modifiedContainer.Env, managedEnv...) 466 | modifiedContainer.VolumeMounts = append(modifiedContainer.VolumeMounts, managedVolumeMounts...) 467 | modifiedContainers[i] = *modifiedContainer 468 | } 469 | podTemplate.Spec.Containers = modifiedContainers 470 | 471 | return &batchv1.Job{ 472 | ObjectMeta: job.ObjectMeta, 473 | Spec: batchv1.JobSpec{ 474 | Completions: job.Spec.Completions, 475 | Parallelism: job.Spec.Parallelism, 476 | BackoffLimit: job.Spec.BackoffLimit, 477 | ActiveDeadlineSeconds: job.Spec.ActiveDeadlineSeconds, 478 | Template: *podTemplate, 479 | }, 480 | } 481 | } 482 | 483 | func newWorkerSet(chjob *apisv1alpha1.ChainerJob, done bool, name string) *appsv1.StatefulSet { 484 | workerSet := backends.NewWorkerSet(chjob, done, name) 485 | 486 | // manged parts to be injected. 487 | managedVolumes := newManagedMPIVolumes(chjob) 488 | managedVolumeMounts := newManagedMPIVolumeMounts(chjob) 489 | managedInitContainers := newKubectlDownloaderContainer(managedVolumeMounts) 490 | managedEnv := newManagedMPIEnvVars() 491 | 492 | // inject aboves to user defined template 493 | podTemplate := workerSet.Spec.Template.DeepCopy() 494 | podTemplate.Spec.Volumes = append(podTemplate.Spec.Volumes, managedVolumes...) 495 | modifiedInitContainers := make([]corev1.Container, len(podTemplate.Spec.InitContainers)) 496 | for i, initContainer := range podTemplate.Spec.InitContainers { 497 | modifiedInitContainer := initContainer.DeepCopy() 498 | modifiedInitContainer.VolumeMounts = append(modifiedInitContainer.VolumeMounts, managedVolumeMounts...) 499 | modifiedInitContainers[i] = *modifiedInitContainer 500 | } 501 | podTemplate.Spec.InitContainers = append(modifiedInitContainers, managedInitContainers...) 502 | modifiedContainers := make([]corev1.Container, len(podTemplate.Spec.Containers)) 503 | for i, container := range podTemplate.Spec.Containers { 504 | modifiedContainer := container.DeepCopy() 505 | modifiedContainer.Env = append(modifiedContainer.Env, managedEnv...) 506 | modifiedContainer.VolumeMounts = append(modifiedContainer.VolumeMounts, managedVolumeMounts...) 507 | modifiedContainers[i] = *modifiedContainer 508 | } 509 | podTemplate.Spec.Containers = modifiedContainers 510 | 511 | return &appsv1.StatefulSet{ 512 | ObjectMeta: workerSet.ObjectMeta, 513 | Spec: appsv1.StatefulSetSpec{ 514 | PodManagementPolicy: workerSet.Spec.PodManagementPolicy, 515 | Replicas: workerSet.Spec.Replicas, 516 | Selector: workerSet.Spec.Selector, 517 | ServiceName: workerSet.Spec.ServiceName, 518 | Template: *podTemplate, 519 | }, 520 | } 521 | } 522 | 523 | func isJobDone(job *batchv1.Job) bool { 524 | succeeded := false 525 | failed := false 526 | if job != nil { 527 | for _, cond := range job.Status.Conditions { 528 | if cond.Type == "Complete" { 529 | succeeded = (cond.Status == "True") 530 | } 531 | } 532 | 533 | for _, cond := range job.Status.Conditions { 534 | if cond.Type == "Failed" { 535 | failed = (cond.Status == "True") 536 | } 537 | } 538 | } 539 | return job != nil && (succeeded || failed) 540 | } 541 | --------------------------------------------------------------------------------