├── nimbix-app ├── Dockerfile.base ├── Dockerfile.app ├── job_template.json ├── README.md └── jarvice_submit.py ├── scheduler ├── Dockerfile ├── scheduler.go └── factory.go ├── deploy ├── sample-job.yaml └── k8s-custom-sched.json └── README.md /nimbix-app/Dockerfile.base: -------------------------------------------------------------------------------- 1 | FROM python:2.7 2 | 3 | RUN pip install jarviceclient 4 | RUN pip install jinja2 5 | 6 | ADD jarvice_submit.py / 7 | ADD job_template.json / 8 | ENTRYPOINT ["python", "/jarvice_submit.py"] 9 | -------------------------------------------------------------------------------- /nimbix-app/Dockerfile.app: -------------------------------------------------------------------------------- 1 | FROM jarvice/powerai 2 | 3 | RUN apt-get update && apt-get install -y \ 4 | libffi-dev 5 | RUN pip install jarviceclient 6 | RUN pip install jinja2 7 | 8 | ADD jarvice_submit.py / 9 | ADD job_template.json / 10 | 11 | WORKDIR / 12 | 13 | ENTRYPOINT ["python", "/jarvice_submit.py"] 14 | -------------------------------------------------------------------------------- /nimbix-app/job_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "{{ app_name }}", 3 | "staging": false, 4 | "checkedout": false, 5 | "application": { 6 | "command": "{{ command }}", 7 | "parameters": { 8 | "command": "{{ command_args }}" 9 | }, 10 | "geometry": "1424x692" 11 | }, 12 | "machine": { 13 | "type": "{{ mc_type }}", 14 | "nodes": 1 15 | }, 16 | "vault": { 17 | "name": "drop.jarvice.com", 18 | "force": false, 19 | "readonly": false 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /scheduler/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.8 2 | 3 | RUN git clone https://github.com/kubernetes/kubernetes.git 4 | RUN apt-get update && apt-get install -y rsync 5 | ADD scheduler.go factory.go / 6 | RUN cd kubernetes && git checkout -b v1.8.3 v1.8.3 && \ 7 | cp /scheduler.go plugin/pkg/scheduler/scheduler.go && \ 8 | cp /factory.go plugin/pkg/scheduler/factory/factory.go && \ 9 | make all WHAT=plugin/cmd/kube-scheduler/ &&\ 10 | cp _output/bin/kube-scheduler /kube-scheduler-nimbix 11 | 12 | FROM debian:jessie 13 | COPY --from=0 /kube-scheduler-nimbix / 14 | CMD ["/kube-scheduler-nimbix"] 15 | -------------------------------------------------------------------------------- /deploy/sample-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: nimbix-job 5 | spec: 6 | template: 7 | metadata: 8 | name: nimbix-gpu 9 | labels: 10 | task-type: Nimbix 11 | spec: 12 | schedulerName: k8s-custom-sched 13 | restartPolicy: Never 14 | containers: 15 | - name: nimbix-job 16 | image: poweraijob 17 | imagePullPolicy: "Never" 18 | env: 19 | - name: "APP_NAME" 20 | value: "power8-ubuntu-mldl" 21 | - name: "APP_COMMAND" 22 | value: "run" 23 | - name: "APP_COMMAND_ARGS" 24 | value: "source /opt/DL/bazel/bin/bazel-activate && source /opt/DL/tensorflow/bin/tensorflow-activate && tensorflow-test" 25 | - name: "ARCH" 26 | value: "POWER" 27 | - name: "NUM_CPUS" 28 | value: "60" 29 | - name: "NUM_GPUS" 30 | value: "2" 31 | - name: "USERNAME" 32 | value: "" 33 | - name: "APIKEY" 34 | value: "" 35 | resources: 36 | limits: 37 | alpha.kubernetes.io/nvidia-gpu: 2 38 | command: ["python", "/jarvice_submit.py"] 39 | -------------------------------------------------------------------------------- /deploy/k8s-custom-sched.json: -------------------------------------------------------------------------------- 1 | { 2 | "apiVersion": "v1", 3 | "kind": "Pod", 4 | "metadata": { 5 | "name": "k8s-custom-sched", 6 | "namespace": "kube-system" 7 | }, 8 | "spec":{ 9 | "hostNetwork": true, 10 | "containers":[ 11 | { 12 | "name": "k8s-custom-sched", 13 | "image": "nimbix-sched", 14 | "imagePullPolicy": "IfNotPresent", 15 | "command": [ 16 | "/kube-scheduler-nimbix", 17 | "--master=https://MASTER_IP:8001", 18 | "--leader-elect=false", 19 | "--kubeconfig=/etc/cfc/conf/kube-scheduler-config.yaml", 20 | "--v=2", 21 | "--scheduler-name=k8s-custom-sched", 22 | "--port=PORT_NUM" 23 | ], 24 | "volumeMounts": [ 25 | { 26 | "name": "data", 27 | "mountPath": "/etc/cfc/conf" 28 | }, 29 | { 30 | "name": "audit", 31 | "mountPath": "/var/lib/icp/audit" 32 | } 33 | ] 34 | } 35 | ], 36 | "nodeName": "MASTER_NODE_NAME", 37 | "volumes": [ 38 | { 39 | "name": "data", 40 | "hostPath": { 41 | "path": "/etc/cfc/conf" 42 | } 43 | }, 44 | { 45 | "name": "audit", 46 | "hostPath": { 47 | "path": "/var/lib/icp/audit" 48 | } 49 | } 50 | ] 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /nimbix-app/README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | This is an example of using a docker image to provision a job in Nimbix - https://www.nimbix.net/ 3 | leveraging the Nimbix CLI. 4 | Read more about Nimbix CLI here - https://www.nimbix.net/jarvice-quick-start-guide/ 5 | 6 | There are two Dockerfiles. 7 | Dockerfile.base - Used to build the docker image to spawn a job in Nimbix. 8 | It uses certain environment variables to specific the required details. 9 | 10 | Dockerfile.app - Used to build an example docker image that can be run either in the local cluster 11 | or Nimbix. Nimbix expects a specific layout of the application environment in the docker image and hence a specific base 12 | image needs to be used. 13 | In the example we have used a Power LE (ppc64le) powerai Nimbix base image. You can use your own base image depending on your 14 | requirements. 15 | 16 | You can read more about creating docker images for Nimbix here - http://jarvice.readthedocs.io/en/latest/cicd/ 17 | 18 | ## Build Docker Image 19 | 20 | ```bash 21 | $ sudo docker build -t nimbix -f Dockerfile.base . 22 | ``` 23 | 24 | ```bash 25 | $ sudo docker build -t ppc64le/powerai -f Dockerfile.app . 26 | ``` 27 | 28 | 29 | ## Example Runs 30 | ### Submit a Job and wait for the job to finish/terminate 31 | 32 | The following command will provision a job in Nimbix cluster 33 | 34 | ```bash 35 | $ sudo docker run -it \ 36 | -e USERNAME=USERNAME \ 37 | -e APIKEY=123456abcdefgh2974 \ 38 | -e APP_NAME=my_app \ 39 | -e APP_COMMAND=run \ 40 | -e APP_COMMAND_ARGS="/run_trainig.sh"\ 41 | -e REMOTE=1\ 42 | -e ARCH=POWER \ 43 | -e NUM_CPUS=60 \ 44 | -e NUM_GPUS=2 \ 45 | nimbix 46 | ``` 47 | 48 | The following command will run the job in your local cluster 49 | 50 | ```bash 51 | sudo docker run -it \ 52 | -e "USERNAME=user" \ 53 | -e "APIKEY=123456789"\ 54 | -e "APP_NAME=power8-ubuntu-mldl"\ 55 | -e "APP_COMMAND=run"\ 56 | -e "APP_COMMAND_ARGS='source /opt/DL/bazel/bin/bazel-activate && source /opt/DL/tensorflow/bin/tensorflow-activate && tensorflow-test'" \ 57 | ppc64le/powerai 58 | ``` 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Demonstrating Cloud Bursting using a Custom Kubernetes Scheduler. 2 | This is an example demonstrating how to create a custom Kubernetes scheduler. 3 | The custom scheduler modifies the Kubernetes default scheduler so that a GPU 4 | related job can be provisioned in [Nimbix cloud](https://www.nimbix.net/) if the 5 | local cluster is unable to satisfy the GPU resource requests. 6 | 7 | ### Project Organization 8 | nimbix-app: Base image to be used for building application docker images 9 | scheduler: Custom K8s scheduler files for Kubernetes version - 1.8.3 10 | deploy: Sample deployment files 11 | 12 | ### How to build the custom scheduler. 13 | ```bash 14 | git clone https://github.com/IBM/k8s-custom-scheduler.git 15 | cd k8s-custom-scheduler/scheduler 16 | docker build -t nimbix-sched . 17 | ``` 18 | ### Deploy Nimbix scheduler on IBM Cloud Private (ICP) 19 | While these instructions are specific for 20 | [ICP](https://www.ibm.com/cloud-computing/products/ibm-cloud-private/), it 21 | should apply for any Kubernetes setup with minor modifications 22 | 23 | 1.**Create a secret for the certificate files to access apiserver over https**
24 | 25 | In ICP, certificates file reside on /etc/cfc/conf in master node. Use the following command to create the secret 26 | ```bash 27 | kubectl create secret generic certs --from-file=kube-scheduler-config.yaml --from-file=kube-scheduler.crt --from-file=kube-scheduler.key 28 | ``` 29 | 2.**Build the images for custom scheduler and sample nimbix job**
30 | 31 | Use the Dockerfiles in **scheduler** and **nimbix-app** directory 32 | 33 | 3.**Deploy the scheduler**
34 | 35 | Example deployment yaml is available at deploy/k8s-custom-sched.yaml. Update the yaml file with the MASTER_IP 36 | ```bash 37 | kubectl create -f k8s-custom-sched.yaml 38 | ``` 39 | 40 | 4.**Create appropriate role binding so that custom scheduler from system:kube-scheduler can modify pods from default namespace**
41 | 42 | ```bash 43 | kubectl create rolebinding someRole --clusterrole=admin --user=system:kube-scheduler --namespace=default 44 | ``` 45 | 5.**Deploy a sample GPU job using the custom scheduler**
46 | 47 | Example yaml is available at deploy/sample-job.yaml. Update the yaml with your Nimbix USERNAME and APIKEY. 48 | The job will be provisioned to Nimbix cloud if resource requirement is not met in the local cluster 49 | ```bash 50 | kubectl create -f sample-job.yaml 51 | ``` 52 | ### Authors 53 | Abhishek Dasgupta (abdasgupta@in.ibm.com)
54 | Pradipta Kumar Banerjee (bpradipt@in.ibm.com) 55 | -------------------------------------------------------------------------------- /nimbix-app/jarvice_submit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #A wrapper around jarvice_cli to submit a job and wait for it to end 3 | 4 | import os 5 | from subprocess import check_output, check_call 6 | import json 7 | import jinja2 8 | import argparse 9 | import sys 10 | import traceback 11 | import logging 12 | 13 | #Username and Apikey 14 | username = "" 15 | apikey = "" 16 | 17 | logging.basicConfig(level=logging.INFO) 18 | logger = logging.getLogger(__name__) 19 | 20 | def exec_and_wait(job_json): 21 | out = check_output(["jarvice_cli", "-username", username, "-apikey", apikey, "submit", "-j", job_json]) 22 | data = json.loads(out) 23 | job_id = str(data['number']) 24 | print "job id is " + job_id 25 | logger.info("job id is %s ", job_id) 26 | if job_id: 27 | out = check_output(["jarvice_cli", "-username", username, "-apikey", apikey, "wait_for", "-number", job_id ]) 28 | logger.info("job %s terminated", job_id) 29 | return 0 30 | print "failed to submit job" 31 | logger.error("failed to submit job") 32 | return 1 33 | 34 | 35 | def exec_and_wait_dry_run(job_json): 36 | logger.info("command to be executed is:") 37 | logger.info("jarvice_cli -username %s -apikey %s submit -j %s", username, apikey, job_json) 38 | return 0 39 | 40 | #Find relevant machine type based on number of CPUs and GPUs 41 | #This method assumes that the required cpus and gpus exactly matches 42 | #with Nimbix provided resource types 43 | #Example JSON o/p 44 | ''' 45 | { 46 | "nc3": { 47 | "mc_scale_max": 128, 48 | "mc_slave_gpus": 0, 49 | "mc_scratch": 100, 50 | "mc_scale_min": 1, 51 | "mc_ram": 128, 52 | "mc_swap": 64, 53 | "mc_description": "16 core, 128GB RAM (accelerated OpenGL on master)", 54 | "mc_price": 2.5, 55 | "mc_gpus": 1, 56 | "mc_slave_ram": 128, 57 | "mc_cores": 16, 58 | "mc_scale_select": "" 59 | }, 60 | "nc5": { 61 | "mc_scale_max": 2, 62 | "mc_slave_gpus": 0, 63 | "mc_scratch": 300, 64 | "mc_scale_min": 1, 65 | "mc_ram": 512, 66 | "mc_swap": 64, 67 | "mc_description": "16 core, 512GB RAM (accelerated OpenGL on master)", 68 | "mc_price": 6.52, 69 | "mc_gpus": 0, 70 | "mc_slave_ram": 512, 71 | "mc_cores": 16, 72 | "mc_scale_select": "" 73 | } 74 | } 75 | ''' 76 | def get_mc_type(num_cpus, num_gpus, arch): 77 | 78 | if arch == "INTEL": 79 | default_mc_type = "nc3" 80 | if arch == "POWER": 81 | default_mc_type = "np8c0" 82 | 83 | out = check_output(["jarvice_cli", "-username", username, "-apikey", apikey, "machines"]) 84 | data = json.loads(out) 85 | #{mc_type: [ ram, cores, gpus] 86 | mc_list = {} 87 | logger.debug("List of machines in Nimbix %s", data) 88 | for key, value in data.iteritems(): 89 | logger.debug("key: %s, mc_ram: %d, mc_cores: %d, mc_gpus: %d", key, value['mc_ram'], value['mc_cores'], value['mc_gpus']) 90 | logger.debug("mc_description: %s", value['mc_description']) 91 | if arch not in value['mc_description']: 92 | continue 93 | mc_list[key] = [ value['mc_ram'], value['mc_cores'], value['mc_gpus']] 94 | 95 | for mc_type, values in mc_list.iteritems(): 96 | if str(values[1]) == str(num_cpus) and str(values[2]) == str(num_gpus): 97 | return mc_type 98 | return default_mc_type 99 | 100 | def find_best_fit(num_res, sorted_list): 101 | logger.debug("sorted_list: %s, num_res: %d", sorted_list, num_res) 102 | return min(sorted_list, key=lambda x:abs(x - num_res)) 103 | 104 | ''' 105 | Get the machine type which is closest to the required resources 106 | ''' 107 | def get_mc_type_best_fit(num_cpus, num_gpus, arch): 108 | 109 | if arch == "INTEL": 110 | default_mc_type = "nc3" 111 | if arch == "POWER": 112 | default_mc_type = "np8c0" 113 | 114 | cpu_list = [] 115 | gpu_list = [] 116 | out = check_output(["jarvice_cli", "-username", username, "-apikey", apikey, "machines"]) 117 | data = json.loads(out) 118 | #{mc_type: [ ram, cores, gpus] 119 | mc_list = {} 120 | for key, value in data.iteritems(): 121 | logger.debug("key: %s, mc_ram: %d, mc_cores: %d, mc_gpus: %d", key, value['mc_ram'], value['mc_cores'], value['mc_gpus']) 122 | logger.debug("mc_description: %s", value['mc_description']) 123 | if arch not in value['mc_description']: 124 | continue 125 | mc_list[key] = [ value['mc_ram'], value['mc_cores'], value['mc_gpus']] 126 | cpu_list.append(value['mc_cores']) 127 | gpu_list.append(value['mc_gpus']) 128 | 129 | logger.debug("CPU list: %s", cpu_list) 130 | logger.debug("GPU list: %s", gpu_list) 131 | logger.debug("Machine list: %s", mc_list) 132 | cpus = find_best_fit(num_cpus, sorted(set(cpu_list))) 133 | logger.info("Required cpus: %d", cpus) 134 | gpus = find_best_fit(num_gpus, sorted(set(gpu_list))) 135 | logger.info("Required gpus: %d", gpus) 136 | 137 | for mc_type, values in mc_list.iteritems(): 138 | #Find best fit machine having required cpu and gpus 139 | if str(values[1]) == str(cpus) and str(values[2]) == str(gpus): 140 | return mc_type 141 | #What if best fit is not available in the list 142 | return default_mc_type 143 | 144 | 145 | def create_job_json(app_name, app_command, app_command_args, mc_type): 146 | template_loader = jinja2.FileSystemLoader( searchpath="./") 147 | template_env = jinja2.Environment( loader=template_loader ) 148 | TEMPLATE_FILE = "/job_template.json" 149 | template = template_env.get_template( TEMPLATE_FILE ) 150 | json_text = template.render(app_name=app_name, command=app_command, command_args=app_command_args, mc_type=mc_type) 151 | 152 | logger.debug("Json Job Description: %s", json_text) 153 | 154 | with open('/job.json', 'wb') as outfile: 155 | outfile.write(json_text) 156 | #This is required otherwise JSON parsing will fail 157 | outfile.write('\n') 158 | 159 | outfile.close() 160 | 161 | def remote_exec(): 162 | global username 163 | global apikey 164 | parser = argparse.ArgumentParser() 165 | parser.add_argument('--log_level', type=str, default='INFO', help='Log level - INFO, DEBUG') 166 | parser.add_argument('--dry_run', action='store_true', help='Dry Run - just dump the complete remote command') 167 | args = parser.parse_args() 168 | 169 | if args.log_level == "INFO": 170 | logger.setLevel(logging.INFO) 171 | if args.log_level == "DEBUG": 172 | logger.setLevel(logging.DEBUG) 173 | 174 | 175 | username = os.environ.get("USERNAME") 176 | apikey = os.environ.get("APIKEY") 177 | app_name = os.environ.get("APP_NAME") 178 | app_command = os.environ.get("APP_COMMAND") 179 | #App command args is optional. For batch jobs its required though 180 | app_command_args = os.environ.get("APP_COMMAND_ARGS") 181 | 182 | if username == None or apikey == None or app_name == None or app_command == None: 183 | logger.critical("Username, APIKey, App Name and Command are must") 184 | sys.exit() 185 | 186 | #Get required CPUs and GPUs. Default 1 187 | num_cpus = os.getenv("NUM_CPUS", 1) 188 | num_gpus = os.getenv("NUM_GPUS", 1) 189 | arch = os.getenv("ARCH", "POWER") 190 | 191 | logger.debug("APP_NAME: %s APP_COMMAND: %s, APP_COMMAND_ARGS: %s, NUM_CPUS: %s, NUM_GPUS: %s", app_name, app_command, 192 | app_command_args, num_cpus, num_gpus) 193 | 194 | try: 195 | if args.dry_run: 196 | exec_and_wait_dry_run("/job.json") 197 | else: 198 | mc_type = get_mc_type_best_fit(int(num_cpus), int(num_gpus), arch) 199 | logger.info("machine type to be used: %s", mc_type) 200 | create_job_json(app_name, app_command, app_command_args, mc_type) 201 | if exec_and_wait("/job.json"): 202 | logger.info("Error in running jarvice job") 203 | 204 | except Exception, e: 205 | logger.error('Unexpected error when running jarvice_cli', exc_info=True) 206 | 207 | def main(): 208 | if os.environ.get("REMOTE") : 209 | remote_exec() 210 | else: 211 | #Execute the command as-is 212 | app_command_args = os.environ.get("APP_COMMAND_ARGS") 213 | try: 214 | check_call(["/bin/bash", "-c", app_command_args]) 215 | except Exception, e: 216 | logger.error('Unexpected error when running command', exc_info=True) 217 | 218 | if __name__== "__main__": 219 | main() 220 | -------------------------------------------------------------------------------- /scheduler/scheduler.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package scheduler 18 | 19 | import ( 20 | "time" 21 | 22 | "k8s.io/api/core/v1" 23 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 24 | "k8s.io/apimachinery/pkg/util/sets" 25 | "k8s.io/apimachinery/pkg/util/wait" 26 | utilfeature "k8s.io/apiserver/pkg/util/feature" 27 | clientset "k8s.io/client-go/kubernetes" 28 | corelisters "k8s.io/client-go/listers/core/v1" 29 | "k8s.io/client-go/tools/cache" 30 | "k8s.io/client-go/tools/record" 31 | "k8s.io/kubernetes/pkg/features" 32 | "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" 33 | schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" 34 | "k8s.io/kubernetes/plugin/pkg/scheduler/core" 35 | "k8s.io/kubernetes/plugin/pkg/scheduler/metrics" 36 | "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" 37 | "k8s.io/kubernetes/plugin/pkg/scheduler/util" 38 | 39 | "github.com/golang/glog" 40 | ) 41 | 42 | // Binder knows how to write a binding. 43 | type Binder interface { 44 | Bind(binding *v1.Binding) error 45 | } 46 | 47 | // PodConditionUpdater updates the condition of a pod based on the passed 48 | // PodCondition 49 | type PodConditionUpdater interface { 50 | Update(pod *v1.Pod, podCondition *v1.PodCondition) error 51 | } 52 | 53 | // PodPreemptor has methods needed to delete a pod and to update 54 | // annotations of the preemptor pod. 55 | type PodPreemptor interface { 56 | GetUpdatedPod(pod *v1.Pod) (*v1.Pod, error) 57 | DeletePod(pod *v1.Pod) error 58 | UpdatePodAnnotations(pod *v1.Pod, annots map[string]string) error 59 | } 60 | 61 | // PodWriter delete the pod and create a new pod based on the passed pod. 62 | type PodWriter interface { 63 | UpdatePod(oldPod, newPod *v1.Pod) error 64 | } 65 | 66 | // Scheduler watches for new unscheduled pods. It attempts to find 67 | // nodes that they fit on and writes bindings back to the api server. 68 | type Scheduler struct { 69 | config *Config 70 | } 71 | 72 | // StopEverything closes the scheduler config's StopEverything channel, to shut 73 | // down the Scheduler. 74 | func (sched *Scheduler) StopEverything() { 75 | close(sched.config.StopEverything) 76 | } 77 | 78 | // Configurator defines I/O, caching, and other functionality needed to 79 | // construct a new scheduler. An implementation of this can be seen in 80 | // factory.go. 81 | type Configurator interface { 82 | GetPriorityFunctionConfigs(priorityKeys sets.String) ([]algorithm.PriorityConfig, error) 83 | GetPriorityMetadataProducer() (algorithm.MetadataProducer, error) 84 | GetPredicateMetadataProducer() (algorithm.PredicateMetadataProducer, error) 85 | GetPredicates(predicateKeys sets.String) (map[string]algorithm.FitPredicate, error) 86 | GetHardPodAffinitySymmetricWeight() int 87 | GetSchedulerName() string 88 | MakeDefaultErrorFunc(backoff *util.PodBackoff, podQueue *cache.FIFO) func(pod *v1.Pod, err error) 89 | 90 | // Probably doesn't need to be public. But exposed for now in case. 91 | ResponsibleForPod(pod *v1.Pod) bool 92 | 93 | // Needs to be exposed for things like integration tests where we want to make fake nodes. 94 | GetNodeLister() corelisters.NodeLister 95 | GetClient() clientset.Interface 96 | GetScheduledPodLister() corelisters.PodLister 97 | 98 | Create() (*Config, error) 99 | CreateFromProvider(providerName string) (*Config, error) 100 | CreateFromConfig(policy schedulerapi.Policy) (*Config, error) 101 | CreateFromKeys(predicateKeys, priorityKeys sets.String, extenders []algorithm.SchedulerExtender) (*Config, error) 102 | } 103 | 104 | // Config is an implementation of the Scheduler's configured input data. 105 | // TODO over time we should make this struct a hidden implementation detail of the scheduler. 106 | type Config struct { 107 | // It is expected that changes made via SchedulerCache will be observed 108 | // by NodeLister and Algorithm. 109 | SchedulerCache schedulercache.Cache 110 | // Ecache is used for optimistically invalid affected cache items after 111 | // successfully binding a pod 112 | Ecache *core.EquivalenceCache 113 | NodeLister algorithm.NodeLister 114 | Algorithm algorithm.ScheduleAlgorithm 115 | Binder Binder 116 | // PodConditionUpdater is used only in case of scheduling errors. If we succeed 117 | // with scheduling, PodScheduled condition will be updated in apiserver in /bind 118 | // handler so that binding and setting PodCondition it is atomic. 119 | PodConditionUpdater PodConditionUpdater 120 | // PodPreemptor is used to evict pods and update pod annotations. 121 | PodPreemptor PodPreemptor 122 | 123 | // PodWriter is only used to schedule Pods those can be run on Nimbix. 124 | PodWriter PodWriter 125 | 126 | // NextPod should be a function that blocks until the next pod 127 | // is available. We don't use a channel for this, because scheduling 128 | // a pod may take some amount of time and we don't want pods to get 129 | // stale while they sit in a channel. 130 | NextPod func() *v1.Pod 131 | 132 | // WaitForCacheSync waits for scheduler cache to populate. 133 | // It returns true if it was successful, false if the controller should shutdown. 134 | WaitForCacheSync func() bool 135 | 136 | // Error is called if there is an error. It is passed the pod in 137 | // question, and the error 138 | Error func(*v1.Pod, error) 139 | 140 | // Recorder is the EventRecorder to use 141 | Recorder record.EventRecorder 142 | 143 | // Close this to shut down the scheduler. 144 | StopEverything chan struct{} 145 | } 146 | 147 | // NewFromConfigurator returns a new scheduler that is created entirely by the Configurator. Assumes Create() is implemented. 148 | // Supports intermediate Config mutation for now if you provide modifier functions which will run after Config is created. 149 | func NewFromConfigurator(c Configurator, modifiers ...func(c *Config)) (*Scheduler, error) { 150 | cfg, err := c.Create() 151 | if err != nil { 152 | return nil, err 153 | } 154 | // Mutate it if any functions were provided, changes might be required for certain types of tests (i.e. change the recorder). 155 | for _, modifier := range modifiers { 156 | modifier(cfg) 157 | } 158 | // From this point on the config is immutable to the outside. 159 | s := &Scheduler{ 160 | config: cfg, 161 | } 162 | metrics.Register() 163 | return s, nil 164 | } 165 | 166 | // Run begins watching and scheduling. It waits for cache to be synced, then starts a goroutine and returns immediately. 167 | func (sched *Scheduler) Run() { 168 | if !sched.config.WaitForCacheSync() { 169 | return 170 | } 171 | 172 | go wait.Until(sched.scheduleOne, 0, sched.config.StopEverything) 173 | } 174 | 175 | // Config return scheduler's config pointer. It is exposed for testing purposes. 176 | func (sched *Scheduler) Config() *Config { 177 | return sched.config 178 | } 179 | 180 | // schedule implements the scheduling algorithm and returns the suggested host. 181 | func (sched *Scheduler) schedule(pod *v1.Pod) (string, error) { 182 | host, err := sched.config.Algorithm.Schedule(pod, sched.config.NodeLister) 183 | if err != nil { 184 | glog.V(1).Infof("Failed to schedule pod: %v/%v", pod.Namespace, pod.Name) 185 | pod = pod.DeepCopy() 186 | sched.config.Error(pod, err) 187 | sched.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "%v", err) 188 | sched.config.PodConditionUpdater.Update(pod, &v1.PodCondition{ 189 | Type: v1.PodScheduled, 190 | Status: v1.ConditionFalse, 191 | Reason: v1.PodReasonUnschedulable, 192 | Message: err.Error(), 193 | }) 194 | return "", err 195 | } 196 | return host, err 197 | } 198 | 199 | func (sched *Scheduler) preempt(preemptor *v1.Pod, scheduleErr error) (string, error) { 200 | if !utilfeature.DefaultFeatureGate.Enabled(features.PodPriority) { 201 | glog.V(3).Infof("Pod priority feature is not enabled. No preemption is performed.") 202 | return "", nil 203 | } 204 | preemptor, err := sched.config.PodPreemptor.GetUpdatedPod(preemptor) 205 | if err != nil { 206 | glog.Errorf("Error getting the updated preemptor pod object: %v", err) 207 | return "", err 208 | } 209 | node, victims, err := sched.config.Algorithm.Preempt(preemptor, sched.config.NodeLister, scheduleErr) 210 | if err != nil { 211 | glog.Errorf("Error preempting victims to make room for %v/%v.", preemptor.Namespace, preemptor.Name) 212 | return "", err 213 | } 214 | if node == nil { 215 | return "", err 216 | } 217 | glog.Infof("Preempting %d pod(s) on node %v to make room for %v/%v.", len(victims), node.Name, preemptor.Namespace, preemptor.Name) 218 | annotations := map[string]string{core.NominatedNodeAnnotationKey: node.Name} 219 | err = sched.config.PodPreemptor.UpdatePodAnnotations(preemptor, annotations) 220 | if err != nil { 221 | glog.Errorf("Error in preemption process. Cannot update pod %v annotations: %v", preemptor.Name, err) 222 | return "", err 223 | } 224 | for _, victim := range victims { 225 | if err := sched.config.PodPreemptor.DeletePod(victim); err != nil { 226 | glog.Errorf("Error preempting pod %v/%v: %v", victim.Namespace, victim.Name, err) 227 | return "", err 228 | } 229 | sched.config.Recorder.Eventf(victim, v1.EventTypeNormal, "Preempted", "by %v/%v on node %v", preemptor.Namespace, preemptor.Name, node.Name) 230 | } 231 | return node.Name, err 232 | } 233 | 234 | // assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous. 235 | // assume modifies `assumed`. 236 | func (sched *Scheduler) assume(assumed *v1.Pod, host string) error { 237 | // Optimistically assume that the binding will succeed and send it to apiserver 238 | // in the background. 239 | // If the binding fails, scheduler will release resources allocated to assumed pod 240 | // immediately. 241 | assumed.Spec.NodeName = host 242 | if err := sched.config.SchedulerCache.AssumePod(assumed); err != nil { 243 | glog.Errorf("scheduler cache AssumePod failed: %v", err) 244 | 245 | // This is most probably result of a BUG in retrying logic. 246 | // We report an error here so that pod scheduling can be retried. 247 | // This relies on the fact that Error will check if the pod has been bound 248 | // to a node and if so will not add it back to the unscheduled pods queue 249 | // (otherwise this would cause an infinite loop). 250 | sched.config.Error(assumed, err) 251 | sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "AssumePod failed: %v", err) 252 | sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{ 253 | Type: v1.PodScheduled, 254 | Status: v1.ConditionFalse, 255 | Reason: "SchedulerError", 256 | Message: err.Error(), 257 | }) 258 | return err 259 | } 260 | 261 | // Optimistically assume that the binding will succeed, so we need to invalidate affected 262 | // predicates in equivalence cache. 263 | // If the binding fails, these invalidated item will not break anything. 264 | if sched.config.Ecache != nil { 265 | sched.config.Ecache.InvalidateCachedPredicateItemForPodAdd(assumed, host) 266 | } 267 | return nil 268 | } 269 | 270 | // bind binds a pod to a given node defined in a binding object. We expect this to run asynchronously, so we 271 | // handle binding metrics internally. 272 | func (sched *Scheduler) bind(assumed *v1.Pod, b *v1.Binding) error { 273 | bindingStart := time.Now() 274 | // If binding succeeded then PodScheduled condition will be updated in apiserver so that 275 | // it's atomic with setting host. 276 | err := sched.config.Binder.Bind(b) 277 | if err := sched.config.SchedulerCache.FinishBinding(assumed); err != nil { 278 | glog.Errorf("scheduler cache FinishBinding failed: %v", err) 279 | } 280 | if err != nil { 281 | glog.V(1).Infof("Failed to bind pod: %v/%v", assumed.Namespace, assumed.Name) 282 | if err := sched.config.SchedulerCache.ForgetPod(assumed); err != nil { 283 | glog.Errorf("scheduler cache ForgetPod failed: %v", err) 284 | } 285 | sched.config.Error(assumed, err) 286 | sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "Binding rejected: %v", err) 287 | sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{ 288 | Type: v1.PodScheduled, 289 | Status: v1.ConditionFalse, 290 | Reason: "BindingRejected", 291 | }) 292 | return err 293 | } 294 | 295 | metrics.BindingLatency.Observe(metrics.SinceInMicroseconds(bindingStart)) 296 | sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v to %v", assumed.Name, b.Target.Name) 297 | return nil 298 | } 299 | 300 | // scheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting. 301 | func (sched *Scheduler) scheduleOne() { 302 | pod := sched.config.NextPod() 303 | if pod.DeletionTimestamp != nil { 304 | sched.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name) 305 | glog.V(3).Infof("Skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name) 306 | return 307 | } 308 | 309 | glog.V(3).Infof("Attempting to schedule pod: %v/%v", pod.Namespace, pod.Name) 310 | 311 | // Synchronously attempt to find a fit for the pod. 312 | start := time.Now() 313 | suggestedHost, err := sched.schedule(pod) 314 | if len(suggestedHost) == 0 { 315 | if sched.checkPodLabelForNimbix(pod) { 316 | if len(pod.Spec.Containers) > 1 { 317 | glog.Errorf("Pods expected to be run in Nimbix must have no more than one container") 318 | return 319 | } 320 | 321 | // Modify the Pod specification to run it in any node with nimbix access. 322 | pod.DeletionTimestamp = nil 323 | newPod := sched.modifyPodForNimbix(pod) 324 | 325 | if updateErr := sched.config.PodWriter.UpdatePod(pod, newPod); updateErr != nil { 326 | glog.Errorf("Pod could not be updated on API server: %v", updateErr) 327 | } 328 | return 329 | } 330 | } 331 | metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInMicroseconds(start)) 332 | if err != nil { 333 | // schedule() may have failed because the pod would not fit on any host, so we try to 334 | // preempt, with the expectation that the next time the pod is tried for scheduling it 335 | // will fit due to the preemption. It is also possible that a different pod will schedule 336 | // into the resources that were preempted, but this is harmless. 337 | if fitError, ok := err.(*core.FitError); ok { 338 | sched.preempt(pod, fitError) 339 | } 340 | return 341 | } 342 | 343 | // Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet. 344 | // This allows us to keep scheduling without waiting on binding to occur. 345 | assumedPod := *pod 346 | // assume modifies `assumedPod` by setting NodeName=suggestedHost 347 | err = sched.assume(&assumedPod, suggestedHost) 348 | if err != nil { 349 | return 350 | } 351 | 352 | // bind the pod to its host asynchronously (we can do this b/c of the assumption step above). 353 | go func() { 354 | err := sched.bind(&assumedPod, &v1.Binding{ 355 | ObjectMeta: metav1.ObjectMeta{Namespace: assumedPod.Namespace, Name: assumedPod.Name, UID: assumedPod.UID}, 356 | Target: v1.ObjectReference{ 357 | Kind: "Node", 358 | Name: suggestedHost, 359 | }, 360 | }) 361 | metrics.E2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start)) 362 | if err != nil { 363 | glog.Errorf("Internal error binding pod: (%v)", err) 364 | } 365 | }() 366 | } 367 | 368 | // Check whether the pod contains label task-type "Nimbix" 369 | func (sched *Scheduler) checkPodLabelForNimbix(pod *v1.Pod) bool { 370 | if pod.Labels["task-type"] == "Nimbix" { 371 | return true 372 | } 373 | 374 | return false 375 | } 376 | 377 | func (sched *Scheduler) modifyPodForNimbix(oldPod *v1.Pod) *v1.Pod { 378 | oldContainer := oldPod.Spec.Containers[0] 379 | newEnv := []v1.EnvVar{ 380 | { 381 | Name: "REMOTE", 382 | Value: "1", 383 | }, 384 | } 385 | pod := &v1.Pod{ 386 | TypeMeta: metav1.TypeMeta{ 387 | Kind: oldPod.Kind, 388 | APIVersion: oldPod.APIVersion, 389 | }, 390 | ObjectMeta: metav1.ObjectMeta{ 391 | Name: oldPod.GetName(), 392 | Namespace: oldPod.GetNamespace(), 393 | Labels: oldPod.GetLabels(), 394 | Annotations: oldPod.GetAnnotations(), 395 | OwnerReferences: oldPod.GetOwnerReferences(), 396 | Initializers: oldPod.GetInitializers(), 397 | Finalizers: oldPod.GetFinalizers(), 398 | ClusterName: oldPod.GetClusterName(), 399 | }, 400 | Spec: v1.PodSpec{ 401 | 402 | Volumes: oldPod.Spec.Volumes, 403 | InitContainers: oldPod.Spec.InitContainers, 404 | Containers: []v1.Container{ 405 | { 406 | Name: oldContainer.Name, 407 | Image: oldContainer.Image, 408 | Command: oldContainer.Command, 409 | Args: oldContainer.Args, 410 | WorkingDir: oldContainer.WorkingDir, 411 | Ports: oldContainer.Ports, 412 | EnvFrom: oldContainer.EnvFrom, 413 | Env: append(oldContainer.Env, newEnv...), 414 | VolumeMounts: oldContainer.VolumeMounts, 415 | LivenessProbe: oldContainer.LivenessProbe, 416 | ReadinessProbe: oldContainer.ReadinessProbe, 417 | Lifecycle: oldContainer.Lifecycle, 418 | TerminationMessagePath: oldContainer.TerminationMessagePath, 419 | TerminationMessagePolicy: oldContainer.TerminationMessagePolicy, 420 | ImagePullPolicy: oldContainer.ImagePullPolicy, 421 | SecurityContext: oldContainer.SecurityContext, 422 | Stdin: oldContainer.Stdin, 423 | StdinOnce: oldContainer.StdinOnce, 424 | TTY: oldContainer.TTY, 425 | }, 426 | }, 427 | RestartPolicy: oldPod.Spec.RestartPolicy, 428 | TerminationGracePeriodSeconds: oldPod.Spec.TerminationGracePeriodSeconds, 429 | ActiveDeadlineSeconds: oldPod.Spec.ActiveDeadlineSeconds, 430 | DNSPolicy: oldPod.Spec.DNSPolicy, 431 | NodeSelector: oldPod.Spec.NodeSelector, 432 | ServiceAccountName: oldPod.Spec.ServiceAccountName, 433 | AutomountServiceAccountToken: oldPod.Spec.AutomountServiceAccountToken, 434 | NodeName: oldPod.Spec.NodeName, 435 | HostNetwork: oldPod.Spec.HostNetwork, 436 | HostPID: oldPod.Spec.HostPID, 437 | HostIPC: oldPod.Spec.HostIPC, 438 | SecurityContext: oldPod.Spec.SecurityContext, 439 | ImagePullSecrets: oldPod.Spec.ImagePullSecrets, 440 | Hostname: oldPod.Spec.Hostname, 441 | Subdomain: oldPod.Spec.Subdomain, 442 | Affinity: oldPod.Spec.Affinity, 443 | SchedulerName: oldPod.Spec.SchedulerName, 444 | Tolerations: oldPod.Spec.Tolerations, 445 | HostAliases: oldPod.Spec.HostAliases, 446 | }, 447 | } 448 | return pod 449 | } 450 | -------------------------------------------------------------------------------- /scheduler/factory.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package factory can set up a scheduler. This code is here instead of 18 | // plugin/cmd/scheduler for both testability and reuse. 19 | package factory 20 | 21 | import ( 22 | "fmt" 23 | "reflect" 24 | "sync" 25 | "time" 26 | 27 | "k8s.io/api/core/v1" 28 | "k8s.io/apimachinery/pkg/api/errors" 29 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 | "k8s.io/apimachinery/pkg/fields" 31 | "k8s.io/apimachinery/pkg/labels" 32 | "k8s.io/apimachinery/pkg/runtime/schema" 33 | "k8s.io/apimachinery/pkg/types" 34 | "k8s.io/apimachinery/pkg/util/runtime" 35 | "k8s.io/apimachinery/pkg/util/sets" 36 | appsinformers "k8s.io/client-go/informers/apps/v1beta1" 37 | coreinformers "k8s.io/client-go/informers/core/v1" 38 | extensionsinformers "k8s.io/client-go/informers/extensions/v1beta1" 39 | clientset "k8s.io/client-go/kubernetes" 40 | appslisters "k8s.io/client-go/listers/apps/v1beta1" 41 | corelisters "k8s.io/client-go/listers/core/v1" 42 | extensionslisters "k8s.io/client-go/listers/extensions/v1beta1" 43 | "k8s.io/client-go/tools/cache" 44 | "k8s.io/kubernetes/pkg/api/helper" 45 | podutil "k8s.io/kubernetes/pkg/api/v1/pod" 46 | kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis" 47 | "k8s.io/kubernetes/plugin/pkg/scheduler" 48 | "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" 49 | "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/predicates" 50 | schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api" 51 | "k8s.io/kubernetes/plugin/pkg/scheduler/api/validation" 52 | "k8s.io/kubernetes/plugin/pkg/scheduler/core" 53 | "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" 54 | "k8s.io/kubernetes/plugin/pkg/scheduler/util" 55 | 56 | "github.com/golang/glog" 57 | ) 58 | 59 | const ( 60 | initialGetBackoff = 100 * time.Millisecond 61 | maximalGetBackoff = time.Minute 62 | ) 63 | 64 | var ( 65 | serviceAffinitySet = sets.NewString("ServiceAffinity") 66 | maxPDVolumeCountPredicateSet = sets.NewString("MaxPDVolumeCountPredicate") 67 | matchInterPodAffinitySet = sets.NewString("MatchInterPodAffinity") 68 | generalPredicatesSets = sets.NewString("GeneralPredicates") 69 | noDiskConflictSet = sets.NewString("NoDiskConflict") 70 | ) 71 | 72 | // ConfigFactory is the default implementation of the scheduler.Configurator interface. 73 | // TODO make this private if possible, so that only its interface is externally used. 74 | type ConfigFactory struct { 75 | client clientset.Interface 76 | // queue for pods that need scheduling 77 | podQueue *cache.FIFO 78 | // a means to list all known scheduled pods. 79 | scheduledPodLister corelisters.PodLister 80 | // a means to list all known scheduled pods and pods assumed to have been scheduled. 81 | podLister algorithm.PodLister 82 | // a means to list all nodes 83 | nodeLister corelisters.NodeLister 84 | // a means to list all PersistentVolumes 85 | pVLister corelisters.PersistentVolumeLister 86 | // a means to list all PersistentVolumeClaims 87 | pVCLister corelisters.PersistentVolumeClaimLister 88 | // a means to list all services 89 | serviceLister corelisters.ServiceLister 90 | // a means to list all controllers 91 | controllerLister corelisters.ReplicationControllerLister 92 | // a means to list all replicasets 93 | replicaSetLister extensionslisters.ReplicaSetLister 94 | // a means to list all statefulsets 95 | statefulSetLister appslisters.StatefulSetLister 96 | 97 | // Close this to stop all reflectors 98 | StopEverything chan struct{} 99 | 100 | scheduledPodsHasSynced cache.InformerSynced 101 | 102 | schedulerCache schedulercache.Cache 103 | 104 | // SchedulerName of a scheduler is used to select which pods will be 105 | // processed by this scheduler, based on pods's "spec.SchedulerName". 106 | schedulerName string 107 | 108 | // RequiredDuringScheduling affinity is not symmetric, but there is an implicit PreferredDuringScheduling affinity rule 109 | // corresponding to every RequiredDuringScheduling affinity rule. 110 | // HardPodAffinitySymmetricWeight represents the weight of implicit PreferredDuringScheduling affinity rule, in the range 0-100. 111 | hardPodAffinitySymmetricWeight int 112 | 113 | // Equivalence class cache 114 | equivalencePodCache *core.EquivalenceCache 115 | 116 | // Enable equivalence class cache 117 | enableEquivalenceClassCache bool 118 | } 119 | 120 | // NewConfigFactory initializes the default implementation of a Configurator To encourage eventual privatization of the struct type, we only 121 | // return the interface. 122 | func NewConfigFactory( 123 | schedulerName string, 124 | client clientset.Interface, 125 | nodeInformer coreinformers.NodeInformer, 126 | podInformer coreinformers.PodInformer, 127 | pvInformer coreinformers.PersistentVolumeInformer, 128 | pvcInformer coreinformers.PersistentVolumeClaimInformer, 129 | replicationControllerInformer coreinformers.ReplicationControllerInformer, 130 | replicaSetInformer extensionsinformers.ReplicaSetInformer, 131 | statefulSetInformer appsinformers.StatefulSetInformer, 132 | serviceInformer coreinformers.ServiceInformer, 133 | hardPodAffinitySymmetricWeight int, 134 | enableEquivalenceClassCache bool, 135 | ) scheduler.Configurator { 136 | stopEverything := make(chan struct{}) 137 | schedulerCache := schedulercache.New(30*time.Second, stopEverything) 138 | 139 | c := &ConfigFactory{ 140 | client: client, 141 | podLister: schedulerCache, 142 | podQueue: cache.NewFIFO(cache.MetaNamespaceKeyFunc), 143 | pVLister: pvInformer.Lister(), 144 | pVCLister: pvcInformer.Lister(), 145 | serviceLister: serviceInformer.Lister(), 146 | controllerLister: replicationControllerInformer.Lister(), 147 | replicaSetLister: replicaSetInformer.Lister(), 148 | statefulSetLister: statefulSetInformer.Lister(), 149 | schedulerCache: schedulerCache, 150 | StopEverything: stopEverything, 151 | schedulerName: schedulerName, 152 | hardPodAffinitySymmetricWeight: hardPodAffinitySymmetricWeight, 153 | enableEquivalenceClassCache: enableEquivalenceClassCache, 154 | } 155 | 156 | c.scheduledPodsHasSynced = podInformer.Informer().HasSynced 157 | // scheduled pod cache 158 | podInformer.Informer().AddEventHandler( 159 | cache.FilteringResourceEventHandler{ 160 | FilterFunc: func(obj interface{}) bool { 161 | switch t := obj.(type) { 162 | case *v1.Pod: 163 | return assignedNonTerminatedPod(t) 164 | default: 165 | runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj)) 166 | return false 167 | } 168 | }, 169 | Handler: cache.ResourceEventHandlerFuncs{ 170 | AddFunc: c.addPodToCache, 171 | UpdateFunc: c.updatePodInCache, 172 | DeleteFunc: c.deletePodFromCache, 173 | }, 174 | }, 175 | ) 176 | // unscheduled pod queue 177 | podInformer.Informer().AddEventHandler( 178 | cache.FilteringResourceEventHandler{ 179 | FilterFunc: func(obj interface{}) bool { 180 | switch t := obj.(type) { 181 | case *v1.Pod: 182 | return unassignedNonTerminatedPod(t) 183 | default: 184 | runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj)) 185 | return false 186 | } 187 | }, 188 | Handler: cache.ResourceEventHandlerFuncs{ 189 | AddFunc: func(obj interface{}) { 190 | if err := c.podQueue.Add(obj); err != nil { 191 | runtime.HandleError(fmt.Errorf("unable to queue %T: %v", obj, err)) 192 | } 193 | }, 194 | UpdateFunc: func(oldObj, newObj interface{}) { 195 | if err := c.podQueue.Update(newObj); err != nil { 196 | runtime.HandleError(fmt.Errorf("unable to update %T: %v", newObj, err)) 197 | } 198 | }, 199 | DeleteFunc: func(obj interface{}) { 200 | if err := c.podQueue.Delete(obj); err != nil { 201 | runtime.HandleError(fmt.Errorf("unable to dequeue %T: %v", obj, err)) 202 | } 203 | }, 204 | }, 205 | }, 206 | ) 207 | // ScheduledPodLister is something we provide to plug-in functions that 208 | // they may need to call. 209 | c.scheduledPodLister = assignedPodLister{podInformer.Lister()} 210 | 211 | // Only nodes in the "Ready" condition with status == "True" are schedulable 212 | nodeInformer.Informer().AddEventHandlerWithResyncPeriod( 213 | cache.ResourceEventHandlerFuncs{ 214 | AddFunc: c.addNodeToCache, 215 | UpdateFunc: c.updateNodeInCache, 216 | DeleteFunc: c.deleteNodeFromCache, 217 | }, 218 | 0, 219 | ) 220 | c.nodeLister = nodeInformer.Lister() 221 | 222 | // On add and delete of PVs, it will affect equivalence cache items 223 | // related to persistent volume 224 | pvInformer.Informer().AddEventHandlerWithResyncPeriod( 225 | cache.ResourceEventHandlerFuncs{ 226 | // MaxPDVolumeCountPredicate: since it relies on the counts of PV. 227 | AddFunc: c.onPvAdd, 228 | DeleteFunc: c.onPvDelete, 229 | }, 230 | 0, 231 | ) 232 | c.pVLister = pvInformer.Lister() 233 | 234 | // This is for MaxPDVolumeCountPredicate: add/delete PVC will affect counts of PV when it is bound. 235 | pvcInformer.Informer().AddEventHandlerWithResyncPeriod( 236 | cache.ResourceEventHandlerFuncs{ 237 | AddFunc: c.onPvcAdd, 238 | DeleteFunc: c.onPvcDelete, 239 | }, 240 | 0, 241 | ) 242 | c.pVCLister = pvcInformer.Lister() 243 | 244 | // This is for ServiceAffinity: affected by the selector of the service is updated. 245 | // Also, if new service is added, equivalence cache will also become invalid since 246 | // existing pods may be "captured" by this service and change this predicate result. 247 | serviceInformer.Informer().AddEventHandlerWithResyncPeriod( 248 | cache.ResourceEventHandlerFuncs{ 249 | AddFunc: c.onServiceAdd, 250 | UpdateFunc: c.onServiceUpdate, 251 | DeleteFunc: c.onServiceDelete, 252 | }, 253 | 0, 254 | ) 255 | c.serviceLister = serviceInformer.Lister() 256 | 257 | // Existing equivalence cache should not be affected by add/delete RC/Deployment etc, 258 | // it only make sense when pod is scheduled or deleted 259 | 260 | return c 261 | } 262 | 263 | func (c *ConfigFactory) onPvAdd(obj interface{}) { 264 | if c.enableEquivalenceClassCache { 265 | pv, ok := obj.(*v1.PersistentVolume) 266 | if !ok { 267 | glog.Errorf("cannot convert to *v1.PersistentVolume: %v", obj) 268 | return 269 | } 270 | c.invalidatePredicatesForPv(pv) 271 | } 272 | } 273 | 274 | func (c *ConfigFactory) onPvDelete(obj interface{}) { 275 | if c.enableEquivalenceClassCache { 276 | var pv *v1.PersistentVolume 277 | switch t := obj.(type) { 278 | case *v1.PersistentVolume: 279 | pv = t 280 | case cache.DeletedFinalStateUnknown: 281 | var ok bool 282 | pv, ok = t.Obj.(*v1.PersistentVolume) 283 | if !ok { 284 | glog.Errorf("cannot convert to *v1.PersistentVolume: %v", t.Obj) 285 | return 286 | } 287 | default: 288 | glog.Errorf("cannot convert to *v1.PersistentVolume: %v", t) 289 | return 290 | } 291 | c.invalidatePredicatesForPv(pv) 292 | } 293 | } 294 | 295 | func (c *ConfigFactory) invalidatePredicatesForPv(pv *v1.PersistentVolume) { 296 | invalidPredicates := sets.NewString("MaxPDVolumeCountPredicate") 297 | if pv.Spec.AWSElasticBlockStore != nil { 298 | invalidPredicates.Insert("MaxEBSVolumeCount") 299 | } 300 | if pv.Spec.GCEPersistentDisk != nil { 301 | invalidPredicates.Insert("MaxGCEPDVolumeCount") 302 | } 303 | if pv.Spec.AzureDisk != nil { 304 | invalidPredicates.Insert("MaxAzureDiskVolumeCount") 305 | } 306 | c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(invalidPredicates) 307 | } 308 | 309 | func (c *ConfigFactory) onPvcAdd(obj interface{}) { 310 | if c.enableEquivalenceClassCache { 311 | pvc, ok := obj.(*v1.PersistentVolumeClaim) 312 | if !ok { 313 | glog.Errorf("cannot convert to *v1.PersistentVolumeClaim: %v", obj) 314 | return 315 | } 316 | c.invalidatePredicatesForPvc(pvc) 317 | } 318 | } 319 | 320 | func (c *ConfigFactory) onPvcDelete(obj interface{}) { 321 | if c.enableEquivalenceClassCache { 322 | var pvc *v1.PersistentVolumeClaim 323 | switch t := obj.(type) { 324 | case *v1.PersistentVolumeClaim: 325 | pvc = t 326 | case cache.DeletedFinalStateUnknown: 327 | var ok bool 328 | pvc, ok = t.Obj.(*v1.PersistentVolumeClaim) 329 | if !ok { 330 | glog.Errorf("cannot convert to *v1.PersistentVolumeClaim: %v", t.Obj) 331 | return 332 | } 333 | default: 334 | glog.Errorf("cannot convert to *v1.PersistentVolumeClaim: %v", t) 335 | return 336 | } 337 | c.invalidatePredicatesForPvc(pvc) 338 | } 339 | } 340 | 341 | func (c *ConfigFactory) invalidatePredicatesForPvc(pvc *v1.PersistentVolumeClaim) { 342 | if pvc.Spec.VolumeName != "" { 343 | c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(maxPDVolumeCountPredicateSet) 344 | } 345 | } 346 | 347 | func (c *ConfigFactory) onServiceAdd(obj interface{}) { 348 | if c.enableEquivalenceClassCache { 349 | c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(serviceAffinitySet) 350 | } 351 | } 352 | 353 | func (c *ConfigFactory) onServiceUpdate(oldObj interface{}, newObj interface{}) { 354 | if c.enableEquivalenceClassCache { 355 | // TODO(resouer) We may need to invalidate this for specified group of pods only 356 | oldService := oldObj.(*v1.Service) 357 | newService := newObj.(*v1.Service) 358 | if !reflect.DeepEqual(oldService.Spec.Selector, newService.Spec.Selector) { 359 | c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(serviceAffinitySet) 360 | } 361 | } 362 | } 363 | 364 | func (c *ConfigFactory) onServiceDelete(obj interface{}) { 365 | if c.enableEquivalenceClassCache { 366 | c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(serviceAffinitySet) 367 | } 368 | } 369 | 370 | // GetNodeStore provides the cache to the nodes, mostly internal use, but may also be called by mock-tests. 371 | func (c *ConfigFactory) GetNodeLister() corelisters.NodeLister { 372 | return c.nodeLister 373 | } 374 | 375 | func (c *ConfigFactory) GetHardPodAffinitySymmetricWeight() int { 376 | return c.hardPodAffinitySymmetricWeight 377 | } 378 | 379 | func (f *ConfigFactory) GetSchedulerName() string { 380 | return f.schedulerName 381 | } 382 | 383 | // GetClient provides a kubernetes client, mostly internal use, but may also be called by mock-tests. 384 | func (f *ConfigFactory) GetClient() clientset.Interface { 385 | return f.client 386 | } 387 | 388 | // GetScheduledPodListerIndexer provides a pod lister, mostly internal use, but may also be called by mock-tests. 389 | func (c *ConfigFactory) GetScheduledPodLister() corelisters.PodLister { 390 | return c.scheduledPodLister 391 | } 392 | 393 | func (c *ConfigFactory) addPodToCache(obj interface{}) { 394 | pod, ok := obj.(*v1.Pod) 395 | if !ok { 396 | glog.Errorf("cannot convert to *v1.Pod: %v", obj) 397 | return 398 | } 399 | 400 | if err := c.schedulerCache.AddPod(pod); err != nil { 401 | glog.Errorf("scheduler cache AddPod failed: %v", err) 402 | } 403 | // NOTE: Updating equivalence cache of addPodToCache has been 404 | // handled optimistically in InvalidateCachedPredicateItemForPodAdd. 405 | } 406 | 407 | func (c *ConfigFactory) updatePodInCache(oldObj, newObj interface{}) { 408 | oldPod, ok := oldObj.(*v1.Pod) 409 | if !ok { 410 | glog.Errorf("cannot convert oldObj to *v1.Pod: %v", oldObj) 411 | return 412 | } 413 | newPod, ok := newObj.(*v1.Pod) 414 | if !ok { 415 | glog.Errorf("cannot convert newObj to *v1.Pod: %v", newObj) 416 | return 417 | } 418 | 419 | if err := c.schedulerCache.UpdatePod(oldPod, newPod); err != nil { 420 | glog.Errorf("scheduler cache UpdatePod failed: %v", err) 421 | } 422 | 423 | c.invalidateCachedPredicatesOnUpdatePod(newPod, oldPod) 424 | } 425 | 426 | func (c *ConfigFactory) invalidateCachedPredicatesOnUpdatePod(newPod *v1.Pod, oldPod *v1.Pod) { 427 | if c.enableEquivalenceClassCache { 428 | // if the pod does not have binded node, updating equivalence cache is meaningless; 429 | // if pod's binded node has been changed, that case should be handled by pod add & delete. 430 | if len(newPod.Spec.NodeName) != 0 && newPod.Spec.NodeName == oldPod.Spec.NodeName { 431 | if !reflect.DeepEqual(oldPod.GetLabels(), newPod.GetLabels()) { 432 | // MatchInterPodAffinity need to be reconsidered for this node, 433 | // as well as all nodes in its same failure domain. 434 | c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes( 435 | matchInterPodAffinitySet) 436 | } 437 | // if requested container resource changed, invalidate GeneralPredicates of this node 438 | if !reflect.DeepEqual(predicates.GetResourceRequest(newPod), 439 | predicates.GetResourceRequest(oldPod)) { 440 | c.equivalencePodCache.InvalidateCachedPredicateItem( 441 | newPod.Spec.NodeName, generalPredicatesSets) 442 | } 443 | } 444 | } 445 | } 446 | 447 | func (c *ConfigFactory) deletePodFromCache(obj interface{}) { 448 | var pod *v1.Pod 449 | switch t := obj.(type) { 450 | case *v1.Pod: 451 | pod = t 452 | case cache.DeletedFinalStateUnknown: 453 | var ok bool 454 | pod, ok = t.Obj.(*v1.Pod) 455 | if !ok { 456 | glog.Errorf("cannot convert to *v1.Pod: %v", t.Obj) 457 | return 458 | } 459 | default: 460 | glog.Errorf("cannot convert to *v1.Pod: %v", t) 461 | return 462 | } 463 | if err := c.schedulerCache.RemovePod(pod); err != nil { 464 | glog.Errorf("scheduler cache RemovePod failed: %v", err) 465 | } 466 | 467 | c.invalidateCachedPredicatesOnDeletePod(pod) 468 | } 469 | 470 | func (c *ConfigFactory) invalidateCachedPredicatesOnDeletePod(pod *v1.Pod) { 471 | if c.enableEquivalenceClassCache { 472 | // part of this case is the same as pod add. 473 | c.equivalencePodCache.InvalidateCachedPredicateItemForPodAdd(pod, pod.Spec.NodeName) 474 | // MatchInterPodAffinity need to be reconsidered for this node, 475 | // as well as all nodes in its same failure domain. 476 | // TODO(resouer) can we just do this for nodes in the same failure domain 477 | c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes( 478 | matchInterPodAffinitySet) 479 | 480 | // if this pod have these PV, cached result of disk conflict will become invalid. 481 | for _, volume := range pod.Spec.Volumes { 482 | if volume.GCEPersistentDisk != nil || volume.AWSElasticBlockStore != nil || 483 | volume.RBD != nil || volume.ISCSI != nil { 484 | c.equivalencePodCache.InvalidateCachedPredicateItem( 485 | pod.Spec.NodeName, noDiskConflictSet) 486 | } 487 | } 488 | } 489 | } 490 | 491 | func (c *ConfigFactory) addNodeToCache(obj interface{}) { 492 | node, ok := obj.(*v1.Node) 493 | if !ok { 494 | glog.Errorf("cannot convert to *v1.Node: %v", obj) 495 | return 496 | } 497 | 498 | if err := c.schedulerCache.AddNode(node); err != nil { 499 | glog.Errorf("scheduler cache AddNode failed: %v", err) 500 | } 501 | 502 | // NOTE: add a new node does not affect existing predicates in equivalence cache 503 | } 504 | 505 | func (c *ConfigFactory) updateNodeInCache(oldObj, newObj interface{}) { 506 | oldNode, ok := oldObj.(*v1.Node) 507 | if !ok { 508 | glog.Errorf("cannot convert oldObj to *v1.Node: %v", oldObj) 509 | return 510 | } 511 | newNode, ok := newObj.(*v1.Node) 512 | if !ok { 513 | glog.Errorf("cannot convert newObj to *v1.Node: %v", newObj) 514 | return 515 | } 516 | 517 | if err := c.schedulerCache.UpdateNode(oldNode, newNode); err != nil { 518 | glog.Errorf("scheduler cache UpdateNode failed: %v", err) 519 | } 520 | 521 | c.invalidateCachedPredicatesOnNodeUpdate(newNode, oldNode) 522 | } 523 | 524 | func (c *ConfigFactory) invalidateCachedPredicatesOnNodeUpdate(newNode *v1.Node, oldNode *v1.Node) { 525 | if c.enableEquivalenceClassCache { 526 | // Begin to update equivalence cache based on node update 527 | // TODO(resouer): think about lazily initialize this set 528 | invalidPredicates := sets.NewString() 529 | 530 | oldTaints, oldErr := helper.GetTaintsFromNodeAnnotations(oldNode.GetAnnotations()) 531 | if oldErr != nil { 532 | glog.Errorf("Failed to get taints from old node annotation for equivalence cache") 533 | } 534 | newTaints, newErr := helper.GetTaintsFromNodeAnnotations(newNode.GetAnnotations()) 535 | if newErr != nil { 536 | glog.Errorf("Failed to get taints from new node annotation for equivalence cache") 537 | } 538 | 539 | if !reflect.DeepEqual(oldNode.Status.Allocatable, newNode.Status.Allocatable) { 540 | invalidPredicates.Insert("GeneralPredicates") // "PodFitsResources" 541 | } 542 | if !reflect.DeepEqual(oldNode.GetLabels(), newNode.GetLabels()) { 543 | invalidPredicates.Insert("GeneralPredicates", "ServiceAffinity") // "PodSelectorMatches" 544 | for k, v := range oldNode.GetLabels() { 545 | // any label can be topology key of pod, we have to invalidate in all cases 546 | if v != newNode.GetLabels()[k] { 547 | invalidPredicates.Insert("MatchInterPodAffinity") 548 | } 549 | // NoVolumeZoneConflict will only be affected by zone related label change 550 | if k == kubeletapis.LabelZoneFailureDomain || k == kubeletapis.LabelZoneRegion { 551 | if v != newNode.GetLabels()[k] { 552 | invalidPredicates.Insert("NoVolumeZoneConflict") 553 | } 554 | } 555 | } 556 | } 557 | if !reflect.DeepEqual(oldTaints, newTaints) { 558 | invalidPredicates.Insert("PodToleratesNodeTaints") 559 | } 560 | if !reflect.DeepEqual(oldNode.Status.Conditions, newNode.Status.Conditions) { 561 | oldConditions := make(map[v1.NodeConditionType]v1.ConditionStatus) 562 | newConditions := make(map[v1.NodeConditionType]v1.ConditionStatus) 563 | for _, cond := range oldNode.Status.Conditions { 564 | oldConditions[cond.Type] = cond.Status 565 | } 566 | for _, cond := range newNode.Status.Conditions { 567 | newConditions[cond.Type] = cond.Status 568 | } 569 | if oldConditions[v1.NodeMemoryPressure] != newConditions[v1.NodeMemoryPressure] { 570 | invalidPredicates.Insert("CheckNodeMemoryPressure") 571 | } 572 | if oldConditions[v1.NodeDiskPressure] != newConditions[v1.NodeDiskPressure] { 573 | invalidPredicates.Insert("CheckNodeDiskPressure") 574 | } 575 | if oldConditions[v1.NodeReady] != newConditions[v1.NodeReady] || 576 | oldConditions[v1.NodeOutOfDisk] != newConditions[v1.NodeOutOfDisk] || 577 | oldConditions[v1.NodeNetworkUnavailable] != newConditions[v1.NodeNetworkUnavailable] || 578 | newNode.Spec.Unschedulable != oldNode.Spec.Unschedulable { 579 | invalidPredicates.Insert("CheckNodeCondition") 580 | } 581 | } 582 | c.equivalencePodCache.InvalidateCachedPredicateItem(newNode.GetName(), invalidPredicates) 583 | } 584 | } 585 | 586 | func (c *ConfigFactory) deleteNodeFromCache(obj interface{}) { 587 | var node *v1.Node 588 | switch t := obj.(type) { 589 | case *v1.Node: 590 | node = t 591 | case cache.DeletedFinalStateUnknown: 592 | var ok bool 593 | node, ok = t.Obj.(*v1.Node) 594 | if !ok { 595 | glog.Errorf("cannot convert to *v1.Node: %v", t.Obj) 596 | return 597 | } 598 | default: 599 | glog.Errorf("cannot convert to *v1.Node: %v", t) 600 | return 601 | } 602 | if err := c.schedulerCache.RemoveNode(node); err != nil { 603 | glog.Errorf("scheduler cache RemoveNode failed: %v", err) 604 | } 605 | if c.enableEquivalenceClassCache { 606 | c.equivalencePodCache.InvalidateAllCachedPredicateItemOfNode(node.GetName()) 607 | } 608 | } 609 | 610 | // Create creates a scheduler with the default algorithm provider. 611 | func (f *ConfigFactory) Create() (*scheduler.Config, error) { 612 | return f.CreateFromProvider(DefaultProvider) 613 | } 614 | 615 | // Creates a scheduler from the name of a registered algorithm provider. 616 | func (f *ConfigFactory) CreateFromProvider(providerName string) (*scheduler.Config, error) { 617 | glog.V(2).Infof("Creating scheduler from algorithm provider '%v'", providerName) 618 | provider, err := GetAlgorithmProvider(providerName) 619 | if err != nil { 620 | return nil, err 621 | } 622 | 623 | return f.CreateFromKeys(provider.FitPredicateKeys, provider.PriorityFunctionKeys, []algorithm.SchedulerExtender{}) 624 | } 625 | 626 | // Creates a scheduler from the configuration file 627 | func (f *ConfigFactory) CreateFromConfig(policy schedulerapi.Policy) (*scheduler.Config, error) { 628 | glog.V(2).Infof("Creating scheduler from configuration: %v", policy) 629 | 630 | // validate the policy configuration 631 | if err := validation.ValidatePolicy(policy); err != nil { 632 | return nil, err 633 | } 634 | 635 | predicateKeys := sets.NewString() 636 | for _, predicate := range policy.Predicates { 637 | glog.V(2).Infof("Registering predicate: %s", predicate.Name) 638 | predicateKeys.Insert(RegisterCustomFitPredicate(predicate)) 639 | } 640 | 641 | priorityKeys := sets.NewString() 642 | for _, priority := range policy.Priorities { 643 | glog.V(2).Infof("Registering priority: %s", priority.Name) 644 | priorityKeys.Insert(RegisterCustomPriorityFunction(priority)) 645 | } 646 | 647 | extenders := make([]algorithm.SchedulerExtender, 0) 648 | if len(policy.ExtenderConfigs) != 0 { 649 | for ii := range policy.ExtenderConfigs { 650 | glog.V(2).Infof("Creating extender with config %+v", policy.ExtenderConfigs[ii]) 651 | if extender, err := core.NewHTTPExtender(&policy.ExtenderConfigs[ii]); err != nil { 652 | return nil, err 653 | } else { 654 | extenders = append(extenders, extender) 655 | } 656 | } 657 | } 658 | // Providing HardPodAffinitySymmetricWeight in the policy config is the new and preferred way of providing the value. 659 | // Give it higher precedence than scheduler CLI configuration when it is provided. 660 | if policy.HardPodAffinitySymmetricWeight != 0 { 661 | f.hardPodAffinitySymmetricWeight = policy.HardPodAffinitySymmetricWeight 662 | } 663 | return f.CreateFromKeys(predicateKeys, priorityKeys, extenders) 664 | } 665 | 666 | // getBinder returns an extender that supports bind or a default binder. 667 | func (f *ConfigFactory) getBinder(extenders []algorithm.SchedulerExtender) scheduler.Binder { 668 | for i := range extenders { 669 | if extenders[i].IsBinder() { 670 | return extenders[i] 671 | } 672 | } 673 | return &binder{f.client} 674 | } 675 | 676 | // Creates a scheduler from a set of registered fit predicate keys and priority keys. 677 | func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String, extenders []algorithm.SchedulerExtender) (*scheduler.Config, error) { 678 | glog.V(2).Infof("Creating scheduler with fit predicates '%v' and priority functions '%v'", predicateKeys, priorityKeys) 679 | 680 | if f.GetHardPodAffinitySymmetricWeight() < 1 || f.GetHardPodAffinitySymmetricWeight() > 100 { 681 | return nil, fmt.Errorf("invalid hardPodAffinitySymmetricWeight: %d, must be in the range 1-100", f.GetHardPodAffinitySymmetricWeight()) 682 | } 683 | 684 | predicateFuncs, err := f.GetPredicates(predicateKeys) 685 | if err != nil { 686 | return nil, err 687 | } 688 | 689 | priorityConfigs, err := f.GetPriorityFunctionConfigs(priorityKeys) 690 | if err != nil { 691 | return nil, err 692 | } 693 | 694 | priorityMetaProducer, err := f.GetPriorityMetadataProducer() 695 | if err != nil { 696 | return nil, err 697 | } 698 | 699 | predicateMetaProducer, err := f.GetPredicateMetadataProducer() 700 | if err != nil { 701 | return nil, err 702 | } 703 | 704 | // Init equivalence class cache 705 | if f.enableEquivalenceClassCache && getEquivalencePodFunc != nil { 706 | f.equivalencePodCache = core.NewEquivalenceCache(getEquivalencePodFunc) 707 | glog.Info("Created equivalence class cache") 708 | } 709 | algo := core.NewGenericScheduler(f.schedulerCache, f.equivalencePodCache, predicateFuncs, predicateMetaProducer, priorityConfigs, priorityMetaProducer, extenders) 710 | 711 | podBackoff := util.CreateDefaultPodBackoff() 712 | return &scheduler.Config{ 713 | SchedulerCache: f.schedulerCache, 714 | Ecache: f.equivalencePodCache, 715 | // The scheduler only needs to consider schedulable nodes. 716 | NodeLister: &nodeLister{f.nodeLister}, 717 | Algorithm: algo, 718 | Binder: f.getBinder(extenders), 719 | PodConditionUpdater: &podConditionUpdater{f.client}, 720 | PodPreemptor: &podPreemptor{f.client}, 721 | PodWriter: &podWriter{Client: f.client}, 722 | WaitForCacheSync: func() bool { 723 | return cache.WaitForCacheSync(f.StopEverything, f.scheduledPodsHasSynced) 724 | }, 725 | NextPod: func() *v1.Pod { 726 | return f.getNextPod() 727 | }, 728 | Error: f.MakeDefaultErrorFunc(podBackoff, f.podQueue), 729 | StopEverything: f.StopEverything, 730 | }, nil 731 | } 732 | 733 | type nodeLister struct { 734 | corelisters.NodeLister 735 | } 736 | 737 | func (n *nodeLister) List() ([]*v1.Node, error) { 738 | return n.NodeLister.List(labels.Everything()) 739 | } 740 | 741 | func (f *ConfigFactory) GetPriorityFunctionConfigs(priorityKeys sets.String) ([]algorithm.PriorityConfig, error) { 742 | pluginArgs, err := f.getPluginArgs() 743 | if err != nil { 744 | return nil, err 745 | } 746 | 747 | return getPriorityFunctionConfigs(priorityKeys, *pluginArgs) 748 | } 749 | 750 | func (f *ConfigFactory) GetPriorityMetadataProducer() (algorithm.MetadataProducer, error) { 751 | pluginArgs, err := f.getPluginArgs() 752 | if err != nil { 753 | return nil, err 754 | } 755 | 756 | return getPriorityMetadataProducer(*pluginArgs) 757 | } 758 | 759 | func (f *ConfigFactory) GetPredicateMetadataProducer() (algorithm.PredicateMetadataProducer, error) { 760 | pluginArgs, err := f.getPluginArgs() 761 | if err != nil { 762 | return nil, err 763 | } 764 | return getPredicateMetadataProducer(*pluginArgs) 765 | } 766 | 767 | func (f *ConfigFactory) GetPredicates(predicateKeys sets.String) (map[string]algorithm.FitPredicate, error) { 768 | pluginArgs, err := f.getPluginArgs() 769 | if err != nil { 770 | return nil, err 771 | } 772 | 773 | return getFitPredicateFunctions(predicateKeys, *pluginArgs) 774 | } 775 | 776 | func (f *ConfigFactory) getPluginArgs() (*PluginFactoryArgs, error) { 777 | return &PluginFactoryArgs{ 778 | PodLister: f.podLister, 779 | ServiceLister: f.serviceLister, 780 | ControllerLister: f.controllerLister, 781 | ReplicaSetLister: f.replicaSetLister, 782 | StatefulSetLister: f.statefulSetLister, 783 | NodeLister: &nodeLister{f.nodeLister}, 784 | NodeInfo: &predicates.CachedNodeInfo{NodeLister: f.nodeLister}, 785 | PVInfo: &predicates.CachedPersistentVolumeInfo{PersistentVolumeLister: f.pVLister}, 786 | PVCInfo: &predicates.CachedPersistentVolumeClaimInfo{PersistentVolumeClaimLister: f.pVCLister}, 787 | HardPodAffinitySymmetricWeight: f.hardPodAffinitySymmetricWeight, 788 | }, nil 789 | } 790 | 791 | func (f *ConfigFactory) getNextPod() *v1.Pod { 792 | for { 793 | pod := cache.Pop(f.podQueue).(*v1.Pod) 794 | if f.ResponsibleForPod(pod) { 795 | glog.V(4).Infof("About to try and schedule pod %v", pod.Name) 796 | return pod 797 | } 798 | } 799 | } 800 | 801 | func (f *ConfigFactory) ResponsibleForPod(pod *v1.Pod) bool { 802 | return f.schedulerName == pod.Spec.SchedulerName 803 | } 804 | 805 | // unassignedNonTerminatedPod selects pods that are unassigned and non-terminal. 806 | func unassignedNonTerminatedPod(pod *v1.Pod) bool { 807 | if len(pod.Spec.NodeName) != 0 { 808 | return false 809 | } 810 | if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed { 811 | return false 812 | } 813 | return true 814 | } 815 | 816 | // assignedNonTerminatedPod selects pods that are assigned and non-terminal (scheduled and running). 817 | func assignedNonTerminatedPod(pod *v1.Pod) bool { 818 | if len(pod.Spec.NodeName) == 0 { 819 | return false 820 | } 821 | if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed { 822 | return false 823 | } 824 | return true 825 | } 826 | 827 | // assignedPodLister filters the pods returned from a PodLister to 828 | // only include those that have a node name set. 829 | type assignedPodLister struct { 830 | corelisters.PodLister 831 | } 832 | 833 | // List lists all Pods in the indexer for a given namespace. 834 | func (l assignedPodLister) List(selector labels.Selector) ([]*v1.Pod, error) { 835 | list, err := l.PodLister.List(selector) 836 | if err != nil { 837 | return nil, err 838 | } 839 | filtered := make([]*v1.Pod, 0, len(list)) 840 | for _, pod := range list { 841 | if len(pod.Spec.NodeName) > 0 { 842 | filtered = append(filtered, pod) 843 | } 844 | } 845 | return filtered, nil 846 | } 847 | 848 | // List lists all Pods in the indexer for a given namespace. 849 | func (l assignedPodLister) Pods(namespace string) corelisters.PodNamespaceLister { 850 | return assignedPodNamespaceLister{l.PodLister.Pods(namespace)} 851 | } 852 | 853 | // assignedPodNamespaceLister filters the pods returned from a PodNamespaceLister to 854 | // only include those that have a node name set. 855 | type assignedPodNamespaceLister struct { 856 | corelisters.PodNamespaceLister 857 | } 858 | 859 | // List lists all Pods in the indexer for a given namespace. 860 | func (l assignedPodNamespaceLister) List(selector labels.Selector) (ret []*v1.Pod, err error) { 861 | list, err := l.PodNamespaceLister.List(selector) 862 | if err != nil { 863 | return nil, err 864 | } 865 | filtered := make([]*v1.Pod, 0, len(list)) 866 | for _, pod := range list { 867 | if len(pod.Spec.NodeName) > 0 { 868 | filtered = append(filtered, pod) 869 | } 870 | } 871 | return filtered, nil 872 | } 873 | 874 | // Get retrieves the Pod from the indexer for a given namespace and name. 875 | func (l assignedPodNamespaceLister) Get(name string) (*v1.Pod, error) { 876 | pod, err := l.PodNamespaceLister.Get(name) 877 | if err != nil { 878 | return nil, err 879 | } 880 | if len(pod.Spec.NodeName) > 0 { 881 | return pod, nil 882 | } 883 | return nil, errors.NewNotFound(schema.GroupResource{Resource: string(v1.ResourcePods)}, name) 884 | } 885 | 886 | type podInformer struct { 887 | informer cache.SharedIndexInformer 888 | } 889 | 890 | func (i *podInformer) Informer() cache.SharedIndexInformer { 891 | return i.informer 892 | } 893 | 894 | func (i *podInformer) Lister() corelisters.PodLister { 895 | return corelisters.NewPodLister(i.informer.GetIndexer()) 896 | } 897 | 898 | // NewPodInformer creates a shared index informer that returns only non-terminal pods. 899 | func NewPodInformer(client clientset.Interface, resyncPeriod time.Duration) coreinformers.PodInformer { 900 | selector := fields.ParseSelectorOrDie("status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed)) 901 | lw := cache.NewListWatchFromClient(client.CoreV1().RESTClient(), string(v1.ResourcePods), metav1.NamespaceAll, selector) 902 | return &podInformer{ 903 | informer: cache.NewSharedIndexInformer(lw, &v1.Pod{}, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}), 904 | } 905 | } 906 | 907 | func (factory *ConfigFactory) MakeDefaultErrorFunc(backoff *util.PodBackoff, podQueue *cache.FIFO) func(pod *v1.Pod, err error) { 908 | return func(pod *v1.Pod, err error) { 909 | if err == core.ErrNoNodesAvailable { 910 | glog.V(4).Infof("Unable to schedule %v %v: no nodes are registered to the cluster; waiting", pod.Namespace, pod.Name) 911 | } else { 912 | if _, ok := err.(*core.FitError); ok { 913 | glog.V(4).Infof("Unable to schedule %v %v: no fit: %v; waiting", pod.Namespace, pod.Name, err) 914 | } else { 915 | glog.Errorf("Error scheduling %v %v: %v; retrying", pod.Namespace, pod.Name, err) 916 | } 917 | } 918 | backoff.Gc() 919 | // Retry asynchronously. 920 | // Note that this is extremely rudimentary and we need a more real error handling path. 921 | go func() { 922 | defer runtime.HandleCrash() 923 | podID := types.NamespacedName{ 924 | Namespace: pod.Namespace, 925 | Name: pod.Name, 926 | } 927 | 928 | entry := backoff.GetEntry(podID) 929 | if !entry.TryWait(backoff.MaxDuration()) { 930 | glog.Warningf("Request for pod %v already in flight, abandoning", podID) 931 | return 932 | } 933 | // Get the pod again; it may have changed/been scheduled already. 934 | getBackoff := initialGetBackoff 935 | for { 936 | pod, err := factory.client.CoreV1().Pods(podID.Namespace).Get(podID.Name, metav1.GetOptions{}) 937 | if err == nil { 938 | if len(pod.Spec.NodeName) == 0 { 939 | podQueue.AddIfNotPresent(pod) 940 | } 941 | break 942 | } 943 | if errors.IsNotFound(err) { 944 | glog.Warningf("A pod %v no longer exists", podID) 945 | return 946 | } 947 | glog.Errorf("Error getting pod %v for retry: %v; retrying...", podID, err) 948 | if getBackoff = getBackoff * 2; getBackoff > maximalGetBackoff { 949 | getBackoff = maximalGetBackoff 950 | } 951 | time.Sleep(getBackoff) 952 | } 953 | }() 954 | } 955 | } 956 | 957 | // nodeEnumerator allows a cache.Poller to enumerate items in an v1.NodeList 958 | type nodeEnumerator struct { 959 | *v1.NodeList 960 | } 961 | 962 | // Len returns the number of items in the node list. 963 | func (ne *nodeEnumerator) Len() int { 964 | if ne.NodeList == nil { 965 | return 0 966 | } 967 | return len(ne.Items) 968 | } 969 | 970 | // Get returns the item (and ID) with the particular index. 971 | func (ne *nodeEnumerator) Get(index int) interface{} { 972 | return &ne.Items[index] 973 | } 974 | 975 | type binder struct { 976 | Client clientset.Interface 977 | } 978 | 979 | // Bind just does a POST binding RPC. 980 | func (b *binder) Bind(binding *v1.Binding) error { 981 | glog.V(3).Infof("Attempting to bind %v to %v", binding.Name, binding.Target.Name) 982 | return b.Client.CoreV1().Pods(binding.Namespace).Bind(binding) 983 | } 984 | 985 | type podConditionUpdater struct { 986 | Client clientset.Interface 987 | } 988 | 989 | func (p *podConditionUpdater) Update(pod *v1.Pod, condition *v1.PodCondition) error { 990 | glog.V(2).Infof("Updating pod condition for %s/%s to (%s==%s)", pod.Namespace, pod.Name, condition.Type, condition.Status) 991 | if podutil.UpdatePodCondition(&pod.Status, condition) { 992 | _, err := p.Client.CoreV1().Pods(pod.Namespace).UpdateStatus(pod) 993 | return err 994 | } 995 | return nil 996 | } 997 | 998 | type podPreemptor struct { 999 | Client clientset.Interface 1000 | } 1001 | 1002 | func (p *podPreemptor) GetUpdatedPod(pod *v1.Pod) (*v1.Pod, error) { 1003 | return p.Client.CoreV1().Pods(pod.Namespace).Get(pod.Name, metav1.GetOptions{}) 1004 | } 1005 | 1006 | func (p *podPreemptor) DeletePod(pod *v1.Pod) error { 1007 | return p.Client.CoreV1().Pods(pod.Namespace).Delete(pod.Name, &metav1.DeleteOptions{}) 1008 | } 1009 | 1010 | //TODO(bsalamat): change this to patch PodStatus to avoid overwriting potential pending status updates. 1011 | func (p *podPreemptor) UpdatePodAnnotations(pod *v1.Pod, annotations map[string]string) error { 1012 | podCopy := pod.DeepCopy() 1013 | if podCopy.Annotations == nil { 1014 | podCopy.Annotations = map[string]string{} 1015 | } 1016 | for k, v := range annotations { 1017 | podCopy.Annotations[k] = v 1018 | } 1019 | _, err := p.Client.CoreV1().Pods(podCopy.Namespace).UpdateStatus(podCopy) 1020 | return err 1021 | } 1022 | type podWriter struct { 1023 | Client clientset.Interface 1024 | mutex sync.Mutex 1025 | } 1026 | 1027 | func (pw *podWriter) UpdatePod(oldPod, newPod *v1.Pod) error { 1028 | pw.mutex.Lock() 1029 | defer pw.mutex.Unlock() 1030 | graceperiod := int64(0) 1031 | if oldPod.Labels["task-type"] == "Nimbix" { 1032 | glog.V(2).Infof("Deleting pod %s/%s as it could not be scheduled", oldPod.Namespace, oldPod.Name) 1033 | err := pw.Client.CoreV1().Pods(oldPod.Namespace).Delete(oldPod.Name, &metav1.DeleteOptions{ 1034 | GracePeriodSeconds: &graceperiod, 1035 | }) 1036 | if err != nil { 1037 | return err 1038 | } 1039 | 1040 | glog.V(2).Infof("Creating new pod %s/%s", newPod.Namespace, newPod.Name) 1041 | pod, err := pw.Client.CoreV1().Pods(oldPod.Namespace).Create(newPod) 1042 | if err != nil { 1043 | return err 1044 | } 1045 | glog.V(2).Infof("Created new pod %s/%s", pod.Namespace, pod.Name) 1046 | } 1047 | return nil 1048 | } 1049 | --------------------------------------------------------------------------------