├── nimbix-app
    ├── Dockerfile.base
    ├── Dockerfile.app
    ├── job_template.json
    ├── README.md
    └── jarvice_submit.py
├── scheduler
    ├── Dockerfile
    ├── scheduler.go
    └── factory.go
├── deploy
    ├── sample-job.yaml
    └── k8s-custom-sched.json
└── README.md


/nimbix-app/Dockerfile.base:
--------------------------------------------------------------------------------
1 | FROM python:2.7
2 | 
3 | RUN pip install jarviceclient
4 | RUN pip install jinja2
5 | 
6 | ADD jarvice_submit.py /
7 | ADD job_template.json /
8 | ENTRYPOINT ["python", "/jarvice_submit.py"]
9 | 


--------------------------------------------------------------------------------
/nimbix-app/Dockerfile.app:
--------------------------------------------------------------------------------
 1 | FROM jarvice/powerai
 2 | 
 3 | RUN apt-get update && apt-get install -y \
 4 |     libffi-dev
 5 | RUN pip install jarviceclient
 6 | RUN pip install jinja2
 7 | 
 8 | ADD jarvice_submit.py /
 9 | ADD job_template.json /
10 | 
11 | WORKDIR /
12 | 
13 | ENTRYPOINT ["python", "/jarvice_submit.py"]
14 | 


--------------------------------------------------------------------------------
/nimbix-app/job_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "{{ app_name }}",
 3 |   "staging": false,
 4 |   "checkedout": false,
 5 |   "application": {
 6 |     "command": "{{ command }}",
 7 |     "parameters": {
 8 |       "command": "{{ command_args }}"
 9 |     },
10 |     "geometry": "1424x692"
11 |   },
12 |   "machine": {
13 |     "type": "{{ mc_type }}",
14 |     "nodes": 1
15 |   },
16 |   "vault": {
17 |     "name": "drop.jarvice.com",
18 |     "force": false,
19 |     "readonly": false
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/scheduler/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM golang:1.8
 2 | 
 3 | RUN git clone https://github.com/kubernetes/kubernetes.git
 4 | RUN apt-get update && apt-get install -y rsync
 5 | ADD scheduler.go factory.go /
 6 | RUN cd kubernetes && git checkout -b v1.8.3 v1.8.3 && \
 7 |     cp /scheduler.go plugin/pkg/scheduler/scheduler.go && \
 8 |     cp /factory.go plugin/pkg/scheduler/factory/factory.go && \
 9 |     make all WHAT=plugin/cmd/kube-scheduler/ &&\
10 |     cp _output/bin/kube-scheduler /kube-scheduler-nimbix
11 | 
12 | FROM debian:jessie
13 | COPY --from=0 /kube-scheduler-nimbix /
14 | CMD ["/kube-scheduler-nimbix"]
15 | 


--------------------------------------------------------------------------------
/deploy/sample-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: nimbix-job
 5 | spec:
 6 |   template:
 7 |     metadata:
 8 |       name: nimbix-gpu
 9 |       labels:
10 |         task-type: Nimbix
11 |     spec:
12 |       schedulerName: k8s-custom-sched
13 |       restartPolicy: Never
14 |       containers:
15 |       - name: nimbix-job
16 |         image: poweraijob
17 |         imagePullPolicy: "Never"
18 |         env:
19 |         - name:  "APP_NAME"
20 |           value: "power8-ubuntu-mldl"
21 |         - name:  "APP_COMMAND"
22 |           value: "run"
23 |         - name: "APP_COMMAND_ARGS"
24 |           value: "source /opt/DL/bazel/bin/bazel-activate && source /opt/DL/tensorflow/bin/tensorflow-activate && tensorflow-test"
25 |         - name:  "ARCH"
26 |           value: "POWER"
27 |         - name:  "NUM_CPUS"
28 |           value: "60"
29 |         - name:  "NUM_GPUS"
30 |           value: "2"
31 |         - name:  "USERNAME"
32 |           value: "<USERNAME>"
33 |         - name:  "APIKEY"
34 |           value: "<APIKEY>"
35 |         resources:
36 |           limits:
37 |             alpha.kubernetes.io/nvidia-gpu: 2
38 |         command: ["python", "/jarvice_submit.py"]
39 | 


--------------------------------------------------------------------------------
/deploy/k8s-custom-sched.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "apiVersion": "v1",
 3 |   "kind": "Pod",
 4 |   "metadata": {
 5 |     "name": "k8s-custom-sched",
 6 |     "namespace": "kube-system"
 7 |   },
 8 |   "spec":{
 9 |     "hostNetwork": true,
10 |     "containers":[
11 |       {
12 |         "name": "k8s-custom-sched",
13 |         "image": "nimbix-sched",
14 |         "imagePullPolicy": "IfNotPresent",
15 |         "command": [
16 |           "/kube-scheduler-nimbix",
17 |           "--master=https://MASTER_IP:8001",
18 |           "--leader-elect=false",
19 |           "--kubeconfig=/etc/cfc/conf/kube-scheduler-config.yaml",
20 |           "--v=2",
21 |           "--scheduler-name=k8s-custom-sched",
22 |           "--port=PORT_NUM"
23 |         ],
24 |         "volumeMounts": [
25 |           {
26 |             "name": "data",
27 |             "mountPath": "/etc/cfc/conf"
28 |           },
29 |           {
30 |             "name": "audit",
31 |             "mountPath": "/var/lib/icp/audit"
32 |           }
33 |         ]
34 |       }
35 |     ],
36 |     "nodeName": "MASTER_NODE_NAME",
37 |     "volumes": [
38 |       {
39 |         "name": "data",
40 |         "hostPath": {
41 |           "path": "/etc/cfc/conf"
42 |         }
43 |       },
44 |       {
45 |         "name": "audit",
46 |         "hostPath": {
47 |           "path": "/var/lib/icp/audit"
48 |         }
49 |       }
50 |     ]
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/nimbix-app/README.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | This is an example of using a docker image to provision a job in Nimbix - https://www.nimbix.net/
 3 | leveraging the Nimbix CLI.
 4 | Read more about Nimbix CLI here -  https://www.nimbix.net/jarvice-quick-start-guide/
 5 | 
 6 | There are two Dockerfiles. 
 7 | Dockerfile.base - Used to build the docker image to spawn a job in Nimbix. 
 8 | It uses certain environment variables to specific the required details.
 9 | 
10 | Dockerfile.app - Used to build an example docker image that can be run either in the local cluster
11 | or Nimbix. Nimbix expects a specific layout of the application environment in the docker image and hence a specific base 
12 | image needs to be used. 
13 | In the example we have used a Power LE (ppc64le) powerai Nimbix base image. You can use your own base image depending on your
14 | requirements. 
15 | 
16 | You can read more about creating docker images for Nimbix here - http://jarvice.readthedocs.io/en/latest/cicd/
17 | 
18 | ## Build Docker Image
19 | 
20 | ```bash
21 | $ sudo docker build -t nimbix -f Dockerfile.base .
22 | ```
23 | 
24 | ```bash
25 | $ sudo docker build -t ppc64le/powerai -f Dockerfile.app .
26 | ```
27 | 
28 | 
29 | ## Example Runs
30 | ### Submit a Job and wait for the job to finish/terminate
31 | 
32 | The following command will provision a job in Nimbix cluster
33 | 
34 | ```bash
35 | $ sudo docker run -it \ 
36 |             -e USERNAME=USERNAME \
37 |             -e APIKEY=123456abcdefgh2974 \
38 |             -e APP_NAME=my_app \
39 |             -e APP_COMMAND=run \
40 | 	    -e APP_COMMAND_ARGS="/run_trainig.sh"\
41 |             -e REMOTE=1\
42 |             -e ARCH=POWER \
43 |             -e NUM_CPUS=60 \
44 |             -e NUM_GPUS=2 \
45 | nimbix
46 | ```
47 | 
48 | The following command will run the job in your local cluster
49 | 
50 | ```bash
51 | sudo docker run -it \
52 |             -e "USERNAME=user" \
53 |             -e "APIKEY=123456789"\
54 |             -e "APP_NAME=power8-ubuntu-mldl"\
55 |             -e "APP_COMMAND=run"\
56 |             -e "APP_COMMAND_ARGS='source /opt/DL/bazel/bin/bazel-activate && source /opt/DL/tensorflow/bin/tensorflow-activate && tensorflow-test'" \
57 | ppc64le/powerai
58 | ```
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Demonstrating Cloud Bursting using a Custom Kubernetes Scheduler.
 2 | This is an example demonstrating how to create a custom Kubernetes scheduler.
 3 | The custom scheduler modifies the Kubernetes default scheduler so that a GPU
 4 | related job can be provisioned in [Nimbix cloud](https://www.nimbix.net/) if the
 5 | local cluster is unable to satisfy the GPU resource requests.
 6 | 
 7 | ### Project Organization
 8 | nimbix-app: Base image to be used for building application docker images
 9 | scheduler: Custom K8s scheduler files for Kubernetes version - 1.8.3
10 | deploy: Sample deployment files
11 | 
12 | ### How to build the custom scheduler.
13 | ```bash
14 | git clone https://github.com/IBM/k8s-custom-scheduler.git
15 | cd k8s-custom-scheduler/scheduler
16 | docker build -t nimbix-sched .
17 | ```
18 | ### Deploy Nimbix scheduler on IBM Cloud Private (ICP)
19 | While these instructions are specific for
20 | [ICP](https://www.ibm.com/cloud-computing/products/ibm-cloud-private/), it
21 | should apply for any Kubernetes setup with minor modifications
22 | 
23 | 1.**Create a secret for the certificate files to access apiserver over https**<br>
24 | 
25 | In ICP, certificates file reside on /etc/cfc/conf in master node. Use the following command to create the secret
26 | ```bash
27 | kubectl create secret generic certs --from-file=kube-scheduler-config.yaml --from-file=kube-scheduler.crt --from-file=kube-scheduler.key
28 | ```
29 | 2.**Build the images for custom scheduler and sample nimbix job**<br>
30 | 
31 | Use the Dockerfiles in **scheduler** and **nimbix-app** directory
32 | 
33 | 3.**Deploy the scheduler**<br>
34 | 
35 | Example deployment yaml is available at deploy/k8s-custom-sched.yaml. Update the yaml file with the MASTER_IP
36 | ```bash
37 | kubectl create -f k8s-custom-sched.yaml
38 | ```
39 | 
40 | 4.**Create appropriate role binding so that custom scheduler from system:kube-scheduler can modify pods from default namespace**<br>
41 | 
42 | ```bash
43 | kubectl create rolebinding someRole --clusterrole=admin --user=system:kube-scheduler --namespace=default
44 | ```
45 | 5.**Deploy a sample GPU job using the custom scheduler**<br>
46 | 
47 | Example yaml is available at deploy/sample-job.yaml. Update the yaml with your Nimbix USERNAME and APIKEY.
48 | The job will be provisioned to Nimbix cloud if resource requirement is not met in the local cluster
49 | ```bash
50 | kubectl create -f sample-job.yaml
51 | ```
52 | ### Authors
53 | Abhishek Dasgupta (abdasgupta@in.ibm.com)<br>
54 | Pradipta Kumar Banerjee (bpradipt@in.ibm.com)
55 | 


--------------------------------------------------------------------------------
/nimbix-app/jarvice_submit.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #A wrapper around jarvice_cli to submit a job and wait for it to end
  3 | 
  4 | import os
  5 | from subprocess import check_output, check_call
  6 | import json
  7 | import jinja2
  8 | import argparse
  9 | import sys
 10 | import traceback
 11 | import logging
 12 | 
 13 | #Username and Apikey
 14 | username = ""
 15 | apikey = ""
 16 | 
 17 | logging.basicConfig(level=logging.INFO)
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | def exec_and_wait(job_json):
 21 |        out = check_output(["jarvice_cli", "-username", username, "-apikey", apikey, "submit", "-j", job_json])
 22 |        data = json.loads(out)
 23 |        job_id = str(data['number'])
 24 |        print "job id is " + job_id
 25 |        logger.info("job id is %s ", job_id)
 26 |        if job_id:
 27 |            out = check_output(["jarvice_cli", "-username", username, "-apikey", apikey, "wait_for", "-number", job_id ])
 28 |            logger.info("job %s terminated", job_id)
 29 |            return 0
 30 |        print "failed to submit job"
 31 |        logger.error("failed to submit job")
 32 |        return 1
 33 | 
 34 | 
 35 | def exec_and_wait_dry_run(job_json):
 36 |         logger.info("command to be executed is:")
 37 |         logger.info("jarvice_cli -username %s -apikey %s submit -j %s", username, apikey, job_json)
 38 |         return 0
 39 | 
 40 | #Find relevant machine type based on number of CPUs and GPUs
 41 | #This method assumes that the required cpus and gpus exactly matches
 42 | #with Nimbix provided resource types
 43 | #Example JSON o/p
 44 | '''
 45 | {
 46 |     "nc3": {
 47 |         "mc_scale_max": 128,
 48 |         "mc_slave_gpus": 0,
 49 |         "mc_scratch": 100,
 50 |         "mc_scale_min": 1,
 51 |         "mc_ram": 128,
 52 |         "mc_swap": 64,
 53 |         "mc_description": "16 core, 128GB RAM (accelerated OpenGL on master)",
 54 |         "mc_price": 2.5,
 55 |         "mc_gpus": 1,
 56 |         "mc_slave_ram": 128,
 57 |         "mc_cores": 16,
 58 |         "mc_scale_select": ""
 59 |     },
 60 |     "nc5": {
 61 |         "mc_scale_max": 2,
 62 |         "mc_slave_gpus": 0,
 63 |         "mc_scratch": 300,
 64 |         "mc_scale_min": 1,
 65 |         "mc_ram": 512,
 66 |         "mc_swap": 64,
 67 |         "mc_description": "16 core, 512GB RAM (accelerated OpenGL on master)",
 68 |         "mc_price": 6.52,
 69 |         "mc_gpus": 0,
 70 |         "mc_slave_ram": 512,
 71 |         "mc_cores": 16,
 72 |         "mc_scale_select": ""
 73 |     }
 74 | }
 75 | '''
 76 | def get_mc_type(num_cpus, num_gpus, arch):
 77 | 
 78 |    if arch == "INTEL":
 79 |        default_mc_type = "nc3"
 80 |    if arch == "POWER":
 81 |        default_mc_type = "np8c0"
 82 | 
 83 |    out = check_output(["jarvice_cli", "-username", username, "-apikey", apikey, "machines"])
 84 |    data = json.loads(out)
 85 |    #{mc_type: [ ram, cores, gpus]
 86 |    mc_list = {}
 87 |    logger.debug("List of machines in Nimbix %s", data)
 88 |    for key, value in data.iteritems():
 89 |        logger.debug("key: %s, mc_ram: %d, mc_cores: %d, mc_gpus: %d", key, value['mc_ram'], value['mc_cores'], value['mc_gpus'])
 90 |        logger.debug("mc_description: %s", value['mc_description'])
 91 |        if arch not in value['mc_description']:
 92 |            continue
 93 |        mc_list[key] = [ value['mc_ram'], value['mc_cores'], value['mc_gpus']]
 94 | 
 95 |    for mc_type, values in mc_list.iteritems():
 96 |       if str(values[1]) == str(num_cpus) and str(values[2]) == str(num_gpus):
 97 |           return mc_type
 98 |    return default_mc_type
 99 | 
100 | def find_best_fit(num_res, sorted_list):
101 |    logger.debug("sorted_list: %s, num_res: %d", sorted_list, num_res)
102 |    return min(sorted_list, key=lambda x:abs(x - num_res))
103 | 
104 | '''
105 | Get the machine type which is closest to the required resources
106 | '''
107 | def get_mc_type_best_fit(num_cpus, num_gpus, arch):
108 | 
109 |    if arch == "INTEL":
110 |        default_mc_type = "nc3"
111 |    if arch == "POWER":
112 |        default_mc_type = "np8c0"
113 | 
114 |    cpu_list = []
115 |    gpu_list = []
116 |    out = check_output(["jarvice_cli", "-username", username, "-apikey", apikey, "machines"])
117 |    data = json.loads(out)
118 |    #{mc_type: [ ram, cores, gpus]
119 |    mc_list = {}
120 |    for key, value in data.iteritems():
121 |        logger.debug("key: %s, mc_ram: %d, mc_cores: %d, mc_gpus: %d", key, value['mc_ram'], value['mc_cores'], value['mc_gpus'])
122 |        logger.debug("mc_description: %s", value['mc_description'])
123 |        if arch not in value['mc_description']:
124 |            continue
125 |        mc_list[key] = [ value['mc_ram'], value['mc_cores'], value['mc_gpus']]
126 |        cpu_list.append(value['mc_cores'])
127 |        gpu_list.append(value['mc_gpus'])
128 | 
129 |    logger.debug("CPU list: %s", cpu_list)
130 |    logger.debug("GPU list: %s", gpu_list)
131 |    logger.debug("Machine list: %s", mc_list)
132 |    cpus = find_best_fit(num_cpus, sorted(set(cpu_list)))
133 |    logger.info("Required cpus: %d", cpus)
134 |    gpus = find_best_fit(num_gpus, sorted(set(gpu_list)))
135 |    logger.info("Required gpus: %d", gpus)
136 | 
137 |    for mc_type, values in mc_list.iteritems():
138 |       #Find best fit machine having required cpu and gpus
139 |       if str(values[1]) == str(cpus) and str(values[2]) == str(gpus):
140 |           return mc_type
141 |       #What if best fit is not available in the list
142 |    return default_mc_type
143 | 
144 | 
145 | def create_job_json(app_name, app_command, app_command_args, mc_type):
146 |    template_loader = jinja2.FileSystemLoader( searchpath="./")
147 |    template_env = jinja2.Environment( loader=template_loader )
148 |    TEMPLATE_FILE = "/job_template.json"
149 |    template = template_env.get_template( TEMPLATE_FILE )
150 |    json_text = template.render(app_name=app_name, command=app_command, command_args=app_command_args, mc_type=mc_type)
151 | 
152 |    logger.debug("Json Job Description: %s", json_text)
153 | 
154 |    with open('/job.json', 'wb') as outfile:
155 |        outfile.write(json_text)
156 |        #This is required otherwise JSON parsing will fail
157 |        outfile.write('\n')
158 | 
159 |    outfile.close()
160 | 
161 | def remote_exec():
162 |    global username
163 |    global apikey
164 |    parser = argparse.ArgumentParser()
165 |    parser.add_argument('--log_level', type=str, default='INFO', help='Log level - INFO, DEBUG')
166 |    parser.add_argument('--dry_run', action='store_true', help='Dry Run - just dump the complete remote command')
167 |    args = parser.parse_args()
168 | 
169 |    if args.log_level == "INFO":
170 |        logger.setLevel(logging.INFO)
171 |    if args.log_level == "DEBUG":
172 |        logger.setLevel(logging.DEBUG)
173 | 
174 | 
175 |    username = os.environ.get("USERNAME")
176 |    apikey = os.environ.get("APIKEY")
177 |    app_name = os.environ.get("APP_NAME")
178 |    app_command = os.environ.get("APP_COMMAND")
179 |    #App command args is optional. For batch jobs its required though
180 |    app_command_args = os.environ.get("APP_COMMAND_ARGS")
181 | 
182 |    if username == None or apikey == None or app_name == None or app_command == None:
183 |        logger.critical("Username, APIKey, App Name and Command  are must")
184 |        sys.exit()
185 | 
186 |    #Get required CPUs and GPUs. Default 1
187 |    num_cpus = os.getenv("NUM_CPUS", 1)
188 |    num_gpus = os.getenv("NUM_GPUS", 1)
189 |    arch = os.getenv("ARCH", "POWER")
190 | 
191 |    logger.debug("APP_NAME: %s APP_COMMAND: %s, APP_COMMAND_ARGS: %s, NUM_CPUS: %s, NUM_GPUS: %s", app_name, app_command, 
192 |                                          app_command_args, num_cpus, num_gpus)
193 | 
194 |    try:
195 |       if args.dry_run:
196 |          exec_and_wait_dry_run("/job.json")
197 |       else: 
198 |          mc_type = get_mc_type_best_fit(int(num_cpus), int(num_gpus), arch)
199 |          logger.info("machine type to be used: %s", mc_type)
200 |          create_job_json(app_name, app_command, app_command_args, mc_type)
201 |          if exec_and_wait("/job.json"):
202 |              logger.info("Error in running jarvice job")
203 | 
204 |    except Exception, e:
205 |        logger.error('Unexpected error when running jarvice_cli', exc_info=True)
206 | 
207 | def main():
208 |    if os.environ.get("REMOTE") :
209 |        remote_exec()
210 |    else:
211 |        #Execute the command as-is
212 |        app_command_args = os.environ.get("APP_COMMAND_ARGS")
213 |        try:
214 |            check_call(["/bin/bash", "-c", app_command_args])
215 |        except Exception, e:
216 |            logger.error('Unexpected error when running command', exc_info=True)
217 | 
218 | if __name__== "__main__":
219 |    main()
220 | 


--------------------------------------------------------------------------------
/scheduler/scheduler.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2014 The Kubernetes Authors.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package scheduler
 18 | 
 19 | import (
 20 | 	"time"
 21 | 
 22 | 	"k8s.io/api/core/v1"
 23 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 24 | 	"k8s.io/apimachinery/pkg/util/sets"
 25 | 	"k8s.io/apimachinery/pkg/util/wait"
 26 | 	utilfeature "k8s.io/apiserver/pkg/util/feature"
 27 | 	clientset "k8s.io/client-go/kubernetes"
 28 | 	corelisters "k8s.io/client-go/listers/core/v1"
 29 | 	"k8s.io/client-go/tools/cache"
 30 | 	"k8s.io/client-go/tools/record"
 31 | 	"k8s.io/kubernetes/pkg/features"
 32 | 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
 33 | 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
 34 | 	"k8s.io/kubernetes/plugin/pkg/scheduler/core"
 35 | 	"k8s.io/kubernetes/plugin/pkg/scheduler/metrics"
 36 | 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 37 | 	"k8s.io/kubernetes/plugin/pkg/scheduler/util"
 38 | 
 39 | 	"github.com/golang/glog"
 40 | )
 41 | 
 42 | // Binder knows how to write a binding.
 43 | type Binder interface {
 44 | 	Bind(binding *v1.Binding) error
 45 | }
 46 | 
 47 | // PodConditionUpdater updates the condition of a pod based on the passed
 48 | // PodCondition
 49 | type PodConditionUpdater interface {
 50 | 	Update(pod *v1.Pod, podCondition *v1.PodCondition) error
 51 | }
 52 | 
 53 | // PodPreemptor has methods needed to delete a pod and to update
 54 | // annotations of the preemptor pod.
 55 | type PodPreemptor interface {
 56 | 	GetUpdatedPod(pod *v1.Pod) (*v1.Pod, error)
 57 | 	DeletePod(pod *v1.Pod) error
 58 | 	UpdatePodAnnotations(pod *v1.Pod, annots map[string]string) error
 59 | }
 60 | 
 61 | // PodWriter delete the pod and create a new pod based on the passed pod.
 62 | type PodWriter interface {
 63 | 	UpdatePod(oldPod, newPod *v1.Pod) error
 64 | }
 65 | 
 66 | // Scheduler watches for new unscheduled pods. It attempts to find
 67 | // nodes that they fit on and writes bindings back to the api server.
 68 | type Scheduler struct {
 69 | 	config *Config
 70 | }
 71 | 
 72 | // StopEverything closes the scheduler config's StopEverything channel, to shut
 73 | // down the Scheduler.
 74 | func (sched *Scheduler) StopEverything() {
 75 | 	close(sched.config.StopEverything)
 76 | }
 77 | 
 78 | // Configurator defines I/O, caching, and other functionality needed to
 79 | // construct a new scheduler. An implementation of this can be seen in
 80 | // factory.go.
 81 | type Configurator interface {
 82 | 	GetPriorityFunctionConfigs(priorityKeys sets.String) ([]algorithm.PriorityConfig, error)
 83 | 	GetPriorityMetadataProducer() (algorithm.MetadataProducer, error)
 84 | 	GetPredicateMetadataProducer() (algorithm.PredicateMetadataProducer, error)
 85 | 	GetPredicates(predicateKeys sets.String) (map[string]algorithm.FitPredicate, error)
 86 | 	GetHardPodAffinitySymmetricWeight() int
 87 | 	GetSchedulerName() string
 88 | 	MakeDefaultErrorFunc(backoff *util.PodBackoff, podQueue *cache.FIFO) func(pod *v1.Pod, err error)
 89 | 
 90 | 	// Probably doesn't need to be public.  But exposed for now in case.
 91 | 	ResponsibleForPod(pod *v1.Pod) bool
 92 | 
 93 | 	// Needs to be exposed for things like integration tests where we want to make fake nodes.
 94 | 	GetNodeLister() corelisters.NodeLister
 95 | 	GetClient() clientset.Interface
 96 | 	GetScheduledPodLister() corelisters.PodLister
 97 | 
 98 | 	Create() (*Config, error)
 99 | 	CreateFromProvider(providerName string) (*Config, error)
100 | 	CreateFromConfig(policy schedulerapi.Policy) (*Config, error)
101 | 	CreateFromKeys(predicateKeys, priorityKeys sets.String, extenders []algorithm.SchedulerExtender) (*Config, error)
102 | }
103 | 
104 | // Config is an implementation of the Scheduler's configured input data.
105 | // TODO over time we should make this struct a hidden implementation detail of the scheduler.
106 | type Config struct {
107 | 	// It is expected that changes made via SchedulerCache will be observed
108 | 	// by NodeLister and Algorithm.
109 | 	SchedulerCache schedulercache.Cache
110 | 	// Ecache is used for optimistically invalid affected cache items after
111 | 	// successfully binding a pod
112 | 	Ecache     *core.EquivalenceCache
113 | 	NodeLister algorithm.NodeLister
114 | 	Algorithm  algorithm.ScheduleAlgorithm
115 | 	Binder     Binder
116 | 	// PodConditionUpdater is used only in case of scheduling errors. If we succeed
117 | 	// with scheduling, PodScheduled condition will be updated in apiserver in /bind
118 | 	// handler so that binding and setting PodCondition it is atomic.
119 | 	PodConditionUpdater PodConditionUpdater
120 | 	// PodPreemptor is used to evict pods and update pod annotations.
121 | 	PodPreemptor PodPreemptor
122 | 
123 | 	// PodWriter is only used to schedule Pods those can be run on Nimbix.
124 | 	PodWriter PodWriter
125 | 
126 | 	// NextPod should be a function that blocks until the next pod
127 | 	// is available. We don't use a channel for this, because scheduling
128 | 	// a pod may take some amount of time and we don't want pods to get
129 | 	// stale while they sit in a channel.
130 | 	NextPod func() *v1.Pod
131 | 
132 | 	// WaitForCacheSync waits for scheduler cache to populate.
133 | 	// It returns true if it was successful, false if the controller should shutdown.
134 | 	WaitForCacheSync func() bool
135 | 
136 | 	// Error is called if there is an error. It is passed the pod in
137 | 	// question, and the error
138 | 	Error func(*v1.Pod, error)
139 | 
140 | 	// Recorder is the EventRecorder to use
141 | 	Recorder record.EventRecorder
142 | 
143 | 	// Close this to shut down the scheduler.
144 | 	StopEverything chan struct{}
145 | }
146 | 
147 | // NewFromConfigurator returns a new scheduler that is created entirely by the Configurator.  Assumes Create() is implemented.
148 | // Supports intermediate Config mutation for now if you provide modifier functions which will run after Config is created.
149 | func NewFromConfigurator(c Configurator, modifiers ...func(c *Config)) (*Scheduler, error) {
150 | 	cfg, err := c.Create()
151 | 	if err != nil {
152 | 		return nil, err
153 | 	}
154 | 	// Mutate it if any functions were provided, changes might be required for certain types of tests (i.e. change the recorder).
155 | 	for _, modifier := range modifiers {
156 | 		modifier(cfg)
157 | 	}
158 | 	// From this point on the config is immutable to the outside.
159 | 	s := &Scheduler{
160 | 		config: cfg,
161 | 	}
162 | 	metrics.Register()
163 | 	return s, nil
164 | }
165 | 
166 | // Run begins watching and scheduling. It waits for cache to be synced, then starts a goroutine and returns immediately.
167 | func (sched *Scheduler) Run() {
168 | 	if !sched.config.WaitForCacheSync() {
169 | 		return
170 | 	}
171 | 
172 | 	go wait.Until(sched.scheduleOne, 0, sched.config.StopEverything)
173 | }
174 | 
175 | // Config return scheduler's config pointer. It is exposed for testing purposes.
176 | func (sched *Scheduler) Config() *Config {
177 | 	return sched.config
178 | }
179 | 
180 | // schedule implements the scheduling algorithm and returns the suggested host.
181 | func (sched *Scheduler) schedule(pod *v1.Pod) (string, error) {
182 | 	host, err := sched.config.Algorithm.Schedule(pod, sched.config.NodeLister)
183 | 	if err != nil {
184 | 		glog.V(1).Infof("Failed to schedule pod: %v/%v", pod.Namespace, pod.Name)
185 | 		pod = pod.DeepCopy()
186 | 		sched.config.Error(pod, err)
187 | 		sched.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "%v", err)
188 | 		sched.config.PodConditionUpdater.Update(pod, &v1.PodCondition{
189 | 			Type:    v1.PodScheduled,
190 | 			Status:  v1.ConditionFalse,
191 | 			Reason:  v1.PodReasonUnschedulable,
192 | 			Message: err.Error(),
193 | 		})
194 | 		return "", err
195 | 	}
196 | 	return host, err
197 | }
198 | 
199 | func (sched *Scheduler) preempt(preemptor *v1.Pod, scheduleErr error) (string, error) {
200 | 	if !utilfeature.DefaultFeatureGate.Enabled(features.PodPriority) {
201 | 		glog.V(3).Infof("Pod priority feature is not enabled. No preemption is performed.")
202 | 		return "", nil
203 | 	}
204 | 	preemptor, err := sched.config.PodPreemptor.GetUpdatedPod(preemptor)
205 | 	if err != nil {
206 | 		glog.Errorf("Error getting the updated preemptor pod object: %v", err)
207 | 		return "", err
208 | 	}
209 | 	node, victims, err := sched.config.Algorithm.Preempt(preemptor, sched.config.NodeLister, scheduleErr)
210 | 	if err != nil {
211 | 		glog.Errorf("Error preempting victims to make room for %v/%v.", preemptor.Namespace, preemptor.Name)
212 | 		return "", err
213 | 	}
214 | 	if node == nil {
215 | 		return "", err
216 | 	}
217 | 	glog.Infof("Preempting %d pod(s) on node %v to make room for %v/%v.", len(victims), node.Name, preemptor.Namespace, preemptor.Name)
218 | 	annotations := map[string]string{core.NominatedNodeAnnotationKey: node.Name}
219 | 	err = sched.config.PodPreemptor.UpdatePodAnnotations(preemptor, annotations)
220 | 	if err != nil {
221 | 		glog.Errorf("Error in preemption process. Cannot update pod %v annotations: %v", preemptor.Name, err)
222 | 		return "", err
223 | 	}
224 | 	for _, victim := range victims {
225 | 		if err := sched.config.PodPreemptor.DeletePod(victim); err != nil {
226 | 			glog.Errorf("Error preempting pod %v/%v: %v", victim.Namespace, victim.Name, err)
227 | 			return "", err
228 | 		}
229 | 		sched.config.Recorder.Eventf(victim, v1.EventTypeNormal, "Preempted", "by %v/%v on node %v", preemptor.Namespace, preemptor.Name, node.Name)
230 | 	}
231 | 	return node.Name, err
232 | }
233 | 
234 | // assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous.
235 | // assume modifies `assumed`.
236 | func (sched *Scheduler) assume(assumed *v1.Pod, host string) error {
237 | 	// Optimistically assume that the binding will succeed and send it to apiserver
238 | 	// in the background.
239 | 	// If the binding fails, scheduler will release resources allocated to assumed pod
240 | 	// immediately.
241 | 	assumed.Spec.NodeName = host
242 | 	if err := sched.config.SchedulerCache.AssumePod(assumed); err != nil {
243 | 		glog.Errorf("scheduler cache AssumePod failed: %v", err)
244 | 
245 | 		// This is most probably result of a BUG in retrying logic.
246 | 		// We report an error here so that pod scheduling can be retried.
247 | 		// This relies on the fact that Error will check if the pod has been bound
248 | 		// to a node and if so will not add it back to the unscheduled pods queue
249 | 		// (otherwise this would cause an infinite loop).
250 | 		sched.config.Error(assumed, err)
251 | 		sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "AssumePod failed: %v", err)
252 | 		sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
253 | 			Type:    v1.PodScheduled,
254 | 			Status:  v1.ConditionFalse,
255 | 			Reason:  "SchedulerError",
256 | 			Message: err.Error(),
257 | 		})
258 | 		return err
259 | 	}
260 | 
261 | 	// Optimistically assume that the binding will succeed, so we need to invalidate affected
262 | 	// predicates in equivalence cache.
263 | 	// If the binding fails, these invalidated item will not break anything.
264 | 	if sched.config.Ecache != nil {
265 | 		sched.config.Ecache.InvalidateCachedPredicateItemForPodAdd(assumed, host)
266 | 	}
267 | 	return nil
268 | }
269 | 
270 | // bind binds a pod to a given node defined in a binding object.  We expect this to run asynchronously, so we
271 | // handle binding metrics internally.
272 | func (sched *Scheduler) bind(assumed *v1.Pod, b *v1.Binding) error {
273 | 	bindingStart := time.Now()
274 | 	// If binding succeeded then PodScheduled condition will be updated in apiserver so that
275 | 	// it's atomic with setting host.
276 | 	err := sched.config.Binder.Bind(b)
277 | 	if err := sched.config.SchedulerCache.FinishBinding(assumed); err != nil {
278 | 		glog.Errorf("scheduler cache FinishBinding failed: %v", err)
279 | 	}
280 | 	if err != nil {
281 | 		glog.V(1).Infof("Failed to bind pod: %v/%v", assumed.Namespace, assumed.Name)
282 | 		if err := sched.config.SchedulerCache.ForgetPod(assumed); err != nil {
283 | 			glog.Errorf("scheduler cache ForgetPod failed: %v", err)
284 | 		}
285 | 		sched.config.Error(assumed, err)
286 | 		sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "Binding rejected: %v", err)
287 | 		sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
288 | 			Type:   v1.PodScheduled,
289 | 			Status: v1.ConditionFalse,
290 | 			Reason: "BindingRejected",
291 | 		})
292 | 		return err
293 | 	}
294 | 
295 | 	metrics.BindingLatency.Observe(metrics.SinceInMicroseconds(bindingStart))
296 | 	sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v to %v", assumed.Name, b.Target.Name)
297 | 	return nil
298 | }
299 | 
300 | // scheduleOne does the entire scheduling workflow for a single pod.  It is serialized on the scheduling algorithm's host fitting.
301 | func (sched *Scheduler) scheduleOne() {
302 | 	pod := sched.config.NextPod()
303 | 	if pod.DeletionTimestamp != nil {
304 | 		sched.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
305 | 		glog.V(3).Infof("Skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
306 | 		return
307 | 	}
308 | 
309 | 	glog.V(3).Infof("Attempting to schedule pod: %v/%v", pod.Namespace, pod.Name)
310 | 
311 | 	// Synchronously attempt to find a fit for the pod.
312 | 	start := time.Now()
313 | 	suggestedHost, err := sched.schedule(pod)
314 | 	if len(suggestedHost) == 0 {
315 | 		if sched.checkPodLabelForNimbix(pod) {
316 | 			if len(pod.Spec.Containers) > 1 {
317 | 				glog.Errorf("Pods expected to be run in Nimbix must have no more than one container")
318 | 				return
319 | 			}
320 | 
321 | 			// Modify the Pod specification to run it in any node with nimbix access.
322 | 			pod.DeletionTimestamp = nil
323 | 			newPod := sched.modifyPodForNimbix(pod)
324 | 
325 | 			if updateErr := sched.config.PodWriter.UpdatePod(pod, newPod); updateErr != nil {
326 | 				glog.Errorf("Pod could not be updated on API server: %v", updateErr)
327 | 			}
328 | 			return
329 | 		}
330 | 	}
331 | 	metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInMicroseconds(start))
332 | 	if err != nil {
333 | 		// schedule() may have failed because the pod would not fit on any host, so we try to
334 | 		// preempt, with the expectation that the next time the pod is tried for scheduling it
335 | 		// will fit due to the preemption. It is also possible that a different pod will schedule
336 | 		// into the resources that were preempted, but this is harmless.
337 | 		if fitError, ok := err.(*core.FitError); ok {
338 | 			sched.preempt(pod, fitError)
339 | 		}
340 | 		return
341 | 	}
342 | 
343 | 	// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
344 | 	// This allows us to keep scheduling without waiting on binding to occur.
345 | 	assumedPod := *pod
346 | 	// assume modifies `assumedPod` by setting NodeName=suggestedHost
347 | 	err = sched.assume(&assumedPod, suggestedHost)
348 | 	if err != nil {
349 | 		return
350 | 	}
351 | 
352 | 	// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
353 | 	go func() {
354 | 		err := sched.bind(&assumedPod, &v1.Binding{
355 | 			ObjectMeta: metav1.ObjectMeta{Namespace: assumedPod.Namespace, Name: assumedPod.Name, UID: assumedPod.UID},
356 | 			Target: v1.ObjectReference{
357 | 				Kind: "Node",
358 | 				Name: suggestedHost,
359 | 			},
360 | 		})
361 | 		metrics.E2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start))
362 | 		if err != nil {
363 | 			glog.Errorf("Internal error binding pod: (%v)", err)
364 | 		}
365 | 	}()
366 | }
367 | 
368 | // Check whether the pod contains label task-type "Nimbix"
369 | func (sched *Scheduler) checkPodLabelForNimbix(pod *v1.Pod) bool {
370 | 	if pod.Labels["task-type"] == "Nimbix" {
371 | 		return true
372 | 	}
373 | 
374 | 	return false
375 | }
376 | 
377 | func (sched *Scheduler) modifyPodForNimbix(oldPod *v1.Pod) *v1.Pod {
378 | 	oldContainer := oldPod.Spec.Containers[0]
379 | 	newEnv := []v1.EnvVar{
380 | 		{
381 | 			Name:  "REMOTE",
382 | 			Value: "1",
383 | 		},
384 | 	}
385 | 	pod := &v1.Pod{
386 | 		TypeMeta: metav1.TypeMeta{
387 | 			Kind:       oldPod.Kind,
388 | 			APIVersion: oldPod.APIVersion,
389 | 		},
390 | 		ObjectMeta: metav1.ObjectMeta{
391 | 			Name:            oldPod.GetName(),
392 | 			Namespace:       oldPod.GetNamespace(),
393 | 			Labels:          oldPod.GetLabels(),
394 | 			Annotations:     oldPod.GetAnnotations(),
395 | 			OwnerReferences: oldPod.GetOwnerReferences(),
396 | 			Initializers:    oldPod.GetInitializers(),
397 | 			Finalizers:      oldPod.GetFinalizers(),
398 | 			ClusterName:     oldPod.GetClusterName(),
399 | 		},
400 | 		Spec: v1.PodSpec{
401 | 
402 | 			Volumes:        oldPod.Spec.Volumes,
403 | 			InitContainers: oldPod.Spec.InitContainers,
404 | 			Containers: []v1.Container{
405 | 				{
406 | 					Name:                     oldContainer.Name,
407 | 					Image:                    oldContainer.Image,
408 | 					Command:                  oldContainer.Command,
409 | 					Args:                     oldContainer.Args,
410 | 					WorkingDir:               oldContainer.WorkingDir,
411 | 					Ports:                    oldContainer.Ports,
412 | 					EnvFrom:                  oldContainer.EnvFrom,
413 | 					Env:                      append(oldContainer.Env, newEnv...),
414 | 					VolumeMounts:             oldContainer.VolumeMounts,
415 | 					LivenessProbe:            oldContainer.LivenessProbe,
416 | 					ReadinessProbe:           oldContainer.ReadinessProbe,
417 | 					Lifecycle:                oldContainer.Lifecycle,
418 | 					TerminationMessagePath:   oldContainer.TerminationMessagePath,
419 | 					TerminationMessagePolicy: oldContainer.TerminationMessagePolicy,
420 | 					ImagePullPolicy:          oldContainer.ImagePullPolicy,
421 | 					SecurityContext:          oldContainer.SecurityContext,
422 | 					Stdin:                    oldContainer.Stdin,
423 | 					StdinOnce:                oldContainer.StdinOnce,
424 | 					TTY:                      oldContainer.TTY,
425 | 				},
426 | 			},
427 | 			RestartPolicy:                 oldPod.Spec.RestartPolicy,
428 | 			TerminationGracePeriodSeconds: oldPod.Spec.TerminationGracePeriodSeconds,
429 | 			ActiveDeadlineSeconds:         oldPod.Spec.ActiveDeadlineSeconds,
430 | 			DNSPolicy:                     oldPod.Spec.DNSPolicy,
431 | 			NodeSelector:                  oldPod.Spec.NodeSelector,
432 | 			ServiceAccountName:            oldPod.Spec.ServiceAccountName,
433 | 			AutomountServiceAccountToken:  oldPod.Spec.AutomountServiceAccountToken,
434 | 			NodeName:                      oldPod.Spec.NodeName,
435 | 			HostNetwork:                   oldPod.Spec.HostNetwork,
436 | 			HostPID:                       oldPod.Spec.HostPID,
437 | 			HostIPC:                       oldPod.Spec.HostIPC,
438 | 			SecurityContext:               oldPod.Spec.SecurityContext,
439 | 			ImagePullSecrets:              oldPod.Spec.ImagePullSecrets,
440 | 			Hostname:                      oldPod.Spec.Hostname,
441 | 			Subdomain:                     oldPod.Spec.Subdomain,
442 | 			Affinity:                      oldPod.Spec.Affinity,
443 | 			SchedulerName:                 oldPod.Spec.SchedulerName,
444 | 			Tolerations:                   oldPod.Spec.Tolerations,
445 | 			HostAliases:                   oldPod.Spec.HostAliases,
446 | 		},
447 | 	}
448 | 	return pod
449 | }
450 | 


--------------------------------------------------------------------------------
/scheduler/factory.go:
--------------------------------------------------------------------------------
   1 | /*
   2 | Copyright 2014 The Kubernetes Authors.
   3 | 
   4 | Licensed under the Apache License, Version 2.0 (the "License");
   5 | you may not use this file except in compliance with the License.
   6 | You may obtain a copy of the License at
   7 | 
   8 |     http://www.apache.org/licenses/LICENSE-2.0
   9 | 
  10 | Unless required by applicable law or agreed to in writing, software
  11 | distributed under the License is distributed on an "AS IS" BASIS,
  12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 | See the License for the specific language governing permissions and
  14 | limitations under the License.
  15 | */
  16 | 
  17 | // Package factory can set up a scheduler. This code is here instead of
  18 | // plugin/cmd/scheduler for both testability and reuse.
  19 | package factory
  20 | 
  21 | import (
  22 | 	"fmt"
  23 | 	"reflect"
  24 | 	"sync"
  25 | 	"time"
  26 | 
  27 | 	"k8s.io/api/core/v1"
  28 | 	"k8s.io/apimachinery/pkg/api/errors"
  29 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  30 | 	"k8s.io/apimachinery/pkg/fields"
  31 | 	"k8s.io/apimachinery/pkg/labels"
  32 | 	"k8s.io/apimachinery/pkg/runtime/schema"
  33 | 	"k8s.io/apimachinery/pkg/types"
  34 | 	"k8s.io/apimachinery/pkg/util/runtime"
  35 | 	"k8s.io/apimachinery/pkg/util/sets"
  36 | 	appsinformers "k8s.io/client-go/informers/apps/v1beta1"
  37 | 	coreinformers "k8s.io/client-go/informers/core/v1"
  38 | 	extensionsinformers "k8s.io/client-go/informers/extensions/v1beta1"
  39 | 	clientset "k8s.io/client-go/kubernetes"
  40 | 	appslisters "k8s.io/client-go/listers/apps/v1beta1"
  41 | 	corelisters "k8s.io/client-go/listers/core/v1"
  42 | 	extensionslisters "k8s.io/client-go/listers/extensions/v1beta1"
  43 | 	"k8s.io/client-go/tools/cache"
  44 | 	"k8s.io/kubernetes/pkg/api/helper"
  45 | 	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
  46 | 	kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
  47 | 	"k8s.io/kubernetes/plugin/pkg/scheduler"
  48 | 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
  49 | 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/predicates"
  50 | 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
  51 | 	"k8s.io/kubernetes/plugin/pkg/scheduler/api/validation"
  52 | 	"k8s.io/kubernetes/plugin/pkg/scheduler/core"
  53 | 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
  54 | 	"k8s.io/kubernetes/plugin/pkg/scheduler/util"
  55 | 
  56 | 	"github.com/golang/glog"
  57 | )
  58 | 
  59 | const (
  60 | 	initialGetBackoff = 100 * time.Millisecond
  61 | 	maximalGetBackoff = time.Minute
  62 | )
  63 | 
  64 | var (
  65 | 	serviceAffinitySet           = sets.NewString("ServiceAffinity")
  66 | 	maxPDVolumeCountPredicateSet = sets.NewString("MaxPDVolumeCountPredicate")
  67 | 	matchInterPodAffinitySet     = sets.NewString("MatchInterPodAffinity")
  68 | 	generalPredicatesSets        = sets.NewString("GeneralPredicates")
  69 | 	noDiskConflictSet            = sets.NewString("NoDiskConflict")
  70 | )
  71 | 
  72 | // ConfigFactory is the default implementation of the scheduler.Configurator interface.
  73 | // TODO make this private if possible, so that only its interface is externally used.
  74 | type ConfigFactory struct {
  75 | 	client clientset.Interface
  76 | 	// queue for pods that need scheduling
  77 | 	podQueue *cache.FIFO
  78 | 	// a means to list all known scheduled pods.
  79 | 	scheduledPodLister corelisters.PodLister
  80 | 	// a means to list all known scheduled pods and pods assumed to have been scheduled.
  81 | 	podLister algorithm.PodLister
  82 | 	// a means to list all nodes
  83 | 	nodeLister corelisters.NodeLister
  84 | 	// a means to list all PersistentVolumes
  85 | 	pVLister corelisters.PersistentVolumeLister
  86 | 	// a means to list all PersistentVolumeClaims
  87 | 	pVCLister corelisters.PersistentVolumeClaimLister
  88 | 	// a means to list all services
  89 | 	serviceLister corelisters.ServiceLister
  90 | 	// a means to list all controllers
  91 | 	controllerLister corelisters.ReplicationControllerLister
  92 | 	// a means to list all replicasets
  93 | 	replicaSetLister extensionslisters.ReplicaSetLister
  94 | 	// a means to list all statefulsets
  95 | 	statefulSetLister appslisters.StatefulSetLister
  96 | 
  97 | 	// Close this to stop all reflectors
  98 | 	StopEverything chan struct{}
  99 | 
 100 | 	scheduledPodsHasSynced cache.InformerSynced
 101 | 
 102 | 	schedulerCache schedulercache.Cache
 103 | 
 104 | 	// SchedulerName of a scheduler is used to select which pods will be
 105 | 	// processed by this scheduler, based on pods's "spec.SchedulerName".
 106 | 	schedulerName string
 107 | 
 108 | 	// RequiredDuringScheduling affinity is not symmetric, but there is an implicit PreferredDuringScheduling affinity rule
 109 | 	// corresponding to every RequiredDuringScheduling affinity rule.
 110 | 	// HardPodAffinitySymmetricWeight represents the weight of implicit PreferredDuringScheduling affinity rule, in the range 0-100.
 111 | 	hardPodAffinitySymmetricWeight int
 112 | 
 113 | 	// Equivalence class cache
 114 | 	equivalencePodCache *core.EquivalenceCache
 115 | 
 116 | 	// Enable equivalence class cache
 117 | 	enableEquivalenceClassCache bool
 118 | }
 119 | 
 120 | // NewConfigFactory initializes the default implementation of a Configurator To encourage eventual privatization of the struct type, we only
 121 | // return the interface.
 122 | func NewConfigFactory(
 123 | 	schedulerName string,
 124 | 	client clientset.Interface,
 125 | 	nodeInformer coreinformers.NodeInformer,
 126 | 	podInformer coreinformers.PodInformer,
 127 | 	pvInformer coreinformers.PersistentVolumeInformer,
 128 | 	pvcInformer coreinformers.PersistentVolumeClaimInformer,
 129 | 	replicationControllerInformer coreinformers.ReplicationControllerInformer,
 130 | 	replicaSetInformer extensionsinformers.ReplicaSetInformer,
 131 | 	statefulSetInformer appsinformers.StatefulSetInformer,
 132 | 	serviceInformer coreinformers.ServiceInformer,
 133 | 	hardPodAffinitySymmetricWeight int,
 134 | 	enableEquivalenceClassCache bool,
 135 | ) scheduler.Configurator {
 136 | 	stopEverything := make(chan struct{})
 137 | 	schedulerCache := schedulercache.New(30*time.Second, stopEverything)
 138 | 
 139 | 	c := &ConfigFactory{
 140 | 		client:                         client,
 141 | 		podLister:                      schedulerCache,
 142 | 		podQueue:                       cache.NewFIFO(cache.MetaNamespaceKeyFunc),
 143 | 		pVLister:                       pvInformer.Lister(),
 144 | 		pVCLister:                      pvcInformer.Lister(),
 145 | 		serviceLister:                  serviceInformer.Lister(),
 146 | 		controllerLister:               replicationControllerInformer.Lister(),
 147 | 		replicaSetLister:               replicaSetInformer.Lister(),
 148 | 		statefulSetLister:              statefulSetInformer.Lister(),
 149 | 		schedulerCache:                 schedulerCache,
 150 | 		StopEverything:                 stopEverything,
 151 | 		schedulerName:                  schedulerName,
 152 | 		hardPodAffinitySymmetricWeight: hardPodAffinitySymmetricWeight,
 153 | 		enableEquivalenceClassCache:    enableEquivalenceClassCache,
 154 | 	}
 155 | 
 156 | 	c.scheduledPodsHasSynced = podInformer.Informer().HasSynced
 157 | 	// scheduled pod cache
 158 | 	podInformer.Informer().AddEventHandler(
 159 | 		cache.FilteringResourceEventHandler{
 160 | 			FilterFunc: func(obj interface{}) bool {
 161 | 				switch t := obj.(type) {
 162 | 				case *v1.Pod:
 163 | 					return assignedNonTerminatedPod(t)
 164 | 				default:
 165 | 					runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj))
 166 | 					return false
 167 | 				}
 168 | 			},
 169 | 			Handler: cache.ResourceEventHandlerFuncs{
 170 | 				AddFunc:    c.addPodToCache,
 171 | 				UpdateFunc: c.updatePodInCache,
 172 | 				DeleteFunc: c.deletePodFromCache,
 173 | 			},
 174 | 		},
 175 | 	)
 176 | 	// unscheduled pod queue
 177 | 	podInformer.Informer().AddEventHandler(
 178 | 		cache.FilteringResourceEventHandler{
 179 | 			FilterFunc: func(obj interface{}) bool {
 180 | 				switch t := obj.(type) {
 181 | 				case *v1.Pod:
 182 | 					return unassignedNonTerminatedPod(t)
 183 | 				default:
 184 | 					runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj))
 185 | 					return false
 186 | 				}
 187 | 			},
 188 | 			Handler: cache.ResourceEventHandlerFuncs{
 189 | 				AddFunc: func(obj interface{}) {
 190 | 					if err := c.podQueue.Add(obj); err != nil {
 191 | 						runtime.HandleError(fmt.Errorf("unable to queue %T: %v", obj, err))
 192 | 					}
 193 | 				},
 194 | 				UpdateFunc: func(oldObj, newObj interface{}) {
 195 | 					if err := c.podQueue.Update(newObj); err != nil {
 196 | 						runtime.HandleError(fmt.Errorf("unable to update %T: %v", newObj, err))
 197 | 					}
 198 | 				},
 199 | 				DeleteFunc: func(obj interface{}) {
 200 | 					if err := c.podQueue.Delete(obj); err != nil {
 201 | 						runtime.HandleError(fmt.Errorf("unable to dequeue %T: %v", obj, err))
 202 | 					}
 203 | 				},
 204 | 			},
 205 | 		},
 206 | 	)
 207 | 	// ScheduledPodLister is something we provide to plug-in functions that
 208 | 	// they may need to call.
 209 | 	c.scheduledPodLister = assignedPodLister{podInformer.Lister()}
 210 | 
 211 | 	// Only nodes in the "Ready" condition with status == "True" are schedulable
 212 | 	nodeInformer.Informer().AddEventHandlerWithResyncPeriod(
 213 | 		cache.ResourceEventHandlerFuncs{
 214 | 			AddFunc:    c.addNodeToCache,
 215 | 			UpdateFunc: c.updateNodeInCache,
 216 | 			DeleteFunc: c.deleteNodeFromCache,
 217 | 		},
 218 | 		0,
 219 | 	)
 220 | 	c.nodeLister = nodeInformer.Lister()
 221 | 
 222 | 	// On add and delete of PVs, it will affect equivalence cache items
 223 | 	// related to persistent volume
 224 | 	pvInformer.Informer().AddEventHandlerWithResyncPeriod(
 225 | 		cache.ResourceEventHandlerFuncs{
 226 | 			// MaxPDVolumeCountPredicate: since it relies on the counts of PV.
 227 | 			AddFunc:    c.onPvAdd,
 228 | 			DeleteFunc: c.onPvDelete,
 229 | 		},
 230 | 		0,
 231 | 	)
 232 | 	c.pVLister = pvInformer.Lister()
 233 | 
 234 | 	// This is for MaxPDVolumeCountPredicate: add/delete PVC will affect counts of PV when it is bound.
 235 | 	pvcInformer.Informer().AddEventHandlerWithResyncPeriod(
 236 | 		cache.ResourceEventHandlerFuncs{
 237 | 			AddFunc:    c.onPvcAdd,
 238 | 			DeleteFunc: c.onPvcDelete,
 239 | 		},
 240 | 		0,
 241 | 	)
 242 | 	c.pVCLister = pvcInformer.Lister()
 243 | 
 244 | 	// This is for ServiceAffinity: affected by the selector of the service is updated.
 245 | 	// Also, if new service is added, equivalence cache will also become invalid since
 246 | 	// existing pods may be "captured" by this service and change this predicate result.
 247 | 	serviceInformer.Informer().AddEventHandlerWithResyncPeriod(
 248 | 		cache.ResourceEventHandlerFuncs{
 249 | 			AddFunc:    c.onServiceAdd,
 250 | 			UpdateFunc: c.onServiceUpdate,
 251 | 			DeleteFunc: c.onServiceDelete,
 252 | 		},
 253 | 		0,
 254 | 	)
 255 | 	c.serviceLister = serviceInformer.Lister()
 256 | 
 257 | 	// Existing equivalence cache should not be affected by add/delete RC/Deployment etc,
 258 | 	// it only make sense when pod is scheduled or deleted
 259 | 
 260 | 	return c
 261 | }
 262 | 
 263 | func (c *ConfigFactory) onPvAdd(obj interface{}) {
 264 | 	if c.enableEquivalenceClassCache {
 265 | 		pv, ok := obj.(*v1.PersistentVolume)
 266 | 		if !ok {
 267 | 			glog.Errorf("cannot convert to *v1.PersistentVolume: %v", obj)
 268 | 			return
 269 | 		}
 270 | 		c.invalidatePredicatesForPv(pv)
 271 | 	}
 272 | }
 273 | 
 274 | func (c *ConfigFactory) onPvDelete(obj interface{}) {
 275 | 	if c.enableEquivalenceClassCache {
 276 | 		var pv *v1.PersistentVolume
 277 | 		switch t := obj.(type) {
 278 | 		case *v1.PersistentVolume:
 279 | 			pv = t
 280 | 		case cache.DeletedFinalStateUnknown:
 281 | 			var ok bool
 282 | 			pv, ok = t.Obj.(*v1.PersistentVolume)
 283 | 			if !ok {
 284 | 				glog.Errorf("cannot convert to *v1.PersistentVolume: %v", t.Obj)
 285 | 				return
 286 | 			}
 287 | 		default:
 288 | 			glog.Errorf("cannot convert to *v1.PersistentVolume: %v", t)
 289 | 			return
 290 | 		}
 291 | 		c.invalidatePredicatesForPv(pv)
 292 | 	}
 293 | }
 294 | 
 295 | func (c *ConfigFactory) invalidatePredicatesForPv(pv *v1.PersistentVolume) {
 296 | 	invalidPredicates := sets.NewString("MaxPDVolumeCountPredicate")
 297 | 	if pv.Spec.AWSElasticBlockStore != nil {
 298 | 		invalidPredicates.Insert("MaxEBSVolumeCount")
 299 | 	}
 300 | 	if pv.Spec.GCEPersistentDisk != nil {
 301 | 		invalidPredicates.Insert("MaxGCEPDVolumeCount")
 302 | 	}
 303 | 	if pv.Spec.AzureDisk != nil {
 304 | 		invalidPredicates.Insert("MaxAzureDiskVolumeCount")
 305 | 	}
 306 | 	c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(invalidPredicates)
 307 | }
 308 | 
 309 | func (c *ConfigFactory) onPvcAdd(obj interface{}) {
 310 | 	if c.enableEquivalenceClassCache {
 311 | 		pvc, ok := obj.(*v1.PersistentVolumeClaim)
 312 | 		if !ok {
 313 | 			glog.Errorf("cannot convert to *v1.PersistentVolumeClaim: %v", obj)
 314 | 			return
 315 | 		}
 316 | 		c.invalidatePredicatesForPvc(pvc)
 317 | 	}
 318 | }
 319 | 
 320 | func (c *ConfigFactory) onPvcDelete(obj interface{}) {
 321 | 	if c.enableEquivalenceClassCache {
 322 | 		var pvc *v1.PersistentVolumeClaim
 323 | 		switch t := obj.(type) {
 324 | 		case *v1.PersistentVolumeClaim:
 325 | 			pvc = t
 326 | 		case cache.DeletedFinalStateUnknown:
 327 | 			var ok bool
 328 | 			pvc, ok = t.Obj.(*v1.PersistentVolumeClaim)
 329 | 			if !ok {
 330 | 				glog.Errorf("cannot convert to *v1.PersistentVolumeClaim: %v", t.Obj)
 331 | 				return
 332 | 			}
 333 | 		default:
 334 | 			glog.Errorf("cannot convert to *v1.PersistentVolumeClaim: %v", t)
 335 | 			return
 336 | 		}
 337 | 		c.invalidatePredicatesForPvc(pvc)
 338 | 	}
 339 | }
 340 | 
 341 | func (c *ConfigFactory) invalidatePredicatesForPvc(pvc *v1.PersistentVolumeClaim) {
 342 | 	if pvc.Spec.VolumeName != "" {
 343 | 		c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(maxPDVolumeCountPredicateSet)
 344 | 	}
 345 | }
 346 | 
 347 | func (c *ConfigFactory) onServiceAdd(obj interface{}) {
 348 | 	if c.enableEquivalenceClassCache {
 349 | 		c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(serviceAffinitySet)
 350 | 	}
 351 | }
 352 | 
 353 | func (c *ConfigFactory) onServiceUpdate(oldObj interface{}, newObj interface{}) {
 354 | 	if c.enableEquivalenceClassCache {
 355 | 		// TODO(resouer) We may need to invalidate this for specified group of pods only
 356 | 		oldService := oldObj.(*v1.Service)
 357 | 		newService := newObj.(*v1.Service)
 358 | 		if !reflect.DeepEqual(oldService.Spec.Selector, newService.Spec.Selector) {
 359 | 			c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(serviceAffinitySet)
 360 | 		}
 361 | 	}
 362 | }
 363 | 
 364 | func (c *ConfigFactory) onServiceDelete(obj interface{}) {
 365 | 	if c.enableEquivalenceClassCache {
 366 | 		c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(serviceAffinitySet)
 367 | 	}
 368 | }
 369 | 
 370 | // GetNodeStore provides the cache to the nodes, mostly internal use, but may also be called by mock-tests.
 371 | func (c *ConfigFactory) GetNodeLister() corelisters.NodeLister {
 372 | 	return c.nodeLister
 373 | }
 374 | 
 375 | func (c *ConfigFactory) GetHardPodAffinitySymmetricWeight() int {
 376 | 	return c.hardPodAffinitySymmetricWeight
 377 | }
 378 | 
 379 | func (f *ConfigFactory) GetSchedulerName() string {
 380 | 	return f.schedulerName
 381 | }
 382 | 
 383 | // GetClient provides a kubernetes client, mostly internal use, but may also be called by mock-tests.
 384 | func (f *ConfigFactory) GetClient() clientset.Interface {
 385 | 	return f.client
 386 | }
 387 | 
 388 | // GetScheduledPodListerIndexer provides a pod lister, mostly internal use, but may also be called by mock-tests.
 389 | func (c *ConfigFactory) GetScheduledPodLister() corelisters.PodLister {
 390 | 	return c.scheduledPodLister
 391 | }
 392 | 
 393 | func (c *ConfigFactory) addPodToCache(obj interface{}) {
 394 | 	pod, ok := obj.(*v1.Pod)
 395 | 	if !ok {
 396 | 		glog.Errorf("cannot convert to *v1.Pod: %v", obj)
 397 | 		return
 398 | 	}
 399 | 
 400 | 	if err := c.schedulerCache.AddPod(pod); err != nil {
 401 | 		glog.Errorf("scheduler cache AddPod failed: %v", err)
 402 | 	}
 403 | 	// NOTE: Updating equivalence cache of addPodToCache has been
 404 | 	// handled optimistically in InvalidateCachedPredicateItemForPodAdd.
 405 | }
 406 | 
 407 | func (c *ConfigFactory) updatePodInCache(oldObj, newObj interface{}) {
 408 | 	oldPod, ok := oldObj.(*v1.Pod)
 409 | 	if !ok {
 410 | 		glog.Errorf("cannot convert oldObj to *v1.Pod: %v", oldObj)
 411 | 		return
 412 | 	}
 413 | 	newPod, ok := newObj.(*v1.Pod)
 414 | 	if !ok {
 415 | 		glog.Errorf("cannot convert newObj to *v1.Pod: %v", newObj)
 416 | 		return
 417 | 	}
 418 | 
 419 | 	if err := c.schedulerCache.UpdatePod(oldPod, newPod); err != nil {
 420 | 		glog.Errorf("scheduler cache UpdatePod failed: %v", err)
 421 | 	}
 422 | 
 423 | 	c.invalidateCachedPredicatesOnUpdatePod(newPod, oldPod)
 424 | }
 425 | 
 426 | func (c *ConfigFactory) invalidateCachedPredicatesOnUpdatePod(newPod *v1.Pod, oldPod *v1.Pod) {
 427 | 	if c.enableEquivalenceClassCache {
 428 | 		// if the pod does not have binded node, updating equivalence cache is meaningless;
 429 | 		// if pod's binded node has been changed, that case should be handled by pod add & delete.
 430 | 		if len(newPod.Spec.NodeName) != 0 && newPod.Spec.NodeName == oldPod.Spec.NodeName {
 431 | 			if !reflect.DeepEqual(oldPod.GetLabels(), newPod.GetLabels()) {
 432 | 				// MatchInterPodAffinity need to be reconsidered for this node,
 433 | 				// as well as all nodes in its same failure domain.
 434 | 				c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(
 435 | 					matchInterPodAffinitySet)
 436 | 			}
 437 | 			// if requested container resource changed, invalidate GeneralPredicates of this node
 438 | 			if !reflect.DeepEqual(predicates.GetResourceRequest(newPod),
 439 | 				predicates.GetResourceRequest(oldPod)) {
 440 | 				c.equivalencePodCache.InvalidateCachedPredicateItem(
 441 | 					newPod.Spec.NodeName, generalPredicatesSets)
 442 | 			}
 443 | 		}
 444 | 	}
 445 | }
 446 | 
 447 | func (c *ConfigFactory) deletePodFromCache(obj interface{}) {
 448 | 	var pod *v1.Pod
 449 | 	switch t := obj.(type) {
 450 | 	case *v1.Pod:
 451 | 		pod = t
 452 | 	case cache.DeletedFinalStateUnknown:
 453 | 		var ok bool
 454 | 		pod, ok = t.Obj.(*v1.Pod)
 455 | 		if !ok {
 456 | 			glog.Errorf("cannot convert to *v1.Pod: %v", t.Obj)
 457 | 			return
 458 | 		}
 459 | 	default:
 460 | 		glog.Errorf("cannot convert to *v1.Pod: %v", t)
 461 | 		return
 462 | 	}
 463 | 	if err := c.schedulerCache.RemovePod(pod); err != nil {
 464 | 		glog.Errorf("scheduler cache RemovePod failed: %v", err)
 465 | 	}
 466 | 
 467 | 	c.invalidateCachedPredicatesOnDeletePod(pod)
 468 | }
 469 | 
 470 | func (c *ConfigFactory) invalidateCachedPredicatesOnDeletePod(pod *v1.Pod) {
 471 | 	if c.enableEquivalenceClassCache {
 472 | 		// part of this case is the same as pod add.
 473 | 		c.equivalencePodCache.InvalidateCachedPredicateItemForPodAdd(pod, pod.Spec.NodeName)
 474 | 		// MatchInterPodAffinity need to be reconsidered for this node,
 475 | 		// as well as all nodes in its same failure domain.
 476 | 		// TODO(resouer) can we just do this for nodes in the same failure domain
 477 | 		c.equivalencePodCache.InvalidateCachedPredicateItemOfAllNodes(
 478 | 			matchInterPodAffinitySet)
 479 | 
 480 | 		// if this pod have these PV, cached result of disk conflict will become invalid.
 481 | 		for _, volume := range pod.Spec.Volumes {
 482 | 			if volume.GCEPersistentDisk != nil || volume.AWSElasticBlockStore != nil ||
 483 | 				volume.RBD != nil || volume.ISCSI != nil {
 484 | 				c.equivalencePodCache.InvalidateCachedPredicateItem(
 485 | 					pod.Spec.NodeName, noDiskConflictSet)
 486 | 			}
 487 | 		}
 488 | 	}
 489 | }
 490 | 
 491 | func (c *ConfigFactory) addNodeToCache(obj interface{}) {
 492 | 	node, ok := obj.(*v1.Node)
 493 | 	if !ok {
 494 | 		glog.Errorf("cannot convert to *v1.Node: %v", obj)
 495 | 		return
 496 | 	}
 497 | 
 498 | 	if err := c.schedulerCache.AddNode(node); err != nil {
 499 | 		glog.Errorf("scheduler cache AddNode failed: %v", err)
 500 | 	}
 501 | 
 502 | 	// NOTE: add a new node does not affect existing predicates in equivalence cache
 503 | }
 504 | 
 505 | func (c *ConfigFactory) updateNodeInCache(oldObj, newObj interface{}) {
 506 | 	oldNode, ok := oldObj.(*v1.Node)
 507 | 	if !ok {
 508 | 		glog.Errorf("cannot convert oldObj to *v1.Node: %v", oldObj)
 509 | 		return
 510 | 	}
 511 | 	newNode, ok := newObj.(*v1.Node)
 512 | 	if !ok {
 513 | 		glog.Errorf("cannot convert newObj to *v1.Node: %v", newObj)
 514 | 		return
 515 | 	}
 516 | 
 517 | 	if err := c.schedulerCache.UpdateNode(oldNode, newNode); err != nil {
 518 | 		glog.Errorf("scheduler cache UpdateNode failed: %v", err)
 519 | 	}
 520 | 
 521 | 	c.invalidateCachedPredicatesOnNodeUpdate(newNode, oldNode)
 522 | }
 523 | 
 524 | func (c *ConfigFactory) invalidateCachedPredicatesOnNodeUpdate(newNode *v1.Node, oldNode *v1.Node) {
 525 | 	if c.enableEquivalenceClassCache {
 526 | 		// Begin to update equivalence cache based on node update
 527 | 		// TODO(resouer): think about lazily initialize this set
 528 | 		invalidPredicates := sets.NewString()
 529 | 
 530 | 		oldTaints, oldErr := helper.GetTaintsFromNodeAnnotations(oldNode.GetAnnotations())
 531 | 		if oldErr != nil {
 532 | 			glog.Errorf("Failed to get taints from old node annotation for equivalence cache")
 533 | 		}
 534 | 		newTaints, newErr := helper.GetTaintsFromNodeAnnotations(newNode.GetAnnotations())
 535 | 		if newErr != nil {
 536 | 			glog.Errorf("Failed to get taints from new node annotation for equivalence cache")
 537 | 		}
 538 | 
 539 | 		if !reflect.DeepEqual(oldNode.Status.Allocatable, newNode.Status.Allocatable) {
 540 | 			invalidPredicates.Insert("GeneralPredicates") // "PodFitsResources"
 541 | 		}
 542 | 		if !reflect.DeepEqual(oldNode.GetLabels(), newNode.GetLabels()) {
 543 | 			invalidPredicates.Insert("GeneralPredicates", "ServiceAffinity") // "PodSelectorMatches"
 544 | 			for k, v := range oldNode.GetLabels() {
 545 | 				// any label can be topology key of pod, we have to invalidate in all cases
 546 | 				if v != newNode.GetLabels()[k] {
 547 | 					invalidPredicates.Insert("MatchInterPodAffinity")
 548 | 				}
 549 | 				// NoVolumeZoneConflict will only be affected by zone related label change
 550 | 				if k == kubeletapis.LabelZoneFailureDomain || k == kubeletapis.LabelZoneRegion {
 551 | 					if v != newNode.GetLabels()[k] {
 552 | 						invalidPredicates.Insert("NoVolumeZoneConflict")
 553 | 					}
 554 | 				}
 555 | 			}
 556 | 		}
 557 | 		if !reflect.DeepEqual(oldTaints, newTaints) {
 558 | 			invalidPredicates.Insert("PodToleratesNodeTaints")
 559 | 		}
 560 | 		if !reflect.DeepEqual(oldNode.Status.Conditions, newNode.Status.Conditions) {
 561 | 			oldConditions := make(map[v1.NodeConditionType]v1.ConditionStatus)
 562 | 			newConditions := make(map[v1.NodeConditionType]v1.ConditionStatus)
 563 | 			for _, cond := range oldNode.Status.Conditions {
 564 | 				oldConditions[cond.Type] = cond.Status
 565 | 			}
 566 | 			for _, cond := range newNode.Status.Conditions {
 567 | 				newConditions[cond.Type] = cond.Status
 568 | 			}
 569 | 			if oldConditions[v1.NodeMemoryPressure] != newConditions[v1.NodeMemoryPressure] {
 570 | 				invalidPredicates.Insert("CheckNodeMemoryPressure")
 571 | 			}
 572 | 			if oldConditions[v1.NodeDiskPressure] != newConditions[v1.NodeDiskPressure] {
 573 | 				invalidPredicates.Insert("CheckNodeDiskPressure")
 574 | 			}
 575 | 			if oldConditions[v1.NodeReady] != newConditions[v1.NodeReady] ||
 576 | 				oldConditions[v1.NodeOutOfDisk] != newConditions[v1.NodeOutOfDisk] ||
 577 | 				oldConditions[v1.NodeNetworkUnavailable] != newConditions[v1.NodeNetworkUnavailable] ||
 578 | 				newNode.Spec.Unschedulable != oldNode.Spec.Unschedulable {
 579 | 				invalidPredicates.Insert("CheckNodeCondition")
 580 | 			}
 581 | 		}
 582 | 		c.equivalencePodCache.InvalidateCachedPredicateItem(newNode.GetName(), invalidPredicates)
 583 | 	}
 584 | }
 585 | 
 586 | func (c *ConfigFactory) deleteNodeFromCache(obj interface{}) {
 587 | 	var node *v1.Node
 588 | 	switch t := obj.(type) {
 589 | 	case *v1.Node:
 590 | 		node = t
 591 | 	case cache.DeletedFinalStateUnknown:
 592 | 		var ok bool
 593 | 		node, ok = t.Obj.(*v1.Node)
 594 | 		if !ok {
 595 | 			glog.Errorf("cannot convert to *v1.Node: %v", t.Obj)
 596 | 			return
 597 | 		}
 598 | 	default:
 599 | 		glog.Errorf("cannot convert to *v1.Node: %v", t)
 600 | 		return
 601 | 	}
 602 | 	if err := c.schedulerCache.RemoveNode(node); err != nil {
 603 | 		glog.Errorf("scheduler cache RemoveNode failed: %v", err)
 604 | 	}
 605 | 	if c.enableEquivalenceClassCache {
 606 | 		c.equivalencePodCache.InvalidateAllCachedPredicateItemOfNode(node.GetName())
 607 | 	}
 608 | }
 609 | 
 610 | // Create creates a scheduler with the default algorithm provider.
 611 | func (f *ConfigFactory) Create() (*scheduler.Config, error) {
 612 | 	return f.CreateFromProvider(DefaultProvider)
 613 | }
 614 | 
 615 | // Creates a scheduler from the name of a registered algorithm provider.
 616 | func (f *ConfigFactory) CreateFromProvider(providerName string) (*scheduler.Config, error) {
 617 | 	glog.V(2).Infof("Creating scheduler from algorithm provider '%v'", providerName)
 618 | 	provider, err := GetAlgorithmProvider(providerName)
 619 | 	if err != nil {
 620 | 		return nil, err
 621 | 	}
 622 | 
 623 | 	return f.CreateFromKeys(provider.FitPredicateKeys, provider.PriorityFunctionKeys, []algorithm.SchedulerExtender{})
 624 | }
 625 | 
 626 | // Creates a scheduler from the configuration file
 627 | func (f *ConfigFactory) CreateFromConfig(policy schedulerapi.Policy) (*scheduler.Config, error) {
 628 | 	glog.V(2).Infof("Creating scheduler from configuration: %v", policy)
 629 | 
 630 | 	// validate the policy configuration
 631 | 	if err := validation.ValidatePolicy(policy); err != nil {
 632 | 		return nil, err
 633 | 	}
 634 | 
 635 | 	predicateKeys := sets.NewString()
 636 | 	for _, predicate := range policy.Predicates {
 637 | 		glog.V(2).Infof("Registering predicate: %s", predicate.Name)
 638 | 		predicateKeys.Insert(RegisterCustomFitPredicate(predicate))
 639 | 	}
 640 | 
 641 | 	priorityKeys := sets.NewString()
 642 | 	for _, priority := range policy.Priorities {
 643 | 		glog.V(2).Infof("Registering priority: %s", priority.Name)
 644 | 		priorityKeys.Insert(RegisterCustomPriorityFunction(priority))
 645 | 	}
 646 | 
 647 | 	extenders := make([]algorithm.SchedulerExtender, 0)
 648 | 	if len(policy.ExtenderConfigs) != 0 {
 649 | 		for ii := range policy.ExtenderConfigs {
 650 | 			glog.V(2).Infof("Creating extender with config %+v", policy.ExtenderConfigs[ii])
 651 | 			if extender, err := core.NewHTTPExtender(&policy.ExtenderConfigs[ii]); err != nil {
 652 | 				return nil, err
 653 | 			} else {
 654 | 				extenders = append(extenders, extender)
 655 | 			}
 656 | 		}
 657 | 	}
 658 | 	// Providing HardPodAffinitySymmetricWeight in the policy config is the new and preferred way of providing the value.
 659 | 	// Give it higher precedence than scheduler CLI configuration when it is provided.
 660 | 	if policy.HardPodAffinitySymmetricWeight != 0 {
 661 | 		f.hardPodAffinitySymmetricWeight = policy.HardPodAffinitySymmetricWeight
 662 | 	}
 663 | 	return f.CreateFromKeys(predicateKeys, priorityKeys, extenders)
 664 | }
 665 | 
 666 | // getBinder returns an extender that supports bind or a default binder.
 667 | func (f *ConfigFactory) getBinder(extenders []algorithm.SchedulerExtender) scheduler.Binder {
 668 | 	for i := range extenders {
 669 | 		if extenders[i].IsBinder() {
 670 | 			return extenders[i]
 671 | 		}
 672 | 	}
 673 | 	return &binder{f.client}
 674 | }
 675 | 
 676 | // Creates a scheduler from a set of registered fit predicate keys and priority keys.
 677 | func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String, extenders []algorithm.SchedulerExtender) (*scheduler.Config, error) {
 678 | 	glog.V(2).Infof("Creating scheduler with fit predicates '%v' and priority functions '%v'", predicateKeys, priorityKeys)
 679 | 
 680 | 	if f.GetHardPodAffinitySymmetricWeight() < 1 || f.GetHardPodAffinitySymmetricWeight() > 100 {
 681 | 		return nil, fmt.Errorf("invalid hardPodAffinitySymmetricWeight: %d, must be in the range 1-100", f.GetHardPodAffinitySymmetricWeight())
 682 | 	}
 683 | 
 684 | 	predicateFuncs, err := f.GetPredicates(predicateKeys)
 685 | 	if err != nil {
 686 | 		return nil, err
 687 | 	}
 688 | 
 689 | 	priorityConfigs, err := f.GetPriorityFunctionConfigs(priorityKeys)
 690 | 	if err != nil {
 691 | 		return nil, err
 692 | 	}
 693 | 
 694 | 	priorityMetaProducer, err := f.GetPriorityMetadataProducer()
 695 | 	if err != nil {
 696 | 		return nil, err
 697 | 	}
 698 | 
 699 | 	predicateMetaProducer, err := f.GetPredicateMetadataProducer()
 700 | 	if err != nil {
 701 | 		return nil, err
 702 | 	}
 703 | 
 704 | 	// Init equivalence class cache
 705 | 	if f.enableEquivalenceClassCache && getEquivalencePodFunc != nil {
 706 | 		f.equivalencePodCache = core.NewEquivalenceCache(getEquivalencePodFunc)
 707 | 		glog.Info("Created equivalence class cache")
 708 | 	}
 709 | 	algo := core.NewGenericScheduler(f.schedulerCache, f.equivalencePodCache, predicateFuncs, predicateMetaProducer, priorityConfigs, priorityMetaProducer, extenders)
 710 | 
 711 | 	podBackoff := util.CreateDefaultPodBackoff()
 712 | 	return &scheduler.Config{
 713 | 		SchedulerCache: f.schedulerCache,
 714 | 		Ecache:         f.equivalencePodCache,
 715 | 		// The scheduler only needs to consider schedulable nodes.
 716 | 		NodeLister:          &nodeLister{f.nodeLister},
 717 | 		Algorithm:           algo,
 718 | 		Binder:              f.getBinder(extenders),
 719 | 		PodConditionUpdater: &podConditionUpdater{f.client},
 720 | 		PodPreemptor:        &podPreemptor{f.client},
 721 | 		PodWriter:           &podWriter{Client: f.client},
 722 | 		WaitForCacheSync: func() bool {
 723 | 			return cache.WaitForCacheSync(f.StopEverything, f.scheduledPodsHasSynced)
 724 | 		},
 725 | 		NextPod: func() *v1.Pod {
 726 | 			return f.getNextPod()
 727 | 		},
 728 | 		Error:          f.MakeDefaultErrorFunc(podBackoff, f.podQueue),
 729 | 		StopEverything: f.StopEverything,
 730 | 	}, nil
 731 | }
 732 | 
 733 | type nodeLister struct {
 734 | 	corelisters.NodeLister
 735 | }
 736 | 
 737 | func (n *nodeLister) List() ([]*v1.Node, error) {
 738 | 	return n.NodeLister.List(labels.Everything())
 739 | }
 740 | 
 741 | func (f *ConfigFactory) GetPriorityFunctionConfigs(priorityKeys sets.String) ([]algorithm.PriorityConfig, error) {
 742 | 	pluginArgs, err := f.getPluginArgs()
 743 | 	if err != nil {
 744 | 		return nil, err
 745 | 	}
 746 | 
 747 | 	return getPriorityFunctionConfigs(priorityKeys, *pluginArgs)
 748 | }
 749 | 
 750 | func (f *ConfigFactory) GetPriorityMetadataProducer() (algorithm.MetadataProducer, error) {
 751 | 	pluginArgs, err := f.getPluginArgs()
 752 | 	if err != nil {
 753 | 		return nil, err
 754 | 	}
 755 | 
 756 | 	return getPriorityMetadataProducer(*pluginArgs)
 757 | }
 758 | 
 759 | func (f *ConfigFactory) GetPredicateMetadataProducer() (algorithm.PredicateMetadataProducer, error) {
 760 | 	pluginArgs, err := f.getPluginArgs()
 761 | 	if err != nil {
 762 | 		return nil, err
 763 | 	}
 764 | 	return getPredicateMetadataProducer(*pluginArgs)
 765 | }
 766 | 
 767 | func (f *ConfigFactory) GetPredicates(predicateKeys sets.String) (map[string]algorithm.FitPredicate, error) {
 768 | 	pluginArgs, err := f.getPluginArgs()
 769 | 	if err != nil {
 770 | 		return nil, err
 771 | 	}
 772 | 
 773 | 	return getFitPredicateFunctions(predicateKeys, *pluginArgs)
 774 | }
 775 | 
 776 | func (f *ConfigFactory) getPluginArgs() (*PluginFactoryArgs, error) {
 777 | 	return &PluginFactoryArgs{
 778 | 		PodLister:         f.podLister,
 779 | 		ServiceLister:     f.serviceLister,
 780 | 		ControllerLister:  f.controllerLister,
 781 | 		ReplicaSetLister:  f.replicaSetLister,
 782 | 		StatefulSetLister: f.statefulSetLister,
 783 | 		NodeLister:        &nodeLister{f.nodeLister},
 784 | 		NodeInfo:          &predicates.CachedNodeInfo{NodeLister: f.nodeLister},
 785 | 		PVInfo:            &predicates.CachedPersistentVolumeInfo{PersistentVolumeLister: f.pVLister},
 786 | 		PVCInfo:           &predicates.CachedPersistentVolumeClaimInfo{PersistentVolumeClaimLister: f.pVCLister},
 787 | 		HardPodAffinitySymmetricWeight: f.hardPodAffinitySymmetricWeight,
 788 | 	}, nil
 789 | }
 790 | 
 791 | func (f *ConfigFactory) getNextPod() *v1.Pod {
 792 | 	for {
 793 | 		pod := cache.Pop(f.podQueue).(*v1.Pod)
 794 | 		if f.ResponsibleForPod(pod) {
 795 | 			glog.V(4).Infof("About to try and schedule pod %v", pod.Name)
 796 | 			return pod
 797 | 		}
 798 | 	}
 799 | }
 800 | 
 801 | func (f *ConfigFactory) ResponsibleForPod(pod *v1.Pod) bool {
 802 | 	return f.schedulerName == pod.Spec.SchedulerName
 803 | }
 804 | 
 805 | // unassignedNonTerminatedPod selects pods that are unassigned and non-terminal.
 806 | func unassignedNonTerminatedPod(pod *v1.Pod) bool {
 807 | 	if len(pod.Spec.NodeName) != 0 {
 808 | 		return false
 809 | 	}
 810 | 	if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
 811 | 		return false
 812 | 	}
 813 | 	return true
 814 | }
 815 | 
 816 | // assignedNonTerminatedPod selects pods that are assigned and non-terminal (scheduled and running).
 817 | func assignedNonTerminatedPod(pod *v1.Pod) bool {
 818 | 	if len(pod.Spec.NodeName) == 0 {
 819 | 		return false
 820 | 	}
 821 | 	if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
 822 | 		return false
 823 | 	}
 824 | 	return true
 825 | }
 826 | 
 827 | // assignedPodLister filters the pods returned from a PodLister to
 828 | // only include those that have a node name set.
 829 | type assignedPodLister struct {
 830 | 	corelisters.PodLister
 831 | }
 832 | 
 833 | // List lists all Pods in the indexer for a given namespace.
 834 | func (l assignedPodLister) List(selector labels.Selector) ([]*v1.Pod, error) {
 835 | 	list, err := l.PodLister.List(selector)
 836 | 	if err != nil {
 837 | 		return nil, err
 838 | 	}
 839 | 	filtered := make([]*v1.Pod, 0, len(list))
 840 | 	for _, pod := range list {
 841 | 		if len(pod.Spec.NodeName) > 0 {
 842 | 			filtered = append(filtered, pod)
 843 | 		}
 844 | 	}
 845 | 	return filtered, nil
 846 | }
 847 | 
 848 | // List lists all Pods in the indexer for a given namespace.
 849 | func (l assignedPodLister) Pods(namespace string) corelisters.PodNamespaceLister {
 850 | 	return assignedPodNamespaceLister{l.PodLister.Pods(namespace)}
 851 | }
 852 | 
 853 | // assignedPodNamespaceLister filters the pods returned from a PodNamespaceLister to
 854 | // only include those that have a node name set.
 855 | type assignedPodNamespaceLister struct {
 856 | 	corelisters.PodNamespaceLister
 857 | }
 858 | 
 859 | // List lists all Pods in the indexer for a given namespace.
 860 | func (l assignedPodNamespaceLister) List(selector labels.Selector) (ret []*v1.Pod, err error) {
 861 | 	list, err := l.PodNamespaceLister.List(selector)
 862 | 	if err != nil {
 863 | 		return nil, err
 864 | 	}
 865 | 	filtered := make([]*v1.Pod, 0, len(list))
 866 | 	for _, pod := range list {
 867 | 		if len(pod.Spec.NodeName) > 0 {
 868 | 			filtered = append(filtered, pod)
 869 | 		}
 870 | 	}
 871 | 	return filtered, nil
 872 | }
 873 | 
 874 | // Get retrieves the Pod from the indexer for a given namespace and name.
 875 | func (l assignedPodNamespaceLister) Get(name string) (*v1.Pod, error) {
 876 | 	pod, err := l.PodNamespaceLister.Get(name)
 877 | 	if err != nil {
 878 | 		return nil, err
 879 | 	}
 880 | 	if len(pod.Spec.NodeName) > 0 {
 881 | 		return pod, nil
 882 | 	}
 883 | 	return nil, errors.NewNotFound(schema.GroupResource{Resource: string(v1.ResourcePods)}, name)
 884 | }
 885 | 
 886 | type podInformer struct {
 887 | 	informer cache.SharedIndexInformer
 888 | }
 889 | 
 890 | func (i *podInformer) Informer() cache.SharedIndexInformer {
 891 | 	return i.informer
 892 | }
 893 | 
 894 | func (i *podInformer) Lister() corelisters.PodLister {
 895 | 	return corelisters.NewPodLister(i.informer.GetIndexer())
 896 | }
 897 | 
 898 | // NewPodInformer creates a shared index informer that returns only non-terminal pods.
 899 | func NewPodInformer(client clientset.Interface, resyncPeriod time.Duration) coreinformers.PodInformer {
 900 | 	selector := fields.ParseSelectorOrDie("status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed))
 901 | 	lw := cache.NewListWatchFromClient(client.CoreV1().RESTClient(), string(v1.ResourcePods), metav1.NamespaceAll, selector)
 902 | 	return &podInformer{
 903 | 		informer: cache.NewSharedIndexInformer(lw, &v1.Pod{}, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}),
 904 | 	}
 905 | }
 906 | 
 907 | func (factory *ConfigFactory) MakeDefaultErrorFunc(backoff *util.PodBackoff, podQueue *cache.FIFO) func(pod *v1.Pod, err error) {
 908 | 	return func(pod *v1.Pod, err error) {
 909 | 		if err == core.ErrNoNodesAvailable {
 910 | 			glog.V(4).Infof("Unable to schedule %v %v: no nodes are registered to the cluster; waiting", pod.Namespace, pod.Name)
 911 | 		} else {
 912 | 			if _, ok := err.(*core.FitError); ok {
 913 | 				glog.V(4).Infof("Unable to schedule %v %v: no fit: %v; waiting", pod.Namespace, pod.Name, err)
 914 | 			} else {
 915 | 				glog.Errorf("Error scheduling %v %v: %v; retrying", pod.Namespace, pod.Name, err)
 916 | 			}
 917 | 		}
 918 | 		backoff.Gc()
 919 | 		// Retry asynchronously.
 920 | 		// Note that this is extremely rudimentary and we need a more real error handling path.
 921 | 		go func() {
 922 | 			defer runtime.HandleCrash()
 923 | 			podID := types.NamespacedName{
 924 | 				Namespace: pod.Namespace,
 925 | 				Name:      pod.Name,
 926 | 			}
 927 | 
 928 | 			entry := backoff.GetEntry(podID)
 929 | 			if !entry.TryWait(backoff.MaxDuration()) {
 930 | 				glog.Warningf("Request for pod %v already in flight, abandoning", podID)
 931 | 				return
 932 | 			}
 933 | 			// Get the pod again; it may have changed/been scheduled already.
 934 | 			getBackoff := initialGetBackoff
 935 | 			for {
 936 | 				pod, err := factory.client.CoreV1().Pods(podID.Namespace).Get(podID.Name, metav1.GetOptions{})
 937 | 				if err == nil {
 938 | 					if len(pod.Spec.NodeName) == 0 {
 939 | 						podQueue.AddIfNotPresent(pod)
 940 | 					}
 941 | 					break
 942 | 				}
 943 | 				if errors.IsNotFound(err) {
 944 | 					glog.Warningf("A pod %v no longer exists", podID)
 945 | 					return
 946 | 				}
 947 | 				glog.Errorf("Error getting pod %v for retry: %v; retrying...", podID, err)
 948 | 				if getBackoff = getBackoff * 2; getBackoff > maximalGetBackoff {
 949 | 					getBackoff = maximalGetBackoff
 950 | 				}
 951 | 				time.Sleep(getBackoff)
 952 | 			}
 953 | 		}()
 954 | 	}
 955 | }
 956 | 
 957 | // nodeEnumerator allows a cache.Poller to enumerate items in an v1.NodeList
 958 | type nodeEnumerator struct {
 959 | 	*v1.NodeList
 960 | }
 961 | 
 962 | // Len returns the number of items in the node list.
 963 | func (ne *nodeEnumerator) Len() int {
 964 | 	if ne.NodeList == nil {
 965 | 		return 0
 966 | 	}
 967 | 	return len(ne.Items)
 968 | }
 969 | 
 970 | // Get returns the item (and ID) with the particular index.
 971 | func (ne *nodeEnumerator) Get(index int) interface{} {
 972 | 	return &ne.Items[index]
 973 | }
 974 | 
 975 | type binder struct {
 976 | 	Client clientset.Interface
 977 | }
 978 | 
 979 | // Bind just does a POST binding RPC.
 980 | func (b *binder) Bind(binding *v1.Binding) error {
 981 | 	glog.V(3).Infof("Attempting to bind %v to %v", binding.Name, binding.Target.Name)
 982 | 	return b.Client.CoreV1().Pods(binding.Namespace).Bind(binding)
 983 | }
 984 | 
 985 | type podConditionUpdater struct {
 986 | 	Client clientset.Interface
 987 | }
 988 | 
 989 | func (p *podConditionUpdater) Update(pod *v1.Pod, condition *v1.PodCondition) error {
 990 | 	glog.V(2).Infof("Updating pod condition for %s/%s to (%s==%s)", pod.Namespace, pod.Name, condition.Type, condition.Status)
 991 | 	if podutil.UpdatePodCondition(&pod.Status, condition) {
 992 | 		_, err := p.Client.CoreV1().Pods(pod.Namespace).UpdateStatus(pod)
 993 | 		return err
 994 | 	}
 995 | 	return nil
 996 | }
 997 | 
 998 | type podPreemptor struct {
 999 | 	Client clientset.Interface
1000 | }
1001 | 
1002 | func (p *podPreemptor) GetUpdatedPod(pod *v1.Pod) (*v1.Pod, error) {
1003 | 	return p.Client.CoreV1().Pods(pod.Namespace).Get(pod.Name, metav1.GetOptions{})
1004 | }
1005 | 
1006 | func (p *podPreemptor) DeletePod(pod *v1.Pod) error {
1007 | 	return p.Client.CoreV1().Pods(pod.Namespace).Delete(pod.Name, &metav1.DeleteOptions{})
1008 | }
1009 | 
1010 | //TODO(bsalamat): change this to patch PodStatus to avoid overwriting potential pending status updates.
1011 | func (p *podPreemptor) UpdatePodAnnotations(pod *v1.Pod, annotations map[string]string) error {
1012 | 	podCopy := pod.DeepCopy()
1013 | 	if podCopy.Annotations == nil {
1014 | 		podCopy.Annotations = map[string]string{}
1015 | 	}
1016 | 	for k, v := range annotations {
1017 | 		podCopy.Annotations[k] = v
1018 | 	}
1019 | 	_, err := p.Client.CoreV1().Pods(podCopy.Namespace).UpdateStatus(podCopy)
1020 | 	return err
1021 | }
1022 | type podWriter struct {
1023 | 	Client clientset.Interface
1024 | 	mutex  sync.Mutex
1025 | }
1026 | 
1027 | func (pw *podWriter) UpdatePod(oldPod, newPod *v1.Pod) error {
1028 | 	pw.mutex.Lock()
1029 | 	defer pw.mutex.Unlock()
1030 | 	graceperiod := int64(0)
1031 | 	if oldPod.Labels["task-type"] == "Nimbix" {
1032 | 		glog.V(2).Infof("Deleting pod %s/%s as it could not be scheduled", oldPod.Namespace, oldPod.Name)
1033 | 		err := pw.Client.CoreV1().Pods(oldPod.Namespace).Delete(oldPod.Name, &metav1.DeleteOptions{
1034 | 			GracePeriodSeconds: &graceperiod,
1035 | 		})
1036 | 		if err != nil {
1037 | 			return err
1038 | 		}
1039 | 
1040 | 		glog.V(2).Infof("Creating new pod %s/%s", newPod.Namespace, newPod.Name)
1041 | 		pod, err := pw.Client.CoreV1().Pods(oldPod.Namespace).Create(newPod)
1042 | 		if err != nil {
1043 | 			return err
1044 | 		}
1045 | 		glog.V(2).Infof("Created new pod %s/%s", pod.Namespace, pod.Name)
1046 | 	}
1047 | 	return nil
1048 | }
1049 | 


--------------------------------------------------------------------------------