├── .arcconfig ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── Vagrantfile ├── changes_mesos_scheduler ├── __init__.py ├── changes_scheduler.py ├── main.py ├── statsreporter.py └── tests │ ├── __init__.py │ ├── test_changes_scheduler.py │ └── test_service.py ├── ci ├── mypy-run ├── mypy-setup ├── run_mypy.py ├── run_tests.sh └── setup.sh ├── make_virtualenv.sh ├── scripts └── changes-mesos-scheduler ├── setup.py └── support └── bootstrap-vagrant.sh /.arcconfig: -------------------------------------------------------------------------------- 1 | { 2 | "conduit_uri" : "https://tails.corp.dropbox.com/api/", 3 | "copyright_holder" : "Dropbox", 4 | "repository.callsign": "CHANGESMESOSFWK" 5 | } 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build/ 3 | dist/ 4 | *.egg-info/ 5 | *.deb 6 | /.vagrant/ 7 | .idea 8 | setup.cfg 9 | 10 | .coverage 11 | coverage.xml 12 | *junit.xml 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2014 Dropbox, Inc. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PKG_NAME = changes-mesos-scheduler 2 | VERSION = 0.0.2 3 | # Revision shows date of latest commit and abbreviated commit SHA 4 | # E.g., 1438708515-753e183 5 | REV=`git show -s --format=%ct-%h HEAD` 6 | 7 | DEB_VERSION = "$(VERSION)-$(REV)" 8 | 9 | test: 10 | py.test changes_mesos_scheduler/tests/ 11 | 12 | install-test-requirements: 13 | pip install "file://`pwd`#egg=changes-mesos-scheduler[tests]" 14 | 15 | coverage: 16 | coverage run -m py.test --junitxml=python.junit.xml changes_mesos_scheduler/tests/ 17 | coverage xml 18 | 19 | virtualenv: 20 | ./make_virtualenv.sh $(PKG_NAME) 21 | 22 | deb: virtualenv 23 | fpm -f -t deb -s dir -C build -n $(PKG_NAME) -v $(DEB_VERSION) -d libcurl3 -d libsvn1 -d libsasl2-modules . 24 | 25 | install_deb: deb 26 | sudo dpkg -i "$(PKG_NAME)_$(DEB_VERSION)_amd64.deb" || \ 27 | sudo apt-get install -f -y --force-yes # Sadly, this is necessary to install any missing deps 28 | 29 | virtualenv_coverage: install_deb 30 | . /usr/share/python/$(PKG_NAME)/bin/activate; \ 31 | make coverage 32 | # Sanity check installed binary 33 | /usr/share/python/$(PKG_NAME)/bin/$(PKG_NAME) --help 34 | 35 | virtualenv_test: install_deb 36 | . /usr/share/python/$(PKG_NAME)/bin/activate; \ 37 | make test 38 | # Sanity check installed binary 39 | /usr/share/python/$(PKG_NAME)/bin/$(PKG_NAME) --help 40 | 41 | .PHONY: deb 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ***NOTICE: THIS REPO IS NO LONGER UPDATED*** 2 | 3 | Changes Mesos Scheduler 4 | ======================= 5 | Setting up the vagrant VM: 6 | 7 | ```shell 8 | vagrant up 9 | vagrant ssh 10 | ``` 11 | 12 | Building a deb: 13 | 14 | ```shell 15 | cd /vagrant 16 | make deb 17 | ``` 18 | 19 | `make install_deb` will also install the deb on your machine. 20 | 21 | Running tests: 22 | ```shell 23 | cd /vagrant 24 | make test 25 | ``` 26 | 27 | You can also run tests locally (on your host machine). You need to install 28 | mesos (`brew install mesos` on Mac), and may need to `sudo pip install mesos` 29 | too. After that `make test` should work (mileage may vary, this is only really 30 | tested on Mac). 31 | 32 | 33 | Running the scheduler requires having mesos set up and running but this vagrant VM is not set up to do that yet. You can instead use a different one: 34 | 35 | ```shell 36 | git clone git@github.com:mesosphere/playa-mesos.git 37 | cp your-changes-mesos-scheduler.deb playa-mesos/ 38 | cd playa-mesos 39 | 40 | vagrant up 41 | vagrant ssh 42 | 43 | make install_deb 44 | mkdir /etc/changes-mesos-scheduler 45 | sudo touch /etc/changes-mesos-scheduler/blacklist 46 | 47 | /usr/share/python/changes-mesos-scheduler/bin/changes-mesos-scheduler --help 48 | /usr/share/python/changes-mesos-scheduler/bin/changes-mesos-scheduler --api-url your-changes-endpoint 49 | ``` 50 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | # Vagrantfile API/syntax version. Don't touch unless you know what you're doing! 5 | VAGRANTFILE_API_VERSION = "2" 6 | 7 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| 8 | config.vm.box = "ubuntu/trusty64" 9 | 10 | config.ssh.forward_agent = true 11 | 12 | config.vm.provision :shell, :path => "support/bootstrap-vagrant.sh" 13 | end 14 | -------------------------------------------------------------------------------- /changes_mesos_scheduler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dropbox/changes-mesos-framework/cbb2351d45b4231286a18e70e5fea039b121d0a4/changes_mesos_scheduler/__init__.py -------------------------------------------------------------------------------- /changes_mesos_scheduler/changes_scheduler.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function 2 | 3 | import bisect 4 | import concurrent.futures 5 | import json 6 | import logging 7 | import os 8 | import threading 9 | import time 10 | import urllib2 # type: ignore 11 | 12 | from changes_mesos_scheduler import statsreporter 13 | 14 | from typing import Any, Callable, Dict, NamedTuple, Optional, Set, Tuple 15 | 16 | from collections import defaultdict 17 | from threading import Event 18 | from urllib import urlencode 19 | from uuid import uuid4 20 | 21 | from google.protobuf import text_format as _text_format # type: ignore 22 | 23 | from mesos.interface import Scheduler, SchedulerDriver 24 | from mesos.interface import mesos_pb2 25 | 26 | # how long (in seconds) we'll continue trying to kill a task. After that we give up. 27 | TASK_KILL_THRESHOLD = 3600 28 | 29 | class FileBlacklist(object): 30 | """ File-backed blacklist for slave hostnames. 31 | Hosts are expected to be named in the file, one per line. 32 | Whitespace and lines beginning with '#' are ignored. 33 | """ 34 | def __init__(self, path): 35 | # type: (str) -> None 36 | self._path = path # type: str 37 | self._mtime = 0.0 38 | self._blacklist = set() # type: Set[str] 39 | 40 | def refresh(self): 41 | # type: () -> None 42 | """Refresh the blacklist if the file changed.""" 43 | if os.path.getmtime(self._path) > self._mtime: 44 | self._refresh() 45 | 46 | def _refresh(self): 47 | # type: () -> None 48 | """Unconditionally refresh the blacklist from the file.""" 49 | logging.info('Refreshing blacklist') 50 | self._mtime = os.path.getmtime(self._path) 51 | with open(self._path) as file: 52 | self._blacklist = set([s.strip() for s in file.readlines() if not s.startswith('#')]) 53 | 54 | def contains(self, hostname): 55 | # type: (str) -> bool 56 | """Returns whether the provided hostname is present in the blacklist as of last reading.""" 57 | return hostname in self._blacklist 58 | 59 | 60 | class APIError(Exception): 61 | """An Exception originating from ChangesAPI. 62 | This mostly exists so that our uncertainty of the possible Exceptions 63 | originating from API requests doesn't muddy the error handling in the Scheduler. 64 | """ 65 | def __init__(self, msg, cause=None): 66 | # type: (str, Any) -> None 67 | super(APIError, self).__init__(msg) 68 | self.cause = cause 69 | 70 | 71 | class ChangesAPI(object): 72 | """Client for the Changes API, intended for Scheduler use. 73 | Any exceptions resulting from runtime failures should be APIErrors. 74 | """ 75 | 76 | def __init__(self, api_url): 77 | # type: (str) -> None 78 | self._api_url = api_url 79 | 80 | @staticmethod 81 | def make_url(base_url, path, get_params=None): 82 | # type: (str, str, Optional[Dict[str,str]]) -> str 83 | # Changes insists that paths end with a slash 84 | path = path if path.endswith('/') else path + '/' 85 | # Make sure there's exactly one slash between path and the API url 86 | path = path if path.startswith('/') else '/' + path 87 | base_url = base_url.rstrip('/') 88 | full_url = base_url + path 89 | if get_params: 90 | query_string = '?' + urlencode(get_params) 91 | full_url += query_string 92 | return full_url 93 | 94 | def _api_request(self, path, body=None, get_params=None): 95 | # type: (str, Optional[Dict[str, Any]], Optional[Dict[str, Any]]) -> Dict[str, Any] 96 | full_url = ChangesAPI.make_url(self._api_url, path, get_params) 97 | try: 98 | data = json.dumps(body) if body else None 99 | req = urllib2.Request( 100 | full_url, data, 101 | {'Content-Type': 'application/json'}) 102 | # Any connectivity issues will raise an exception, as will some error statuses. 103 | content = urllib2.urlopen(req).read() 104 | return json.loads(content) 105 | except Exception as exc: 106 | # Always log exceptions so callers don't have to. 107 | logging.exception("Error POSTing to Changes at %s", full_url) 108 | raise APIError("Error POSTing to Changes at %s" % full_url, exc) 109 | 110 | def get_allocate_jobsteps(self, limit=None, cluster=None): 111 | # type: (Optional[int], Optional[str]) -> List[Dict[str, Any]] 112 | """ Returns a list of up to `limit` pending allocation jobsteps in `cluster`. 113 | The scheduler may then allocate these as it sees fit. 114 | 115 | Args: 116 | limit: maximum jobsteps to return 117 | cluster: cluster to look in. The "default" cluster 118 | returns jobsteps with no cluster specified. 119 | 120 | Returns: 121 | list: List of JobSteps (in priority order) that are pending allocation 122 | """ 123 | data = {'limit': limit} if limit else {} # type: Dict[str, Any] 124 | if cluster: 125 | data['cluster'] = cluster 126 | return self._api_request("/jobsteps/allocate/", get_params=data)['jobsteps'] 127 | 128 | def post_allocate_jobsteps(self, jobstep_ids, cluster=None): 129 | # type: (List[str], Optional[str]) -> List[str] 130 | """ Attempt to allocate the given list of JobStep ids. 131 | 132 | Args: 133 | jobstep_ids: list of JobStep ID hexs to allocate. 134 | cluster: cluster to allocate in. 135 | 136 | Returns: 137 | list: list of jobstep ID hexs that were actually allocated. 138 | """ 139 | data = {'jobstep_ids': jobstep_ids} # type: Dict[str, Any] 140 | if cluster: 141 | data['cluster'] = cluster 142 | return self._api_request("/jobsteps/allocate/", data)['allocated'] 143 | 144 | def jobstep_needs_abort(self, jobstep_ids): 145 | # type: (List[str]) -> List[str] 146 | """ Query for which jobsteps in a given list should be aborted. 147 | 148 | Args: 149 | jobstep_ids: JobStep ID hexs we are asking about. 150 | Returns: 151 | list: subset of the jobstep_ids, which should be aborted. 152 | """ 153 | # don't bother sending the request if there are no jobstep ids 154 | if len(jobstep_ids) == 0: 155 | return [] 156 | data = {'jobstep_ids': jobstep_ids} 157 | return self._api_request("/jobsteps/needs_abort/", data)['needs_abort'] 158 | 159 | def update_jobstep(self, jobstep_id, status, result=None, hostname=None): 160 | # type: (str, str, Optional[str], Optional[str]) -> None 161 | """ Update the recorded status and possibly result of a JobStep in Changes. 162 | 163 | Args: 164 | jobstep_id: JobStep ID. 165 | status: Status (one of "finished", "queued", "in_progress"). 166 | result: Optionally one of 'failed', 'passed', 'aborted', 'skipped', or 'infra_failed'. 167 | hostname: Optional hostname of slave we are running this jobstep on 168 | """ 169 | data = {"status": status} 170 | if result: 171 | data["result"] = result 172 | if hostname: 173 | data["node"] = hostname 174 | self._api_request("/jobsteps/{}/".format(jobstep_id), data) 175 | 176 | def jobstep_console_append(self, jobstep_id, text): 177 | # type: (str, str) -> None 178 | """ Append to the JobStep's console log. 179 | Args: 180 | jobstep_id: JobStep ID. 181 | text: Text to append. 182 | """ 183 | url = '/jobsteps/%s/logappend/' % jobstep_id 184 | self._api_request(url, {'source': 'console', 'text': text}) 185 | 186 | 187 | class SlaveInfo(object): 188 | def __init__(self, hostname): 189 | # type: (str) -> None 190 | self.hostname = hostname 191 | 192 | class ChangesScheduler(Scheduler): 193 | def __init__(self, state_file, api, blacklist, stats=None, 194 | changes_request_limit=200): 195 | # type: (str, ChangesAPI, FileBlacklist, Optional[Any], int) -> None 196 | """ 197 | Args: 198 | state_file (str): Path where serialized internal state will be 199 | stored. 200 | api (ChangesAPI): API to use for interacting with Changes. 201 | blacklist (FileBlacklist): Blacklist to use. 202 | stats (statsreporter.Stats): Optional Stats instance to use. 203 | """ 204 | self.framework_id = None # type: Optional[str] 205 | self._changes_api = api 206 | self.taskJobStepMappingLock = threading.Lock() 207 | self.taskJobStepMapping = {} # type: Dict[str, str] 208 | # maps from a slave_id to general info about that slave (currently only its hostname) 209 | self.slaveIdInfo = {} # type: Dict[str, SlaveInfo] 210 | # maps from a task id to a timestamp of when we first tried killing that task 211 | self.tasksPendingKill = {} # type: Dict[str, float] 212 | self.tasksLaunched = 0 213 | self.tasksFinished = 0 214 | self.shuttingDown = Event() 215 | # Use the provided Stats or create a no-op one. 216 | self._stats = stats or statsreporter.Stats(None) 217 | self._blacklist = blacklist 218 | # Refresh now so that if it fails, it fails at startup. 219 | self._blacklist.refresh() 220 | self.state_file = state_file 221 | self.changes_request_limit = changes_request_limit 222 | self._snapshot_slave_map = defaultdict(lambda: defaultdict(float)) # type: Dict[str, Dict[str, float]] 223 | 224 | # Variables to help with polling Changes for pending jobsteps in a 225 | # separate thread. _cached_slaves_lock protects _cached_slaves. 226 | self._cached_slaves_lock = threading.Lock() 227 | self._cached_slaves = {} # type: Dict[str, ChangesScheduler.Slave] 228 | 229 | # Restore state from a previous run 230 | if not self.state_file: 231 | logging.warning("State file location not set. Not restoring old state.") 232 | elif not os.path.exists(self.state_file): 233 | logging.warning("State file not found. Not restoring old state.") 234 | else: 235 | try: 236 | self.restore_state() 237 | except Exception: 238 | logging.exception("Failed to restore state. Continuing as a new scheduler.") 239 | else: 240 | # Delete the old file to prevent it from being used again on a restart 241 | # as it will likely be stale. 242 | os.remove(self.state_file) 243 | 244 | def poll_changes_until_shutdown(self, driver, interval): 245 | # type: (SchedulerDriver, int) -> None 246 | """In a separate thread, periodically poll Changes for jobsteps that 247 | need to be scheduled. This method will block, waiting indefinitely 248 | until shuttingDown() is set. Then the thread will terminate (finishing 249 | any current polling activity if necessary) and this method will return. 250 | Args: 251 | driver: the MesosSchedulerDriver object 252 | interval: number of seconds in each poll loop. 253 | """ 254 | with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: 255 | future = executor.submit(self._polling_loop, driver, interval) 256 | logging.info("Started thread at %s. Now waiting...", time.ctime()) 257 | while not future.done(): 258 | time.sleep(.01) 259 | try: 260 | future.result() 261 | except Exception: 262 | logging.exception("Polling thread failed. Exiting.") 263 | self.decline_open_offers(driver) 264 | 265 | def _polling_loop(self, driver, interval): 266 | # type: (SchedulerDriver, int) -> None 267 | """Poll Changes for new jobsteps forever, until shuttingDown is set. 268 | Args: 269 | driver: the MesosSchedulerDriver object 270 | interval: number of seconds in each poll loop. 271 | """ 272 | try: 273 | next_wait_duration = 0.0 274 | while not self.shuttingDown.wait(next_wait_duration): 275 | start_time = time.time() 276 | # Loop as long as Changes continues providing tasks to schedule. 277 | while self.poll_and_launch_once(driver): 278 | pass 279 | 280 | # kill any aborted jobsteps too 281 | self.poll_and_abort(driver) 282 | 283 | # Schedule the delay for the next iteration of the loop, 284 | # attempting to compensate for scheduling skew caused by 285 | # polling/computation time. 286 | last_poll_duration = time.time() - start_time 287 | next_wait_duration = max(0, interval - last_poll_duration) 288 | finally: 289 | # In the event of an exception in the polling thread, shut 290 | # everything down clean(ish)ly. 291 | self.shuttingDown.set() 292 | 293 | def poll_and_launch_once(self, driver): 294 | # type: (SchedulerDriver) -> bool 295 | """Poll Changes once for all jobsteps matching all clusters for which 296 | we have offers. Then assign these jobsteps to offers. Then execute the 297 | assignments by launching tasks on Mesos and informing Changes about 298 | the assignments. 299 | This is also the entry point for most testing, since it skips the 300 | annoying threading and while-loop behavior that make synchronization 301 | difficult. 302 | Args: 303 | driver: the MesosSchedulerDriver object 304 | Returns: 305 | bool: True if there are more jobsteps to fetch from Changes, False 306 | otherwise. 307 | """ 308 | # TODO: There's presently a window between post_allocate_jobsteps() and 309 | # launchTasks() where Changes thinks tasks are scheduled on Mesos, but 310 | # the tasks haven't actually been scheduled yet. If there's a shutdown 311 | # or failure in this window, it can be a long time before Changes will 312 | # figure it out and re-submit the tasks to the scheduler. 313 | # 314 | # Also note that until post_allocate_jobsteps() is called, Changes will 315 | # just keep returning the same set of jobsteps to 316 | # get_allocate_jobsteps(). Thus we call get- and post- in a 1:1 317 | # ratio, otherwise we could have an infinite poll loop on Changes. 318 | # 319 | # To that end, consider implementing something like the following: 320 | # 1) Query Changes for jobsteps 321 | # 2) Internally assign jobsteps to offers 322 | # 3) Store assignments in scheduler's state.pending_assignments 323 | # 4) Write the state file each time the state changes, rather than 324 | # only on shutdown, such that we'd have everything in order in the 325 | # event of a problem. 326 | # 5) post_allocate_jobsteps() the assignments 327 | # 6) Goto 1 until no more jobsteps 328 | # 7) Launch jobsteps on mesos 329 | # 8) Clear state.pending_assignments and write state file. 330 | # 331 | # 9) On startup, jobstep_deallocate any state.pending_assignments 332 | with self._cached_slaves_lock: 333 | # Get all slaves (composites of individual offers on the same host) 334 | all_slaves = self._cached_slaves.values() 335 | filtered_slaves = self._filter_slaves(all_slaves) 336 | logging.info("Do scheduling cycle with %d available slaves. (%d " + 337 | "after filtering)", 338 | len(all_slaves), len(filtered_slaves)) 339 | slaves_by_cluster = self._slaves_by_cluster(filtered_slaves) 340 | 341 | # Get all jobsteps, organized by cluster. 342 | jobsteps_by_cluster = self._query_changes_for_jobsteps( 343 | driver, slaves_by_cluster.keys()) 344 | 345 | # For each cluster, assign jobsteps to slaves, then launch the 346 | # jobsteps on those slaves, using multiple offers if necessary. 347 | for cluster, jobsteps in jobsteps_by_cluster.iteritems(): 348 | self._assign_jobsteps(cluster, 349 | slaves_by_cluster[cluster], 350 | jobsteps_by_cluster[cluster]) 351 | self._launch_jobsteps(driver, 352 | cluster, 353 | slaves_by_cluster[cluster]) 354 | 355 | # Guess whether or not there are more jobsteps waiting on Changes by 356 | # comparing the number of jobsteps received vs. the number of jobsteps 357 | # requested. 358 | return len(jobsteps_by_cluster) == self.changes_request_limit 359 | 360 | def poll_and_abort(self, driver): 361 | # type: (SchedulerDriver) -> None 362 | """Poll Changes to see if any jobsteps we're responsible for should be aborted. 363 | We ask the Mesos master to kill the tasks for these jobsteps. 364 | """ 365 | jobStepTaskMapping = {} 366 | with self.taskJobStepMappingLock: 367 | for task_id, jobstep_id in self.taskJobStepMapping.iteritems(): 368 | jobStepTaskMapping[jobstep_id] = task_id 369 | try: 370 | abort_jobstep_ids = self._changes_api.jobstep_needs_abort(sorted(jobStepTaskMapping.keys())) 371 | except APIError: 372 | logging.warning('/jobstep/needs_abort/ failed', exc_info=True) 373 | abort_jobstep_ids = [] 374 | 375 | now = time.time() 376 | for jobstep_id in abort_jobstep_ids: 377 | task_id = jobStepTaskMapping[jobstep_id] 378 | with self.taskJobStepMappingLock: 379 | # add it to tasksPendingKill if it's not already there. 380 | first_tried_to_kill = self.tasksPendingKill.setdefault(task_id, now) 381 | if now - first_tried_to_kill > TASK_KILL_THRESHOLD: 382 | # giving up on this one 383 | logging.warning("Task %s (jobstep ID %s) still hasn't been successfully killed, giving up.", task_id, jobstep_id) 384 | self._stats.incr('couldnt_abort_task') 385 | del self.taskJobStepMapping[task_id] 386 | del self.tasksPendingKill[task_id] 387 | continue 388 | logging.info('Asking Mesos to kill task %s (jobstep ID %s)', task_id, jobstep_id) 389 | driver.killTask(mesos_pb2.TaskID(value=task_id)) 390 | 391 | def decline_open_offers(self, driver): 392 | # type: (SchedulerDriver) -> None 393 | """Decline all cached Mesos pb_offers. 394 | """ 395 | with self._cached_slaves_lock: 396 | slaves = self._cached_slaves.values() 397 | for slave in slaves: 398 | self._stat_and_log_list(slave.offers(), 'decline_for_shutdown', 399 | lambda offer: "Shutting down, declining offer: %s" % offer.offer.id) 400 | self._decline_list(driver, slave.offers()) 401 | self._cached_slaves = {} 402 | 403 | def registered(self, driver, frameworkId, masterInfo): 404 | """ 405 | Invoked when the scheduler successfully registers with a Mesos master. 406 | It is called with the frameworkId, a unique ID generated by the 407 | master, and the masterInfo which is information about the master 408 | itself. 409 | """ 410 | logging.info("Registered with framework ID %s", frameworkId.value) 411 | self.framework_id = frameworkId.value 412 | 413 | def reregistered(self, driver, masterInfo): 414 | """ 415 | Invoked when the scheduler re-registers with a newly elected Mesos 416 | master. This is only called when the scheduler has previously been 417 | registered. masterInfo contains information about the newly elected 418 | master. 419 | """ 420 | logging.info("Re-Registered with new master") 421 | 422 | def disconnected(self, driver): 423 | # type: (SchedulerDriver) -> None 424 | """ 425 | Invoked when the scheduler becomes disconnected from the master, e.g. 426 | the master fails and another is taking over. 427 | Abandon all open offers and slaves. We don't decline, since there's 428 | no master to report to. The new master should provide a new batch of 429 | offers soon enough. 430 | """ 431 | logging.info("Disconnected from master. Abandoning all cached offer and slave info without declining.") 432 | with self._cached_slaves_lock: 433 | self._cached_slaves = {} 434 | 435 | @staticmethod 436 | def _decode_typed_field(pb): 437 | field_type = pb.type 438 | if field_type == mesos_pb2.Value.SCALAR: 439 | return pb.scalar.value 440 | elif field_type == mesos_pb2.Value.RANGES: 441 | return [{"begin": ra.begin, "end": ra.end} for ra in pb.ranges.range] 442 | elif field_type == mesos_pb2.Value.SET: 443 | return pb.set.item 444 | elif field_type == mesos_pb2.Value.TEXT: 445 | return pb.text.value 446 | else: 447 | raise Exception("Unknown field type: %s", field_type) 448 | 449 | @staticmethod 450 | def _decode_attribute(attr_pb): 451 | return (attr_pb.name, ChangesScheduler._decode_typed_field(attr_pb)) 452 | 453 | @staticmethod 454 | def _decode_resource(resource_pb): 455 | return (resource_pb.name, ChangesScheduler._decode_typed_field(resource_pb)) 456 | 457 | @property 458 | def activeTasks(self): 459 | return self.tasksFinished - self.tasksLaunched 460 | 461 | @staticmethod 462 | def get_cluster(offer): 463 | attributes = dict([ChangesScheduler._decode_attribute(a) for a in offer.attributes]) 464 | return attributes.get('labels') 465 | 466 | @staticmethod 467 | def get_resources(offer): 468 | return {name: value for (name, value) in 469 | [ChangesScheduler._decode_resource(r) for r in offer.resources]} 470 | 471 | class OfferWrapper(object): 472 | """Precompute some commonly-used fields from a Mesos Offer proto. 473 | """ 474 | def __init__(self, pb_offer): 475 | # type: (Any) -> None 476 | self.offer = pb_offer 477 | self.cluster = ChangesScheduler.get_cluster(pb_offer) 478 | 479 | resources = ChangesScheduler.get_resources(pb_offer) 480 | self.cpu = resources.get('cpus', 0.0) 481 | self.mem = resources.get('mem', 0) 482 | 483 | def __cmp__(self, other): 484 | # type: (ChangesScheduler.OfferWrapper) -> int 485 | """Comparator for sorting offers by "least loaded". 486 | """ 487 | # we prioritize first by cpu then memory. 488 | # (values are negated so more resources sorts as "least loaded") 489 | us = (-self.cpu, -self.mem) 490 | them = (-other.cpu, -other.mem) 491 | if us < them: 492 | return -1 493 | return 0 if us == them else 1 494 | 495 | def __str__(self, pb_offer): 496 | cpu = "?" 497 | mem = "?" 498 | for r in pb_offer.resources: 499 | if r.name == 'cpus': 500 | cpu = str(r.scalar).strip() 501 | if r.name == 'memory': 502 | cpu = str(r.scalar).strip() 503 | return "Offer({} {} {} cpu: {} mem: {})".format( 504 | pb_offer.id.value, pb_offer.slave_id.value, 505 | pb_offer.hostname, cpu, mem) 506 | 507 | class Slave(object): 508 | """ Wrapper around a protobuf Offer object. Provides numerous 509 | conveniences including comparison (we currently use a least loaded 510 | approach), and being able to assign jobsteps to the offer. 511 | """ 512 | def __init__(self, slave_id, hostname, cluster): 513 | # type: (str, str, str) -> None 514 | self.slave_id = slave_id 515 | self.hostname = hostname 516 | self.cluster = cluster 517 | 518 | self._offers = {} # type: Dict[str, ChangesScheduler.OfferWrapper] 519 | self.jobsteps_assigned = [] # type: List[Dict[str, Any]] 520 | 521 | # Sum of all Offer resources for this slave. 522 | self.total_cpu = 0.0 523 | self.total_mem = 0 524 | 525 | # Sum of all resources for jobsteps assigned to this slave. 526 | self.allocated_cpu = 0.0 527 | self.allocated_mem = 0 528 | 529 | def offers(self): 530 | # type: () -> List[ChangesScheduler.OfferWrapper] 531 | """Returns a list of available offers on the slave. 532 | """ 533 | return self._offers.values() 534 | 535 | def has_offers(self): 536 | # type: () -> bool 537 | """Returns True if the slave has any available offers, False 538 | otherwise. 539 | """ 540 | return len(self._offers) > 0 541 | 542 | def is_maintenanced(self, now_nanos): 543 | # type: (int) -> bool 544 | """Determine if a Mesos offer indicates that a maintenance window is 545 | in progress for the slave. Treat the slave as maintenanced if ANY 546 | offer has an active maintenance window. 547 | Args: 548 | now_nanos: Timestamp of right now in nanoseconds, for comparing 549 | to the offer's (optional) maintenance time window. 550 | Returns: 551 | True if the offer is in the maintenance window, False otherwise. 552 | """ 553 | is_maintenanced = False 554 | for offer in self._offers.itervalues(): 555 | if not offer.offer.HasField('unavailability'): 556 | continue 557 | start_time = offer.offer.unavailability.start.nanoseconds 558 | 559 | # If "duration" is not present use a default value of anything 560 | # greater than Now, to represent an unbounded maintenance time. 561 | # Override this with an actual end time if the "duration" field 562 | # is present in the protobuf. 563 | end_time = now_nanos + 1 564 | if (offer.offer.unavailability.HasField('duration')): 565 | end_time = start_time + offer.offer.unavailability.duration.nanoseconds 566 | 567 | is_maintenanced = now_nanos > start_time and now_nanos < end_time 568 | if is_maintenanced: 569 | break 570 | return is_maintenanced 571 | 572 | def add_offer(self, offer): 573 | # type: (ChangesScheduler.OfferWrapper) -> None 574 | """Add an offer to this slave, and add its resources to the slave's 575 | total resources. 576 | """ 577 | if (offer.offer.slave_id.value != self.slave_id or 578 | offer.offer.hostname != self.hostname or 579 | offer.cluster != self.cluster): 580 | logging.error("A mismatched offer got mixed in with the wrong " + 581 | "slave. Skipping. (\n Slave: %s\n Offer: %s)", 582 | self, offer) 583 | return 584 | 585 | self.total_cpu += offer.cpu 586 | self.total_mem += offer.mem 587 | logging.info("Slave %s: Add new offer +%f cpu, +%d mem (-> %f %d)", 588 | self.hostname, offer.cpu, offer.mem, self.total_cpu, 589 | self.total_mem) 590 | self._offers[offer.offer.id.value] = offer 591 | 592 | def remove_offer(self, offer_id): 593 | # type: (Any) -> None 594 | """Remove an offer and its resources from this slave. 595 | Args: 596 | offer_id: mesos_pb2.OfferId 597 | """ 598 | offer = self._offers.get(offer_id.value) 599 | if offer: 600 | del(self._offers[offer_id.value]) 601 | self.total_cpu -= offer.cpu 602 | self.total_mem -= offer.mem 603 | 604 | def offers_to_launch(self): 605 | # type: () -> List[ChangesScheduler.OfferWrapper] 606 | """Based on the jobsteps previously assigned, select the offers on 607 | which to allocate the jobsteps. 608 | Also, remove from the Slave all offers which are about to be 609 | launched, and decrement total resources appropriately. 610 | Returns: 611 | A list of OfferWrappers representing the set of offers on which 612 | the tasks should be scheduled. All returned OfferWrappers 613 | should have the same slave ID and hostname. 614 | """ 615 | current_offers = sorted(self._offers.values()) 616 | 617 | offers_to_launch = [] 618 | for offer in current_offers: 619 | # Decrement the "remaining" resources fields as we choose 620 | # offers to allocate to the jobsteps. 621 | if (self.allocated_cpu > 0 and offer.cpu > 0 or 622 | self.allocated_mem > 0 and offer.mem > 0): 623 | offers_to_launch.append(offer.offer.id) 624 | self.allocated_cpu -= offer.cpu 625 | self.allocated_mem -= offer.mem 626 | self.remove_offer(offer.offer.id) 627 | return offers_to_launch 628 | 629 | def tasks_to_launch(self): 630 | # type: () -> Tuple[List[Any], List[str]] 631 | """Generate list of mesos_pb2.Task to launch, and a second list of 632 | jobstep IDs corresponding to each task. 633 | Also, reset/clear jobsteps_assigned on the Slave. 634 | Returns: 635 | (list of tasks, list of jobstep IDs) 636 | """ 637 | tasks = [] 638 | jobstep_ids = [] 639 | for jobstep in self.jobsteps_assigned: 640 | tasks.append(self._jobstep_to_task(jobstep)) 641 | jobstep_ids.append(jobstep['id']) 642 | 643 | self.unassign_jobsteps() 644 | return tasks, jobstep_ids 645 | 646 | def unassign_jobsteps(self): 647 | # type: () -> None 648 | """Clear all assigned jobsteps from the Slave and reset required 649 | resources. 650 | """ 651 | self.jobsteps_assigned = [] 652 | self.allocated_cpu = 0.0 653 | self.allocated_mem = 0 654 | 655 | def __cmp__(self, other): 656 | # type: (ChangesScheduler.Slave) -> int 657 | # we prioritize first by cpu then memory. 658 | # (values are negated so more resources sorts as "least loaded") 659 | us = (-(self.total_cpu - self.allocated_cpu), 660 | -(self.total_mem - self.allocated_mem)) 661 | them = (-(other.total_cpu - other.allocated_cpu), 662 | -(other.total_mem - other.allocated_mem)) 663 | if us < them: 664 | return -1 665 | return 0 if us == them else 1 666 | 667 | def __str__(self, slave): 668 | return "Slave({}: {} offers, {} acpu, {} amem)".format( 669 | slave.hostname, len(slave.offers()), slave.total_cpu, 670 | slave.total_mem) 671 | 672 | def has_resources_for(self, jobstep): 673 | # type: (Dict[str, Any]) -> bool 674 | """Returns true if the slave has sufficient available resources to 675 | execute a jobstep, false otherwise. 676 | Args: 677 | jobstep: The jobstep to execute. 678 | Returns: 679 | True if the slave can host the jobstep. 680 | """ 681 | return ((self.total_cpu - self.allocated_cpu) >= jobstep['resources']['cpus'] and 682 | (self.total_mem - self.allocated_mem) >= jobstep['resources']['mem']) 683 | 684 | def assign_jobstep(self, jobstep): 685 | # type: (Dict[str, Any]) -> None 686 | """Tentatively assign a jobstep to run on this slave. The actual 687 | launching occurs elsewhere. 688 | """ 689 | assert self.has_resources_for(jobstep) 690 | self.allocated_cpu += jobstep['resources']['cpus'] 691 | self.allocated_mem += jobstep['resources']['mem'] 692 | self.jobsteps_assigned.append(jobstep) 693 | 694 | def _jobstep_to_task(self, jobstep): 695 | # type: (Dict[str, Any]) -> Any 696 | """ Given a jobstep and an offer to assign it to, returns the TaskInfo 697 | protobuf for the jobstep and updates scheduler state accordingly. 698 | Args: 699 | jobstep: The jobstep to convert to a task. 700 | Returns: 701 | mesos_pb2.Task 702 | """ 703 | tid = uuid4().hex 704 | logging.info("Accepting offer on %s to start task %s", self.hostname, tid) 705 | 706 | task = mesos_pb2.TaskInfo() 707 | task.name = "{} {}".format( 708 | jobstep['project']['slug'], 709 | jobstep['id'], 710 | ) 711 | task.task_id.value = str(tid) 712 | task.slave_id.value = self.slave_id 713 | 714 | cmd = jobstep["cmd"] 715 | 716 | task.command.value = cmd 717 | logging.debug("Scheduling cmd: %s", cmd) 718 | 719 | cpus = task.resources.add() 720 | cpus.name = "cpus" 721 | cpus.type = mesos_pb2.Value.SCALAR 722 | cpus.scalar.value = jobstep["resources"]["cpus"] 723 | 724 | mem = task.resources.add() 725 | mem.name = "mem" 726 | mem.type = mesos_pb2.Value.SCALAR 727 | mem.scalar.value = jobstep["resources"]["mem"] 728 | 729 | return task 730 | 731 | def _get_slaves_for_snapshot(self, snapshot_id, recency_threshold_hours=12): 732 | # type: (str, int) -> List[str] 733 | """ Returns list of hostnames which have run tasks with a given 734 | snapshot_id recently. 735 | """ 736 | latest_snapshot_use = time.time() - recency_threshold_hours * 3600 737 | return [k for k, v in self._snapshot_slave_map[snapshot_id].iteritems() 738 | if v >= latest_snapshot_use] 739 | 740 | def _associate_snapshot_with_slave(self, snapshot_id, slave): 741 | self._snapshot_slave_map[snapshot_id][slave] = time.time() 742 | 743 | @staticmethod 744 | def _jobstep_snapshot(jobstep): 745 | """ Given a jobstep, return its snapshot id if set, None otherwise. 746 | """ 747 | if 'image' in jobstep and jobstep['image']: 748 | if 'snapshot' in jobstep['image'] and jobstep['image']['snapshot']: 749 | return jobstep['image']['snapshot']['id'] 750 | 751 | return None 752 | 753 | def _fetch_jobsteps(self, cluster): 754 | # type: (str) -> List[Dict[str, Any]] 755 | """Query Changes for all allocatable jobsteps for the specified cluster. 756 | """ 757 | try: 758 | with self._stats.timer('poll_changes'): 759 | possible_jobsteps = self._changes_api.get_allocate_jobsteps(limit=self.changes_request_limit, 760 | cluster=cluster) 761 | except APIError: 762 | logging.warning('/jobstep/allocate/ GET failed for cluster: %s', cluster, exc_info=True) 763 | possible_jobsteps = [] 764 | return possible_jobsteps 765 | 766 | def _assign_jobsteps(self, cluster, slaves_for_cluster, jobsteps_for_cluster): 767 | # type: (str, List[ChangesScheduler.Slave], List[Dict[str, Any]]) -> None 768 | """Make assignments for jobsteps for a cluster to offers for a cluster. 769 | Assignments are stored in the OfferWrapper, to be launched later. 770 | Args: 771 | cluster: The cluster to make assignments for. 772 | slaves_for_cluster: A list of offers for the cluster. 773 | jobsteps_for_cluster: A list of jobsteps for the cluster. 774 | """ 775 | # Changes returns JobSteps in priority order, so for each one 776 | # we attempt to put it on the machine with the least current load that 777 | # still has sufficient resources for it. This is not necessarily an 778 | # optimal algorithm--it might allocate fewer jobsteps than is possible, 779 | # and it currently prioritizes cpu over memory. We don't believe this 780 | # to be an issue currently, but it may be worth improving in the future 781 | if len(slaves_for_cluster) == 0 or len(jobsteps_for_cluster) == 0: 782 | return 783 | 784 | logging.info("Assign %s jobsteps on cluster %s", len(jobsteps_for_cluster), cluster) 785 | sorted_slaves = sorted(slaves_for_cluster) 786 | 787 | for jobstep in jobsteps_for_cluster: 788 | slave_to_use = None 789 | snapshot_id = self._jobstep_snapshot(jobstep) 790 | # Disable proximity check if not using a snapshot or scheduling in an explicit cluster. 791 | # Clusters are expected to pre-populate snapshots out of band and will not benefit 792 | # from proximity checks. 793 | if snapshot_id and not cluster: 794 | slaves_with_snapshot = self._get_slaves_for_snapshot(snapshot_id) 795 | logging.info('Found slaves with snapshot id %s: %s', 796 | snapshot_id, slaves_with_snapshot) 797 | 798 | if len(slaves_with_snapshot) > 0: 799 | for slave in sorted_slaves: 800 | if (slave.hostname in slaves_with_snapshot and 801 | slave.has_resources_for(jobstep)): 802 | slave_to_use = slave 803 | logging.info('Scheduling jobstep %s on slave %s which might have snapshot %s', 804 | jobstep, slave.hostname, snapshot_id) 805 | break 806 | 807 | # If we couldn't find a slave which is likely to have the snapshot already, 808 | # this gives us the least-loaded slave that we could actually use for this jobstep 809 | if not slave_to_use: 810 | for slave in sorted_slaves: 811 | if slave.has_resources_for(jobstep): 812 | slave_to_use = slave 813 | break 814 | 815 | # couldn't find any slaves that would support this jobstep, move on 816 | if not slave_to_use: 817 | logging.warning("No slave found to run jobstep %s.", jobstep) 818 | continue 819 | 820 | sorted_slaves.remove(slave_to_use) 821 | if snapshot_id: 822 | self._associate_snapshot_with_slave(snapshot_id, slave_to_use.hostname) 823 | 824 | slave_to_use.assign_jobstep(jobstep) 825 | bisect.insort(sorted_slaves, slave_to_use) 826 | 827 | def _stat_and_log_list(self, to_decline, stats_counter_name, reason_func): 828 | # type: (List[Any], str, Callable[[Any], str]) -> None 829 | """Inform the Mesos master that we're declining a list of offers. 830 | Args: 831 | to_decline: The list of offers to decline 832 | stats_counter_name: A counter name to increment, to track stats for 833 | different decline reasons. 834 | reason_func (function(Mesos Offer protobuf)): A function to generate 835 | a logging string, to explain why this offer was declined. 836 | """ 837 | self._stats.incr(stats_counter_name, len(to_decline)) 838 | for offer in to_decline: 839 | if reason_func: 840 | logging.info(reason_func(offer)) 841 | 842 | def _decline_list(self, driver, to_decline): 843 | # type: (SchedulerDriver, List[Any]) -> None 844 | """Inform the Mesos master that we're declining a list of offers. 845 | Args: 846 | driver: the MesosSchedulerDriver object 847 | to_decline: The list of offers to decline 848 | """ 849 | for offer in to_decline: 850 | driver.declineOffer(offer.offer.id) 851 | 852 | def _filter_slaves(self, slaves): 853 | # type: (List[Any]) -> List[Any] 854 | """Given a list of offer protos, decline blacklisted or unusable 855 | offers. Return a list of usable offers. 856 | Args: 857 | pb_offers (list of Mesos Offer protobufs): A list of offers, some 858 | of which are usable and some of which might not be usable. 859 | Returns: 860 | list of usable Mesos Offer protobufs 861 | """ 862 | self._blacklist.refresh() 863 | now_nanos = int(time.time() * 1000000000) 864 | maintenanced, blacklisted, usable = [], [], [] 865 | for slave in slaves: 866 | if slave.is_maintenanced(now_nanos): 867 | maintenanced.append(slave) 868 | elif self._blacklist.contains(slave.hostname): 869 | blacklisted.append(slave) 870 | else: 871 | usable.append(slave) 872 | 873 | self._stat_and_log_list(maintenanced, 'ignore_for_maintenance', 874 | lambda slave: "Ignoring slave from maintenanced hostname: %s" % slave.hostname) 875 | self._stat_and_log_list(blacklisted, 'ignore_for_blacklist', 876 | lambda slave: "Ignoring slave from blacklisted hostname: %s" % slave.hostname) 877 | return usable 878 | 879 | def _launch_jobsteps(self, driver, cluster, slaves_for_cluster): 880 | # type: (SchedulerDriver, str, List[ChangesScheduler.Slave]) -> None 881 | """Given a list of offers, launch all jobsteps assigned on each offer. 882 | Remove from the Offers cache any used offers. 883 | Args: 884 | driver: the MesosSchedulerDriver object 885 | slaves_for_cluster: A list of offers with assigned jobsteps already 886 | embedded. Launch the jobsteps on the offer. 887 | """ 888 | if len(slaves_for_cluster) == 0: 889 | return 890 | 891 | # Inform Changes of where the jobsteps are going. 892 | jobsteps_to_allocate = [] 893 | for slave in slaves_for_cluster: 894 | jobstep_ids = [jobstep['id'] for jobstep in slave.jobsteps_assigned] 895 | jobsteps_to_allocate.extend(jobstep_ids) 896 | 897 | if len(jobsteps_to_allocate) == 0: 898 | return 899 | 900 | try: 901 | jobsteps_to_allocate.sort() # Make testing deterministic. 902 | allocated_jobstep_ids = self._changes_api.post_allocate_jobsteps( 903 | jobsteps_to_allocate, cluster=cluster) 904 | except APIError: 905 | allocated_jobstep_ids = [] 906 | if sorted(allocated_jobstep_ids) != sorted(jobsteps_to_allocate): 907 | # NB: cluster could be None here 908 | logging.warning("Could not successfully allocate for cluster: %s", cluster) 909 | # for now we just give up on this cluster entirely 910 | for slave in slaves_for_cluster: 911 | slave.unassign_jobsteps() 912 | 913 | # we've allocated all the jobsteps we can, now we launch them 914 | for slave in slaves_for_cluster: 915 | if len(slave.jobsteps_assigned) == 0: 916 | continue 917 | filters = mesos_pb2.Filters() 918 | filters.refuse_seconds = 1.0 919 | 920 | # Note: offers_to_launch() and tasks_to_launch() remove offers and 921 | # tasks from the slave. 922 | offers_to_launch = slave.offers_to_launch() 923 | tasks_to_launch, jobstep_ids = slave.tasks_to_launch() 924 | 925 | with self.taskJobStepMappingLock: 926 | for task, jobstep_id in zip(tasks_to_launch, jobstep_ids): 927 | self.taskJobStepMapping[task.task_id.value] = jobstep_id 928 | 929 | self.tasksLaunched += len(tasks_to_launch) 930 | logging.info("Launch tasks: {} offers, {} tasks".format(len(offers_to_launch), len(tasks_to_launch))) 931 | driver.launchTasks(offers_to_launch, tasks_to_launch, filters) 932 | 933 | def resourceOffers(self, driver, pb_offers): 934 | # type: (SchedulerDriver, List[Any]) -> None 935 | """ 936 | Invoked when resources have been offered to this framework. A single 937 | offer will only contain resources from a single slave. Resources 938 | associated with an offer will not be re-offered to _this_ framework 939 | until either (a) this framework has rejected those resources (see 940 | SchedulerDriver.launchTasks) or (b) those resources have been 941 | rescinded (see Scheduler.offerRescinded). Note that resources may be 942 | concurrently offered to more than one framework at a time (depending 943 | on the allocator being used). In that case, the first framework to 944 | launch tasks using those resources will be able to use them while the 945 | other frameworks will have those resources rescinded (or if a 946 | framework has already launched tasks with those resources then those 947 | tasks will fail with a TASK_LOST status and a message saying as much). 948 | """ 949 | logging.info("Got %d resource offers", len(pb_offers)) 950 | self._stats.incr('offers', len(pb_offers)) 951 | 952 | # Simply add the offers to our local cache of available offers. 953 | # Jobsteps are allocated asynchronously, driven by 954 | # poll_changes_until_shutdown(). 955 | with self._cached_slaves_lock: 956 | for pb_offer in pb_offers: 957 | offer = ChangesScheduler.OfferWrapper(pb_offer) 958 | if pb_offer.slave_id.value not in self._cached_slaves: 959 | slave = ChangesScheduler.Slave(pb_offer.slave_id.value, 960 | pb_offer.hostname, 961 | offer.cluster) 962 | self._cached_slaves[pb_offer.slave_id.value] = slave 963 | self._cached_slaves[pb_offer.slave_id.value].add_offer(offer) 964 | self.slaveIdInfo[pb_offer.slave_id.value] = SlaveInfo(hostname=pb_offer.hostname) 965 | 966 | def _slaves_by_cluster(self, slaves): 967 | slaves_by_cluster = defaultdict(list) 968 | for slave in slaves: 969 | if slave.has_offers(): 970 | slaves_by_cluster[slave.cluster].append(slave) 971 | return slaves_by_cluster 972 | 973 | def _query_changes_for_jobsteps(self, driver, clusters): 974 | # type: (SchedulerDriver, List[str]) -> Dict[str, List[Dict[str, Any]]] 975 | """Query Changes for the pending jobsteps for each cluster for which we 976 | have offers available. 977 | """ 978 | jobsteps_by_cluster = defaultdict(list) # type: Dict[str, List[Dict[str, Any]]] 979 | for cluster in clusters: 980 | jobsteps = self._fetch_jobsteps(cluster) 981 | jobsteps_by_cluster[cluster] = jobsteps 982 | return jobsteps_by_cluster 983 | 984 | def offerRescinded(self, driver, offerId): 985 | # type: (SchedulerDriver, Any) -> None 986 | """ 987 | Invoked when an offer is no longer valid (e.g., the slave was lost or 988 | another framework used resources in the offer.) If for whatever reason 989 | an offer is never rescinded (e.g., dropped message, failing over 990 | framework, etc.), a framwork that attempts to launch tasks using an 991 | invalid offer will receive TASK_LOST status updats for those tasks 992 | (see Scheduler.resourceOffers). 993 | Args: 994 | driver: the MesosSchedulerDriver object 995 | offerId: a Mesos OfferId protobuf 996 | """ 997 | logging.info("Offer rescinded: %s", offerId.value) 998 | with self._cached_slaves_lock: 999 | for slave in self._cached_slaves.itervalues(): 1000 | slave.remove_offer(offerId) 1001 | 1002 | def statusUpdate(self, driver, status): 1003 | """ 1004 | Invoked when the status of a task has changed (e.g., a slave is lost 1005 | and so the task is lost, a task finishes and an executor sends a 1006 | status update saying so, etc.) Note that returning from this callback 1007 | acknowledges receipt of this status update. If for whatever reason 1008 | the scheduler aborts during this callback (or the process exits) 1009 | another status update will be delivered. Note, however, that this is 1010 | currently not true if the slave sending the status update is lost or 1011 | fails during that time. 1012 | """ 1013 | 1014 | states = { 1015 | 0: "starting", 1016 | 1: "running", 1017 | 2: "finished", # terminal 1018 | 3: "failed", # terminal 1019 | 4: "killed", # terminal 1020 | 5: "lost", # terminal 1021 | 6: "staging", 1022 | } 1023 | terminal_states = ["finished", "failed", "killed", "lost"] 1024 | 1025 | state = states[status.state] 1026 | logging.info("Task %s is in state %d", status.task_id.value, status.state) 1027 | 1028 | if status.state == mesos_pb2.TASK_FINISHED: 1029 | self.tasksFinished += 1 1030 | 1031 | aborted = False 1032 | with self.taskJobStepMappingLock: 1033 | jobstep_id = self.taskJobStepMapping.get(status.task_id.value) 1034 | 1035 | if state in terminal_states: 1036 | self.taskJobStepMapping.pop(status.task_id.value, None) 1037 | if status.task_id.value in self.tasksPendingKill: 1038 | kill_time = self.tasksPendingKill[status.task_id.value] 1039 | del self.tasksPendingKill[status.task_id.value] 1040 | aborted = True 1041 | elapsed = time.time() - kill_time 1042 | logging.info('Successfully aborted task %s (jobstep ID %s) after %.2f seconds', status.task_id.value, jobstep_id, elapsed) 1043 | self._stats.incr('task_aborted') 1044 | 1045 | hostname = None 1046 | if self.slaveIdInfo.get(status.slave_id.value): 1047 | hostname = self.slaveIdInfo[status.slave_id.value].hostname 1048 | if hostname is None: 1049 | logging.warning('No hostname associated with task: %s (slave_id %s)', status.task_id.value, status.slave_id.value) 1050 | 1051 | if jobstep_id is None: 1052 | # TODO(nate): how does this happen? 1053 | logging.error("Task %s missing JobStep ID (state %s, message %s)", 1054 | status.task_id.value, state, 1055 | _text_format.MessageToString(status)) 1056 | self._stats.incr('missing_jobstep_id_' + state) 1057 | return 1058 | 1059 | if state == 'finished': 1060 | try: 1061 | self._changes_api.update_jobstep(jobstep_id, status="finished", hostname=hostname) 1062 | except APIError: 1063 | pass 1064 | elif state in ('killed', 'lost', 'failed') and not aborted: 1065 | self._stats.incr('task_' + state) 1066 | # Jobsteps are only intended to be executed once and should only exit non-zero or be 1067 | # lost/killed by infrastructural issues, so we don't attempt to reschedule, and we mark 1068 | # this down as an infrastructural failure. Note that this state may not mean that the 1069 | # Jobstep will necessarily stop executing, but it means that the results will be 1070 | # considered immediately invalid. 1071 | logging.warn('Task %s %s: %s', jobstep_id, state, status.message) 1072 | msg = '==> Scheduler marked task as %s (will NOT be retried):\n\n%s' % (state, status.message) 1073 | try: 1074 | self._changes_api.jobstep_console_append(jobstep_id, text=msg) 1075 | except APIError: 1076 | pass 1077 | try: 1078 | self._changes_api.update_jobstep(jobstep_id, status="finished", result="infra_failed", hostname=hostname) 1079 | except APIError: 1080 | pass 1081 | 1082 | def frameworkMessage(self, driver, executorId, slaveId, message): 1083 | """ 1084 | Invoked when an executor sends a message. These messages are best 1085 | effort; do not expect a framework message to be retransmitted in any 1086 | reliable fashion. 1087 | """ 1088 | logging.info("Received message: %s", repr(str(message))) 1089 | 1090 | def slaveLost(self, driver, slaveId): 1091 | """ 1092 | Invoked when a slave has been determined unreachable (e.g., machine 1093 | failure, network partition.) Most frameworks will need to reschedule 1094 | any tasks launched on this slave on a new slave. 1095 | """ 1096 | logging.warn("Slave lost: %s", slaveId.value) 1097 | self._stats.incr('slave_lost') 1098 | with self._cached_slaves_lock: 1099 | slave = self._cached_slaves.pop(slaveId.value, None) 1100 | if slave: 1101 | self._stat_and_log_list(slave.offers(), 'decline_for_slave_lost', 1102 | lambda offer: "Slave lost, declining offer: %s" % offer.offer.id) 1103 | self._decline_list(driver, slave.offers()) 1104 | 1105 | def executorLost(self, driver, executorId, slaveId, status): 1106 | """ 1107 | Invoked when an executor has exited/terminated. Note that any tasks 1108 | running will have TASK_LOST status updates automatically generated. 1109 | """ 1110 | logging.warn("Executor %s lost on slave %s", executorId.value, slaveId.value) 1111 | 1112 | def error(self, driver, message): 1113 | """ 1114 | Invoked when there is an unrecoverable error in the scheduler or 1115 | scheduler driver. The driver will be aborted BEFORE invoking this 1116 | callback. 1117 | """ 1118 | logging.error("Error from Mesos: %s", message) 1119 | self._stats.incr('errors') 1120 | 1121 | def save_state(self): 1122 | """ 1123 | Save current state to a file so that a restart of the scheduler can 1124 | restore the state. 1125 | """ 1126 | state = {} 1127 | state['framework_id'] = self.framework_id 1128 | state['taskJobStepMapping'] = self.taskJobStepMapping 1129 | state['tasksPendingKill'] = self.tasksPendingKill 1130 | state['slaveIdInfo'] = {} 1131 | for slave, info in self.slaveIdInfo.iteritems(): 1132 | state['slaveIdInfo'][slave] = {'hostname': info.hostname} 1133 | state['tasksLaunched'] = self.tasksLaunched 1134 | state['tasksFinished'] = self.tasksFinished 1135 | state['snapshot_slave_map'] = self._snapshot_slave_map 1136 | logging.info('Attempting to save state for framework %s with %d running tasks to %s', 1137 | self.framework_id, len(self.taskJobStepMapping), self.state_file) 1138 | 1139 | with open(self.state_file, 'w') as f: 1140 | f.write(json.dumps(state)) 1141 | 1142 | def restore_state(self): 1143 | """ 1144 | Restores state from the previous run of the scheduler. 1145 | """ 1146 | with open(self.state_file) as f: 1147 | json_state = f.read() 1148 | state = json.loads(json_state) 1149 | 1150 | self.framework_id = state['framework_id'] 1151 | self.taskJobStepMapping = state['taskJobStepMapping'] 1152 | self.tasksPendingKill = state.get('tasksPendingKill', {}) 1153 | self.slaveIdInfo = {} 1154 | for slave, info in state.get('slaveIdInfo', {}).iteritems(): 1155 | self.slaveIdInfo[slave] = SlaveInfo(hostname=info.get('hostname')) 1156 | self.tasksLaunched = state['tasksLaunched'] 1157 | self.tasksFinished = state['tasksFinished'] 1158 | snapshot_slave_map = state['snapshot_slave_map'] 1159 | self._snapshot_slave_map = defaultdict(lambda: defaultdict(float)) 1160 | for snapshot, slave_map in snapshot_slave_map.iteritems(): 1161 | for slave, timestamp in slave_map.iteritems(): 1162 | self._snapshot_slave_map[snapshot][slave] = timestamp 1163 | 1164 | logging.info('Restored state for framework %s with %d running tasks from %s', 1165 | self.framework_id, len(self.taskJobStepMapping), self.state_file) 1166 | 1167 | def state_json(self): 1168 | # type: () -> Dict[str, Any] 1169 | """Produce a JSON dump of the scheduler's internal state. 1170 | Returns: 1171 | A JSON-encoded dict representing the scheduler's state. 1172 | """ 1173 | def convert_attrs(attrs): 1174 | # type: (List[Any]) -> List[Dict[str, Any]] 1175 | """Convert Attribute and Resource protobuf fields to dictionaries. 1176 | Args: 1177 | attrs: List of mesos_pb2.Attribute or mesos_pb2.Resource 1178 | Returns: 1179 | {'name': str, 'type': int, 'value': any simple Python type} 1180 | """ 1181 | accum = [] 1182 | for attr in attrs: 1183 | if attr.type == mesos_pb2.Value.SCALAR: 1184 | value = attr.scalar.value 1185 | elif attr.type == mesos_pb2.Value.RANGES: 1186 | value = ', '.join(map(lambda x: '(%d, %d)' % (x.begin, x.end), attr.ranges.range)) 1187 | elif attr.type == mesos_pb2.Value.SET: 1188 | value = ', '.join(attr.set.item) 1189 | elif attr.type == mesos_pb2.Value.TEXT: 1190 | value = attr.text.value 1191 | else: 1192 | value = 'Unknown Mesos value type {} on slave {} offer {}'.format( 1193 | attr.type, slave.hostname, offer.offer.id.value) 1194 | 1195 | attr_output = { 1196 | 'name': attr.name, 1197 | 'type': attr.type, 1198 | 'value': value, 1199 | } 1200 | accum.append(attr_output) 1201 | return accum 1202 | 1203 | start_time = time.time() 1204 | with self._cached_slaves_lock: 1205 | # Build JSON output for the blacklist. 1206 | blacklist_output = { 1207 | 'path': self._blacklist._path, 1208 | 'entries': sorted(list(self._blacklist._blacklist)), 1209 | } 1210 | 1211 | # Build JSON output for all slaves. 1212 | slaves = self._cached_slaves.values() 1213 | slaves.sort(key=lambda x: x.hostname) 1214 | slaves_output = [] 1215 | for slave in slaves: 1216 | # Build JSON output for all offers on the slave. 1217 | offers = slave._offers.values() 1218 | offers.sort(key=lambda x: x.offer.id.value) 1219 | offers_output = [] 1220 | for offer in offers: 1221 | if offer.offer.url.address.hostname: 1222 | base = offer.offer.url.address.hostname 1223 | else: 1224 | base = offer.offer.url.address.ip 1225 | url = (offer.offer.url.scheme + 1226 | base + 1227 | offer.offer.url.path + 1228 | '&'.join(offer.offer.url.query) + 1229 | offer.offer.url.fragment) 1230 | 1231 | offer_output = { 1232 | 'offer_id': offer.offer.id.value, 1233 | 'framework_id': offer.offer.framework_id.value, 1234 | 'url': url, 1235 | 'cpu': offer.cpu, 1236 | 'mem': offer.mem, 1237 | 'attributes': convert_attrs(offer.offer.attributes), 1238 | 'resources': convert_attrs(offer.offer.resources), 1239 | } 1240 | json.dumps(offer_output) 1241 | offers_output.append(offer_output) 1242 | slave_output = { 1243 | 'slave_id': slave.slave_id, 1244 | 'hostname': slave.hostname, 1245 | 'cluster': slave.cluster, 1246 | 'offers': offers_output, 1247 | 'total_cpu': slave.total_cpu, 1248 | 'total_mem': slave.total_mem, 1249 | 'is_maintenanced': slave.is_maintenanced( 1250 | int(start_time * 1000000000)), 1251 | } 1252 | json.dumps(slave_output) 1253 | slaves_output.append(slave_output) 1254 | 1255 | # Put it all together. 1256 | state = { 1257 | 'framework_id': self.framework_id, 1258 | 'taskJobStepMapping': self.taskJobStepMapping, 1259 | 'tasksPendingKill': self.tasksPendingKill, 1260 | 'tasksLaunched': self.tasksLaunched, 1261 | 'tasksFinished': self.tasksFinished, 1262 | 'shuttingDown': self.shuttingDown.is_set(), 1263 | 'blacklist': blacklist_output, 1264 | 'snapshot_slave_map': self._snapshot_slave_map, 1265 | 'changes_request_limit': self.changes_request_limit, 1266 | 'cached_slaves': slaves_output, 1267 | 'build_state_json_secs': time.time() - start_time, 1268 | } 1269 | 1270 | return state 1271 | -------------------------------------------------------------------------------- /changes_mesos_scheduler/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import absolute_import, print_function 4 | 5 | import argparse 6 | import json 7 | import logging 8 | import os 9 | import signal 10 | import sys 11 | import threading 12 | 13 | from time import sleep 14 | 15 | from flask import Flask 16 | from mesos.native import MesosSchedulerDriver 17 | from mesos.interface import mesos_pb2 18 | 19 | from .changes_scheduler import ChangesScheduler, ChangesAPI, FileBlacklist 20 | from .statsreporter import StatsReporter 21 | 22 | # Configuration should contain the file 'blacklist' which 23 | # is a line-separated lists of hosts to blacklist. 24 | # 25 | # NOTE: inside ec2, hostnames look like 26 | # ip-*-*-*-*.region.compute.internal 27 | DEFAULT_CONFIG_DIR = '/etc/changes-mesos-scheduler' 28 | 29 | 30 | def install_sentry_logger(): 31 | try: 32 | import raven 33 | except ImportError: 34 | logging.warning('Unable to find raven library. Sentry integration disabled.') 35 | return 36 | 37 | from raven.conf import setup_logging 38 | from raven.handlers.logging import SentryHandler 39 | 40 | client = raven.Client() 41 | handler = SentryHandler(client, level=logging.WARN) 42 | setup_logging(handler) 43 | 44 | 45 | def json_handler(func): 46 | """Produce an HTTP handler which JSON-encodes a Python object and sets 47 | the Content-Type to application/json.""" 48 | def wrapped_func(): 49 | """Returns 50 | (str output content, int status code, dict headers (content type)) 51 | """ 52 | return json.dumps(func()), 200, {'Content-Type': 'application/json; charset=utf-8'} 53 | return wrapped_func 54 | 55 | 56 | def run(api_url, mesos_master, user, config_dir, state_file, 57 | changes_request_limit, http_port, stats=None): 58 | scheduler = ChangesScheduler(state_file, api=ChangesAPI(api_url), stats=stats, 59 | blacklist=FileBlacklist(os.path.join(config_dir, 'blacklist')), 60 | changes_request_limit=changes_request_limit) 61 | 62 | executor = mesos_pb2.ExecutorInfo() 63 | executor.executor_id.value = "default" 64 | executor.command.value = os.path.abspath("./executor.py") 65 | executor.name = "Changes Executor" 66 | executor.source = "changes" 67 | 68 | framework = mesos_pb2.FrameworkInfo() 69 | framework.user = user 70 | framework.name = "Changes Scheduler" 71 | framework.principal = "changes" 72 | # Give the scheduler 1 week to restart before mesos cancels the tasks. 73 | # this is the setting recommended by the docs. 74 | framework.failover_timeout = 3600 * 24 * 7 75 | 76 | if scheduler.framework_id: 77 | framework.id.value = scheduler.framework_id 78 | executor.framework_id.value = scheduler.framework_id 79 | 80 | driver = MesosSchedulerDriver( 81 | scheduler, 82 | framework, 83 | mesos_master) 84 | 85 | stopped = threading.Event() 86 | 87 | def handle_interrupt(signal, frame): 88 | stopped.set() 89 | logging.info("Received interrupt, shutting down") 90 | logging.warning("Not saving state. Will wait for running tasks to finish.") 91 | scheduler.shuttingDown.set() 92 | while scheduler.activeTasks > 0: 93 | logging.info("Waiting for %d tasks to finish running", scheduler.activeTasks) 94 | sleep(5) 95 | driver.stop() 96 | 97 | def handle_sigterm(signal, frame): 98 | # TODO: Avoid save_state race conditions by having handle_sigterm() 99 | # only set shuttingDown, then do the actual save-state and driver.stop() 100 | # in the main thread after all other threads are join()ed. 101 | # Also, stopped doesn't appear to be used. 102 | stopped.set() 103 | logging.info("Received sigterm, shutting down") 104 | scheduler.shuttingDown.set() 105 | if scheduler.state_file: 106 | try: 107 | scheduler.save_state() 108 | logging.info("Successfully saved state to %s.", state_file) 109 | except Exception: 110 | logging.exception("Failed to save state") 111 | driver.stop() 112 | return 113 | # With `failover` set to true, we do not tell Mesos to stop the existing tasks 114 | # started by this framework. Instead, the tasks will run for 115 | # `fail_timeout` more seconds set above or we start a scheduler with 116 | # the same framework id. 117 | driver.stop(True) 118 | else: 119 | logging.warning("State file location not set. Not saving state. Existing builds will be cancelled.") 120 | driver.stop() 121 | 122 | signal.signal(signal.SIGINT, handle_interrupt) 123 | signal.signal(signal.SIGTERM, handle_sigterm) 124 | 125 | driver.start() 126 | logging.info("Driver started") 127 | 128 | app = Flask("Changes Mesos Scheduler") 129 | app.add_url_rule( 130 | '/api/state_json', 'state_json', json_handler(scheduler.state_json) ) 131 | http_thread = threading.Thread(target=app.run, kwargs={'port':http_port}) 132 | http_thread.start() 133 | 134 | scheduler.poll_changes_until_shutdown(driver, 5) 135 | status = 0 136 | if driver.join() == mesos_pb2.DRIVER_STOPPED: 137 | logging.info("Driver stopped cleanly.") 138 | else: 139 | # Ensure that the driver process terminates. 140 | status = 1 141 | logging.info("Stopping driver forcibly.") 142 | driver.stop() 143 | 144 | logging.info("Stopping HTTP server.") 145 | http_thread.terminate() 146 | http_thread.join() 147 | 148 | logging.info("Clean shutdown complete. Exiting status %d.", status) 149 | sys.exit(status) 150 | 151 | 152 | def main(): 153 | parser = argparse.ArgumentParser(description='Mesos HTTP Proxy') 154 | 155 | parser.add_argument('--api-url', required=True, 156 | help='URL root of Changes API, including scheme. (e.g. http://localhost:5000/api/0/)') 157 | parser.add_argument('--mesos-master', default='127.0.1.1:5050', 158 | help='Location of Mesos master server. (e.g. 127.0.1.1:5050)') 159 | parser.add_argument('--user', default='root', help="User to run tasks as") 160 | parser.add_argument('--log-level', default='info', help="Level to log at. (e.g. info)") 161 | parser.add_argument('--config-dir', default=DEFAULT_CONFIG_DIR, help='Configuration directory') 162 | parser.add_argument('--state-file', default=None, help='File path preserve state across restarts') 163 | parser.add_argument('--statsd-host', default=None, help='Host to report stats to') 164 | parser.add_argument('--statsd-port', default=8125, type=int, help='Port for on statsd host to send to') 165 | parser.add_argument('--statsd-prefix', default='changes_scheduler', help='Prefix for stats keys') 166 | parser.add_argument('--changes-request-limit', default=200, type=int, 167 | help='Maximum number of JobSteps to ask Changes for per-request') 168 | parser.add_argument('--http_port', default=5888, type=int, help='Port for Flask to listen for and serve HTTP requests.') 169 | 170 | args = parser.parse_args(sys.argv[1:]) 171 | logging.basicConfig(level=getattr(logging, args.log_level.upper()), 172 | format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s') 173 | install_sentry_logger() 174 | 175 | stats = None 176 | if args.statsd_host: 177 | stats = StatsReporter({ 178 | 'STATSD_HOST': args.statsd_host, 179 | 'STATSD_PORT': args.statsd_port, 180 | 'STATSD_PREFIX': args.statsd_prefix, 181 | }).stats() 182 | 183 | try: 184 | run(args.api_url, args.mesos_master, args.user, args.config_dir, 185 | args.state_file, args.changes_request_limit, args.http_port, stats) 186 | except Exception as e: 187 | logging.exception(unicode(e)) 188 | raise 189 | 190 | if __name__ == "__main__": 191 | main() 192 | -------------------------------------------------------------------------------- /changes_mesos_scheduler/statsreporter.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import logging 4 | from contextlib import contextmanager 5 | 6 | import statsd 7 | 8 | logger = logging.getLogger('statsreporter') 9 | 10 | 11 | def swallow_exceptions(exn_logger): 12 | """Decorator to catch, log, and discard any Exceptions raised in a method. 13 | :param exn_logger: logging.Logger to use for logging any exceptions. 14 | """ 15 | def decor(func): 16 | def wrapper(*args, **kwargs): 17 | try: 18 | return func(*args, **kwargs) 19 | except Exception as e: 20 | exn_logger.exception(e) 21 | return wrapper 22 | return decor 23 | 24 | 25 | class StatsReporter(object): 26 | """StatsReporter is responsible for maintaining an app-specific Stats instance. 27 | The config should specify: 28 | STATSD_HOST (address of statsd host as a string) 29 | STATSD_PORT (port statsd is listening on as an int) 30 | STATSD_PREFIX (string to be automatically prepended to all reported stats for namespacing) 31 | 32 | If STATSD_HOST isn't specified, none of the others will be used and this app will 33 | get a no-op Stats instance. 34 | """ 35 | def __init__(self, config): 36 | host = config.get('STATSD_HOST') 37 | self._stats = None 38 | if host: 39 | sd = statsd.StatsClient(host=host, 40 | prefix=config.get('STATSD_PREFIX'), 41 | port=config.get('STATSD_PORT')) 42 | self._stats = Stats(client=sd) 43 | 44 | def stats(self): 45 | """Returns a Stats object. 46 | If no statsd config has been provided, 47 | the Stats won't do anything but validate.""" 48 | if self._stats: 49 | return self._stats 50 | return Stats(client=None) 51 | 52 | 53 | class Stats(object): 54 | """ Minimalistic class for sending stats/monitoring values.""" 55 | 56 | def __init__(self, client): 57 | """ 58 | @param client - A statsd.StatsClient instance, or None for a no-op Stats. 59 | """ 60 | # A thin wrapper around Statsd rather than just Statsd so we 61 | # can pick which features to support and how to encode the data. 62 | self._client = client 63 | 64 | @swallow_exceptions(logger) 65 | def set_gauge(self, key, value): 66 | """ Set a gauge, typically a sampled instantaneous value. 67 | @param key - the name of the gauge. 68 | @param value - current value of the gauge. 69 | """ 70 | assert isinstance(value, (int, float, long)) 71 | Stats._check_key(key) 72 | if self._client: 73 | self._client.gauge(key, value) 74 | 75 | @swallow_exceptions(logger) 76 | def incr(self, key, delta=1): 77 | """ Increment a count. 78 | @param key - the name of the stat. 79 | @param delta - amount to increment the stat by. Must be positive. 80 | """ 81 | assert isinstance(delta, (int, float, long)) 82 | assert delta >= 0 83 | Stats._check_key(key) 84 | if self._client: 85 | self._client.incr(key, delta) 86 | 87 | @swallow_exceptions(logger) 88 | def log_timing(self, key, duration_ms): 89 | """ Record a millisecond timing. """ 90 | assert isinstance(duration_ms, (int, float, long)) 91 | Stats._check_key(key) 92 | if self._client: 93 | self._client.timing(key, duration_ms) 94 | 95 | @contextmanager 96 | def timer(self, key): 97 | """A contextmanager that reports the duration in milliseconds on exit.""" 98 | t0 = time.time() 99 | try: 100 | yield 101 | finally: 102 | duration_ms = int(1000 * (time.time() - t0)) 103 | self.log_timing(key, duration_ms) 104 | 105 | _KEY_RE = re.compile(r'^[A-Za-z0-9_-]+$') 106 | 107 | @classmethod 108 | def _check_key(cls, key): 109 | """ This is probably overly strict, but we have little use for 110 | interestingly named keys and this avoids unintentionally using them.""" 111 | if not cls._KEY_RE.match(key): 112 | raise Exception("Invalid key: {}".format(repr(key))) 113 | -------------------------------------------------------------------------------- /changes_mesos_scheduler/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dropbox/changes-mesos-framework/cbb2351d45b4231286a18e70e5fea039b121d0a4/changes_mesos_scheduler/tests/__init__.py -------------------------------------------------------------------------------- /changes_mesos_scheduler/tests/test_changes_scheduler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | import os 4 | import shutil 5 | import tempfile 6 | import threading 7 | import time 8 | 9 | from collections import defaultdict 10 | 11 | import mock 12 | from typing import Any 13 | from unittest import TestCase 14 | 15 | # Capture debug logging output on test failure 16 | logger = logging.getLogger() 17 | logger.level = logging.DEBUG 18 | 19 | from mesos.interface import mesos_pb2 20 | from mesos.interface import Scheduler 21 | 22 | from changes_mesos_scheduler.changes_scheduler import ChangesScheduler, APIError, FileBlacklist, ChangesAPI, SlaveInfo, TASK_KILL_THRESHOLD 23 | from changes_mesos_scheduler import statsreporter 24 | 25 | def _noop_blacklist(): 26 | """Returns a blacklist instance that behaves like an empty blacklist.""" 27 | m = mock.Mock(spec=FileBlacklist) 28 | m.contains.return_value = False 29 | return m 30 | 31 | 32 | def help_resource_offers_and_poll_changes(cs, driver, new_offers): 33 | # type: (ChangesScheduler, Scheduler, List[Any]) -> None 34 | """Receive offers from the Mesos master and poll Changes for new jobsteps 35 | in a synchronous manner to facilitate simpler, more straightforward 36 | testing. Normally these two tasks run in separate threads. 37 | Args: 38 | driver: the MesosSchedulerDriver object 39 | new_offers: A list of Mesos Offer protobufs that should be offered to 40 | the scheduler. 41 | """ 42 | cs.shuttingDown.clear() # reset shuttingDown if necessary. 43 | cs.resourceOffers(driver, new_offers) # Get offers from Mesos master. 44 | assert not cs.poll_and_launch_once(driver) # Get jobsteps and launch them. 45 | 46 | 47 | class ChangesAPITest(TestCase): 48 | url = 'https://changes.com/api/0' 49 | 50 | def test_make_url_paths(self): 51 | desired = 'https://changes.com/api/0/jobsteps/allocate/' 52 | assert ChangesAPI.make_url(self.url, '/jobsteps/allocate/') == desired 53 | assert ChangesAPI.make_url(self.url, 'jobsteps/allocate') == desired 54 | assert ChangesAPI.make_url(self.url + '/', 'jobsteps/allocate') == desired 55 | assert ChangesAPI.make_url(self.url + '/', '/jobsteps/allocate') == desired 56 | assert ChangesAPI.make_url(self.url + '//', '/jobsteps/allocate') == desired 57 | 58 | def test_make_url_query(self): 59 | desired = ['https://changes.com/api/0/jobsteps/allocate/?foo=bar&baz=xyzz', 60 | 'https://changes.com/api/0/jobsteps/allocate/?baz=xyzz&foo=bar'] 61 | full_url = ChangesAPI.make_url(self.url, '/jobsteps/allocate/', {'foo': 'bar', 'baz': 'xyzz'}) 62 | assert full_url in desired 63 | 64 | 65 | class ChangesSchedulerTest(TestCase): 66 | 67 | def setUp(self): 68 | self.test_dir = tempfile.mkdtemp() 69 | super(ChangesSchedulerTest, self).setUp() 70 | 71 | def tearDown(self): 72 | shutil.rmtree(self.test_dir) 73 | super(ChangesSchedulerTest, self).tearDown() 74 | 75 | def _make_task_status(self, id='taskid', state=mesos_pb2.TASK_FINISHED, 76 | message="foo", slave_id='slaveid', jobstep_id='1'): 77 | status = mesos_pb2.TaskStatus( 78 | task_id=mesos_pb2.TaskID(value=id), 79 | slave_id=mesos_pb2.SlaveID(value=slave_id), 80 | state=state, 81 | message=message, 82 | ) 83 | return status 84 | 85 | def _make_offer(self, 86 | hostname=None, 87 | cpus=4, 88 | mem=8192, 89 | cluster=None, 90 | id='offerid', 91 | unavailability_start_secs=None, 92 | unavailability_duration_secs=None): 93 | # Offers with different IDs will have different hostnames, unless 94 | # otherwise explicitly specified. 95 | if hostname is None: 96 | hostname = id 97 | offer = mesos_pb2.Offer( 98 | id=mesos_pb2.OfferID(value=id), 99 | framework_id=mesos_pb2.FrameworkID(value="frameworkid"), 100 | slave_id=mesos_pb2.SlaveID(value="slave_id_" + hostname), 101 | hostname=hostname, 102 | ) 103 | 104 | if unavailability_start_secs is not None: 105 | offer.unavailability.start.nanoseconds = int(unavailability_start_secs * 1000000000) 106 | if unavailability_duration_secs is not None: 107 | offer.unavailability.duration.nanoseconds = int(unavailability_duration_secs * 1000000000) 108 | 109 | offer.resources.add(name="cpus", 110 | type=mesos_pb2.Value.SCALAR, 111 | scalar=mesos_pb2.Value.Scalar(value=cpus)) 112 | offer.resources.add(name="mem", 113 | type=mesos_pb2.Value.SCALAR, 114 | scalar=mesos_pb2.Value.Scalar(value=mem)) 115 | if cluster: 116 | offer.attributes.add(name="labels", 117 | type=mesos_pb2.Value.TEXT, 118 | text=mesos_pb2.Value.Text(value=cluster)) 119 | return offer 120 | 121 | def _make_changes_task(self, id, cpus=2, mem=4096, slug='foo', cmd='ls', snapshot=None): 122 | image = None 123 | if snapshot: 124 | image = {'snapshot': {'id': snapshot}} 125 | 126 | return {'project': {'slug': slug}, 'id': id, 127 | 'cmd': cmd, 'resources': {'cpus': cpus, 'mem': mem}, 128 | 'image': image} 129 | 130 | def test_save_restore_state(self): 131 | state_file = self.test_dir + '/test.json' 132 | 133 | cs = ChangesScheduler(state_file, api=mock.Mock(), 134 | blacklist=_noop_blacklist()) 135 | cs.tasksLaunched = 5 136 | cs.tasksFinished = 3 137 | cs.taskJobStepMapping['task x'] = 'jobstep x' 138 | cs.tasksPendingKill = {'task y': 100.5} 139 | cs.slaveIdInfo['slaveid'] = SlaveInfo(hostname='aHostname') 140 | cs._snapshot_slave_map = defaultdict(lambda: defaultdict(float)) 141 | cs._snapshot_slave_map['snapid']['host1'] = 1234567.0 142 | cs._snapshot_slave_map['snapid']['host2'] = 1234569.0 143 | cs.save_state() 144 | cs = None 145 | 146 | cs2 = ChangesScheduler(state_file, api=mock.Mock(), 147 | blacklist=_noop_blacklist()) 148 | assert 5 == cs2.tasksLaunched 149 | assert 3 == cs2.tasksFinished 150 | assert {'task x': 'jobstep x'} == cs2.taskJobStepMapping 151 | assert {'task y': 100.5} == cs2.tasksPendingKill 152 | assert cs2.slaveIdInfo['slaveid'].hostname == 'aHostname' 153 | assert not os.path.exists(state_file) 154 | assert {'snapid': {'host1': 1234567.0, 'host2': 1234569.0}} == cs2._snapshot_slave_map 155 | 156 | def test_save_restore_state_missing(self): 157 | state_file = self.test_dir + '/test.json' 158 | 159 | # newly added fields shouldn't be added to this dict. This is so we 160 | # can test that newly added (aka initially missing) fields are 161 | # restored to a reasonable default. 162 | state = {'framework_id': 1, 163 | 'tasksLaunched': 5, 164 | 'tasksFinished': 3, 165 | 'taskJobStepMapping': {'task x': 'jobstep x'}, 166 | 'snapshot_slave_map': {} 167 | } 168 | 169 | with open(state_file, 'w') as f: 170 | f.write(json.dumps(state)) 171 | 172 | cs2 = ChangesScheduler(state_file, api=mock.Mock(), 173 | blacklist=_noop_blacklist()) 174 | assert 5 == cs2.tasksLaunched 175 | assert 3 == cs2.tasksFinished 176 | assert {'task x': 'jobstep x'} == cs2.taskJobStepMapping 177 | assert cs2.tasksPendingKill == {} 178 | assert cs2.slaveIdInfo == {} 179 | assert not os.path.exists(state_file) 180 | assert {} == cs2._snapshot_slave_map 181 | 182 | def test_task_finished(self): 183 | api = mock.Mock(spec=ChangesAPI) 184 | cs = ChangesScheduler(state_file=None, api=api, 185 | blacklist=_noop_blacklist()) 186 | cs.taskJobStepMapping = {'taskid': '1'} 187 | cs.slaveIdInfo = {'slaveid': SlaveInfo(hostname='aHostname')} 188 | driver = mock.Mock() 189 | 190 | status = self._make_task_status(id='taskid', jobstep_id='1') 191 | 192 | cs.statusUpdate(driver, status) 193 | 194 | assert cs.tasksFinished == 1 195 | assert len(cs.taskJobStepMapping) == 0 196 | 197 | api.update_jobstep.assert_called_once_with('1', status='finished', hostname='aHostname') 198 | 199 | def test_task_failed(self): 200 | api = mock.Mock(spec=ChangesAPI) 201 | cs = ChangesScheduler(state_file=None, api=api, 202 | blacklist=_noop_blacklist()) 203 | cs.taskJobStepMapping = {'taskid': '1'} 204 | cs.slaveIdInfo = {'slaveid': SlaveInfo(hostname='aHostname')} 205 | driver = mock.Mock() 206 | 207 | status = self._make_task_status(id='taskid', jobstep_id='1', state=mesos_pb2.TASK_FAILED) 208 | 209 | cs.statusUpdate(driver, status) 210 | 211 | assert cs.tasksFinished == 0 212 | assert len(cs.taskJobStepMapping) == 0 213 | 214 | assert api.jobstep_console_append.call_count == 1 215 | api.update_jobstep.assert_called_once_with('1', status='finished', result='infra_failed', hostname='aHostname') 216 | 217 | def test_task_running(self): 218 | api = mock.Mock(spec=ChangesAPI) 219 | cs = ChangesScheduler(state_file=None, api=api, 220 | blacklist=_noop_blacklist()) 221 | cs.taskJobStepMapping = {'taskid': '1'} 222 | driver = mock.Mock() 223 | 224 | status = self._make_task_status(id='taskid', jobstep_id='1', state=mesos_pb2.TASK_RUNNING) 225 | 226 | cs.statusUpdate(driver, status) 227 | 228 | assert cs.tasksFinished == 0 229 | assert len(cs.taskJobStepMapping) == 1 230 | 231 | api.jobstep_console_append.assert_not_called() 232 | api.update_jobstep.assert_not_called() 233 | 234 | def test_missing_jobstep_mapping(self): 235 | api = mock.Mock(spec=ChangesAPI) 236 | stats = mock.Mock() 237 | cs = ChangesScheduler(state_file=None, api=api, stats=stats, 238 | blacklist=_noop_blacklist()) 239 | cs.taskJobStepMapping = {} 240 | driver = mock.Mock() 241 | 242 | status = self._make_task_status(id='taskid', jobstep_id='1', state=mesos_pb2.TASK_FINISHED) 243 | 244 | cs.statusUpdate(driver, status) 245 | 246 | assert cs.tasksFinished == 1 247 | 248 | stats.incr.assert_called_once_with('missing_jobstep_id_finished') 249 | 250 | def test_missing_hostname_mapping(self): 251 | api = mock.Mock(spec=ChangesAPI) 252 | cs = ChangesScheduler(state_file=None, api=api, 253 | blacklist=_noop_blacklist()) 254 | cs.taskJobStepMapping = {'taskid': '1'} 255 | driver = mock.Mock() 256 | 257 | status = self._make_task_status(id='taskid', jobstep_id='1') 258 | 259 | cs.statusUpdate(driver, status) 260 | 261 | assert cs.tasksFinished == 1 262 | assert len(cs.taskJobStepMapping) == 0 263 | 264 | api.update_jobstep.assert_called_once_with('1', status='finished', hostname=None) 265 | 266 | def test_needs_abort_api_error(self): 267 | api = mock.Mock(spec=ChangesAPI) 268 | api.jobstep_needs_abort.side_effect = APIError("Failure") 269 | cs = ChangesScheduler(state_file=None, api=api, 270 | blacklist=_noop_blacklist()) 271 | cs.taskJobStepMapping = {'task1': '1'} 272 | driver = mock.Mock() 273 | 274 | cs.poll_and_abort(driver) 275 | 276 | api.jobstep_needs_abort.assert_called_once_with(['1']) 277 | assert driver.killTask.call_count == 0 278 | 279 | def test_no_needs_abort(self): 280 | api = mock.Mock(spec=ChangesAPI) 281 | api.jobstep_needs_abort.return_value = [] 282 | cs = ChangesScheduler(state_file=None, api=api, 283 | blacklist=_noop_blacklist()) 284 | cs.taskJobStepMapping = {'task1': '1'} 285 | driver = mock.Mock() 286 | 287 | cs.poll_and_abort(driver) 288 | 289 | api.jobstep_needs_abort.assert_called_once_with(['1']) 290 | assert driver.killTask.call_count == 0 291 | 292 | def test_jobsteps_needs_abort(self): 293 | api = mock.Mock(spec=ChangesAPI) 294 | api.jobstep_needs_abort.return_value = ['1', '2'] 295 | cs = ChangesScheduler(state_file=None, api=api, 296 | blacklist=_noop_blacklist()) 297 | cs.taskJobStepMapping = {'task1': '1', 'task2': '2', 'task3': '3'} 298 | driver = mock.Mock() 299 | killed_tasks = [] 300 | driver.killTask.side_effect = lambda task: killed_tasks.append(task.value) 301 | 302 | with mock.patch('time.time') as t: 303 | t.return_value = 1000.0 304 | cs.poll_and_abort(driver) 305 | 306 | api.jobstep_needs_abort.assert_called_once_with(['1', '2', '3']) 307 | 308 | # task3 isn't marked aborted by Changes so we don't abort it 309 | assert sorted(killed_tasks) == ['task1', 'task2'] 310 | assert driver.killTask.call_count == 2 311 | assert cs.tasksPendingKill == {'task1': 1000, 'task2': 1000} 312 | 313 | def test_aborted_task_wont_die(self): 314 | api = mock.Mock(spec=ChangesAPI) 315 | api.jobstep_needs_abort.return_value = ['1', '2'] 316 | stats = mock.Mock() 317 | cs = ChangesScheduler(state_file=None, api=api, stats=stats, 318 | blacklist=_noop_blacklist()) 319 | cs.taskJobStepMapping = {'task1': '1', 'task2': '2', 'task3': '3'} 320 | task2_time = 1000.0 + TASK_KILL_THRESHOLD + 1 321 | cs.tasksPendingKill = {'task1': 1000.0, 'task2': task2_time} 322 | driver = mock.Mock() 323 | killed_tasks = [] 324 | driver.killTask.side_effect = lambda task: killed_tasks.append(task.value) 325 | 326 | with mock.patch('time.time') as t: 327 | t.return_value = task2_time + 1 328 | cs.poll_and_abort(driver) 329 | 330 | api.jobstep_needs_abort.assert_called_once_with(['1', '2', '3']) 331 | 332 | assert sorted(killed_tasks) == ['task2'] 333 | assert driver.killTask.call_count == 1 334 | assert cs.taskJobStepMapping == {'task2': '2', 'task3': '3'} 335 | assert cs.tasksPendingKill == {'task2': task2_time} 336 | stats.incr.assert_called_once_with('couldnt_abort_task') 337 | 338 | def test_blacklist(self): 339 | blpath = self.test_dir + '/blacklist' 340 | # Ensure we have an empty blacklist file. 341 | open(blpath, 'w+').close() 342 | 343 | api = mock.MagicMock() 344 | stats = mock.Mock() 345 | cs = ChangesScheduler(state_file=None, api=api, stats=stats, 346 | blacklist=FileBlacklist(blpath)) 347 | offer = self._make_offer(hostname = 'some_hostname.com') 348 | 349 | blacklist = open(blpath, 'w+') 350 | blacklist.write('some_hostname.com\n') 351 | blacklist.close() 352 | 353 | driver = mock.Mock() 354 | # We have to fake the mtime despite the file legitimately having been modified 355 | # later because some filesystems (HFS+, for example) don't have enough precision 356 | # for this to pass reliably. 357 | with mock.patch('os.path.getmtime', return_value=time.time()+1) as getmtime: 358 | help_resource_offers_and_poll_changes(cs, driver, [offer]) 359 | getmtime.assert_called_with(blpath) 360 | assert api.declineOffer.call_count == 0 361 | assert api.allocate_jobsteps.call_count == 0 362 | 363 | assert stats.incr.call_count == 3 364 | stats.incr.assert_any_call('ignore_for_blacklist', 1) 365 | stats.incr.assert_any_call('ignore_for_maintenance', 0) 366 | 367 | # Decline any unused offers. Expect the blacklisted offer. 368 | cs.decline_open_offers(driver) 369 | driver.declineOffer.assert_called_once_with(offer.id) 370 | 371 | def test_blacklist_maintenance(self): 372 | api = mock.Mock(spec=ChangesAPI) 373 | now = time.time() 374 | memlimit = 8192 375 | 376 | # Test no unavailability scheduled - ACCEPT 377 | offer1 = self._make_offer(hostname='hostname_1.com', 378 | id="offer_1", 379 | mem=memlimit) 380 | 381 | # Test unavailability scheduled right now - DECLINE 382 | offer2 = self._make_offer(hostname='hostname_2.com', 383 | id="offer_2", 384 | mem=memlimit, 385 | unavailability_start_secs=now, 386 | unavailability_duration_secs=10) 387 | 388 | # Test unavailability scheduled in a few seconds - ACCEPT 389 | offer3 = self._make_offer(hostname='hostname_3.com', 390 | id="offer_3", 391 | mem=memlimit, 392 | unavailability_start_secs=now + 5, 393 | unavailability_duration_secs=10) 394 | 395 | # Test unavailability scheduled in the past, ending in the past - ACCEPT 396 | offer4 = self._make_offer(hostname='hostname_4.com', 397 | id="offer_4", 398 | mem=memlimit, 399 | unavailability_start_secs=now - 20, 400 | unavailability_duration_secs=10) 401 | 402 | # Test unavailability in progress - DECLINE 403 | offer5 = self._make_offer(hostname='hostname_5.com', 404 | id="offer_5", 405 | mem=memlimit, 406 | unavailability_start_secs=now - 5, 407 | unavailability_duration_secs=10) 408 | 409 | # Test past unavailability with no duration - DECLINE 410 | offer6 = self._make_offer(hostname='hostname_6.com', 411 | id="offer_6", 412 | mem=memlimit, 413 | unavailability_start_secs=now - 5, 414 | unavailability_duration_secs=None) 415 | 416 | # Test future unavailability with no duration - ACCEPT 417 | offer7 = self._make_offer(hostname='hostname_7.com', 418 | id="offer_7", 419 | mem=memlimit, 420 | unavailability_start_secs=now + 5, 421 | unavailability_duration_secs=None) 422 | 423 | # Test unavailability with zero duration - ACCEPT 424 | offer8 = self._make_offer(hostname='hostname_8.com', 425 | id="offer_8", 426 | mem=memlimit, 427 | unavailability_start_secs=now - 5, 428 | unavailability_duration_secs=0) 429 | 430 | all_offers = [offer1, offer2, offer3, offer4, offer5, offer6, offer7, offer8] 431 | expected_launches = [offer1, offer3, offer4, offer7, offer8] 432 | expected_ignores = [offer2, offer5, offer6] 433 | 434 | # To ensure that offers aren't accidentally declined due to a shortage 435 | # of tasks, ensure tasks > offers so there's at least one tasks per 436 | # machine, plus an extra. (each offer has memory for one task) 437 | num_tasks = len(all_offers) + 1 438 | tasks = [] 439 | for i in xrange(num_tasks): 440 | tasks.append(self._make_changes_task(str(i), mem=memlimit)) 441 | api.get_allocate_jobsteps.return_value = tasks 442 | 443 | # In practice, we end up with two tasks allocated per offer. 444 | post_allocate_jobsteps_return = [] 445 | for i in xrange(len(expected_launches)): 446 | post_allocate_jobsteps_return.append(str(i)) 447 | api.post_allocate_jobsteps.return_value = post_allocate_jobsteps_return 448 | 449 | # Actually run the test logic. 450 | stats = mock.MagicMock(spec=statsreporter.Stats) 451 | cs = ChangesScheduler(state_file=None, api=api, stats=stats, 452 | blacklist=_noop_blacklist()) 453 | driver = mock.Mock() 454 | help_resource_offers_and_poll_changes(cs, driver, all_offers) 455 | 456 | # Check that the maintenanced offers are declined. 457 | assert driver.declineOffer.call_count == 0 458 | 459 | # Check the stats reporting. 460 | assert stats.incr.call_count == 3 461 | stats.incr.assert_any_call('offers', len(all_offers)) 462 | stats.incr.assert_any_call('ignore_for_blacklist', 0) 463 | stats.incr.assert_any_call('ignore_for_maintenance', len(expected_ignores)) 464 | 465 | # Check that the non-maintenanced tasks are launched. 466 | assert driver.launchTasks.call_count == len(expected_launches) 467 | actual_launch_set = set() 468 | expected_launch_set = set() 469 | for launch_offer, args in zip(expected_launches, 470 | driver.launchTasks.call_args_list): 471 | expected_launch_set.add(launch_offer.id.value) 472 | assert len(args[0][0]) == 1 # only one OfferId in the launch args. 473 | actual_launch_set.add(args[0][0][0].value) 474 | assert actual_launch_set == expected_launch_set 475 | 476 | # Decline any unused offers. Expect all maintenanced offers. 477 | cs.decline_open_offers(driver) 478 | for offer in expected_ignores: 479 | driver.declineOffer.assert_any_call(offer.id) 480 | assert driver.declineOffer.call_count == len(expected_ignores) 481 | 482 | def test_error_stats(self): 483 | stats = mock.Mock() 484 | cs = ChangesScheduler(state_file=None, api=mock.Mock(), stats=stats, 485 | blacklist=_noop_blacklist()) 486 | driver = mock.Mock() 487 | cs.error(driver, 'message') 488 | stats.incr.assert_called_once_with('errors') 489 | 490 | def test_slaveLost(self): 491 | stats = mock.Mock() 492 | cs = ChangesScheduler(state_file=None, api=mock.Mock(), stats=stats, 493 | blacklist=_noop_blacklist()) 494 | driver = mock.Mock() 495 | 496 | pb_offer = self._make_offer(hostname="hostname") 497 | 498 | # Check removing an unrecognized slave. 499 | assert len(cs._cached_slaves) == 0 500 | cs.slaveLost(driver, pb_offer.slave_id) 501 | stats.incr.assert_called_once_with('slave_lost') 502 | 503 | # Check removing a recognized slave. 504 | cs.resourceOffers(driver, [pb_offer]) 505 | stats.reset_mock() 506 | assert len(cs._cached_slaves) == 1 507 | 508 | cs.slaveLost(driver, pb_offer.slave_id) 509 | assert stats.incr.call_count == 2 510 | stats.incr.assert_any_call('decline_for_slave_lost', 1) 511 | stats.incr.assert_any_call('slave_lost') 512 | assert len(cs._cached_slaves) == 0 513 | 514 | def test_disconnected(self): 515 | stats = mock.Mock() 516 | cs = ChangesScheduler(state_file=None, api=mock.Mock(), stats=stats, 517 | blacklist=_noop_blacklist()) 518 | driver = mock.Mock() 519 | 520 | pb_offer = self._make_offer(hostname="hostname") 521 | cs.resourceOffers(driver, [pb_offer]) 522 | assert len(cs._cached_slaves) == 1 523 | 524 | cs.disconnected(driver) 525 | assert len(cs._cached_slaves) == 0 526 | 527 | def test_api_error(self): 528 | api = mock.Mock(spec=ChangesAPI) 529 | api.get_allocate_jobsteps.side_effect = APIError("Failure") 530 | cs = ChangesScheduler(state_file=None, api=api, 531 | blacklist=_noop_blacklist()) 532 | driver = mock.Mock() 533 | 534 | offer = self._make_offer() 535 | 536 | help_resource_offers_and_poll_changes(cs, driver, [offer]) 537 | 538 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, cluster=None) 539 | assert driver.declineOffer.call_count == 0 540 | 541 | # Decline any unused offers. Expect the errored offer. 542 | cs.decline_open_offers(driver) 543 | driver.declineOffer.assert_called_once_with(offer.id) 544 | 545 | def test_api_no_tasks(self): 546 | api = mock.Mock(spec=ChangesAPI) 547 | api.get_allocate_jobsteps.return_value = [] 548 | cs = ChangesScheduler(state_file=None, api=api, 549 | blacklist=_noop_blacklist()) 550 | driver = mock.Mock() 551 | 552 | offer = self._make_offer(cluster="foo_cluster") 553 | 554 | help_resource_offers_and_poll_changes(cs, driver, [offer]) 555 | 556 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, cluster="foo_cluster") 557 | assert driver.declineOffer.call_count == 0 558 | 559 | # Decline any unused offers. Expect the only existing offer. 560 | cs.decline_open_offers(driver) 561 | driver.declineOffer.assert_called_once_with(offer.id) 562 | 563 | def test_api_one_task(self): 564 | api = mock.Mock(spec=ChangesAPI) 565 | api.get_allocate_jobsteps.return_value = [self._make_changes_task('1')] 566 | api.post_allocate_jobsteps.return_value = ['1'] 567 | cs = ChangesScheduler(state_file=None, api=api, 568 | blacklist=_noop_blacklist()) 569 | driver = mock.Mock() 570 | 571 | offer = self._make_offer(hostname='aHostname', cluster="foo_cluster") 572 | 573 | def check_tasks(offer_ids, tasks, filters): 574 | assert len(offer_ids) == 1 575 | assert offer_ids[0] == offer.id 576 | assert len(tasks) == 1 577 | assert tasks[0].name == 'foo 1' 578 | assert tasks[0].slave_id.value == offer.slave_id.value 579 | assert tasks[0].command.value == 'ls' 580 | assert tasks[0].resources[0].name == "cpus" 581 | assert tasks[0].resources[0].scalar.value == 2 582 | assert tasks[0].resources[1].name == "mem" 583 | assert tasks[0].resources[1].scalar.value == 4096 584 | assert filters.refuse_seconds == 1.0 585 | driver.launchTasks.side_effect = check_tasks 586 | 587 | help_resource_offers_and_poll_changes(cs, driver, [offer]) 588 | 589 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, cluster="foo_cluster") 590 | api.post_allocate_jobsteps.assert_called_once_with(['1'], cluster="foo_cluster") 591 | assert driver.launchTasks.call_count == 1 592 | assert cs.tasksLaunched == 1 593 | 594 | # Decline any unused offers (should be none) 595 | cs.decline_open_offers(driver) 596 | assert driver.declineOffer.call_count == 0 597 | 598 | def test_not_enough_resources(self): 599 | api = mock.Mock(spec=ChangesAPI) 600 | api.get_allocate_jobsteps.return_value = [self._make_changes_task('1', cpus=8)] 601 | api.post_allocate_jobsteps.return_value = ['1'] 602 | cs = ChangesScheduler(state_file=None, api=api, 603 | blacklist=_noop_blacklist()) 604 | driver = mock.Mock() 605 | 606 | offer = self._make_offer(cluster="foo_cluster", cpus=4) 607 | 608 | help_resource_offers_and_poll_changes(cs, driver, [offer]) 609 | 610 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, cluster="foo_cluster") 611 | assert api.post_allocate_jobsteps.call_count == 0 612 | assert driver.launchTasks.call_count == 0 613 | assert driver.declineOffer.call_count == 0 614 | assert cs.tasksLaunched == 0 615 | 616 | # Decline any unused offers. Expect the offer with insufficient 617 | # resources to schedule the jobstep. 618 | cs.decline_open_offers(driver) 619 | driver.declineOffer.assert_called_once_with(offer.id) 620 | 621 | def test_tries_all_offers(self): 622 | api = mock.Mock(spec=ChangesAPI) 623 | api.get_allocate_jobsteps.return_value = [self._make_changes_task('1', cpus=8)] 624 | api.post_allocate_jobsteps.return_value = ['1'] 625 | cs = ChangesScheduler(state_file=None, api=api, 626 | blacklist=_noop_blacklist()) 627 | driver = mock.Mock() 628 | 629 | offer1 = self._make_offer(hostname="host1", cluster="foo_cluster", cpus=4) 630 | offer2 = self._make_offer(hostname="host2", cluster="foo_cluster", cpus=8) 631 | 632 | def check_tasks(offer_ids, tasks, filters): 633 | assert offer_ids == [offer2.id] 634 | assert len(tasks) == 1 635 | assert tasks[0].name == 'foo 1' 636 | assert tasks[0].slave_id.value == offer2.slave_id.value 637 | assert filters.refuse_seconds == 1.0 638 | driver.launchTasks.side_effect = check_tasks 639 | 640 | help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2]) 641 | 642 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, cluster="foo_cluster") 643 | api.post_allocate_jobsteps.assert_called_once_with(['1'], cluster="foo_cluster") 644 | assert driver.launchTasks.call_count == 1 645 | assert cs.tasksLaunched == 1 646 | 647 | # Decline any unused offers (should be one) 648 | cs.decline_open_offers(driver) 649 | assert driver.declineOffer.call_count == 1 650 | 651 | def test_least_loaded(self): 652 | api = mock.Mock(spec=ChangesAPI) 653 | # task 4 won't be allocated if we schedule tasks in the order they're 654 | # returned 655 | api.get_allocate_jobsteps.return_value = [ 656 | self._make_changes_task('1'), self._make_changes_task('2'), 657 | self._make_changes_task('3'), self._make_changes_task('4', cpus=3), 658 | ] 659 | api.post_allocate_jobsteps.return_value = ['1', '2', '3'] 660 | cs = ChangesScheduler(state_file=None, api=api, 661 | blacklist=_noop_blacklist()) 662 | driver = mock.Mock() 663 | 664 | offer1 = self._make_offer(id='offer1', cpus=4, mem=8192) 665 | # should get loaded first 666 | offer2 = self._make_offer(id='offer2', cpus=4, mem=8193) 667 | 668 | def check_tasks(offer_ids, tasks, filters): 669 | assert len(offer_ids) == 1 670 | offer_id = offer_ids[0] 671 | assert offer_id in (offer1.id, offer2.id) 672 | if offer_id == offer1.id: 673 | assert len(tasks) == 1 674 | # after task 1 is allocated, this slave is least loaded, so 675 | # second task should go to it. 676 | assert tasks[0].name == 'foo 2' 677 | assert tasks[0].slave_id.value == offer1.slave_id.value 678 | elif offer_id == offer2.id: 679 | assert len(tasks) == 2 680 | assert tasks[0].name == 'foo 1' 681 | assert tasks[0].slave_id.value == offer2.slave_id.value 682 | # for task 3 this slave is least loaded again 683 | assert tasks[1].name == 'foo 3' 684 | assert tasks[1].slave_id.value == offer2.slave_id.value 685 | assert filters.refuse_seconds == 1.0 686 | 687 | driver.launchTasks.side_effect = check_tasks 688 | 689 | help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2]) 690 | 691 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, cluster=None) 692 | api.post_allocate_jobsteps.assert_called_once_with(['1', '2', '3'], cluster=None) 693 | assert driver.launchTasks.call_count == 2 694 | assert cs.tasksLaunched == 3 695 | 696 | # Decline any unused offers (should be none) 697 | cs.decline_open_offers(driver) 698 | assert driver.declineOffer.call_count == 0 699 | 700 | def test_alloc_failed(self): 701 | api = mock.Mock(spec=ChangesAPI) 702 | api.get_allocate_jobsteps.side_effect = lambda limit, cluster: [self._make_changes_task(id=cluster)] 703 | def post_allocate_jobsteps(ids, cluster): 704 | if cluster == '1': 705 | return ['1'] 706 | else: 707 | raise APIError('Failure') 708 | api.post_allocate_jobsteps.side_effect = post_allocate_jobsteps 709 | cs = ChangesScheduler(state_file=None, api=api, 710 | blacklist=_noop_blacklist()) 711 | driver = mock.Mock() 712 | 713 | offer1 = self._make_offer(id="offer1", cluster="1") 714 | offer2 = self._make_offer(id="offer2", cluster="2") 715 | 716 | def check_tasks(offer_ids, tasks, filters): 717 | assert len(offer_ids) == 1 718 | assert offer_ids[0] == offer1.id 719 | # other task should still get launched if second one failed. 720 | assert len(tasks) == 1 721 | assert tasks[0].name == 'foo 1' 722 | assert tasks[0].slave_id.value == offer1.slave_id.value 723 | assert filters.refuse_seconds == 1.0 724 | driver.launchTasks.side_effect = check_tasks 725 | 726 | help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2]) 727 | 728 | api.get_allocate_jobsteps.assert_has_calls([mock.call(limit=200, cluster='1'), mock.call(limit=200, cluster='2')], 729 | any_order=True) 730 | assert api.get_allocate_jobsteps.call_count == 2 731 | api.post_allocate_jobsteps.assert_has_calls([mock.call(['1'], cluster='1'), mock.call(['2'], cluster='2')], 732 | any_order=True) 733 | assert api.post_allocate_jobsteps.call_count == 2 734 | assert driver.launchTasks.call_count == 1 735 | assert cs.tasksLaunched == 1 736 | assert driver.declineOffer.call_count == 0 737 | 738 | # Decline any unused offers (offer2 should be open, since it failed 739 | # to schedule.) 740 | cs.decline_open_offers(driver) 741 | driver.declineOffer.assert_called_once_with(offer2.id) 742 | 743 | def test_group_snapshots_on_same_machine(self): 744 | # Create 2 tasks with same snapshot and assert they both go to the 745 | # same slave. 746 | api = mock.Mock(spec=ChangesAPI) 747 | api.get_allocate_jobsteps.return_value = [ 748 | self._make_changes_task('1', cpus=2, snapshot='snapfoo'), 749 | self._make_changes_task('2', cpus=2, snapshot='snapfoo') 750 | ] 751 | api.post_allocate_jobsteps.return_value = ['1', '2'] 752 | 753 | cs = ChangesScheduler(state_file=None, api=api, 754 | blacklist=_noop_blacklist()) 755 | driver = mock.Mock() 756 | 757 | launched_offer_id = None 758 | def launchTasks(offers, tasks, filters=None): 759 | # Assert all launched tasks go to offer1 (host1) 760 | # host1 is assured to be picked first because it has slightly more 761 | # resources at first. 762 | assert len(offers) == 1 763 | assert offers == [mesos_pb2.OfferID(value="offer1")] 764 | 765 | driver.launchTasks.side_effect = launchTasks 766 | 767 | offer1 = self._make_offer(id='offer1', hostname='host1', cpus=5) 768 | offer2 = self._make_offer(id='offer2', hostname='host2', cpus=4) 769 | 770 | help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2]) 771 | 772 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, 773 | cluster=None) 774 | assert api.post_allocate_jobsteps.call_count == 1 775 | assert driver.launchTasks.call_count == 1 776 | assert driver.declineOffer.call_count == 0 777 | assert cs.tasksLaunched == 2 778 | 779 | # Decline any unused offers. Expect offer2 remains, since both tasks 780 | # were schedule on offer1. 781 | cs.decline_open_offers(driver) 782 | driver.declineOffer.assert_called_once_with(offer2.id) 783 | 784 | def test_fall_back_to_least_loaded(self): 785 | # Fall back to least-loaded assignment if the snapshot for a task is 786 | # not found on any slave. 787 | api = mock.Mock(spec=ChangesAPI) 788 | api.get_allocate_jobsteps.return_value = [ 789 | self._make_changes_task('1', cpus=2, snapshot='snapfoo'), 790 | self._make_changes_task('2', cpus=2, snapshot='snapbar') 791 | ] 792 | api.post_allocate_jobsteps.return_value = ['1', '2'] 793 | 794 | cs = ChangesScheduler(state_file=None, api=api, 795 | blacklist=_noop_blacklist()) 796 | driver = mock.Mock() 797 | 798 | offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4) 799 | offer2 = self._make_offer(id='offer2', hostname='host2', cpus=4) 800 | 801 | help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2]) 802 | 803 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, 804 | cluster=None) 805 | assert api.post_allocate_jobsteps.call_count == 1 806 | assert driver.launchTasks.call_count == 2 # Jobs are sent to separate slaves 807 | assert driver.declineOffer.call_count == 0 808 | assert cs.tasksLaunched == 2 809 | 810 | # Decline any unused offers (should be none) 811 | cs.decline_open_offers(driver) 812 | assert driver.declineOffer.call_count == 0 813 | 814 | def test_prefer_loaded_slave_with_snapshot(self): 815 | # Fall back to least-loaded assignment if the snapshot for a task is 816 | # not found on any slave. 817 | api = mock.Mock(spec=ChangesAPI) 818 | api.get_allocate_jobsteps.return_value = [ 819 | self._make_changes_task('1', cpus=2, snapshot='snapfoo') 820 | ] 821 | api.post_allocate_jobsteps.return_value = ['1'] 822 | 823 | cs = ChangesScheduler(state_file=None, api=api, 824 | blacklist=_noop_blacklist()) 825 | driver = mock.Mock() 826 | 827 | offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4) 828 | help_resource_offers_and_poll_changes(cs, driver, [offer1]) 829 | 830 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, 831 | cluster=None) 832 | 833 | cs.decline_open_offers(driver) 834 | api.reset_mock() 835 | driver.reset_mock() 836 | 837 | api.get_allocate_jobsteps.return_value = [ 838 | self._make_changes_task('2', cpus=2, snapshot='snapfoo') 839 | ] 840 | api.post_allocate_jobsteps.return_value = ['2'] 841 | 842 | def launchTasks(offers, tasks, filters=None): 843 | # Assert launched task goes to offer1 (host1) 844 | # although it has lesser resources than host2 845 | assert offers == [mesos_pb2.OfferID(value="offer1")] 846 | 847 | driver.launchTasks.side_effect = launchTasks 848 | 849 | offer1 = self._make_offer(id='offer1', hostname='host1', cpus=2) 850 | offer2 = self._make_offer(id='offer2', hostname='host2', cpus=4) 851 | help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2]) 852 | 853 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, 854 | cluster=None) 855 | assert api.post_allocate_jobsteps.call_count == 1 856 | assert driver.launchTasks.call_count == 1 857 | assert driver.declineOffer.call_count == 0 858 | assert cs.tasksLaunched == 2 859 | 860 | # Decline any unused offers. Expect offer2 remains since offer1 was 861 | # prefered. 862 | cs.decline_open_offers(driver) 863 | driver.declineOffer.assert_called_once_with(offer2.id) 864 | 865 | def test_slave_with_snapshot_unavailable(self): 866 | # Fall back to least-loaded assignment if the snapshot for a task is 867 | # not found on any slave. 868 | api = mock.Mock(spec=ChangesAPI) 869 | api.get_allocate_jobsteps.return_value = [ 870 | self._make_changes_task('1', cpus=2, snapshot='snapfoo') 871 | ] 872 | api.post_allocate_jobsteps.return_value = ['1'] 873 | 874 | cs = ChangesScheduler(state_file=None, api=api, 875 | blacklist=_noop_blacklist()) 876 | driver = mock.Mock() 877 | 878 | offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4) 879 | help_resource_offers_and_poll_changes(cs, driver, [offer1]) 880 | 881 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, 882 | cluster=None) 883 | 884 | cs.decline_open_offers(driver) 885 | api.reset_mock() 886 | driver.reset_mock() 887 | 888 | api.get_allocate_jobsteps.return_value = [ 889 | self._make_changes_task('2', cpus=2, snapshot='snapfoo') 890 | ] 891 | api.post_allocate_jobsteps.return_value = ['2'] 892 | 893 | # Use this slightly roundabout way of verify launchTasks in order to 894 | # avoid hanging the changes-polling thread. Otherwise the test will 895 | # hang when it fails. 896 | expected_launched_tasks = [mesos_pb2.OfferID(value="offer2").value] 897 | launched_tasks = [] 898 | def launchTasks(offers, tasks, filters=None): 899 | # Assert offer is accepted although slave doesn't have snapshot. 900 | assert len(offers) == 1 901 | launched_tasks.append(offers[0].value) 902 | 903 | driver.launchTasks.side_effect = launchTasks 904 | 905 | offer2 = self._make_offer(id='offer2', hostname='host2', cpus=4) 906 | help_resource_offers_and_poll_changes(cs, driver, [offer2]) 907 | assert launched_tasks == expected_launched_tasks 908 | 909 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, 910 | cluster=None) 911 | assert api.post_allocate_jobsteps.call_count == 1 912 | assert driver.launchTasks.call_count == 1 913 | assert driver.declineOffer.call_count == 0 914 | assert cs.tasksLaunched == 2 915 | 916 | # Decline any unused offers (should be none) 917 | cs.decline_open_offers(driver) 918 | assert driver.declineOffer.call_count == 0 919 | 920 | @mock.patch('time.time') 921 | def test_slave_with_stale_snapshot(self, time_mock): 922 | # Fall back to least-loaded assignment if the snapshot for a task is 923 | # not found on any slave. 924 | api = mock.Mock(spec=ChangesAPI) 925 | time_mock.return_value = 1 926 | api.get_allocate_jobsteps.return_value = [ 927 | self._make_changes_task('1', cpus=2, snapshot='snapfoo') 928 | ] 929 | api.post_allocate_jobsteps.return_value = ['1'] 930 | 931 | cs = ChangesScheduler(state_file=None, api=api, 932 | blacklist=_noop_blacklist()) 933 | driver = mock.Mock() 934 | 935 | offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4) 936 | help_resource_offers_and_poll_changes(cs, driver, [offer1]) 937 | 938 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, 939 | cluster=None) 940 | assert api.post_allocate_jobsteps.call_count == 1 941 | 942 | cs.decline_open_offers(driver) 943 | api.reset_mock() 944 | driver.reset_mock() 945 | time_mock.return_value = 1000000 946 | 947 | api.get_allocate_jobsteps.return_value = [ 948 | self._make_changes_task('2', cpus=2, snapshot='snapfoo') 949 | ] 950 | api.post_allocate_jobsteps.return_value = ['2'] 951 | 952 | def launchTasks(offers, tasks, filters=None): 953 | # Ignore stale snapshot association and select least-loaded slave. 954 | assert offers == [mesos_pb2.OfferID(value="offer2")] 955 | 956 | driver.launchTasks.side_effect = launchTasks 957 | 958 | offer1 = self._make_offer(id='offer1', hostname='host1', cpus=2) 959 | offer2 = self._make_offer(id='offer2', hostname='host2', cpus=4) 960 | help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2]) 961 | 962 | api.get_allocate_jobsteps.assert_called_once_with(limit=200, 963 | cluster=None) 964 | assert api.post_allocate_jobsteps.call_count == 1 965 | assert driver.launchTasks.call_count == 1 966 | assert driver.declineOffer.call_count == 0 967 | assert cs.tasksLaunched == 2 968 | 969 | # Decline any unused offers. Expect offer1 remains, since offer2 was 970 | # least-loaded. 971 | cs.decline_open_offers(driver) 972 | driver.declineOffer.assert_called_once_with(offer1.id) 973 | 974 | def test_cached_offer_is_used(self): 975 | api = mock.Mock(spec=ChangesAPI) 976 | cs = ChangesScheduler(state_file=None, api=api, 977 | blacklist=_noop_blacklist()) 978 | driver = mock.Mock() 979 | 980 | # Scheduler has an offer, but no tasks. 981 | api.get_allocate_jobsteps.return_value = [] 982 | offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4) 983 | help_resource_offers_and_poll_changes(cs, driver, [offer1]) 984 | assert api.get_allocate_jobsteps.call_count == 1 985 | assert api.post_allocate_jobsteps.call_count == 0 986 | 987 | # Don't decline offers here like we decline offers when resetting 988 | # elsewhere. We need to leave the offer cache intact. 989 | api.reset_mock() 990 | driver.reset_mock() 991 | 992 | # When an offer arrives, the scheduler uses on the cached offer. 993 | api.get_allocate_jobsteps.return_value = [ 994 | self._make_changes_task('2', cpus=2, snapshot='snapfoo') 995 | ] 996 | api.post_allocate_jobsteps.return_value = ['2'] 997 | help_resource_offers_and_poll_changes(cs, driver, []) 998 | assert api.get_allocate_jobsteps.call_count == 1 999 | assert api.post_allocate_jobsteps.call_count == 1 1000 | 1001 | cs.decline_open_offers(driver) 1002 | driver.declineOffer.assert_not_called() 1003 | 1004 | def test_offer_rescinded(self): 1005 | api = mock.Mock(spec=ChangesAPI) 1006 | cs = ChangesScheduler(state_file=None, api=api, 1007 | blacklist=_noop_blacklist()) 1008 | driver = mock.Mock() 1009 | 1010 | # Scheduler has an offer, but no tasks are available. 1011 | api.get_allocate_jobsteps.return_value = [] 1012 | offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4) 1013 | help_resource_offers_and_poll_changes(cs, driver, [offer1]) 1014 | assert api.get_allocate_jobsteps.call_count == 1 1015 | assert api.post_allocate_jobsteps.call_count == 0 1016 | 1017 | cs.decline_open_offers(driver) 1018 | api.reset_mock() 1019 | driver.reset_mock() 1020 | 1021 | # Offer gets rescinded by Mesos master. 1022 | cs.offerRescinded(driver, offer1.id) 1023 | api.get_allocate_jobsteps.assert_not_called() 1024 | api.reset_mock() 1025 | driver.reset_mock() 1026 | 1027 | # Now changes has no offers, so the task can't be scheduled. 1028 | api.get_allocate_jobsteps.return_value = [ 1029 | self._make_changes_task('2', cpus=2, snapshot='snapfoo') 1030 | ] 1031 | help_resource_offers_and_poll_changes(cs, driver, []) 1032 | # No offers -> no clusters to query for -> no get_allocate_jobsteps calls. 1033 | assert api.get_allocate_jobsteps.call_count == 0 1034 | assert api.post_allocate_jobsteps.call_count == 0 1035 | 1036 | cs.decline_open_offers(driver) 1037 | driver.declineOffer.assert_not_called() 1038 | 1039 | def test_combine_offer_fragments(self): 1040 | api = mock.Mock(spec=ChangesAPI) 1041 | cs = ChangesScheduler(state_file=None, api=api, 1042 | blacklist=_noop_blacklist()) 1043 | driver = mock.Mock() 1044 | 1045 | api.get_allocate_jobsteps.return_value = [ 1046 | self._make_changes_task('1', cpus=2, mem=2048), 1047 | ] 1048 | api.post_allocate_jobsteps.return_value = ['1'] 1049 | 1050 | # Add an offer for a different host, to make sure offers for different 1051 | # hosts aren't being merged/defragmented. 1052 | host2_offer = self._make_offer(id='host2_offer', hostname='host2', 1053 | cpus=1, mem=1024) 1054 | cs.resourceOffers(driver, [host2_offer]) 1055 | 1056 | # Add a set of small, fragmented offers one at a time. The task can 1057 | # only be scheduled once all offers have arrived. By our powers 1058 | # combined! 1059 | offers = ((1, 0), (1, 0), (0, 1024), (0, 1024)) 1060 | expected_offer_ids = [] 1061 | for i, (cpu, mem) in enumerate(offers): 1062 | offer_id = 'host1_offer{}'.format(i) 1063 | new_offer = self._make_offer(id=offer_id, hostname='host1', 1064 | cpus=cpu, mem=mem) 1065 | expected_offer_ids.append(new_offer.id) 1066 | 1067 | if i < len(offers) - 1: # Not the last iteration 1068 | help_resource_offers_and_poll_changes(cs, driver, [new_offer]) 1069 | assert api.get_allocate_jobsteps.call_count == 1 1070 | assert api.post_allocate_jobsteps.call_count == 0 1071 | else: # on the final iteration only 1072 | def check_tasks(offer_ids, tasks, filters): 1073 | assert offer_ids == expected_offer_ids 1074 | assert len(tasks) == 1 1075 | driver.launchTasks.side_effect = check_tasks 1076 | help_resource_offers_and_poll_changes(cs, driver, [new_offer]) 1077 | assert api.get_allocate_jobsteps.call_count == 1 1078 | assert api.post_allocate_jobsteps.call_count == 1 1079 | api.reset_mock() 1080 | driver.reset_mock() 1081 | cs.decline_open_offers(driver) 1082 | driver.declineOffer.assert_not_called() 1083 | 1084 | def test_full_thread_polling(self): 1085 | """This test simply runs through the startup/teardown machinery for the 1086 | Changes polling thread. We run the polling a couple of times to ensure 1087 | the looping works properly. 1088 | """ 1089 | api = mock.Mock(spec=ChangesAPI) 1090 | cs = ChangesScheduler(state_file=None, api=api, 1091 | blacklist=_noop_blacklist()) 1092 | driver = mock.Mock() 1093 | 1094 | # Add an offer to ensure get_allocate_jobsteps() is polled. 1095 | offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4) 1096 | cs.resourceOffers(driver, [offer1]) 1097 | 1098 | # After get_allocate_jobsteps() is called a couple of times, shut down. 1099 | count = [0] 1100 | def get_allocate_jobsteps(limit, cluster): 1101 | if count[0] > 3: 1102 | cs.shuttingDown.set() 1103 | count[0] += 1 1104 | return [] 1105 | api.get_allocate_jobsteps.side_effect = get_allocate_jobsteps 1106 | 1107 | # Just makes sure this executes with no exceptions. 1108 | cs.poll_changes_until_shutdown(driver, 0) 1109 | 1110 | def test_full_thread_polling_with_exception(self): 1111 | """Test that exceptions in the polling thread are reported back to the 1112 | main thread correctly. 1113 | """ 1114 | api = mock.Mock(spec=ChangesAPI) 1115 | cs = ChangesScheduler(state_file=None, api=api, 1116 | blacklist=_noop_blacklist()) 1117 | driver = mock.Mock() 1118 | 1119 | # Add an offer to ensure get_allocate_jobsteps() is polled. 1120 | offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4) 1121 | cs.resourceOffers(driver, [offer1]) 1122 | 1123 | # Force an exception in get_allocate_jobsteps() 1124 | def get_allocate_jobsteps(limit, cluster): 1125 | assert False 1126 | api.get_allocate_jobsteps.side_effect = get_allocate_jobsteps 1127 | 1128 | class Filter(logging.Filter): 1129 | def __init__(self): 1130 | super(Filter, self).__init__() 1131 | self.found_error = False 1132 | 1133 | def filter(self, record): 1134 | if record.getMessage() == "Polling thread failed. Exiting.": 1135 | self.found_error = True 1136 | 1137 | f = Filter() 1138 | try: 1139 | logger.addFilter(f) 1140 | cs.poll_changes_until_shutdown(driver, 0) 1141 | assert f.found_error 1142 | finally: 1143 | logger.removeFilter(f) 1144 | 1145 | def test_state_json(self): 1146 | framework_id = 'frameworkid' 1147 | changes_request_limit = 53 1148 | 1149 | blpath = self.test_dir + '/blacklist' 1150 | blacklist = open(blpath, 'w+') 1151 | blacklist.write('hostname1\nhostname2\n') 1152 | blacklist.close() 1153 | 1154 | api = mock.Mock(spec=ChangesAPI) 1155 | cs = ChangesScheduler(state_file=None, 1156 | api=api, 1157 | blacklist=FileBlacklist(blpath), 1158 | changes_request_limit=changes_request_limit) 1159 | cs.framework_id = framework_id 1160 | driver = mock.Mock() 1161 | now = time.time() 1162 | 1163 | offer1 = self._make_offer(id='offer1', hostname='host1', cpus=1, mem=1024) 1164 | offer2 = self._make_offer(id='offer2', hostname='host1', cpus=2, mem=2048) 1165 | offer3 = self._make_offer(id='offer3', hostname='host3', cpus=4, mem=4096, 1166 | cluster='some_cluster') 1167 | offer4 = self._make_offer(hostname='host4', 1168 | id='offer4', 1169 | cpus=5, 1170 | mem=5000, 1171 | unavailability_start_secs=now - 5, 1172 | unavailability_duration_secs=100) 1173 | offer4.attributes.add(name="ranges_example", 1174 | type=mesos_pb2.Value.RANGES, 1175 | ranges=mesos_pb2.Value.Ranges(range=[ 1176 | mesos_pb2.Value.Range(begin=10, end=20), 1177 | mesos_pb2.Value.Range(begin=30, end=40), 1178 | ])) 1179 | offer4.attributes.add(name="set_example", 1180 | type=mesos_pb2.Value.SET, 1181 | set=mesos_pb2.Value.Set(item=[ 1182 | 'string_1', 1183 | 'string_2', 1184 | ])) 1185 | cs.resourceOffers(driver, [offer1, offer2, offer3, offer4]) 1186 | 1187 | expected_state = { 1188 | 'framework_id': framework_id, 1189 | 'taskJobStepMapping': {}, 1190 | 'tasksPendingKill': {}, 1191 | 'tasksLaunched': 0, 1192 | 'tasksFinished': 0, 1193 | 'shuttingDown': False, 1194 | 'blacklist': { 1195 | 'path': blpath, 1196 | 'entries': [ 1197 | 'hostname1', 1198 | 'hostname2', 1199 | ], 1200 | }, 1201 | 'snapshot_slave_map': {}, 1202 | 'changes_request_limit': changes_request_limit, 1203 | 'cached_slaves': [ 1204 | { 1205 | 'slave_id': 'slave_id_host1', 1206 | 'hostname': 'host1', 1207 | 'cluster': None, 1208 | 'offers': [ 1209 | { 1210 | 'offer_id': 'offer1', 1211 | 'framework_id': framework_id, 1212 | 'url': '', 1213 | 'cpu': 1.0, 1214 | 'mem': 1024, 1215 | 'attributes': [], 1216 | 'resources': [ 1217 | {'name': 'cpus', 'type': mesos_pb2.Value.SCALAR, 'value': 1}, 1218 | {'name': 'mem', 'type': mesos_pb2.Value.SCALAR, 'value': 1024}, 1219 | ], 1220 | }, 1221 | { 1222 | 'offer_id': 'offer2', 1223 | 'framework_id': framework_id, 1224 | 'url': '', 1225 | 'cpu': 2.0, 1226 | 'mem': 2048, 1227 | 'attributes': [], 1228 | 'resources': [ 1229 | {'name': 'cpus', 'type': mesos_pb2.Value.SCALAR, 'value': 2}, 1230 | {'name': 'mem', 'type': mesos_pb2.Value.SCALAR, 'value': 2048}, 1231 | ], 1232 | }, 1233 | ], 1234 | 'total_cpu': 3.0, 1235 | 'total_mem': 3072, 1236 | 'is_maintenanced': False, 1237 | }, 1238 | { 1239 | 'slave_id': 'slave_id_host3', 1240 | 'hostname': 'host3', 1241 | 'cluster': 'some_cluster', 1242 | 'offers': [ 1243 | { 1244 | 'offer_id': 'offer3', 1245 | 'framework_id': framework_id, 1246 | 'url': '', 1247 | 'cpu': 4.0, 1248 | 'mem': 4096, 1249 | 'attributes': [ 1250 | {'name': 'labels', 'type': mesos_pb2.Value.TEXT, 'value': 'some_cluster'}, 1251 | ], 1252 | 'resources': [ 1253 | {'name': 'cpus', 'type': mesos_pb2.Value.SCALAR, 'value': 4}, 1254 | {'name': 'mem', 'type': mesos_pb2.Value.SCALAR, 'value': 4096}, 1255 | ], 1256 | }, 1257 | ], 1258 | 'total_cpu': 4.0, 1259 | 'total_mem': 4096, 1260 | 'is_maintenanced': False, 1261 | }, 1262 | { 1263 | 'slave_id': 'slave_id_host4', 1264 | 'hostname': 'host4', 1265 | 'cluster': None, 1266 | 'offers': [ 1267 | { 1268 | 'offer_id': 'offer4', 1269 | 'framework_id': framework_id, 1270 | 'url': '', 1271 | 'cpu': 5.0, 1272 | 'mem': 5000, 1273 | 'attributes': [ 1274 | {'name': 'ranges_example', 'type': mesos_pb2.Value.RANGES, 'value': '(10, 20), (30, 40)'}, 1275 | {'name': 'set_example', 'type': mesos_pb2.Value.SET, 'value': 'string_1, string_2'}, 1276 | ], 1277 | 'resources': [ 1278 | {'name': 'cpus', 'type': mesos_pb2.Value.SCALAR, 'value': 5}, 1279 | {'name': 'mem', 'type': mesos_pb2.Value.SCALAR, 'value': 5000}, 1280 | ], 1281 | }, 1282 | ], 1283 | 'total_cpu': 5.0, 1284 | 'total_mem': 5000, 1285 | 'is_maintenanced': True, 1286 | }, 1287 | ], 1288 | 'build_state_json_secs': .5, 1289 | } 1290 | 1291 | state = cs.state_json() 1292 | 1293 | # Verify that the state can be json-converted cleanly. 1294 | json.dumps(state) 1295 | 1296 | # Verify that we got a time-to-build with approximately the right order 1297 | # of magnitude, then replace the value with something predictable. 1298 | assert state['build_state_json_secs'] > 0 1299 | assert state['build_state_json_secs'] < 10 1300 | state['build_state_json_secs'] = expected_state['build_state_json_secs'] 1301 | 1302 | # Verify a bunch of state fields individually to make diffing easier 1303 | # when we find a problem 1304 | for slave, expected_slave in (zip(state['cached_slaves'], 1305 | expected_state['cached_slaves'])): 1306 | for offer, expected_offer in zip(slave['offers'], expected_slave['offers']): 1307 | assert offer == expected_offer 1308 | 1309 | # Compare all state keys individually. 1310 | for key in expected_state: 1311 | print 'Compare key {}: [{}] vs expected [{}]'.format( 1312 | key, state[key], expected_state[key]) 1313 | assert state[key] == expected_state[key] 1314 | 1315 | # Ensure both dicts have the same number of keys, which means the 1316 | # previous loop hit everything. 1317 | assert sorted(expected_state.keys()) == sorted(state.keys()) 1318 | 1319 | # Add some tasks to the scheduler and reschedule, to trigger some 1320 | # snapshot-slave mappings. 1321 | tasks = [ 1322 | self._make_changes_task('1', mem=3072, snapshot='snap1'), 1323 | self._make_changes_task('2', mem=4096, snapshot='snap2'), 1324 | ] 1325 | api.get_allocate_jobsteps.return_value = tasks 1326 | api.post_allocate_jobsteps.return_value = ['1', '2'] 1327 | 1328 | assert api.get_allocate_jobsteps.reset() 1329 | assert api.post_allocate_jobsteps.reset() 1330 | assert not cs.poll_and_launch_once(driver) # Get jobsteps and launch them. 1331 | assert api.get_allocate_jobsteps.call_count == 2 1332 | assert api.post_allocate_jobsteps.call_count == 2 1333 | 1334 | state = cs.state_json() 1335 | assert len(state['snapshot_slave_map']) == 2 1336 | 1337 | def test_state_json_performance(self): 1338 | """Verify that the /state_json handler can build its JSON payload in 1339 | less than .05 seconds, on average. 1340 | """ 1341 | framework_id = 'frameworkid' 1342 | changes_request_limit = 53 1343 | 1344 | blpath = self.test_dir + '/blacklist' 1345 | blacklist = open(blpath, 'w+') 1346 | blacklist.write('hostname1\nhostname2\n') 1347 | blacklist.close() 1348 | 1349 | api = mock.Mock(spec=ChangesAPI) 1350 | cs = ChangesScheduler(state_file=None, 1351 | api=api, 1352 | blacklist=FileBlacklist(blpath), 1353 | changes_request_limit=changes_request_limit) 1354 | cs.framework_id = framework_id 1355 | driver = mock.Mock() 1356 | now = time.time() 1357 | 1358 | offer1 = self._make_offer(id='offer1', hostname='host1', cpus=1, mem=1024) 1359 | offer2 = self._make_offer(id='offer2', hostname='host1', cpus=2, mem=2048) 1360 | offer3 = self._make_offer(id='offer3', hostname='host3', cpus=4, mem=4096, 1361 | cluster='some_cluster') 1362 | offer4 = self._make_offer(hostname='host4', 1363 | id='offer4', 1364 | cpus=5, 1365 | mem=5000, 1366 | unavailability_start_secs=now - 5, 1367 | unavailability_duration_secs=100) 1368 | offer4.attributes.add(name="ranges_example", 1369 | type=mesos_pb2.Value.RANGES, 1370 | ranges=mesos_pb2.Value.Ranges(range=[ 1371 | mesos_pb2.Value.Range(begin=10, end=20), 1372 | mesos_pb2.Value.Range(begin=30, end=40), 1373 | ])) 1374 | offer4.attributes.add(name="set_example", 1375 | type=mesos_pb2.Value.SET, 1376 | set=mesos_pb2.Value.Set(item=[ 1377 | 'string_1', 1378 | 'string_2', 1379 | ])) 1380 | 1381 | cs.resourceOffers(driver, [offer1, offer2, offer3, offer4]) 1382 | 1383 | tasks = [ 1384 | self._make_changes_task('1', mem=3072, snapshot='snap1'), 1385 | self._make_changes_task('2', mem=4096, snapshot='snap2'), 1386 | ] 1387 | api.get_allocate_jobsteps.return_value = tasks 1388 | api.post_allocate_jobsteps.return_value = ['1', '2'] 1389 | assert not cs.poll_and_launch_once(driver) 1390 | 1391 | start_time = time.time() 1392 | loops = 1000 1393 | for i in xrange(loops): 1394 | state_json = cs.state_json() 1395 | total_time = time.time() - start_time 1396 | 1397 | max_avg_time_per_loop = .05 1398 | assert total_time < max_avg_time_per_loop * loops 1399 | -------------------------------------------------------------------------------- /changes_mesos_scheduler/tests/test_service.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | 5 | import json 6 | import random 7 | from pprint import pprint 8 | 9 | from flask import jsonify, Flask, Response, request 10 | 11 | 12 | app = Flask(__name__) 13 | 14 | @app.route("/") 15 | def index(): 16 | return "Mesos HTTP Proxy test service." 17 | 18 | 19 | @app.route("/jobsteps/allocate/", methods = ['POST']) 20 | def offer(): 21 | print("Received resource offer:") 22 | print(json.dumps(request.get_json(), sort_keys=True, indent=2, separators=(',', ': '))) 23 | 24 | REQUIRED_MEM = 500 25 | REQUIRED_CPU = 0.5 26 | 27 | tasks_to_run = [] 28 | 29 | info = request.get_json() 30 | 31 | if info["resources"]["cpus"] >= REQUIRED_CPU \ 32 | and info["resources"]["mem"] >= REQUIRED_MEM: 33 | 34 | random_id = str(random.randint(0, 1000)) 35 | tasks_to_run.append( 36 | { 37 | "id": "my_job_" + random_id, 38 | # "cmd": "pwd && /bin/sleep " + str(random.randint(10, 60)), 39 | "project": { 40 | "slug": random_id 41 | }, 42 | "resources": { 43 | "cpus": REQUIRED_CPU, 44 | "mem": REQUIRED_MEM 45 | } 46 | } 47 | ) 48 | 49 | print("Responding with the following tasks:") 50 | print(json.dumps(tasks_to_run, sort_keys=True, indent=2, separators=(',', ': '))) 51 | return Response(json.dumps(tasks_to_run), mimetype='application/json') 52 | 53 | 54 | @app.route("/jobsteps//", methods = ['POST']) 55 | def status(job_id): 56 | print("Received status update:") 57 | print(json.dumps(request.get_json(), sort_keys=True, indent=2, separators=(',', ': '))) 58 | return "OK" 59 | 60 | @app.route("/jobsteps//deallocate/", methods = ['POST']) 61 | def delallocate(job_id): 62 | print("Received status update:") 63 | print(json.dumps(request.get_json(), sort_keys=True, indent=2, separators=(',', ': '))) 64 | return "OK" 65 | 66 | 67 | if __name__ == "__main__": 68 | app.debug = True 69 | app.run(host='0.0.0.0') 70 | -------------------------------------------------------------------------------- /ci/mypy-run: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eux 2 | 3 | # Allow the path to mypy to be specified in the MYPY environment variable, but default to "mypy". 4 | : ${MYPY=mypy} 5 | 6 | # Any paths we need to include in typechecking that are not automatically found (that is, that 7 | # have no '# type:' annotation) 8 | EXTRA_FILES="" 9 | 10 | # Any files with type annotations that should be excluded from typechecking. This is a regular 11 | # expression matched against the filenames. 12 | EXCLUDE="" 13 | 14 | # Find all Python files that are not in the exclude list and which have a '# type:' annotation. 15 | FILES=`find . -type f -name \*.py -print0 \ 16 | | xargs -0 grep -ls '# type:'` 17 | 18 | if [ -n "$EXCLUDE" ]; then 19 | FILES=`echo "$FILES" | egrep -v "$EXCLUDE"` 20 | fi 21 | 22 | ci/run_mypy.py $MYPY --silent-imports --py2 $FILES $EXTRA_FILES 23 | -------------------------------------------------------------------------------- /ci/mypy-setup: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eux 2 | 3 | # Heroic effort to install Python 3.2, setuptools, pip, mypy (in that 4 | # order) on Ubuntu 12.04. 5 | 6 | # This script needs to run as root. 7 | 8 | case `whoami` in 9 | root) ;; 10 | *) echo "Please use sudo to run this script as root."; exit 1;; 11 | esac 12 | 13 | apt-get install -y -q python3 14 | apt-get install -y -q python3-setuptools 15 | apt-get install -y -q python3-pip 16 | 17 | # Sadly, setuptools and pip are installed in /usr/local/lib/python3.4/ 18 | # (but there's no apt-get package that installs Python 3.4). 19 | # Just add that directory to sys.path 20 | 21 | export PYTHONPATH=/usr/local/lib/python3.4/dist-packages 22 | 23 | python3 -m pip install -q -U git+https://github.com/python/mypy 24 | #python3 -m pip install -q -U git+https://github.com/gvanrossum/pyxl3 25 | # Copied from pyxl3/finish_install.py 26 | #python3 < 24 | 25 | 26 | 27 | 28 | """ 29 | 30 | FAIL_TEMPLATE = """ 31 | 32 | 33 | {text} 34 | 35 | 36 | """ 37 | 38 | ERROR_TEMPLATE = """ 39 | 40 | 41 | {text} 42 | 43 | 44 | """ 45 | 46 | 47 | def main(): 48 | # TODO: parse flags args 49 | cmd = sys.argv[1:] 50 | if not cmd: 51 | sys.stderr.write("Usage: run_mypy mypy \n") 52 | return 2 53 | junit_file = 'mypy.junit.xml' 54 | 55 | t0 = time.time() 56 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 57 | outb, errb = p.communicate() 58 | code = p.returncode 59 | t1 = time.time() 60 | dt = '%.3f' % (t1 - t0) 61 | 62 | out = outb.decode('utf-8') 63 | err = errb.decode('utf-8') 64 | 65 | if out: 66 | if not out.endswith("\n"): 67 | out += "\n" 68 | sys.stdout.write(out) 69 | if err: 70 | if not err.endswith("\n"): 71 | err += "\n" 72 | sys.stderr.write(err) 73 | 74 | if code == 0: 75 | print("Pass") 76 | xml = PASS_TEMPLATE.format(time=dt) 77 | # TODO(guido): Remove the "mypy:" check once mypy writes to stderr. 78 | elif code == 1 and not err and out and not out.startswith("mypy:"): 79 | print("Fail") 80 | xml = FAIL_TEMPLATE.format(text=escape(out), time=dt) 81 | else: 82 | print("Error") 83 | texts = [] 84 | # TODO(guido): Use and once Changes supports them. 85 | if out: 86 | texts.append("=== stdout ===\n") 87 | texts.append(out) 88 | if err: 89 | texts.append("=== stderr ===\n") 90 | texts.append(err) 91 | text = "".join(texts) 92 | xml = ERROR_TEMPLATE.format(text=escape(text), time=dt) 93 | 94 | with open(junit_file, 'w') as f: 95 | f.write(xml) 96 | 97 | return code 98 | 99 | 100 | if __name__ == '__main__': 101 | sys.exit(main()) 102 | -------------------------------------------------------------------------------- /ci/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eux 2 | 3 | ci/mypy-run 4 | 5 | make virtualenv_coverage 6 | -------------------------------------------------------------------------------- /ci/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eux 2 | 3 | export DEBIAN_FRONTEND=noninteractive 4 | 5 | # Install git 6 | sudo apt-get install -y git 7 | 8 | # Install fpm 9 | sudo apt-get install -y ruby-dev gcc 10 | fpm -h > /dev/null || sudo gem install fpm --no-ri --no-rdoc 11 | 12 | # Install easy_install for fpm to use 13 | sudo apt-get install -y python-setuptools 14 | 15 | sudo ci/mypy-setup 16 | -------------------------------------------------------------------------------- /make_virtualenv.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | set -xe 3 | 4 | # Based on: https://github.com/brutasse/graphite-api/blob/master/fpm/build-deb.sh 5 | # but adapted to use easy_install 6 | 7 | export PROJECT=$1 8 | 9 | sudo apt-get -y install build-essential python-dev python-virtualenv 10 | 11 | rm -rf build 12 | 13 | mkdir -p build/usr/share/python 14 | virtualenv build/usr/share/python/$PROJECT 15 | 16 | build/usr/share/python/$PROJECT/bin/easy_install virtualenv-tools 17 | # Actually install our project 18 | build/usr/share/python/$PROJECT/bin/easy_install . 19 | 20 | # can't seem to do this with easy_install 21 | # ideally we wouldn't install test requirements for the deb we install 22 | build/usr/share/python/$PROJECT/bin/pip install "file://`pwd`#egg=$PROJECT[tests]" 23 | 24 | find build ! -perm -a+r -exec chmod a+r {} \; 25 | 26 | cd build/usr/share/python/$PROJECT 27 | # Not sure if this is necessary 28 | sed -i "s/'\/bin\/python'/\('\/bin\/python','\/bin\/python2'\)/g" lib/python2.7/site-packages/virtualenv_tools-*-py2.7.egg/virtualenv_tools.py 29 | ./bin/virtualenv-tools --update-path /usr/share/python/$PROJECT 30 | cd - 31 | 32 | find build -iname *.pyc -exec rm {} \; 33 | find build -iname *.pyo -exec rm {} \; 34 | -------------------------------------------------------------------------------- /scripts/changes-mesos-scheduler: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from changes_mesos_scheduler.main import main 4 | 5 | main() 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | MESOS_VERSION = '0.27.0' 4 | UBUNTU_VERSION = '14.04' 5 | 6 | tests_require = ['pytest>=2.5.0,<2.6.0', 'pytest-cov>=1.6,<1.7', 7 | 'pytest-xdist>=1.9,<1.10', 'unittest2>=0.5.1,<0.6.0', 8 | 'mock>=1.0.1,<1.1.0', 'flask>=0.10.1,<0.11.0'] 9 | 10 | setup(name='changes-mesos-scheduler', 11 | scripts=['scripts/changes-mesos-scheduler'], 12 | packages=['changes_mesos_scheduler'], 13 | extras_require={'tests': tests_require}, 14 | dependency_links = ['http://downloads.mesosphere.io/master/ubuntu/%s/mesos-%s-py2.7-linux-x86_64.egg#egg=mesos' 15 | % (UBUNTU_VERSION, MESOS_VERSION)], 16 | install_requires=['futures==2.2', 'mesos', 'protobuf>=2.5.0,<3a0', 'raven', 'statsd', 'typing'], 17 | package_dir={'changes_mesos_scheduler': 'changes_mesos_scheduler'}) 18 | 19 | -------------------------------------------------------------------------------- /support/bootstrap-vagrant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eux 2 | 3 | export DEBIAN_FRONTEND=noninteractive 4 | 5 | sudo apt-get update -y 6 | 7 | # Install git 8 | sudo apt-get install -y git 9 | 10 | # Install fpm 11 | sudo apt-get install -y ruby-dev gcc 12 | sudo gem install fpm --no-ri --no-rdoc 13 | 14 | # Install easy_install for fpm to use 15 | sudo apt-get install -y python-setuptools 16 | 17 | # Install pytest and flask required for tests. 18 | sudo apt-get install -y python-pip 19 | sudo pip install pytest 20 | sudo pip install flask 21 | --------------------------------------------------------------------------------