├── .arcconfig
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── Vagrantfile
├── changes_mesos_scheduler
    ├── __init__.py
    ├── changes_scheduler.py
    ├── main.py
    ├── statsreporter.py
    └── tests
    │   ├── __init__.py
    │   ├── test_changes_scheduler.py
    │   └── test_service.py
├── ci
    ├── mypy-run
    ├── mypy-setup
    ├── run_mypy.py
    ├── run_tests.sh
    └── setup.sh
├── make_virtualenv.sh
├── scripts
    └── changes-mesos-scheduler
├── setup.py
└── support
    └── bootstrap-vagrant.sh


/.arcconfig:
--------------------------------------------------------------------------------
1 | {
2 |   "conduit_uri" : "https://tails.corp.dropbox.com/api/",
3 |   "copyright_holder" : "Dropbox",
4 |   "repository.callsign": "CHANGESMESOSFWK"
5 | }
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | build/
 3 | dist/
 4 | *.egg-info/
 5 | *.deb
 6 | /.vagrant/
 7 | .idea
 8 | setup.cfg
 9 | 
10 | .coverage
11 | coverage.xml
12 | *junit.xml
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright 2014 Dropbox, Inc.
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 |     http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PKG_NAME = changes-mesos-scheduler
 2 | VERSION = 0.0.2
 3 | # Revision shows date of latest commit and abbreviated commit SHA
 4 | # E.g., 1438708515-753e183
 5 | REV=`git show -s --format=%ct-%h HEAD`
 6 | 
 7 | DEB_VERSION = "$(VERSION)-$(REV)"
 8 | 
 9 | test:
10 | 	py.test changes_mesos_scheduler/tests/
11 | 
12 | install-test-requirements:
13 | 	pip install "file://`pwd`#egg=changes-mesos-scheduler[tests]"
14 | 
15 | coverage:
16 | 	coverage run -m py.test --junitxml=python.junit.xml changes_mesos_scheduler/tests/
17 | 	coverage xml
18 | 
19 | virtualenv:
20 | 	./make_virtualenv.sh $(PKG_NAME)
21 | 
22 | deb: virtualenv
23 | 	fpm -f -t deb -s dir -C build -n $(PKG_NAME) -v $(DEB_VERSION) -d libcurl3 -d libsvn1 -d libsasl2-modules .
24 | 
25 | install_deb: deb
26 | 	sudo dpkg -i "$(PKG_NAME)_$(DEB_VERSION)_amd64.deb" || \
27 | 	sudo apt-get install -f -y --force-yes  # Sadly, this is necessary to install any missing deps
28 | 
29 | virtualenv_coverage: install_deb
30 | 	. /usr/share/python/$(PKG_NAME)/bin/activate; \
31 | 	make coverage
32 | 	# Sanity check installed binary
33 | 	/usr/share/python/$(PKG_NAME)/bin/$(PKG_NAME) --help
34 | 
35 | virtualenv_test: install_deb
36 | 	. /usr/share/python/$(PKG_NAME)/bin/activate; \
37 | 	make test
38 | 	# Sanity check installed binary
39 | 	/usr/share/python/$(PKG_NAME)/bin/$(PKG_NAME) --help
40 | 
41 | .PHONY: deb
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ***NOTICE: THIS REPO IS NO LONGER UPDATED***
 2 | 
 3 | Changes Mesos Scheduler
 4 | =======================
 5 | Setting up the vagrant VM:
 6 | 
 7 | ```shell
 8 | vagrant up
 9 | vagrant ssh
10 | ```
11 | 
12 | Building a deb:
13 | 
14 | ```shell
15 | cd /vagrant
16 | make deb
17 | ```
18 | 
19 | `make install_deb` will also install the deb on your machine.
20 | 
21 | Running tests:
22 | ```shell
23 | cd /vagrant
24 | make test
25 | ```
26 | 
27 | You can also run tests locally (on your host machine). You need to install
28 | mesos (`brew install mesos` on Mac), and may need to `sudo pip install mesos`
29 | too. After that `make test` should work (mileage may vary, this is only really
30 | tested on Mac).
31 | 
32 | 
33 | Running the scheduler requires having mesos set up and running but this vagrant VM is not set up to do that yet. You can instead use a different one:
34 | 
35 | ```shell
36 | git clone git@github.com:mesosphere/playa-mesos.git
37 | cp your-changes-mesos-scheduler.deb playa-mesos/
38 | cd playa-mesos
39 | 
40 | vagrant up
41 | vagrant ssh
42 | 
43 | make install_deb
44 | mkdir /etc/changes-mesos-scheduler
45 | sudo touch /etc/changes-mesos-scheduler/blacklist
46 | 
47 | /usr/share/python/changes-mesos-scheduler/bin/changes-mesos-scheduler --help
48 | /usr/share/python/changes-mesos-scheduler/bin/changes-mesos-scheduler --api-url your-changes-endpoint
49 | ```
50 | 


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # -*- mode: ruby -*-
 2 | # vi: set ft=ruby :
 3 | 
 4 | # Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
 5 | VAGRANTFILE_API_VERSION = "2"
 6 | 
 7 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
 8 |   config.vm.box = "ubuntu/trusty64"
 9 | 
10 |   config.ssh.forward_agent = true
11 | 
12 |   config.vm.provision :shell, :path => "support/bootstrap-vagrant.sh"
13 | end
14 | 


--------------------------------------------------------------------------------
/changes_mesos_scheduler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dropbox/changes-mesos-framework/cbb2351d45b4231286a18e70e5fea039b121d0a4/changes_mesos_scheduler/__init__.py


--------------------------------------------------------------------------------
/changes_mesos_scheduler/changes_scheduler.py:
--------------------------------------------------------------------------------
   1 | from __future__ import absolute_import, print_function
   2 | 
   3 | import bisect
   4 | import concurrent.futures
   5 | import json
   6 | import logging
   7 | import os
   8 | import threading
   9 | import time
  10 | import urllib2 # type: ignore
  11 | 
  12 | from changes_mesos_scheduler import statsreporter
  13 | 
  14 | from typing import Any, Callable, Dict, NamedTuple, Optional, Set, Tuple
  15 | 
  16 | from collections import defaultdict
  17 | from threading import Event
  18 | from urllib import urlencode
  19 | from uuid import uuid4
  20 | 
  21 | from google.protobuf import text_format as _text_format # type: ignore
  22 | 
  23 | from mesos.interface import Scheduler, SchedulerDriver
  24 | from mesos.interface import mesos_pb2
  25 | 
  26 | # how long (in seconds) we'll continue trying to kill a task. After that we give up.
  27 | TASK_KILL_THRESHOLD = 3600
  28 | 
  29 | class FileBlacklist(object):
  30 |     """ File-backed blacklist for slave hostnames.
  31 |     Hosts are expected to be named in the file, one per line.
  32 |     Whitespace and lines beginning with '#' are ignored.
  33 |     """
  34 |     def __init__(self, path):
  35 |         # type: (str) -> None
  36 |         self._path = path # type: str
  37 |         self._mtime = 0.0
  38 |         self._blacklist = set() # type: Set[str]
  39 | 
  40 |     def refresh(self):
  41 |         # type: () -> None
  42 |         """Refresh the blacklist if the file changed."""
  43 |         if os.path.getmtime(self._path) > self._mtime:
  44 |             self._refresh()
  45 | 
  46 |     def _refresh(self):
  47 |         # type: () -> None
  48 |         """Unconditionally refresh the blacklist from the file."""
  49 |         logging.info('Refreshing blacklist')
  50 |         self._mtime = os.path.getmtime(self._path)
  51 |         with open(self._path) as file:
  52 |             self._blacklist = set([s.strip() for s in file.readlines() if not s.startswith('#')])
  53 | 
  54 |     def contains(self, hostname):
  55 |         # type: (str) -> bool
  56 |         """Returns whether the provided hostname is present in the blacklist as of last reading."""
  57 |         return hostname in self._blacklist
  58 | 
  59 | 
  60 | class APIError(Exception):
  61 |     """An Exception originating from ChangesAPI.
  62 |     This mostly exists so that our uncertainty of the possible Exceptions
  63 |     originating from API requests doesn't muddy the error handling in the Scheduler.
  64 |     """
  65 |     def __init__(self, msg, cause=None):
  66 |         # type: (str, Any) -> None
  67 |         super(APIError, self).__init__(msg)
  68 |         self.cause = cause
  69 | 
  70 | 
  71 | class ChangesAPI(object):
  72 |     """Client for the Changes API, intended for Scheduler use.
  73 |     Any exceptions resulting from runtime failures should be APIErrors.
  74 |     """
  75 | 
  76 |     def __init__(self, api_url):
  77 |         # type: (str) -> None
  78 |         self._api_url = api_url
  79 | 
  80 |     @staticmethod
  81 |     def make_url(base_url, path, get_params=None):
  82 |         # type: (str, str, Optional[Dict[str,str]]) -> str
  83 |         # Changes insists that paths end with a slash
  84 |         path = path if path.endswith('/') else path + '/'
  85 |         # Make sure there's exactly one slash between path and the API url
  86 |         path = path if path.startswith('/') else '/' + path
  87 |         base_url = base_url.rstrip('/')
  88 |         full_url = base_url + path
  89 |         if get_params:
  90 |             query_string = '?' + urlencode(get_params)
  91 |             full_url += query_string
  92 |         return full_url
  93 | 
  94 |     def _api_request(self, path, body=None, get_params=None):
  95 |         # type: (str, Optional[Dict[str, Any]], Optional[Dict[str, Any]]) -> Dict[str, Any]
  96 |         full_url = ChangesAPI.make_url(self._api_url, path, get_params)
  97 |         try:
  98 |             data = json.dumps(body) if body else None
  99 |             req = urllib2.Request(
 100 |                 full_url, data,
 101 |                 {'Content-Type': 'application/json'})
 102 |             # Any connectivity issues will raise an exception, as will some error statuses.
 103 |             content = urllib2.urlopen(req).read()
 104 |             return json.loads(content)
 105 |         except Exception as exc:
 106 |             # Always log exceptions so callers don't have to.
 107 |             logging.exception("Error POSTing to Changes at %s", full_url)
 108 |             raise APIError("Error POSTing to Changes at %s" % full_url, exc)
 109 | 
 110 |     def get_allocate_jobsteps(self, limit=None, cluster=None):
 111 |         # type: (Optional[int], Optional[str]) -> List[Dict[str, Any]]
 112 |         """ Returns a list of up to `limit` pending allocation jobsteps in `cluster`.
 113 |             The scheduler may then allocate these as it sees fit.
 114 | 
 115 |         Args:
 116 |             limit: maximum jobsteps to return
 117 |             cluster: cluster to look in. The "default" cluster
 118 |                 returns jobsteps with no cluster specified.
 119 | 
 120 |         Returns:
 121 |             list: List of JobSteps (in priority order) that are pending allocation
 122 |         """
 123 |         data = {'limit': limit} if limit else {} # type: Dict[str, Any]
 124 |         if cluster:
 125 |             data['cluster'] = cluster
 126 |         return self._api_request("/jobsteps/allocate/", get_params=data)['jobsteps']
 127 | 
 128 |     def post_allocate_jobsteps(self, jobstep_ids, cluster=None):
 129 |         # type: (List[str], Optional[str]) -> List[str]
 130 |         """ Attempt to allocate the given list of JobStep ids.
 131 | 
 132 |         Args:
 133 |             jobstep_ids: list of JobStep ID hexs to allocate.
 134 |             cluster: cluster to allocate in.
 135 | 
 136 |         Returns:
 137 |             list: list of jobstep ID hexs that were actually allocated.
 138 |         """
 139 |         data = {'jobstep_ids': jobstep_ids} # type: Dict[str, Any]
 140 |         if cluster:
 141 |             data['cluster'] = cluster
 142 |         return self._api_request("/jobsteps/allocate/", data)['allocated']
 143 | 
 144 |     def jobstep_needs_abort(self, jobstep_ids):
 145 |         # type: (List[str]) -> List[str]
 146 |         """ Query for which jobsteps in a given list should be aborted.
 147 | 
 148 |         Args:
 149 |             jobstep_ids: JobStep ID hexs we are asking about.
 150 |         Returns:
 151 |             list: subset of the jobstep_ids, which should be aborted.
 152 |         """
 153 |         # don't bother sending the request if there are no jobstep ids
 154 |         if len(jobstep_ids) == 0:
 155 |             return []
 156 |         data = {'jobstep_ids': jobstep_ids}
 157 |         return self._api_request("/jobsteps/needs_abort/", data)['needs_abort']
 158 | 
 159 |     def update_jobstep(self, jobstep_id, status, result=None, hostname=None):
 160 |         # type: (str, str, Optional[str], Optional[str]) -> None
 161 |         """ Update the recorded status and possibly result of a JobStep in Changes.
 162 | 
 163 |         Args:
 164 |             jobstep_id: JobStep ID.
 165 |             status: Status (one of "finished", "queued", "in_progress").
 166 |             result: Optionally one of 'failed', 'passed', 'aborted', 'skipped', or 'infra_failed'.
 167 |             hostname: Optional hostname of slave we are running this jobstep on
 168 |         """
 169 |         data = {"status": status}
 170 |         if result:
 171 |             data["result"] = result
 172 |         if hostname:
 173 |             data["node"] = hostname
 174 |         self._api_request("/jobsteps/{}/".format(jobstep_id), data)
 175 | 
 176 |     def jobstep_console_append(self, jobstep_id, text):
 177 |         # type: (str, str) -> None
 178 |         """ Append to the JobStep's console log.
 179 |         Args:
 180 |             jobstep_id: JobStep ID.
 181 |             text: Text to append.
 182 |         """
 183 |         url = '/jobsteps/%s/logappend/' % jobstep_id
 184 |         self._api_request(url, {'source': 'console', 'text': text})
 185 | 
 186 | 
 187 | class SlaveInfo(object):
 188 |     def __init__(self, hostname):
 189 |         # type: (str) -> None
 190 |         self.hostname = hostname
 191 | 
 192 | class ChangesScheduler(Scheduler):
 193 |     def __init__(self, state_file, api, blacklist, stats=None,
 194 |                  changes_request_limit=200):
 195 |         # type: (str, ChangesAPI, FileBlacklist, Optional[Any], int) -> None
 196 |         """
 197 |         Args:
 198 |             state_file (str): Path where serialized internal state will be
 199 |                 stored.
 200 |             api (ChangesAPI): API to use for interacting with Changes.
 201 |             blacklist (FileBlacklist): Blacklist to use.
 202 |             stats (statsreporter.Stats): Optional Stats instance to use.
 203 |         """
 204 |         self.framework_id = None # type: Optional[str]
 205 |         self._changes_api = api
 206 |         self.taskJobStepMappingLock = threading.Lock()
 207 |         self.taskJobStepMapping = {} # type: Dict[str, str]
 208 |         # maps from a slave_id to general info about that slave (currently only its hostname)
 209 |         self.slaveIdInfo = {} # type: Dict[str, SlaveInfo]
 210 |         # maps from a task id to a timestamp of when we first tried killing that task
 211 |         self.tasksPendingKill = {} # type: Dict[str, float]
 212 |         self.tasksLaunched = 0
 213 |         self.tasksFinished = 0
 214 |         self.shuttingDown = Event()
 215 |         # Use the provided Stats or create a no-op one.
 216 |         self._stats = stats or statsreporter.Stats(None)
 217 |         self._blacklist = blacklist
 218 |         # Refresh now so that if it fails, it fails at startup.
 219 |         self._blacklist.refresh()
 220 |         self.state_file = state_file
 221 |         self.changes_request_limit = changes_request_limit
 222 |         self._snapshot_slave_map = defaultdict(lambda: defaultdict(float)) # type: Dict[str, Dict[str, float]]
 223 | 
 224 |         # Variables to help with polling Changes for pending jobsteps in a
 225 |         # separate thread. _cached_slaves_lock protects _cached_slaves.
 226 |         self._cached_slaves_lock = threading.Lock()
 227 |         self._cached_slaves = {} # type: Dict[str, ChangesScheduler.Slave]
 228 | 
 229 |         # Restore state from a previous run
 230 |         if not self.state_file:
 231 |             logging.warning("State file location not set. Not restoring old state.")
 232 |         elif not os.path.exists(self.state_file):
 233 |             logging.warning("State file not found. Not restoring old state.")
 234 |         else:
 235 |             try:
 236 |                 self.restore_state()
 237 |             except Exception:
 238 |                 logging.exception("Failed to restore state. Continuing as a new scheduler.")
 239 |             else:
 240 |                 # Delete the old file to prevent it from being used again on a restart
 241 |                 # as it will likely be stale.
 242 |                 os.remove(self.state_file)
 243 | 
 244 |     def poll_changes_until_shutdown(self, driver, interval):
 245 |         # type: (SchedulerDriver, int) -> None
 246 |         """In a separate thread, periodically poll Changes for jobsteps that
 247 |         need to be scheduled. This method will block, waiting indefinitely
 248 |         until shuttingDown() is set. Then the thread will terminate (finishing
 249 |         any current polling activity if necessary) and this method will return.
 250 |         Args:
 251 |             driver: the MesosSchedulerDriver object
 252 |             interval: number of seconds in each poll loop.
 253 |         """
 254 |         with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
 255 |             future = executor.submit(self._polling_loop, driver, interval)
 256 |             logging.info("Started thread at %s. Now waiting...", time.ctime())
 257 |             while not future.done():
 258 |                 time.sleep(.01)
 259 |             try:
 260 |                 future.result()
 261 |             except Exception:
 262 |                 logging.exception("Polling thread failed. Exiting.")
 263 |             self.decline_open_offers(driver)
 264 | 
 265 |     def _polling_loop(self, driver, interval):
 266 |         # type: (SchedulerDriver, int) -> None
 267 |         """Poll Changes for new jobsteps forever, until shuttingDown is set.
 268 |         Args:
 269 |             driver: the MesosSchedulerDriver object
 270 |             interval: number of seconds in each poll loop.
 271 |         """
 272 |         try:
 273 |             next_wait_duration = 0.0
 274 |             while not self.shuttingDown.wait(next_wait_duration):
 275 |                 start_time = time.time()
 276 |                 # Loop as long as Changes continues providing tasks to schedule.
 277 |                 while self.poll_and_launch_once(driver):
 278 |                     pass
 279 | 
 280 |                 # kill any aborted jobsteps too
 281 |                 self.poll_and_abort(driver)
 282 | 
 283 |                 # Schedule the delay for the next iteration of the loop,
 284 |                 # attempting to compensate for scheduling skew caused by
 285 |                 # polling/computation time.
 286 |                 last_poll_duration = time.time() - start_time
 287 |                 next_wait_duration = max(0, interval - last_poll_duration)
 288 |         finally:
 289 |             # In the event of an exception in the polling thread, shut
 290 |             # everything down clean(ish)ly.
 291 |             self.shuttingDown.set()
 292 | 
 293 |     def poll_and_launch_once(self, driver):
 294 |         # type: (SchedulerDriver) -> bool
 295 |         """Poll Changes once for all jobsteps matching all clusters for which
 296 |         we have offers. Then assign these jobsteps to offers. Then execute the
 297 |         assignments by launching tasks on Mesos and informing Changes about
 298 |         the assignments.
 299 |         This is also the entry point for most testing, since it skips the
 300 |         annoying threading and while-loop behavior that make synchronization
 301 |         difficult.
 302 |         Args:
 303 |             driver: the MesosSchedulerDriver object
 304 |         Returns:
 305 |             bool: True if there are more jobsteps to fetch from Changes, False
 306 |                 otherwise.
 307 |         """
 308 |         # TODO: There's presently a window between post_allocate_jobsteps() and
 309 |         # launchTasks() where Changes thinks tasks are scheduled on Mesos, but
 310 |         # the tasks haven't actually been scheduled yet. If there's a shutdown
 311 |         # or failure in this window, it can be a long time before Changes will
 312 |         # figure it out and re-submit the tasks to the scheduler.
 313 |         #
 314 |         # Also note that until post_allocate_jobsteps() is called, Changes will
 315 |         # just keep returning the same set of jobsteps to
 316 |         # get_allocate_jobsteps(). Thus we call get- and post- in a 1:1
 317 |         # ratio, otherwise we could have an infinite poll loop on Changes.
 318 |         #
 319 |         # To that end, consider implementing something like the following:
 320 |         #  1) Query Changes for jobsteps
 321 |         #  2) Internally assign jobsteps to offers
 322 |         #  3) Store assignments in scheduler's state.pending_assignments
 323 |         #  4) Write the state file each time the state changes, rather than
 324 |         #     only on shutdown, such that we'd have everything in order in the
 325 |         #     event of a problem.
 326 |         #  5) post_allocate_jobsteps() the assignments
 327 |         #  6) Goto 1 until no more jobsteps
 328 |         #  7) Launch jobsteps on mesos
 329 |         #  8) Clear state.pending_assignments and write state file.
 330 |         #
 331 |         #  9) On startup, jobstep_deallocate any state.pending_assignments
 332 |         with self._cached_slaves_lock:
 333 |             # Get all slaves (composites of individual offers on the same host)
 334 |             all_slaves = self._cached_slaves.values()
 335 |             filtered_slaves = self._filter_slaves(all_slaves)
 336 |             logging.info("Do scheduling cycle with %d available slaves. (%d " +
 337 |                          "after filtering)",
 338 |                          len(all_slaves), len(filtered_slaves))
 339 |             slaves_by_cluster = self._slaves_by_cluster(filtered_slaves)
 340 | 
 341 |             # Get all jobsteps, organized by cluster.
 342 |             jobsteps_by_cluster = self._query_changes_for_jobsteps(
 343 |                     driver, slaves_by_cluster.keys())
 344 | 
 345 |             # For each cluster, assign jobsteps to slaves, then launch the
 346 |             # jobsteps on those slaves, using multiple offers if necessary.
 347 |             for cluster, jobsteps in jobsteps_by_cluster.iteritems():
 348 |                 self._assign_jobsteps(cluster,
 349 |                                       slaves_by_cluster[cluster],
 350 |                                       jobsteps_by_cluster[cluster])
 351 |                 self._launch_jobsteps(driver,
 352 |                                       cluster,
 353 |                                       slaves_by_cluster[cluster])
 354 | 
 355 |         # Guess whether or not there are more jobsteps waiting on Changes by
 356 |         # comparing the number of jobsteps received vs. the number of jobsteps
 357 |         # requested.
 358 |         return len(jobsteps_by_cluster) == self.changes_request_limit
 359 | 
 360 |     def poll_and_abort(self, driver):
 361 |         # type: (SchedulerDriver) -> None
 362 |         """Poll Changes to see if any jobsteps we're responsible for should be aborted. 
 363 |         We ask the Mesos master to kill the tasks for these jobsteps.
 364 |         """
 365 |         jobStepTaskMapping = {}
 366 |         with self.taskJobStepMappingLock:
 367 |             for task_id, jobstep_id in self.taskJobStepMapping.iteritems():
 368 |                 jobStepTaskMapping[jobstep_id] = task_id
 369 |         try:
 370 |             abort_jobstep_ids = self._changes_api.jobstep_needs_abort(sorted(jobStepTaskMapping.keys()))
 371 |         except APIError:
 372 |             logging.warning('/jobstep/needs_abort/ failed', exc_info=True)
 373 |             abort_jobstep_ids = []
 374 | 
 375 |         now = time.time()
 376 |         for jobstep_id in abort_jobstep_ids:
 377 |             task_id = jobStepTaskMapping[jobstep_id]
 378 |             with self.taskJobStepMappingLock:
 379 |                 # add it to tasksPendingKill if it's not already there.
 380 |                 first_tried_to_kill = self.tasksPendingKill.setdefault(task_id, now)
 381 |                 if now - first_tried_to_kill > TASK_KILL_THRESHOLD:
 382 |                     # giving up on this one
 383 |                     logging.warning("Task %s (jobstep ID %s) still hasn't been successfully killed, giving up.", task_id, jobstep_id)
 384 |                     self._stats.incr('couldnt_abort_task')
 385 |                     del self.taskJobStepMapping[task_id]
 386 |                     del self.tasksPendingKill[task_id]
 387 |                     continue
 388 |             logging.info('Asking Mesos to kill task %s (jobstep ID %s)', task_id, jobstep_id)
 389 |             driver.killTask(mesos_pb2.TaskID(value=task_id))
 390 | 
 391 |     def decline_open_offers(self, driver):
 392 |         # type: (SchedulerDriver) -> None
 393 |         """Decline all cached Mesos pb_offers.
 394 |         """
 395 |         with self._cached_slaves_lock:
 396 |             slaves = self._cached_slaves.values()
 397 |             for slave in slaves:
 398 |                 self._stat_and_log_list(slave.offers(), 'decline_for_shutdown',
 399 |                                         lambda offer: "Shutting down, declining offer: %s" % offer.offer.id)
 400 |                 self._decline_list(driver, slave.offers())
 401 |             self._cached_slaves = {}
 402 | 
 403 |     def registered(self, driver, frameworkId, masterInfo):
 404 |         """
 405 |           Invoked when the scheduler successfully registers with a Mesos master.
 406 |           It is called with the frameworkId, a unique ID generated by the
 407 |           master, and the masterInfo which is information about the master
 408 |           itself.
 409 |         """
 410 |         logging.info("Registered with framework ID %s", frameworkId.value)
 411 |         self.framework_id = frameworkId.value
 412 | 
 413 |     def reregistered(self, driver, masterInfo):
 414 |         """
 415 |           Invoked when the scheduler re-registers with a newly elected Mesos
 416 |           master.  This is only called when the scheduler has previously been
 417 |           registered.  masterInfo contains information about the newly elected
 418 |           master.
 419 |         """
 420 |         logging.info("Re-Registered with new master")
 421 | 
 422 |     def disconnected(self, driver):
 423 |         # type: (SchedulerDriver) -> None
 424 |         """
 425 |           Invoked when the scheduler becomes disconnected from the master, e.g.
 426 |           the master fails and another is taking over.
 427 |           Abandon all open offers and slaves. We don't decline, since there's
 428 |           no master to report to. The new master should provide a new batch of
 429 |           offers soon enough.
 430 |         """
 431 |         logging.info("Disconnected from master. Abandoning all cached offer and slave info without declining.")
 432 |         with self._cached_slaves_lock:
 433 |             self._cached_slaves = {}
 434 | 
 435 |     @staticmethod
 436 |     def _decode_typed_field(pb):
 437 |         field_type = pb.type
 438 |         if field_type == mesos_pb2.Value.SCALAR:
 439 |             return pb.scalar.value
 440 |         elif field_type == mesos_pb2.Value.RANGES:
 441 |             return [{"begin": ra.begin, "end": ra.end} for ra in pb.ranges.range]
 442 |         elif field_type == mesos_pb2.Value.SET:
 443 |             return pb.set.item
 444 |         elif field_type == mesos_pb2.Value.TEXT:
 445 |             return pb.text.value
 446 |         else:
 447 |             raise Exception("Unknown field type: %s", field_type)
 448 | 
 449 |     @staticmethod
 450 |     def _decode_attribute(attr_pb):
 451 |         return (attr_pb.name, ChangesScheduler._decode_typed_field(attr_pb))
 452 | 
 453 |     @staticmethod
 454 |     def _decode_resource(resource_pb):
 455 |         return (resource_pb.name, ChangesScheduler._decode_typed_field(resource_pb))
 456 | 
 457 |     @property
 458 |     def activeTasks(self):
 459 |         return self.tasksFinished - self.tasksLaunched
 460 | 
 461 |     @staticmethod
 462 |     def get_cluster(offer):
 463 |         attributes = dict([ChangesScheduler._decode_attribute(a) for a in offer.attributes])
 464 |         return attributes.get('labels')
 465 | 
 466 |     @staticmethod
 467 |     def get_resources(offer):
 468 |         return {name: value for (name, value) in
 469 |                 [ChangesScheduler._decode_resource(r) for r in offer.resources]}
 470 | 
 471 |     class OfferWrapper(object):
 472 |         """Precompute some commonly-used fields from a Mesos Offer proto.
 473 |         """
 474 |         def __init__(self, pb_offer):
 475 |             # type: (Any) -> None
 476 |             self.offer = pb_offer
 477 |             self.cluster = ChangesScheduler.get_cluster(pb_offer)
 478 | 
 479 |             resources = ChangesScheduler.get_resources(pb_offer)
 480 |             self.cpu = resources.get('cpus', 0.0)
 481 |             self.mem = resources.get('mem', 0)
 482 | 
 483 |         def __cmp__(self, other):
 484 |             # type: (ChangesScheduler.OfferWrapper) -> int
 485 |             """Comparator for sorting offers by "least loaded".
 486 |             """
 487 |             # we prioritize first by cpu then memory.
 488 |             # (values are negated so more resources sorts as "least loaded")
 489 |             us = (-self.cpu, -self.mem)
 490 |             them = (-other.cpu, -other.mem)
 491 |             if us < them:
 492 |                 return -1
 493 |             return 0 if us == them else 1
 494 | 
 495 |         def __str__(self, pb_offer):
 496 |             cpu = "?"
 497 |             mem = "?"
 498 |             for r in pb_offer.resources:
 499 |                 if r.name == 'cpus':
 500 |                     cpu = str(r.scalar).strip()
 501 |                 if r.name == 'memory':
 502 |                     cpu = str(r.scalar).strip()
 503 |             return "Offer({} {} {} cpu: {}  mem: {})".format(
 504 |                     pb_offer.id.value, pb_offer.slave_id.value,
 505 |                     pb_offer.hostname, cpu, mem)
 506 | 
 507 |     class Slave(object):
 508 |         """ Wrapper around a protobuf Offer object. Provides numerous
 509 |         conveniences including comparison (we currently use a least loaded
 510 |         approach), and being able to assign jobsteps to the offer.
 511 |         """
 512 |         def __init__(self, slave_id, hostname, cluster):
 513 |             # type: (str, str, str) -> None
 514 |             self.slave_id = slave_id
 515 |             self.hostname = hostname
 516 |             self.cluster = cluster
 517 | 
 518 |             self._offers = {} # type: Dict[str, ChangesScheduler.OfferWrapper]
 519 |             self.jobsteps_assigned = []  # type: List[Dict[str, Any]]
 520 | 
 521 |             # Sum of all Offer resources for this slave.
 522 |             self.total_cpu = 0.0
 523 |             self.total_mem = 0
 524 | 
 525 |             # Sum of all resources for jobsteps assigned to this slave.
 526 |             self.allocated_cpu = 0.0
 527 |             self.allocated_mem = 0
 528 | 
 529 |         def offers(self):
 530 |             # type: () -> List[ChangesScheduler.OfferWrapper]
 531 |             """Returns a list of available offers on the slave.
 532 |             """
 533 |             return self._offers.values()
 534 | 
 535 |         def has_offers(self):
 536 |             # type: () -> bool
 537 |             """Returns True if the slave has any available offers, False
 538 |             otherwise.
 539 |             """
 540 |             return len(self._offers) > 0
 541 | 
 542 |         def is_maintenanced(self, now_nanos):
 543 |             # type: (int) -> bool
 544 |             """Determine if a Mesos offer indicates that a maintenance window is
 545 |             in progress for the slave. Treat the slave as maintenanced if ANY
 546 |             offer has an active maintenance window.
 547 |             Args:
 548 |                 now_nanos: Timestamp of right now in nanoseconds, for comparing
 549 |                     to the offer's (optional) maintenance time window.
 550 |             Returns:
 551 |                 True if the offer is in the maintenance window, False otherwise.
 552 |             """
 553 |             is_maintenanced = False
 554 |             for offer in self._offers.itervalues():
 555 |                 if not offer.offer.HasField('unavailability'):
 556 |                     continue
 557 |                 start_time = offer.offer.unavailability.start.nanoseconds
 558 | 
 559 |                 # If "duration" is not present use a default value of anything
 560 |                 # greater than Now, to represent an unbounded maintenance time.
 561 |                 # Override this with an actual end time if the "duration" field
 562 |                 # is present in the protobuf.
 563 |                 end_time = now_nanos + 1
 564 |                 if (offer.offer.unavailability.HasField('duration')):
 565 |                     end_time = start_time + offer.offer.unavailability.duration.nanoseconds
 566 | 
 567 |                 is_maintenanced = now_nanos > start_time and now_nanos < end_time
 568 |                 if is_maintenanced:
 569 |                     break
 570 |             return is_maintenanced
 571 | 
 572 |         def add_offer(self, offer):
 573 |             # type: (ChangesScheduler.OfferWrapper) -> None
 574 |             """Add an offer to this slave, and add its resources to the slave's
 575 |             total resources.
 576 |             """
 577 |             if (offer.offer.slave_id.value != self.slave_id or
 578 |                 offer.offer.hostname != self.hostname or
 579 |                 offer.cluster != self.cluster):
 580 |                 logging.error("A mismatched offer got mixed in with the wrong " +
 581 |                               "slave. Skipping. (\n  Slave: %s\n  Offer: %s)",
 582 |                               self, offer)
 583 |                 return
 584 | 
 585 |             self.total_cpu += offer.cpu
 586 |             self.total_mem += offer.mem
 587 |             logging.info("Slave %s: Add new offer +%f cpu,  +%d mem (-> %f %d)",
 588 |                          self.hostname, offer.cpu, offer.mem, self.total_cpu,
 589 |                          self.total_mem)
 590 |             self._offers[offer.offer.id.value] = offer
 591 | 
 592 |         def remove_offer(self, offer_id):
 593 |             # type: (Any) -> None
 594 |             """Remove an offer and its resources from this slave.
 595 |             Args:
 596 |                 offer_id: mesos_pb2.OfferId
 597 |             """
 598 |             offer = self._offers.get(offer_id.value)
 599 |             if offer:
 600 |                 del(self._offers[offer_id.value])
 601 |                 self.total_cpu -= offer.cpu
 602 |                 self.total_mem -= offer.mem
 603 | 
 604 |         def offers_to_launch(self):
 605 |             # type: () -> List[ChangesScheduler.OfferWrapper]
 606 |             """Based on the jobsteps previously assigned, select the offers on
 607 |             which to allocate the jobsteps.
 608 |             Also, remove from the Slave all offers which are about to be
 609 |             launched, and decrement total resources appropriately.
 610 |             Returns:
 611 |                 A list of OfferWrappers representing the set of offers on which
 612 |                 the tasks should be scheduled. All returned OfferWrappers
 613 |                 should have the same slave ID and hostname.
 614 |             """
 615 |             current_offers = sorted(self._offers.values())
 616 | 
 617 |             offers_to_launch = []
 618 |             for offer in current_offers:
 619 |                 # Decrement the "remaining" resources fields as we choose
 620 |                 # offers to allocate to the jobsteps.
 621 |                 if (self.allocated_cpu > 0 and offer.cpu > 0 or
 622 |                     self.allocated_mem > 0 and offer.mem > 0):
 623 |                     offers_to_launch.append(offer.offer.id)
 624 |                     self.allocated_cpu -= offer.cpu
 625 |                     self.allocated_mem -= offer.mem
 626 |                     self.remove_offer(offer.offer.id)
 627 |             return offers_to_launch
 628 | 
 629 |         def tasks_to_launch(self):
 630 |             # type: () -> Tuple[List[Any], List[str]]
 631 |             """Generate list of mesos_pb2.Task to launch, and a second list of
 632 |             jobstep IDs corresponding to each task.
 633 |             Also, reset/clear jobsteps_assigned on the Slave.
 634 |             Returns:
 635 |                 (list of tasks, list of jobstep IDs)
 636 |             """
 637 |             tasks = []
 638 |             jobstep_ids = []
 639 |             for jobstep in self.jobsteps_assigned:
 640 |                 tasks.append(self._jobstep_to_task(jobstep))
 641 |                 jobstep_ids.append(jobstep['id'])
 642 | 
 643 |             self.unassign_jobsteps()
 644 |             return tasks, jobstep_ids
 645 | 
 646 |         def unassign_jobsteps(self):
 647 |             # type: () -> None
 648 |             """Clear all assigned jobsteps from the Slave and reset required
 649 |             resources.
 650 |             """
 651 |             self.jobsteps_assigned = []
 652 |             self.allocated_cpu = 0.0
 653 |             self.allocated_mem = 0
 654 | 
 655 |         def __cmp__(self, other):
 656 |             # type: (ChangesScheduler.Slave) -> int
 657 |             # we prioritize first by cpu then memory.
 658 |             # (values are negated so more resources sorts as "least loaded")
 659 |             us = (-(self.total_cpu - self.allocated_cpu),
 660 |                   -(self.total_mem - self.allocated_mem))
 661 |             them = (-(other.total_cpu - other.allocated_cpu),
 662 |                     -(other.total_mem - other.allocated_mem))
 663 |             if us < them:
 664 |                 return -1
 665 |             return 0 if us == them else 1
 666 | 
 667 |         def __str__(self, slave):
 668 |             return "Slave({}: {} offers, {} acpu, {} amem)".format(
 669 |                     slave.hostname, len(slave.offers()), slave.total_cpu,
 670 |                     slave.total_mem)
 671 | 
 672 |         def has_resources_for(self, jobstep):
 673 |             # type: (Dict[str, Any]) -> bool
 674 |             """Returns true if the slave has sufficient available resources to
 675 |             execute a jobstep, false otherwise.
 676 |             Args:
 677 |                 jobstep: The jobstep to execute.
 678 |             Returns:
 679 |                 True if the slave can host the jobstep.
 680 |             """
 681 |             return ((self.total_cpu - self.allocated_cpu) >= jobstep['resources']['cpus'] and
 682 |                     (self.total_mem - self.allocated_mem) >= jobstep['resources']['mem'])
 683 | 
 684 |         def assign_jobstep(self, jobstep):
 685 |             # type: (Dict[str, Any]) -> None
 686 |             """Tentatively assign a jobstep to run on this slave. The actual
 687 |             launching occurs elsewhere.
 688 |             """
 689 |             assert self.has_resources_for(jobstep)
 690 |             self.allocated_cpu += jobstep['resources']['cpus']
 691 |             self.allocated_mem += jobstep['resources']['mem']
 692 |             self.jobsteps_assigned.append(jobstep)
 693 | 
 694 |         def _jobstep_to_task(self, jobstep):
 695 |             # type: (Dict[str, Any]) -> Any
 696 |             """ Given a jobstep and an offer to assign it to, returns the TaskInfo
 697 |             protobuf for the jobstep and updates scheduler state accordingly.
 698 |             Args:
 699 |                 jobstep: The jobstep to convert to a task.
 700 |             Returns:
 701 |                 mesos_pb2.Task
 702 |             """
 703 |             tid = uuid4().hex
 704 |             logging.info("Accepting offer on %s to start task %s", self.hostname, tid)
 705 | 
 706 |             task = mesos_pb2.TaskInfo()
 707 |             task.name = "{} {}".format(
 708 |                 jobstep['project']['slug'],
 709 |                 jobstep['id'],
 710 |             )
 711 |             task.task_id.value = str(tid)
 712 |             task.slave_id.value = self.slave_id
 713 | 
 714 |             cmd = jobstep["cmd"]
 715 | 
 716 |             task.command.value = cmd
 717 |             logging.debug("Scheduling cmd: %s", cmd)
 718 | 
 719 |             cpus = task.resources.add()
 720 |             cpus.name = "cpus"
 721 |             cpus.type = mesos_pb2.Value.SCALAR
 722 |             cpus.scalar.value = jobstep["resources"]["cpus"]
 723 | 
 724 |             mem = task.resources.add()
 725 |             mem.name = "mem"
 726 |             mem.type = mesos_pb2.Value.SCALAR
 727 |             mem.scalar.value = jobstep["resources"]["mem"]
 728 | 
 729 |             return task
 730 | 
 731 |     def _get_slaves_for_snapshot(self, snapshot_id, recency_threshold_hours=12):
 732 |         # type: (str, int) -> List[str]
 733 |         """ Returns list of hostnames which have run tasks with a given
 734 |         snapshot_id recently.
 735 |         """
 736 |         latest_snapshot_use = time.time() - recency_threshold_hours * 3600
 737 |         return [k for k, v in self._snapshot_slave_map[snapshot_id].iteritems()
 738 |                 if v >= latest_snapshot_use]
 739 | 
 740 |     def _associate_snapshot_with_slave(self, snapshot_id, slave):
 741 |         self._snapshot_slave_map[snapshot_id][slave] = time.time()
 742 | 
 743 |     @staticmethod
 744 |     def _jobstep_snapshot(jobstep):
 745 |         """ Given a jobstep, return its snapshot id if set, None otherwise.
 746 |         """
 747 |         if 'image' in jobstep and jobstep['image']:
 748 |             if 'snapshot' in jobstep['image'] and jobstep['image']['snapshot']:
 749 |                 return jobstep['image']['snapshot']['id']
 750 | 
 751 |         return None
 752 | 
 753 |     def _fetch_jobsteps(self, cluster):
 754 |         # type: (str) -> List[Dict[str, Any]]
 755 |         """Query Changes for all allocatable jobsteps for the specified cluster.
 756 |         """
 757 |         try:
 758 |             with self._stats.timer('poll_changes'):
 759 |                 possible_jobsteps = self._changes_api.get_allocate_jobsteps(limit=self.changes_request_limit,
 760 |                                                                             cluster=cluster)
 761 |         except APIError:
 762 |             logging.warning('/jobstep/allocate/ GET failed for cluster: %s', cluster, exc_info=True)
 763 |             possible_jobsteps = []
 764 |         return possible_jobsteps
 765 | 
 766 |     def _assign_jobsteps(self, cluster, slaves_for_cluster, jobsteps_for_cluster):
 767 |         # type: (str, List[ChangesScheduler.Slave], List[Dict[str, Any]]) -> None
 768 |         """Make assignments for jobsteps for a cluster to offers for a cluster.
 769 |         Assignments are stored in the OfferWrapper, to be launched later.
 770 |         Args:
 771 |             cluster: The cluster to make assignments for.
 772 |             slaves_for_cluster: A list of offers for the cluster.
 773 |             jobsteps_for_cluster: A list of jobsteps for the cluster.
 774 |         """
 775 |         # Changes returns JobSteps in priority order, so for each one
 776 |         # we attempt to put it on the machine with the least current load that
 777 |         # still has sufficient resources for it. This is not necessarily an
 778 |         # optimal algorithm--it might allocate fewer jobsteps than is possible,
 779 |         # and it currently prioritizes cpu over memory. We don't believe this
 780 |         # to be an issue currently, but it may be worth improving in the future
 781 |         if len(slaves_for_cluster) == 0 or len(jobsteps_for_cluster) == 0:
 782 |             return
 783 | 
 784 |         logging.info("Assign %s jobsteps on cluster %s", len(jobsteps_for_cluster), cluster)
 785 |         sorted_slaves = sorted(slaves_for_cluster)
 786 | 
 787 |         for jobstep in jobsteps_for_cluster:
 788 |             slave_to_use = None
 789 |             snapshot_id = self._jobstep_snapshot(jobstep)
 790 |             # Disable proximity check if not using a snapshot or scheduling in an explicit cluster.
 791 |             # Clusters are expected to pre-populate snapshots out of band and will not benefit
 792 |             # from proximity checks.
 793 |             if snapshot_id and not cluster:
 794 |                 slaves_with_snapshot = self._get_slaves_for_snapshot(snapshot_id)
 795 |                 logging.info('Found slaves with snapshot id %s: %s',
 796 |                              snapshot_id, slaves_with_snapshot)
 797 | 
 798 |                 if len(slaves_with_snapshot) > 0:
 799 |                     for slave in sorted_slaves:
 800 |                         if (slave.hostname in slaves_with_snapshot and
 801 |                             slave.has_resources_for(jobstep)):
 802 |                             slave_to_use = slave
 803 |                             logging.info('Scheduling jobstep %s on slave %s which might have snapshot %s',
 804 |                                          jobstep, slave.hostname, snapshot_id)
 805 |                             break
 806 | 
 807 |             # If we couldn't find a slave which is likely to have the snapshot already,
 808 |             # this gives us the least-loaded slave that we could actually use for this jobstep
 809 |             if not slave_to_use:
 810 |                 for slave in sorted_slaves:
 811 |                     if slave.has_resources_for(jobstep):
 812 |                         slave_to_use = slave
 813 |                         break
 814 | 
 815 |             # couldn't find any slaves that would support this jobstep, move on
 816 |             if not slave_to_use:
 817 |                 logging.warning("No slave found to run jobstep %s.", jobstep)
 818 |                 continue
 819 | 
 820 |             sorted_slaves.remove(slave_to_use)
 821 |             if snapshot_id:
 822 |                 self._associate_snapshot_with_slave(snapshot_id, slave_to_use.hostname)
 823 | 
 824 |             slave_to_use.assign_jobstep(jobstep)
 825 |             bisect.insort(sorted_slaves, slave_to_use)
 826 | 
 827 |     def _stat_and_log_list(self, to_decline, stats_counter_name, reason_func):
 828 |         # type: (List[Any], str, Callable[[Any], str]) -> None
 829 |         """Inform the Mesos master that we're declining a list of offers.
 830 |         Args:
 831 |             to_decline: The list of offers to decline
 832 |             stats_counter_name: A counter name to increment, to track stats for
 833 |                 different decline reasons.
 834 |             reason_func (function(Mesos Offer protobuf)): A function to generate
 835 |                 a logging string, to explain why this offer was declined.
 836 |         """
 837 |         self._stats.incr(stats_counter_name, len(to_decline))
 838 |         for offer in to_decline:
 839 |             if reason_func:
 840 |                 logging.info(reason_func(offer))
 841 | 
 842 |     def _decline_list(self, driver, to_decline):
 843 |         # type: (SchedulerDriver, List[Any]) -> None
 844 |         """Inform the Mesos master that we're declining a list of offers.
 845 |         Args:
 846 |             driver: the MesosSchedulerDriver object
 847 |             to_decline: The list of offers to decline
 848 |         """
 849 |         for offer in to_decline:
 850 |             driver.declineOffer(offer.offer.id)
 851 | 
 852 |     def _filter_slaves(self, slaves):
 853 |         # type: (List[Any]) -> List[Any]
 854 |         """Given a list of offer protos, decline blacklisted or unusable
 855 |         offers. Return a list of usable offers.
 856 |         Args:
 857 |             pb_offers (list of Mesos Offer protobufs): A list of offers, some
 858 |                 of which are usable and some of which might not be usable.
 859 |         Returns:
 860 |             list of usable Mesos Offer protobufs
 861 |         """
 862 |         self._blacklist.refresh()
 863 |         now_nanos = int(time.time() * 1000000000)
 864 |         maintenanced, blacklisted, usable = [], [], []
 865 |         for slave in slaves:
 866 |             if slave.is_maintenanced(now_nanos):
 867 |                 maintenanced.append(slave)
 868 |             elif self._blacklist.contains(slave.hostname):
 869 |                 blacklisted.append(slave)
 870 |             else:
 871 |                 usable.append(slave)
 872 | 
 873 |         self._stat_and_log_list(maintenanced, 'ignore_for_maintenance',
 874 |                                 lambda slave: "Ignoring slave from maintenanced hostname: %s" % slave.hostname)
 875 |         self._stat_and_log_list(blacklisted, 'ignore_for_blacklist',
 876 |                                 lambda slave: "Ignoring slave from blacklisted hostname: %s" % slave.hostname)
 877 |         return usable
 878 | 
 879 |     def _launch_jobsteps(self, driver, cluster, slaves_for_cluster):
 880 |         # type: (SchedulerDriver, str, List[ChangesScheduler.Slave]) -> None
 881 |         """Given a list of offers, launch all jobsteps assigned on each offer.
 882 |         Remove from the Offers cache any used offers.
 883 |         Args:
 884 |             driver: the MesosSchedulerDriver object
 885 |             slaves_for_cluster: A list of offers with assigned jobsteps already
 886 |                 embedded. Launch the jobsteps on the offer.
 887 |         """
 888 |         if len(slaves_for_cluster) == 0:
 889 |             return
 890 | 
 891 |         # Inform Changes of where the jobsteps are going.
 892 |         jobsteps_to_allocate = []
 893 |         for slave in slaves_for_cluster:
 894 |             jobstep_ids = [jobstep['id'] for jobstep in slave.jobsteps_assigned]
 895 |             jobsteps_to_allocate.extend(jobstep_ids)
 896 | 
 897 |         if len(jobsteps_to_allocate) == 0:
 898 |             return
 899 | 
 900 |         try:
 901 |             jobsteps_to_allocate.sort()  # Make testing deterministic.
 902 |             allocated_jobstep_ids = self._changes_api.post_allocate_jobsteps(
 903 |                     jobsteps_to_allocate, cluster=cluster)
 904 |         except APIError:
 905 |             allocated_jobstep_ids = []
 906 |         if sorted(allocated_jobstep_ids) != sorted(jobsteps_to_allocate):
 907 |             # NB: cluster could be None here
 908 |             logging.warning("Could not successfully allocate for cluster: %s", cluster)
 909 |             # for now we just give up on this cluster entirely
 910 |             for slave in slaves_for_cluster:
 911 |                 slave.unassign_jobsteps()
 912 | 
 913 |         # we've allocated all the jobsteps we can, now we launch them
 914 |         for slave in slaves_for_cluster:
 915 |             if len(slave.jobsteps_assigned) == 0:
 916 |                 continue
 917 |             filters = mesos_pb2.Filters()
 918 |             filters.refuse_seconds = 1.0
 919 | 
 920 |             # Note: offers_to_launch() and tasks_to_launch() remove offers and
 921 |             # tasks from the slave.
 922 |             offers_to_launch = slave.offers_to_launch()
 923 |             tasks_to_launch, jobstep_ids = slave.tasks_to_launch()
 924 | 
 925 |             with self.taskJobStepMappingLock:
 926 |                 for task, jobstep_id in zip(tasks_to_launch, jobstep_ids):
 927 |                     self.taskJobStepMapping[task.task_id.value] = jobstep_id
 928 | 
 929 |             self.tasksLaunched += len(tasks_to_launch)
 930 |             logging.info("Launch tasks: {} offers,  {} tasks".format(len(offers_to_launch), len(tasks_to_launch)))
 931 |             driver.launchTasks(offers_to_launch, tasks_to_launch, filters)
 932 | 
 933 |     def resourceOffers(self, driver, pb_offers):
 934 |         # type: (SchedulerDriver, List[Any]) -> None
 935 |         """
 936 |           Invoked when resources have been offered to this framework. A single
 937 |           offer will only contain resources from a single slave.  Resources
 938 |           associated with an offer will not be re-offered to _this_ framework
 939 |           until either (a) this framework has rejected those resources (see
 940 |           SchedulerDriver.launchTasks) or (b) those resources have been
 941 |           rescinded (see Scheduler.offerRescinded).  Note that resources may be
 942 |           concurrently offered to more than one framework at a time (depending
 943 |           on the allocator being used).  In that case, the first framework to
 944 |           launch tasks using those resources will be able to use them while the
 945 |           other frameworks will have those resources rescinded (or if a
 946 |           framework has already launched tasks with those resources then those
 947 |           tasks will fail with a TASK_LOST status and a message saying as much).
 948 |         """
 949 |         logging.info("Got %d resource offers", len(pb_offers))
 950 |         self._stats.incr('offers', len(pb_offers))
 951 | 
 952 |         # Simply add the offers to our local cache of available offers.
 953 |         # Jobsteps are allocated asynchronously, driven by
 954 |         # poll_changes_until_shutdown().
 955 |         with self._cached_slaves_lock:
 956 |             for pb_offer in pb_offers:
 957 |                 offer = ChangesScheduler.OfferWrapper(pb_offer)
 958 |                 if pb_offer.slave_id.value not in self._cached_slaves:
 959 |                     slave = ChangesScheduler.Slave(pb_offer.slave_id.value,
 960 |                                                    pb_offer.hostname,
 961 |                                                    offer.cluster)
 962 |                     self._cached_slaves[pb_offer.slave_id.value] = slave
 963 |                 self._cached_slaves[pb_offer.slave_id.value].add_offer(offer)
 964 |                 self.slaveIdInfo[pb_offer.slave_id.value] = SlaveInfo(hostname=pb_offer.hostname)
 965 | 
 966 |     def _slaves_by_cluster(self, slaves):
 967 |         slaves_by_cluster = defaultdict(list)
 968 |         for slave in slaves:
 969 |             if slave.has_offers():
 970 |                 slaves_by_cluster[slave.cluster].append(slave)
 971 |         return slaves_by_cluster
 972 | 
 973 |     def _query_changes_for_jobsteps(self, driver, clusters):
 974 |         # type: (SchedulerDriver, List[str]) -> Dict[str, List[Dict[str, Any]]]
 975 |         """Query Changes for the pending jobsteps for each cluster for which we
 976 |         have offers available.
 977 |         """
 978 |         jobsteps_by_cluster = defaultdict(list)  # type: Dict[str, List[Dict[str, Any]]]
 979 |         for cluster in clusters:
 980 |             jobsteps = self._fetch_jobsteps(cluster)
 981 |             jobsteps_by_cluster[cluster] = jobsteps
 982 |         return jobsteps_by_cluster
 983 | 
 984 |     def offerRescinded(self, driver, offerId):
 985 |         # type: (SchedulerDriver, Any) -> None
 986 |         """
 987 |           Invoked when an offer is no longer valid (e.g., the slave was lost or
 988 |           another framework used resources in the offer.) If for whatever reason
 989 |           an offer is never rescinded (e.g., dropped message, failing over
 990 |           framework, etc.), a framwork that attempts to launch tasks using an
 991 |           invalid offer will receive TASK_LOST status updats for those tasks
 992 |           (see Scheduler.resourceOffers).
 993 |           Args:
 994 |             driver: the MesosSchedulerDriver object
 995 |             offerId: a Mesos OfferId protobuf
 996 |         """
 997 |         logging.info("Offer rescinded: %s", offerId.value)
 998 |         with self._cached_slaves_lock:
 999 |             for slave in self._cached_slaves.itervalues():
1000 |                 slave.remove_offer(offerId)
1001 | 
1002 |     def statusUpdate(self, driver, status):
1003 |         """
1004 |           Invoked when the status of a task has changed (e.g., a slave is lost
1005 |           and so the task is lost, a task finishes and an executor sends a
1006 |           status update saying so, etc.) Note that returning from this callback
1007 |           acknowledges receipt of this status update.  If for whatever reason
1008 |           the scheduler aborts during this callback (or the process exits)
1009 |           another status update will be delivered.  Note, however, that this is
1010 |           currently not true if the slave sending the status update is lost or
1011 |           fails during that time.
1012 |         """
1013 | 
1014 |         states = {
1015 |             0: "starting",
1016 |             1: "running",
1017 |             2: "finished",  # terminal
1018 |             3: "failed",  # terminal
1019 |             4: "killed",  # terminal
1020 |             5: "lost",  # terminal
1021 |             6: "staging",
1022 |         }
1023 |         terminal_states = ["finished", "failed", "killed", "lost"]
1024 | 
1025 |         state = states[status.state]
1026 |         logging.info("Task %s is in state %d", status.task_id.value, status.state)
1027 | 
1028 |         if status.state == mesos_pb2.TASK_FINISHED:
1029 |             self.tasksFinished += 1
1030 | 
1031 |         aborted = False
1032 |         with self.taskJobStepMappingLock:
1033 |             jobstep_id = self.taskJobStepMapping.get(status.task_id.value)
1034 | 
1035 |             if state in terminal_states:
1036 |                 self.taskJobStepMapping.pop(status.task_id.value, None)
1037 |                 if status.task_id.value in self.tasksPendingKill:
1038 |                     kill_time = self.tasksPendingKill[status.task_id.value]
1039 |                     del self.tasksPendingKill[status.task_id.value]
1040 |                     aborted = True
1041 |                     elapsed = time.time() - kill_time
1042 |                     logging.info('Successfully aborted task %s (jobstep ID %s) after %.2f seconds', status.task_id.value, jobstep_id, elapsed)
1043 |                     self._stats.incr('task_aborted')
1044 | 
1045 |         hostname = None
1046 |         if self.slaveIdInfo.get(status.slave_id.value):
1047 |             hostname = self.slaveIdInfo[status.slave_id.value].hostname
1048 |         if hostname is None:
1049 |             logging.warning('No hostname associated with task: %s (slave_id %s)', status.task_id.value, status.slave_id.value)
1050 | 
1051 |         if jobstep_id is None:
1052 |             # TODO(nate): how does this happen?
1053 |             logging.error("Task %s missing JobStep ID (state %s, message %s)",
1054 |                           status.task_id.value, state,
1055 |                           _text_format.MessageToString(status))
1056 |             self._stats.incr('missing_jobstep_id_' + state)
1057 |             return
1058 | 
1059 |         if state == 'finished':
1060 |             try:
1061 |                 self._changes_api.update_jobstep(jobstep_id, status="finished", hostname=hostname)
1062 |             except APIError:
1063 |                 pass
1064 |         elif state in ('killed', 'lost', 'failed') and not aborted:
1065 |             self._stats.incr('task_' + state)
1066 |             # Jobsteps are only intended to be executed once and should only exit non-zero or be
1067 |             # lost/killed by infrastructural issues, so we don't attempt to reschedule, and we mark
1068 |             # this down as an infrastructural failure. Note that this state may not mean that the
1069 |             # Jobstep will necessarily stop executing, but it means that the results will be
1070 |             # considered immediately invalid.
1071 |             logging.warn('Task %s %s: %s', jobstep_id, state, status.message)
1072 |             msg = '==> Scheduler marked task as %s (will NOT be retried):\n\n%s' % (state, status.message)
1073 |             try:
1074 |                 self._changes_api.jobstep_console_append(jobstep_id, text=msg)
1075 |             except APIError:
1076 |                 pass
1077 |             try:
1078 |                 self._changes_api.update_jobstep(jobstep_id, status="finished", result="infra_failed", hostname=hostname)
1079 |             except APIError:
1080 |                 pass
1081 | 
1082 |     def frameworkMessage(self, driver, executorId, slaveId, message):
1083 |         """
1084 |           Invoked when an executor sends a message. These messages are best
1085 |           effort; do not expect a framework message to be retransmitted in any
1086 |           reliable fashion.
1087 |         """
1088 |         logging.info("Received message: %s", repr(str(message)))
1089 | 
1090 |     def slaveLost(self, driver, slaveId):
1091 |         """
1092 |           Invoked when a slave has been determined unreachable (e.g., machine
1093 |           failure, network partition.) Most frameworks will need to reschedule
1094 |           any tasks launched on this slave on a new slave.
1095 |         """
1096 |         logging.warn("Slave lost: %s", slaveId.value)
1097 |         self._stats.incr('slave_lost')
1098 |         with self._cached_slaves_lock:
1099 |             slave = self._cached_slaves.pop(slaveId.value, None)
1100 |             if slave:
1101 |                 self._stat_and_log_list(slave.offers(), 'decline_for_slave_lost',
1102 |                                         lambda offer: "Slave lost, declining offer: %s" % offer.offer.id)
1103 |                 self._decline_list(driver, slave.offers())
1104 | 
1105 |     def executorLost(self, driver, executorId, slaveId, status):
1106 |         """
1107 |           Invoked when an executor has exited/terminated. Note that any tasks
1108 |           running will have TASK_LOST status updates automatically generated.
1109 |         """
1110 |         logging.warn("Executor %s lost on slave %s", executorId.value, slaveId.value)
1111 | 
1112 |     def error(self, driver, message):
1113 |         """
1114 |           Invoked when there is an unrecoverable error in the scheduler or
1115 |           scheduler driver.  The driver will be aborted BEFORE invoking this
1116 |           callback.
1117 |         """
1118 |         logging.error("Error from Mesos: %s", message)
1119 |         self._stats.incr('errors')
1120 | 
1121 |     def save_state(self):
1122 |         """
1123 |           Save current state to a file so that a restart of the scheduler can
1124 |           restore the state.
1125 |         """
1126 |         state = {}
1127 |         state['framework_id'] = self.framework_id
1128 |         state['taskJobStepMapping'] = self.taskJobStepMapping
1129 |         state['tasksPendingKill'] = self.tasksPendingKill
1130 |         state['slaveIdInfo'] = {}
1131 |         for slave, info in self.slaveIdInfo.iteritems():
1132 |             state['slaveIdInfo'][slave] = {'hostname': info.hostname}
1133 |         state['tasksLaunched'] = self.tasksLaunched
1134 |         state['tasksFinished'] = self.tasksFinished
1135 |         state['snapshot_slave_map'] = self._snapshot_slave_map
1136 |         logging.info('Attempting to save state for framework %s with %d running tasks to %s',
1137 |                      self.framework_id, len(self.taskJobStepMapping), self.state_file)
1138 | 
1139 |         with open(self.state_file, 'w') as f:
1140 |             f.write(json.dumps(state))
1141 | 
1142 |     def restore_state(self):
1143 |         """
1144 |           Restores state from the previous run of the scheduler.
1145 |         """
1146 |         with open(self.state_file) as f:
1147 |             json_state = f.read()
1148 |         state = json.loads(json_state)
1149 | 
1150 |         self.framework_id = state['framework_id']
1151 |         self.taskJobStepMapping = state['taskJobStepMapping']
1152 |         self.tasksPendingKill = state.get('tasksPendingKill', {})
1153 |         self.slaveIdInfo = {}
1154 |         for slave, info in state.get('slaveIdInfo', {}).iteritems():
1155 |             self.slaveIdInfo[slave] = SlaveInfo(hostname=info.get('hostname'))
1156 |         self.tasksLaunched = state['tasksLaunched']
1157 |         self.tasksFinished = state['tasksFinished']
1158 |         snapshot_slave_map = state['snapshot_slave_map']
1159 |         self._snapshot_slave_map = defaultdict(lambda: defaultdict(float))
1160 |         for snapshot, slave_map in snapshot_slave_map.iteritems():
1161 |             for slave, timestamp in slave_map.iteritems():
1162 |                 self._snapshot_slave_map[snapshot][slave] = timestamp
1163 | 
1164 |         logging.info('Restored state for framework %s with %d running tasks from %s',
1165 |                      self.framework_id, len(self.taskJobStepMapping), self.state_file)
1166 | 
1167 |     def state_json(self):
1168 |         # type: () -> Dict[str, Any]
1169 |         """Produce a JSON dump of the scheduler's internal state.
1170 |         Returns:
1171 |             A JSON-encoded dict representing the scheduler's state.
1172 |         """
1173 |         def convert_attrs(attrs):
1174 |             # type: (List[Any]) -> List[Dict[str, Any]]
1175 |             """Convert Attribute and Resource protobuf fields to dictionaries.
1176 |             Args:
1177 |                 attrs: List of mesos_pb2.Attribute or mesos_pb2.Resource
1178 |             Returns:
1179 |                 {'name': str, 'type': int, 'value': any simple Python type}
1180 |             """
1181 |             accum = []
1182 |             for attr in attrs:
1183 |                 if attr.type == mesos_pb2.Value.SCALAR:
1184 |                     value = attr.scalar.value
1185 |                 elif attr.type == mesos_pb2.Value.RANGES:
1186 |                     value = ', '.join(map(lambda x: '(%d, %d)' % (x.begin, x.end), attr.ranges.range))
1187 |                 elif attr.type == mesos_pb2.Value.SET:
1188 |                     value = ', '.join(attr.set.item)
1189 |                 elif attr.type == mesos_pb2.Value.TEXT:
1190 |                     value = attr.text.value
1191 |                 else:
1192 |                     value = 'Unknown Mesos value type {} on slave {} offer {}'.format(
1193 |                             attr.type, slave.hostname, offer.offer.id.value)
1194 | 
1195 |                 attr_output = {
1196 |                     'name': attr.name,
1197 |                     'type': attr.type,
1198 |                     'value': value,
1199 |                 }
1200 |                 accum.append(attr_output)
1201 |             return accum
1202 | 
1203 |         start_time = time.time()
1204 |         with self._cached_slaves_lock:
1205 |             # Build JSON output for the blacklist.
1206 |             blacklist_output = {
1207 |                 'path': self._blacklist._path,
1208 |                 'entries': sorted(list(self._blacklist._blacklist)),
1209 |             }
1210 | 
1211 |             # Build JSON output for all slaves.
1212 |             slaves = self._cached_slaves.values()
1213 |             slaves.sort(key=lambda x: x.hostname)
1214 |             slaves_output = []
1215 |             for slave in slaves:
1216 |                 # Build JSON output for all offers on the slave.
1217 |                 offers = slave._offers.values()
1218 |                 offers.sort(key=lambda x: x.offer.id.value)
1219 |                 offers_output = []
1220 |                 for offer in offers:
1221 |                     if offer.offer.url.address.hostname:
1222 |                         base = offer.offer.url.address.hostname
1223 |                     else:
1224 |                         base = offer.offer.url.address.ip
1225 |                     url = (offer.offer.url.scheme +
1226 |                            base +
1227 |                            offer.offer.url.path +
1228 |                            '&'.join(offer.offer.url.query) +
1229 |                            offer.offer.url.fragment)
1230 | 
1231 |                     offer_output = {
1232 |                         'offer_id': offer.offer.id.value,
1233 |                         'framework_id': offer.offer.framework_id.value,
1234 |                         'url': url,
1235 |                         'cpu': offer.cpu,
1236 |                         'mem': offer.mem,
1237 |                         'attributes': convert_attrs(offer.offer.attributes),
1238 |                         'resources': convert_attrs(offer.offer.resources),
1239 |                     }
1240 |                     json.dumps(offer_output)
1241 |                     offers_output.append(offer_output)
1242 |                 slave_output = {
1243 |                         'slave_id': slave.slave_id,
1244 |                         'hostname': slave.hostname,
1245 |                         'cluster': slave.cluster,
1246 |                         'offers': offers_output,
1247 |                         'total_cpu': slave.total_cpu,
1248 |                         'total_mem': slave.total_mem,
1249 |                         'is_maintenanced': slave.is_maintenanced(
1250 |                             int(start_time * 1000000000)),
1251 |                 }
1252 |                 json.dumps(slave_output)
1253 |                 slaves_output.append(slave_output)
1254 | 
1255 |         # Put it all together.
1256 |         state = {
1257 |             'framework_id': self.framework_id,
1258 |             'taskJobStepMapping': self.taskJobStepMapping,
1259 |             'tasksPendingKill': self.tasksPendingKill,
1260 |             'tasksLaunched': self.tasksLaunched,
1261 |             'tasksFinished': self.tasksFinished,
1262 |             'shuttingDown': self.shuttingDown.is_set(),
1263 |             'blacklist': blacklist_output,
1264 |             'snapshot_slave_map': self._snapshot_slave_map,
1265 |             'changes_request_limit': self.changes_request_limit,
1266 |             'cached_slaves': slaves_output,
1267 |             'build_state_json_secs': time.time() - start_time,
1268 |         }
1269 | 
1270 |         return state
1271 | 


--------------------------------------------------------------------------------
/changes_mesos_scheduler/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from __future__ import absolute_import, print_function
  4 | 
  5 | import argparse
  6 | import json
  7 | import logging
  8 | import os
  9 | import signal
 10 | import sys
 11 | import threading
 12 | 
 13 | from time import sleep
 14 | 
 15 | from flask import Flask
 16 | from mesos.native import MesosSchedulerDriver
 17 | from mesos.interface import mesos_pb2
 18 | 
 19 | from .changes_scheduler import ChangesScheduler, ChangesAPI, FileBlacklist
 20 | from .statsreporter import StatsReporter
 21 | 
 22 | # Configuration should contain the file 'blacklist' which
 23 | # is a line-separated lists of hosts to blacklist.
 24 | #
 25 | # NOTE: inside ec2, hostnames look like
 26 | # ip-*-*-*-*.region.compute.internal
 27 | DEFAULT_CONFIG_DIR = '/etc/changes-mesos-scheduler'
 28 | 
 29 | 
 30 | def install_sentry_logger():
 31 |     try:
 32 |         import raven
 33 |     except ImportError:
 34 |         logging.warning('Unable to find raven library. Sentry integration disabled.')
 35 |         return
 36 | 
 37 |     from raven.conf import setup_logging
 38 |     from raven.handlers.logging import SentryHandler
 39 | 
 40 |     client = raven.Client()
 41 |     handler = SentryHandler(client, level=logging.WARN)
 42 |     setup_logging(handler)
 43 | 
 44 | 
 45 | def json_handler(func):
 46 |     """Produce an HTTP handler which JSON-encodes a Python object and sets
 47 |     the Content-Type to application/json."""
 48 |     def wrapped_func():
 49 |         """Returns
 50 |             (str output content, int status code, dict headers (content type))
 51 |         """
 52 |         return json.dumps(func()), 200, {'Content-Type': 'application/json; charset=utf-8'}
 53 |     return wrapped_func
 54 | 
 55 | 
 56 | def run(api_url, mesos_master, user, config_dir, state_file,
 57 |         changes_request_limit, http_port, stats=None):
 58 |     scheduler = ChangesScheduler(state_file, api=ChangesAPI(api_url), stats=stats,
 59 |                                  blacklist=FileBlacklist(os.path.join(config_dir, 'blacklist')),
 60 |                                  changes_request_limit=changes_request_limit)
 61 | 
 62 |     executor = mesos_pb2.ExecutorInfo()
 63 |     executor.executor_id.value = "default"
 64 |     executor.command.value = os.path.abspath("./executor.py")
 65 |     executor.name = "Changes Executor"
 66 |     executor.source = "changes"
 67 | 
 68 |     framework = mesos_pb2.FrameworkInfo()
 69 |     framework.user = user
 70 |     framework.name = "Changes Scheduler"
 71 |     framework.principal = "changes"
 72 |     # Give the scheduler 1 week to restart before mesos cancels the tasks.
 73 |     # this is the setting recommended by the docs.
 74 |     framework.failover_timeout = 3600 * 24 * 7
 75 | 
 76 |     if scheduler.framework_id:
 77 |         framework.id.value = scheduler.framework_id
 78 |         executor.framework_id.value = scheduler.framework_id
 79 | 
 80 |     driver = MesosSchedulerDriver(
 81 |         scheduler,
 82 |         framework,
 83 |         mesos_master)
 84 | 
 85 |     stopped = threading.Event()
 86 | 
 87 |     def handle_interrupt(signal, frame):
 88 |         stopped.set()
 89 |         logging.info("Received interrupt, shutting down")
 90 |         logging.warning("Not saving state. Will wait for running tasks to finish.")
 91 |         scheduler.shuttingDown.set()
 92 |         while scheduler.activeTasks > 0:
 93 |             logging.info("Waiting for %d tasks to finish running", scheduler.activeTasks)
 94 |             sleep(5)
 95 |         driver.stop()
 96 | 
 97 |     def handle_sigterm(signal, frame):
 98 |         # TODO: Avoid save_state race conditions by having handle_sigterm()
 99 |         # only set shuttingDown, then do the actual save-state and driver.stop()
100 |         # in the main thread after all other threads are join()ed.
101 |         # Also, stopped doesn't appear to be used.
102 |         stopped.set()
103 |         logging.info("Received sigterm, shutting down")
104 |         scheduler.shuttingDown.set()
105 |         if scheduler.state_file:
106 |             try:
107 |                 scheduler.save_state()
108 |                 logging.info("Successfully saved state to %s.", state_file)
109 |             except Exception:
110 |                 logging.exception("Failed to save state")
111 |                 driver.stop()
112 |                 return
113 |             # With `failover` set to true, we do not tell Mesos to stop the existing tasks
114 |             # started by this framework. Instead, the tasks will run for
115 |             # `fail_timeout` more seconds set above or we start a scheduler with
116 |             # the same framework id.
117 |             driver.stop(True)
118 |         else:
119 |             logging.warning("State file location not set. Not saving state. Existing builds will be cancelled.")
120 |             driver.stop()
121 | 
122 |     signal.signal(signal.SIGINT, handle_interrupt)
123 |     signal.signal(signal.SIGTERM, handle_sigterm)
124 | 
125 |     driver.start()
126 |     logging.info("Driver started")
127 | 
128 |     app = Flask("Changes Mesos Scheduler")
129 |     app.add_url_rule(
130 |         '/api/state_json', 'state_json', json_handler(scheduler.state_json) )
131 |     http_thread = threading.Thread(target=app.run, kwargs={'port':http_port})
132 |     http_thread.start()
133 | 
134 |     scheduler.poll_changes_until_shutdown(driver, 5)
135 |     status = 0
136 |     if driver.join() == mesos_pb2.DRIVER_STOPPED:
137 |         logging.info("Driver stopped cleanly.")
138 |     else:
139 |         # Ensure that the driver process terminates.
140 |         status = 1
141 |         logging.info("Stopping driver forcibly.")
142 |         driver.stop()
143 | 
144 |     logging.info("Stopping HTTP server.")
145 |     http_thread.terminate()
146 |     http_thread.join()
147 | 
148 |     logging.info("Clean shutdown complete. Exiting status %d.", status)
149 |     sys.exit(status)
150 | 
151 | 
152 | def main():
153 |     parser = argparse.ArgumentParser(description='Mesos HTTP Proxy')
154 | 
155 |     parser.add_argument('--api-url', required=True,
156 |                         help='URL root of Changes API, including scheme. (e.g. http://localhost:5000/api/0/)')
157 |     parser.add_argument('--mesos-master', default='127.0.1.1:5050',
158 |                         help='Location of Mesos master server. (e.g. 127.0.1.1:5050)')
159 |     parser.add_argument('--user', default='root', help="User to run tasks as")
160 |     parser.add_argument('--log-level', default='info', help="Level to log at. (e.g. info)")
161 |     parser.add_argument('--config-dir', default=DEFAULT_CONFIG_DIR, help='Configuration directory')
162 |     parser.add_argument('--state-file', default=None, help='File path preserve state across restarts')
163 |     parser.add_argument('--statsd-host', default=None, help='Host to report stats to')
164 |     parser.add_argument('--statsd-port', default=8125, type=int, help='Port for on statsd host to send to')
165 |     parser.add_argument('--statsd-prefix', default='changes_scheduler', help='Prefix for stats keys')
166 |     parser.add_argument('--changes-request-limit', default=200, type=int,
167 |                         help='Maximum number of JobSteps to ask Changes for per-request')
168 |     parser.add_argument('--http_port', default=5888, type=int, help='Port for Flask to listen for and serve HTTP requests.')
169 | 
170 |     args = parser.parse_args(sys.argv[1:])
171 |     logging.basicConfig(level=getattr(logging, args.log_level.upper()),
172 |                         format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
173 |     install_sentry_logger()
174 | 
175 |     stats = None
176 |     if args.statsd_host:
177 |         stats = StatsReporter({
178 |             'STATSD_HOST':   args.statsd_host,
179 |             'STATSD_PORT':   args.statsd_port,
180 |             'STATSD_PREFIX': args.statsd_prefix,
181 |         }).stats()
182 | 
183 |     try:
184 |         run(args.api_url, args.mesos_master, args.user, args.config_dir,
185 |             args.state_file, args.changes_request_limit, args.http_port, stats)
186 |     except Exception as e:
187 |         logging.exception(unicode(e))
188 |         raise
189 | 
190 | if __name__ == "__main__":
191 |     main()
192 | 


--------------------------------------------------------------------------------
/changes_mesos_scheduler/statsreporter.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import time
  3 | import logging
  4 | from contextlib import contextmanager
  5 | 
  6 | import statsd
  7 | 
  8 | logger = logging.getLogger('statsreporter')
  9 | 
 10 | 
 11 | def swallow_exceptions(exn_logger):
 12 |     """Decorator to catch, log, and discard any Exceptions raised in a method.
 13 |     :param exn_logger: logging.Logger to use for logging any exceptions.
 14 |     """
 15 |     def decor(func):
 16 |         def wrapper(*args, **kwargs):
 17 |             try:
 18 |                 return func(*args, **kwargs)
 19 |             except Exception as e:
 20 |                 exn_logger.exception(e)
 21 |         return wrapper
 22 |     return decor
 23 | 
 24 | 
 25 | class StatsReporter(object):
 26 |     """StatsReporter is responsible for maintaining an app-specific Stats instance.
 27 |     The config should specify:
 28 |        STATSD_HOST (address of statsd host as a string)
 29 |        STATSD_PORT (port statsd is listening on as an int)
 30 |        STATSD_PREFIX (string to be automatically prepended to all reported stats for namespacing)
 31 | 
 32 |     If STATSD_HOST isn't specified, none of the others will be used and this app will
 33 |     get a no-op Stats instance.
 34 |     """
 35 |     def __init__(self, config):
 36 |         host = config.get('STATSD_HOST')
 37 |         self._stats = None
 38 |         if host:
 39 |             sd = statsd.StatsClient(host=host,
 40 |                                     prefix=config.get('STATSD_PREFIX'),
 41 |                                     port=config.get('STATSD_PORT'))
 42 |             self._stats = Stats(client=sd)
 43 | 
 44 |     def stats(self):
 45 |         """Returns a Stats object.
 46 |         If no statsd config has been provided,
 47 |         the Stats won't do anything but validate."""
 48 |         if self._stats:
 49 |             return self._stats
 50 |         return Stats(client=None)
 51 | 
 52 | 
 53 | class Stats(object):
 54 |     """ Minimalistic class for sending stats/monitoring values."""
 55 | 
 56 |     def __init__(self, client):
 57 |         """
 58 |         @param client - A statsd.StatsClient instance, or None for a no-op Stats.
 59 |         """
 60 |         # A thin wrapper around Statsd rather than just Statsd so we
 61 |         # can pick which features to support and how to encode the data.
 62 |         self._client = client
 63 | 
 64 |     @swallow_exceptions(logger)
 65 |     def set_gauge(self, key, value):
 66 |         """ Set a gauge, typically a sampled instantaneous value.
 67 |             @param key - the name of the gauge.
 68 |             @param value - current value of the gauge.
 69 |         """
 70 |         assert isinstance(value, (int, float, long))
 71 |         Stats._check_key(key)
 72 |         if self._client:
 73 |             self._client.gauge(key, value)
 74 | 
 75 |     @swallow_exceptions(logger)
 76 |     def incr(self, key, delta=1):
 77 |         """ Increment a count.
 78 |             @param key - the name of the stat.
 79 |             @param delta - amount to increment the stat by. Must be positive.
 80 |         """
 81 |         assert isinstance(delta, (int, float, long))
 82 |         assert delta >= 0
 83 |         Stats._check_key(key)
 84 |         if self._client:
 85 |             self._client.incr(key, delta)
 86 | 
 87 |     @swallow_exceptions(logger)
 88 |     def log_timing(self, key, duration_ms):
 89 |         """ Record a millisecond timing. """
 90 |         assert isinstance(duration_ms, (int, float, long))
 91 |         Stats._check_key(key)
 92 |         if self._client:
 93 |             self._client.timing(key, duration_ms)
 94 | 
 95 |     @contextmanager
 96 |     def timer(self, key):
 97 |         """A contextmanager that reports the duration in milliseconds on exit."""
 98 |         t0 = time.time()
 99 |         try:
100 |             yield
101 |         finally:
102 |             duration_ms = int(1000 * (time.time() - t0))
103 |             self.log_timing(key, duration_ms)
104 | 
105 |     _KEY_RE = re.compile(r'^[A-Za-z0-9_-]+$')
106 | 
107 |     @classmethod
108 |     def _check_key(cls, key):
109 |         """ This is probably overly strict, but we have little use for
110 |         interestingly named keys and this avoids unintentionally using them."""
111 |         if not cls._KEY_RE.match(key):
112 |             raise Exception("Invalid key: {}".format(repr(key)))
113 | 


--------------------------------------------------------------------------------
/changes_mesos_scheduler/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dropbox/changes-mesos-framework/cbb2351d45b4231286a18e70e5fea039b121d0a4/changes_mesos_scheduler/tests/__init__.py


--------------------------------------------------------------------------------
/changes_mesos_scheduler/tests/test_changes_scheduler.py:
--------------------------------------------------------------------------------
   1 | import logging
   2 | import json
   3 | import os
   4 | import shutil
   5 | import tempfile
   6 | import threading
   7 | import time
   8 | 
   9 | from collections import defaultdict
  10 | 
  11 | import mock
  12 | from typing import Any
  13 | from unittest import TestCase
  14 | 
  15 | # Capture debug logging output on test failure
  16 | logger = logging.getLogger()
  17 | logger.level = logging.DEBUG
  18 | 
  19 | from mesos.interface import mesos_pb2
  20 | from mesos.interface import Scheduler
  21 | 
  22 | from changes_mesos_scheduler.changes_scheduler import ChangesScheduler, APIError, FileBlacklist, ChangesAPI, SlaveInfo, TASK_KILL_THRESHOLD
  23 | from changes_mesos_scheduler import statsreporter
  24 | 
  25 | def _noop_blacklist():
  26 |     """Returns a blacklist instance that behaves like an empty blacklist."""
  27 |     m = mock.Mock(spec=FileBlacklist)
  28 |     m.contains.return_value = False
  29 |     return m
  30 | 
  31 | 
  32 | def help_resource_offers_and_poll_changes(cs, driver, new_offers):
  33 |     # type: (ChangesScheduler, Scheduler, List[Any]) -> None
  34 |     """Receive offers from the Mesos master and poll Changes for new jobsteps
  35 |     in a synchronous manner to facilitate simpler, more straightforward
  36 |     testing. Normally these two tasks run in separate threads.
  37 |     Args:
  38 |         driver: the MesosSchedulerDriver object
  39 |         new_offers: A list of Mesos Offer protobufs that should be offered to
  40 |             the scheduler.
  41 |     """
  42 |     cs.shuttingDown.clear()                     # reset shuttingDown if necessary.
  43 |     cs.resourceOffers(driver, new_offers)       # Get offers from Mesos master.
  44 |     assert not cs.poll_and_launch_once(driver)  # Get jobsteps and launch them.
  45 | 
  46 | 
  47 | class ChangesAPITest(TestCase):
  48 |     url = 'https://changes.com/api/0'
  49 | 
  50 |     def test_make_url_paths(self):
  51 |         desired = 'https://changes.com/api/0/jobsteps/allocate/'
  52 |         assert ChangesAPI.make_url(self.url, '/jobsteps/allocate/') == desired
  53 |         assert ChangesAPI.make_url(self.url, 'jobsteps/allocate') == desired
  54 |         assert ChangesAPI.make_url(self.url + '/', 'jobsteps/allocate') == desired
  55 |         assert ChangesAPI.make_url(self.url + '/', '/jobsteps/allocate') == desired
  56 |         assert ChangesAPI.make_url(self.url + '//', '/jobsteps/allocate') == desired
  57 | 
  58 |     def test_make_url_query(self):
  59 |         desired = ['https://changes.com/api/0/jobsteps/allocate/?foo=bar&baz=xyzz',
  60 |                    'https://changes.com/api/0/jobsteps/allocate/?baz=xyzz&foo=bar']
  61 |         full_url = ChangesAPI.make_url(self.url, '/jobsteps/allocate/', {'foo': 'bar', 'baz': 'xyzz'})
  62 |         assert full_url in desired
  63 | 
  64 | 
  65 | class ChangesSchedulerTest(TestCase):
  66 | 
  67 |     def setUp(self):
  68 |         self.test_dir = tempfile.mkdtemp()
  69 |         super(ChangesSchedulerTest, self).setUp()
  70 | 
  71 |     def tearDown(self):
  72 |         shutil.rmtree(self.test_dir)
  73 |         super(ChangesSchedulerTest, self).tearDown()
  74 | 
  75 |     def _make_task_status(self, id='taskid', state=mesos_pb2.TASK_FINISHED,
  76 |                           message="foo", slave_id='slaveid', jobstep_id='1'):
  77 |         status = mesos_pb2.TaskStatus(
  78 |             task_id=mesos_pb2.TaskID(value=id),
  79 |             slave_id=mesos_pb2.SlaveID(value=slave_id),
  80 |             state=state,
  81 |             message=message,
  82 |         )
  83 |         return status
  84 | 
  85 |     def _make_offer(self,
  86 |                     hostname=None,
  87 |                     cpus=4,
  88 |                     mem=8192,
  89 |                     cluster=None,
  90 |                     id='offerid',
  91 |                     unavailability_start_secs=None,
  92 |                     unavailability_duration_secs=None):
  93 |         # Offers with different IDs will have different hostnames, unless
  94 |         # otherwise explicitly specified.
  95 |         if hostname is None:
  96 |             hostname = id
  97 |         offer = mesos_pb2.Offer(
  98 |             id=mesos_pb2.OfferID(value=id),
  99 |             framework_id=mesos_pb2.FrameworkID(value="frameworkid"),
 100 |             slave_id=mesos_pb2.SlaveID(value="slave_id_" + hostname),
 101 |             hostname=hostname,
 102 |         )
 103 | 
 104 |         if unavailability_start_secs is not None:
 105 |             offer.unavailability.start.nanoseconds = int(unavailability_start_secs * 1000000000)
 106 |         if unavailability_duration_secs is not None:
 107 |             offer.unavailability.duration.nanoseconds = int(unavailability_duration_secs * 1000000000)
 108 | 
 109 |         offer.resources.add(name="cpus",
 110 |                             type=mesos_pb2.Value.SCALAR,
 111 |                             scalar=mesos_pb2.Value.Scalar(value=cpus))
 112 |         offer.resources.add(name="mem",
 113 |                             type=mesos_pb2.Value.SCALAR,
 114 |                             scalar=mesos_pb2.Value.Scalar(value=mem))
 115 |         if cluster:
 116 |             offer.attributes.add(name="labels",
 117 |                                  type=mesos_pb2.Value.TEXT,
 118 |                                  text=mesos_pb2.Value.Text(value=cluster))
 119 |         return offer
 120 | 
 121 |     def _make_changes_task(self, id, cpus=2, mem=4096, slug='foo', cmd='ls', snapshot=None):
 122 |         image = None
 123 |         if snapshot:
 124 |             image = {'snapshot': {'id': snapshot}}
 125 | 
 126 |         return {'project': {'slug': slug}, 'id': id,
 127 |                 'cmd': cmd, 'resources': {'cpus': cpus, 'mem': mem},
 128 |                 'image': image}
 129 | 
 130 |     def test_save_restore_state(self):
 131 |         state_file = self.test_dir + '/test.json'
 132 | 
 133 |         cs = ChangesScheduler(state_file, api=mock.Mock(),
 134 |                               blacklist=_noop_blacklist())
 135 |         cs.tasksLaunched = 5
 136 |         cs.tasksFinished = 3
 137 |         cs.taskJobStepMapping['task x'] = 'jobstep x'
 138 |         cs.tasksPendingKill = {'task y': 100.5}
 139 |         cs.slaveIdInfo['slaveid'] = SlaveInfo(hostname='aHostname')
 140 |         cs._snapshot_slave_map = defaultdict(lambda: defaultdict(float))
 141 |         cs._snapshot_slave_map['snapid']['host1'] = 1234567.0
 142 |         cs._snapshot_slave_map['snapid']['host2'] = 1234569.0
 143 |         cs.save_state()
 144 |         cs = None
 145 | 
 146 |         cs2 = ChangesScheduler(state_file, api=mock.Mock(),
 147 |                                blacklist=_noop_blacklist())
 148 |         assert 5 == cs2.tasksLaunched
 149 |         assert 3 == cs2.tasksFinished
 150 |         assert {'task x': 'jobstep x'} == cs2.taskJobStepMapping
 151 |         assert {'task y': 100.5} == cs2.tasksPendingKill
 152 |         assert cs2.slaveIdInfo['slaveid'].hostname == 'aHostname'
 153 |         assert not os.path.exists(state_file)
 154 |         assert {'snapid': {'host1': 1234567.0, 'host2': 1234569.0}} == cs2._snapshot_slave_map
 155 | 
 156 |     def test_save_restore_state_missing(self):
 157 |         state_file = self.test_dir + '/test.json'
 158 | 
 159 |         # newly added fields shouldn't be added to this dict. This is so we 
 160 |         # can test that newly added (aka initially missing) fields are 
 161 |         # restored to a reasonable default.
 162 |         state = {'framework_id': 1,
 163 |                  'tasksLaunched': 5,
 164 |                  'tasksFinished': 3,
 165 |                  'taskJobStepMapping': {'task x': 'jobstep x'},
 166 |                  'snapshot_slave_map': {}
 167 |                  }
 168 | 
 169 |         with open(state_file, 'w') as f:
 170 |             f.write(json.dumps(state))
 171 | 
 172 |         cs2 = ChangesScheduler(state_file, api=mock.Mock(),
 173 |                                blacklist=_noop_blacklist())
 174 |         assert 5 == cs2.tasksLaunched
 175 |         assert 3 == cs2.tasksFinished
 176 |         assert {'task x': 'jobstep x'} == cs2.taskJobStepMapping
 177 |         assert cs2.tasksPendingKill == {}
 178 |         assert cs2.slaveIdInfo == {}
 179 |         assert not os.path.exists(state_file)
 180 |         assert {} == cs2._snapshot_slave_map
 181 | 
 182 |     def test_task_finished(self):
 183 |         api = mock.Mock(spec=ChangesAPI)
 184 |         cs = ChangesScheduler(state_file=None, api=api,
 185 |                               blacklist=_noop_blacklist())
 186 |         cs.taskJobStepMapping = {'taskid': '1'}
 187 |         cs.slaveIdInfo = {'slaveid': SlaveInfo(hostname='aHostname')}
 188 |         driver = mock.Mock()
 189 | 
 190 |         status = self._make_task_status(id='taskid', jobstep_id='1')
 191 | 
 192 |         cs.statusUpdate(driver, status)
 193 | 
 194 |         assert cs.tasksFinished == 1
 195 |         assert len(cs.taskJobStepMapping) == 0
 196 | 
 197 |         api.update_jobstep.assert_called_once_with('1', status='finished', hostname='aHostname')
 198 | 
 199 |     def test_task_failed(self):
 200 |         api = mock.Mock(spec=ChangesAPI)
 201 |         cs = ChangesScheduler(state_file=None, api=api,
 202 |                               blacklist=_noop_blacklist())
 203 |         cs.taskJobStepMapping = {'taskid': '1'}
 204 |         cs.slaveIdInfo = {'slaveid': SlaveInfo(hostname='aHostname')}
 205 |         driver = mock.Mock()
 206 | 
 207 |         status = self._make_task_status(id='taskid', jobstep_id='1', state=mesos_pb2.TASK_FAILED)
 208 | 
 209 |         cs.statusUpdate(driver, status)
 210 | 
 211 |         assert cs.tasksFinished == 0
 212 |         assert len(cs.taskJobStepMapping) == 0
 213 | 
 214 |         assert api.jobstep_console_append.call_count == 1
 215 |         api.update_jobstep.assert_called_once_with('1', status='finished', result='infra_failed', hostname='aHostname')
 216 | 
 217 |     def test_task_running(self):
 218 |         api = mock.Mock(spec=ChangesAPI)
 219 |         cs = ChangesScheduler(state_file=None, api=api,
 220 |                               blacklist=_noop_blacklist())
 221 |         cs.taskJobStepMapping = {'taskid': '1'}
 222 |         driver = mock.Mock()
 223 | 
 224 |         status = self._make_task_status(id='taskid', jobstep_id='1', state=mesos_pb2.TASK_RUNNING)
 225 | 
 226 |         cs.statusUpdate(driver, status)
 227 | 
 228 |         assert cs.tasksFinished == 0
 229 |         assert len(cs.taskJobStepMapping) == 1
 230 | 
 231 |         api.jobstep_console_append.assert_not_called()
 232 |         api.update_jobstep.assert_not_called()
 233 | 
 234 |     def test_missing_jobstep_mapping(self):
 235 |         api = mock.Mock(spec=ChangesAPI)
 236 |         stats = mock.Mock()
 237 |         cs = ChangesScheduler(state_file=None, api=api, stats=stats,
 238 |                               blacklist=_noop_blacklist())
 239 |         cs.taskJobStepMapping = {}
 240 |         driver = mock.Mock()
 241 | 
 242 |         status = self._make_task_status(id='taskid', jobstep_id='1', state=mesos_pb2.TASK_FINISHED)
 243 | 
 244 |         cs.statusUpdate(driver, status)
 245 | 
 246 |         assert cs.tasksFinished == 1
 247 | 
 248 |         stats.incr.assert_called_once_with('missing_jobstep_id_finished')
 249 | 
 250 |     def test_missing_hostname_mapping(self):
 251 |         api = mock.Mock(spec=ChangesAPI)
 252 |         cs = ChangesScheduler(state_file=None, api=api,
 253 |                               blacklist=_noop_blacklist())
 254 |         cs.taskJobStepMapping = {'taskid': '1'}
 255 |         driver = mock.Mock()
 256 | 
 257 |         status = self._make_task_status(id='taskid', jobstep_id='1')
 258 | 
 259 |         cs.statusUpdate(driver, status)
 260 | 
 261 |         assert cs.tasksFinished == 1
 262 |         assert len(cs.taskJobStepMapping) == 0
 263 | 
 264 |         api.update_jobstep.assert_called_once_with('1', status='finished', hostname=None)
 265 | 
 266 |     def test_needs_abort_api_error(self):
 267 |         api = mock.Mock(spec=ChangesAPI)
 268 |         api.jobstep_needs_abort.side_effect = APIError("Failure")
 269 |         cs = ChangesScheduler(state_file=None, api=api,
 270 |                               blacklist=_noop_blacklist())
 271 |         cs.taskJobStepMapping = {'task1': '1'}
 272 |         driver = mock.Mock()
 273 | 
 274 |         cs.poll_and_abort(driver)
 275 | 
 276 |         api.jobstep_needs_abort.assert_called_once_with(['1'])
 277 |         assert driver.killTask.call_count == 0
 278 | 
 279 |     def test_no_needs_abort(self):
 280 |         api = mock.Mock(spec=ChangesAPI)
 281 |         api.jobstep_needs_abort.return_value = []
 282 |         cs = ChangesScheduler(state_file=None, api=api,
 283 |                               blacklist=_noop_blacklist())
 284 |         cs.taskJobStepMapping = {'task1': '1'}
 285 |         driver = mock.Mock()
 286 | 
 287 |         cs.poll_and_abort(driver)
 288 | 
 289 |         api.jobstep_needs_abort.assert_called_once_with(['1'])
 290 |         assert driver.killTask.call_count == 0
 291 | 
 292 |     def test_jobsteps_needs_abort(self):
 293 |         api = mock.Mock(spec=ChangesAPI)
 294 |         api.jobstep_needs_abort.return_value = ['1', '2']
 295 |         cs = ChangesScheduler(state_file=None, api=api,
 296 |                               blacklist=_noop_blacklist())
 297 |         cs.taskJobStepMapping = {'task1': '1', 'task2': '2', 'task3': '3'}
 298 |         driver = mock.Mock()
 299 |         killed_tasks = []
 300 |         driver.killTask.side_effect = lambda task: killed_tasks.append(task.value)
 301 | 
 302 |         with mock.patch('time.time') as t:
 303 |             t.return_value = 1000.0
 304 |             cs.poll_and_abort(driver)
 305 | 
 306 |         api.jobstep_needs_abort.assert_called_once_with(['1', '2', '3'])
 307 | 
 308 |         # task3 isn't marked aborted by Changes so we don't abort it
 309 |         assert sorted(killed_tasks) == ['task1', 'task2']
 310 |         assert driver.killTask.call_count == 2
 311 |         assert cs.tasksPendingKill == {'task1': 1000, 'task2': 1000}
 312 | 
 313 |     def test_aborted_task_wont_die(self):
 314 |         api = mock.Mock(spec=ChangesAPI)
 315 |         api.jobstep_needs_abort.return_value = ['1', '2']
 316 |         stats = mock.Mock()
 317 |         cs = ChangesScheduler(state_file=None, api=api, stats=stats,
 318 |                               blacklist=_noop_blacklist())
 319 |         cs.taskJobStepMapping = {'task1': '1', 'task2': '2', 'task3': '3'}
 320 |         task2_time = 1000.0 + TASK_KILL_THRESHOLD + 1
 321 |         cs.tasksPendingKill = {'task1': 1000.0, 'task2': task2_time}
 322 |         driver = mock.Mock()
 323 |         killed_tasks = []
 324 |         driver.killTask.side_effect = lambda task: killed_tasks.append(task.value)
 325 | 
 326 |         with mock.patch('time.time') as t:
 327 |             t.return_value = task2_time + 1
 328 |             cs.poll_and_abort(driver)
 329 | 
 330 |         api.jobstep_needs_abort.assert_called_once_with(['1', '2', '3'])
 331 | 
 332 |         assert sorted(killed_tasks) == ['task2']
 333 |         assert driver.killTask.call_count == 1
 334 |         assert cs.taskJobStepMapping == {'task2': '2', 'task3': '3'}
 335 |         assert cs.tasksPendingKill == {'task2': task2_time}
 336 |         stats.incr.assert_called_once_with('couldnt_abort_task')
 337 | 
 338 |     def test_blacklist(self):
 339 |         blpath = self.test_dir + '/blacklist'
 340 |         # Ensure we have an empty blacklist file.
 341 |         open(blpath, 'w+').close()
 342 | 
 343 |         api = mock.MagicMock()
 344 |         stats = mock.Mock()
 345 |         cs = ChangesScheduler(state_file=None, api=api, stats=stats,
 346 |                               blacklist=FileBlacklist(blpath))
 347 |         offer = self._make_offer(hostname = 'some_hostname.com')
 348 | 
 349 |         blacklist = open(blpath, 'w+')
 350 |         blacklist.write('some_hostname.com\n')
 351 |         blacklist.close()
 352 | 
 353 |         driver = mock.Mock()
 354 |         # We have to fake the mtime despite the file legitimately having been modified
 355 |         # later because some filesystems (HFS+, for example) don't have enough precision
 356 |         # for this to pass reliably.
 357 |         with mock.patch('os.path.getmtime', return_value=time.time()+1) as getmtime:
 358 |             help_resource_offers_and_poll_changes(cs, driver, [offer])
 359 |             getmtime.assert_called_with(blpath)
 360 |         assert api.declineOffer.call_count == 0
 361 |         assert api.allocate_jobsteps.call_count == 0
 362 | 
 363 |         assert stats.incr.call_count == 3
 364 |         stats.incr.assert_any_call('ignore_for_blacklist', 1)
 365 |         stats.incr.assert_any_call('ignore_for_maintenance', 0)
 366 | 
 367 |         # Decline any unused offers. Expect the blacklisted offer.
 368 |         cs.decline_open_offers(driver)
 369 |         driver.declineOffer.assert_called_once_with(offer.id)
 370 | 
 371 |     def test_blacklist_maintenance(self):
 372 |         api = mock.Mock(spec=ChangesAPI)
 373 |         now = time.time()
 374 |         memlimit = 8192
 375 | 
 376 |         # Test no unavailability scheduled - ACCEPT
 377 |         offer1 = self._make_offer(hostname='hostname_1.com',
 378 |                                   id="offer_1",
 379 |                                   mem=memlimit)
 380 | 
 381 |         # Test unavailability scheduled right now - DECLINE
 382 |         offer2 = self._make_offer(hostname='hostname_2.com',
 383 |                                   id="offer_2",
 384 |                                   mem=memlimit,
 385 |                                   unavailability_start_secs=now,
 386 |                                   unavailability_duration_secs=10)
 387 | 
 388 |         # Test unavailability scheduled in a few seconds - ACCEPT
 389 |         offer3 = self._make_offer(hostname='hostname_3.com',
 390 |                                   id="offer_3",
 391 |                                   mem=memlimit,
 392 |                                   unavailability_start_secs=now + 5,
 393 |                                   unavailability_duration_secs=10)
 394 | 
 395 |         # Test unavailability scheduled in the past, ending in the past - ACCEPT
 396 |         offer4 = self._make_offer(hostname='hostname_4.com',
 397 |                                   id="offer_4",
 398 |                                   mem=memlimit,
 399 |                                   unavailability_start_secs=now - 20,
 400 |                                   unavailability_duration_secs=10)
 401 | 
 402 |         # Test unavailability in progress - DECLINE
 403 |         offer5 = self._make_offer(hostname='hostname_5.com',
 404 |                                   id="offer_5",
 405 |                                   mem=memlimit,
 406 |                                   unavailability_start_secs=now - 5,
 407 |                                   unavailability_duration_secs=10)
 408 | 
 409 |         # Test past unavailability with no duration - DECLINE
 410 |         offer6 = self._make_offer(hostname='hostname_6.com',
 411 |                                   id="offer_6",
 412 |                                   mem=memlimit,
 413 |                                   unavailability_start_secs=now - 5,
 414 |                                   unavailability_duration_secs=None)
 415 | 
 416 |         # Test future unavailability with no duration - ACCEPT
 417 |         offer7 = self._make_offer(hostname='hostname_7.com',
 418 |                                   id="offer_7",
 419 |                                   mem=memlimit,
 420 |                                   unavailability_start_secs=now + 5,
 421 |                                   unavailability_duration_secs=None)
 422 | 
 423 |         # Test unavailability with zero duration - ACCEPT
 424 |         offer8 = self._make_offer(hostname='hostname_8.com',
 425 |                                   id="offer_8",
 426 |                                   mem=memlimit,
 427 |                                   unavailability_start_secs=now - 5,
 428 |                                   unavailability_duration_secs=0)
 429 | 
 430 |         all_offers = [offer1, offer2, offer3, offer4, offer5, offer6, offer7, offer8]
 431 |         expected_launches = [offer1, offer3, offer4, offer7, offer8]
 432 |         expected_ignores = [offer2, offer5, offer6]
 433 | 
 434 |         # To ensure that offers aren't accidentally declined due to a shortage
 435 |         # of tasks, ensure tasks > offers so there's at least one tasks per
 436 |         # machine, plus an extra. (each offer has memory for one task)
 437 |         num_tasks = len(all_offers) + 1
 438 |         tasks = []
 439 |         for i in xrange(num_tasks):
 440 |             tasks.append(self._make_changes_task(str(i), mem=memlimit))
 441 |         api.get_allocate_jobsteps.return_value = tasks
 442 | 
 443 |         # In practice, we end up with two tasks allocated per offer.
 444 |         post_allocate_jobsteps_return = []
 445 |         for i in xrange(len(expected_launches)):
 446 |             post_allocate_jobsteps_return.append(str(i))
 447 |         api.post_allocate_jobsteps.return_value = post_allocate_jobsteps_return
 448 | 
 449 |         # Actually run the test logic.
 450 |         stats = mock.MagicMock(spec=statsreporter.Stats)
 451 |         cs = ChangesScheduler(state_file=None, api=api, stats=stats,
 452 |                               blacklist=_noop_blacklist())
 453 |         driver = mock.Mock()
 454 |         help_resource_offers_and_poll_changes(cs, driver, all_offers)
 455 | 
 456 |         # Check that the maintenanced offers are declined.
 457 |         assert driver.declineOffer.call_count == 0
 458 | 
 459 |         # Check the stats reporting.
 460 |         assert stats.incr.call_count == 3
 461 |         stats.incr.assert_any_call('offers', len(all_offers))
 462 |         stats.incr.assert_any_call('ignore_for_blacklist', 0)
 463 |         stats.incr.assert_any_call('ignore_for_maintenance', len(expected_ignores))
 464 | 
 465 |         # Check that the non-maintenanced tasks are launched.
 466 |         assert driver.launchTasks.call_count == len(expected_launches)
 467 |         actual_launch_set = set()
 468 |         expected_launch_set = set()
 469 |         for launch_offer, args in zip(expected_launches,
 470 |                                       driver.launchTasks.call_args_list):
 471 |             expected_launch_set.add(launch_offer.id.value)
 472 |             assert len(args[0][0]) == 1  # only one OfferId in the launch args.
 473 |             actual_launch_set.add(args[0][0][0].value)
 474 |         assert actual_launch_set == expected_launch_set
 475 | 
 476 |         # Decline any unused offers. Expect all maintenanced offers.
 477 |         cs.decline_open_offers(driver)
 478 |         for offer in expected_ignores:
 479 |             driver.declineOffer.assert_any_call(offer.id)
 480 |         assert driver.declineOffer.call_count == len(expected_ignores)
 481 | 
 482 |     def test_error_stats(self):
 483 |         stats = mock.Mock()
 484 |         cs = ChangesScheduler(state_file=None, api=mock.Mock(), stats=stats,
 485 |                               blacklist=_noop_blacklist())
 486 |         driver = mock.Mock()
 487 |         cs.error(driver, 'message')
 488 |         stats.incr.assert_called_once_with('errors')
 489 | 
 490 |     def test_slaveLost(self):
 491 |         stats = mock.Mock()
 492 |         cs = ChangesScheduler(state_file=None, api=mock.Mock(), stats=stats,
 493 |                               blacklist=_noop_blacklist())
 494 |         driver = mock.Mock()
 495 | 
 496 |         pb_offer = self._make_offer(hostname="hostname")
 497 | 
 498 |         # Check removing an unrecognized slave.
 499 |         assert len(cs._cached_slaves) == 0
 500 |         cs.slaveLost(driver, pb_offer.slave_id)
 501 |         stats.incr.assert_called_once_with('slave_lost')
 502 | 
 503 |         # Check removing a recognized slave.
 504 |         cs.resourceOffers(driver, [pb_offer])
 505 |         stats.reset_mock()
 506 |         assert len(cs._cached_slaves) == 1
 507 | 
 508 |         cs.slaveLost(driver, pb_offer.slave_id)
 509 |         assert stats.incr.call_count == 2
 510 |         stats.incr.assert_any_call('decline_for_slave_lost', 1)
 511 |         stats.incr.assert_any_call('slave_lost')
 512 |         assert len(cs._cached_slaves) == 0
 513 | 
 514 |     def test_disconnected(self):
 515 |         stats = mock.Mock()
 516 |         cs = ChangesScheduler(state_file=None, api=mock.Mock(), stats=stats,
 517 |                               blacklist=_noop_blacklist())
 518 |         driver = mock.Mock()
 519 | 
 520 |         pb_offer = self._make_offer(hostname="hostname")
 521 |         cs.resourceOffers(driver, [pb_offer])
 522 |         assert len(cs._cached_slaves) == 1
 523 | 
 524 |         cs.disconnected(driver)
 525 |         assert len(cs._cached_slaves) == 0
 526 | 
 527 |     def test_api_error(self):
 528 |         api = mock.Mock(spec=ChangesAPI)
 529 |         api.get_allocate_jobsteps.side_effect = APIError("Failure")
 530 |         cs = ChangesScheduler(state_file=None, api=api,
 531 |                               blacklist=_noop_blacklist())
 532 |         driver = mock.Mock()
 533 | 
 534 |         offer = self._make_offer()
 535 | 
 536 |         help_resource_offers_and_poll_changes(cs, driver, [offer])
 537 | 
 538 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200, cluster=None)
 539 |         assert driver.declineOffer.call_count == 0
 540 | 
 541 |         # Decline any unused offers. Expect the errored offer.
 542 |         cs.decline_open_offers(driver)
 543 |         driver.declineOffer.assert_called_once_with(offer.id)
 544 | 
 545 |     def test_api_no_tasks(self):
 546 |         api = mock.Mock(spec=ChangesAPI)
 547 |         api.get_allocate_jobsteps.return_value = []
 548 |         cs = ChangesScheduler(state_file=None, api=api,
 549 |                               blacklist=_noop_blacklist())
 550 |         driver = mock.Mock()
 551 | 
 552 |         offer = self._make_offer(cluster="foo_cluster")
 553 | 
 554 |         help_resource_offers_and_poll_changes(cs, driver, [offer])
 555 | 
 556 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200, cluster="foo_cluster")
 557 |         assert driver.declineOffer.call_count == 0
 558 | 
 559 |         # Decline any unused offers. Expect the only existing offer.
 560 |         cs.decline_open_offers(driver)
 561 |         driver.declineOffer.assert_called_once_with(offer.id)
 562 | 
 563 |     def test_api_one_task(self):
 564 |         api = mock.Mock(spec=ChangesAPI)
 565 |         api.get_allocate_jobsteps.return_value = [self._make_changes_task('1')]
 566 |         api.post_allocate_jobsteps.return_value = ['1']
 567 |         cs = ChangesScheduler(state_file=None, api=api,
 568 |                               blacklist=_noop_blacklist())
 569 |         driver = mock.Mock()
 570 | 
 571 |         offer = self._make_offer(hostname='aHostname', cluster="foo_cluster")
 572 | 
 573 |         def check_tasks(offer_ids, tasks, filters):
 574 |             assert len(offer_ids) == 1
 575 |             assert offer_ids[0] == offer.id
 576 |             assert len(tasks) == 1
 577 |             assert tasks[0].name == 'foo 1'
 578 |             assert tasks[0].slave_id.value == offer.slave_id.value
 579 |             assert tasks[0].command.value == 'ls'
 580 |             assert tasks[0].resources[0].name == "cpus"
 581 |             assert tasks[0].resources[0].scalar.value == 2
 582 |             assert tasks[0].resources[1].name == "mem"
 583 |             assert tasks[0].resources[1].scalar.value == 4096
 584 |             assert filters.refuse_seconds == 1.0
 585 |         driver.launchTasks.side_effect = check_tasks
 586 | 
 587 |         help_resource_offers_and_poll_changes(cs, driver, [offer])
 588 | 
 589 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200, cluster="foo_cluster")
 590 |         api.post_allocate_jobsteps.assert_called_once_with(['1'], cluster="foo_cluster")
 591 |         assert driver.launchTasks.call_count == 1
 592 |         assert cs.tasksLaunched == 1
 593 | 
 594 |         # Decline any unused offers (should be none)
 595 |         cs.decline_open_offers(driver)
 596 |         assert driver.declineOffer.call_count == 0
 597 | 
 598 |     def test_not_enough_resources(self):
 599 |         api = mock.Mock(spec=ChangesAPI)
 600 |         api.get_allocate_jobsteps.return_value = [self._make_changes_task('1', cpus=8)]
 601 |         api.post_allocate_jobsteps.return_value = ['1']
 602 |         cs = ChangesScheduler(state_file=None, api=api,
 603 |                               blacklist=_noop_blacklist())
 604 |         driver = mock.Mock()
 605 | 
 606 |         offer = self._make_offer(cluster="foo_cluster", cpus=4)
 607 | 
 608 |         help_resource_offers_and_poll_changes(cs, driver, [offer])
 609 | 
 610 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200, cluster="foo_cluster")
 611 |         assert api.post_allocate_jobsteps.call_count == 0
 612 |         assert driver.launchTasks.call_count == 0
 613 |         assert driver.declineOffer.call_count == 0
 614 |         assert cs.tasksLaunched == 0
 615 | 
 616 |         # Decline any unused offers. Expect the offer with insufficient
 617 |         # resources to schedule the jobstep.
 618 |         cs.decline_open_offers(driver)
 619 |         driver.declineOffer.assert_called_once_with(offer.id)
 620 | 
 621 |     def test_tries_all_offers(self):
 622 |         api = mock.Mock(spec=ChangesAPI)
 623 |         api.get_allocate_jobsteps.return_value = [self._make_changes_task('1', cpus=8)]
 624 |         api.post_allocate_jobsteps.return_value = ['1']
 625 |         cs = ChangesScheduler(state_file=None, api=api,
 626 |                               blacklist=_noop_blacklist())
 627 |         driver = mock.Mock()
 628 | 
 629 |         offer1 = self._make_offer(hostname="host1", cluster="foo_cluster", cpus=4)
 630 |         offer2 = self._make_offer(hostname="host2", cluster="foo_cluster", cpus=8)
 631 | 
 632 |         def check_tasks(offer_ids, tasks, filters):
 633 |             assert offer_ids == [offer2.id]
 634 |             assert len(tasks) == 1
 635 |             assert tasks[0].name == 'foo 1'
 636 |             assert tasks[0].slave_id.value == offer2.slave_id.value
 637 |             assert filters.refuse_seconds == 1.0
 638 |         driver.launchTasks.side_effect = check_tasks
 639 | 
 640 |         help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2])
 641 | 
 642 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200, cluster="foo_cluster")
 643 |         api.post_allocate_jobsteps.assert_called_once_with(['1'], cluster="foo_cluster")
 644 |         assert driver.launchTasks.call_count == 1
 645 |         assert cs.tasksLaunched == 1
 646 | 
 647 |         # Decline any unused offers (should be one)
 648 |         cs.decline_open_offers(driver)
 649 |         assert driver.declineOffer.call_count == 1
 650 | 
 651 |     def test_least_loaded(self):
 652 |         api = mock.Mock(spec=ChangesAPI)
 653 |         # task 4 won't be allocated if we schedule tasks in the order they're
 654 |         # returned
 655 |         api.get_allocate_jobsteps.return_value = [
 656 |             self._make_changes_task('1'), self._make_changes_task('2'),
 657 |             self._make_changes_task('3'), self._make_changes_task('4', cpus=3),
 658 |         ]
 659 |         api.post_allocate_jobsteps.return_value = ['1', '2', '3']
 660 |         cs = ChangesScheduler(state_file=None, api=api,
 661 |                               blacklist=_noop_blacklist())
 662 |         driver = mock.Mock()
 663 | 
 664 |         offer1 = self._make_offer(id='offer1', cpus=4, mem=8192)
 665 |         # should get loaded first
 666 |         offer2 = self._make_offer(id='offer2', cpus=4, mem=8193)
 667 | 
 668 |         def check_tasks(offer_ids, tasks, filters):
 669 |             assert len(offer_ids) == 1
 670 |             offer_id = offer_ids[0]
 671 |             assert offer_id in (offer1.id, offer2.id)
 672 |             if offer_id == offer1.id:
 673 |                 assert len(tasks) == 1
 674 |                 # after task 1 is allocated, this slave is least loaded, so
 675 |                 # second task should go to it.
 676 |                 assert tasks[0].name == 'foo 2'
 677 |                 assert tasks[0].slave_id.value == offer1.slave_id.value
 678 |             elif offer_id == offer2.id:
 679 |                 assert len(tasks) == 2
 680 |                 assert tasks[0].name == 'foo 1'
 681 |                 assert tasks[0].slave_id.value == offer2.slave_id.value
 682 |                 # for task 3 this slave is least loaded again
 683 |                 assert tasks[1].name == 'foo 3'
 684 |                 assert tasks[1].slave_id.value == offer2.slave_id.value
 685 |             assert filters.refuse_seconds == 1.0
 686 | 
 687 |         driver.launchTasks.side_effect = check_tasks
 688 | 
 689 |         help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2])
 690 | 
 691 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200, cluster=None)
 692 |         api.post_allocate_jobsteps.assert_called_once_with(['1', '2', '3'], cluster=None)
 693 |         assert driver.launchTasks.call_count == 2
 694 |         assert cs.tasksLaunched == 3
 695 | 
 696 |         # Decline any unused offers (should be none)
 697 |         cs.decline_open_offers(driver)
 698 |         assert driver.declineOffer.call_count == 0
 699 | 
 700 |     def test_alloc_failed(self):
 701 |         api = mock.Mock(spec=ChangesAPI)
 702 |         api.get_allocate_jobsteps.side_effect = lambda limit, cluster: [self._make_changes_task(id=cluster)]
 703 |         def post_allocate_jobsteps(ids, cluster):
 704 |             if cluster == '1':
 705 |                 return ['1']
 706 |             else:
 707 |                 raise APIError('Failure')
 708 |         api.post_allocate_jobsteps.side_effect = post_allocate_jobsteps
 709 |         cs = ChangesScheduler(state_file=None, api=api,
 710 |                               blacklist=_noop_blacklist())
 711 |         driver = mock.Mock()
 712 | 
 713 |         offer1 = self._make_offer(id="offer1", cluster="1")
 714 |         offer2 = self._make_offer(id="offer2", cluster="2")
 715 | 
 716 |         def check_tasks(offer_ids, tasks, filters):
 717 |             assert len(offer_ids) == 1
 718 |             assert offer_ids[0] == offer1.id
 719 |             # other task should still get launched if second one failed.
 720 |             assert len(tasks) == 1
 721 |             assert tasks[0].name == 'foo 1'
 722 |             assert tasks[0].slave_id.value == offer1.slave_id.value
 723 |             assert filters.refuse_seconds == 1.0
 724 |         driver.launchTasks.side_effect = check_tasks
 725 | 
 726 |         help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2])
 727 | 
 728 |         api.get_allocate_jobsteps.assert_has_calls([mock.call(limit=200, cluster='1'), mock.call(limit=200, cluster='2')],
 729 |                                                    any_order=True)
 730 |         assert api.get_allocate_jobsteps.call_count == 2
 731 |         api.post_allocate_jobsteps.assert_has_calls([mock.call(['1'], cluster='1'), mock.call(['2'], cluster='2')],
 732 |                                                     any_order=True)
 733 |         assert api.post_allocate_jobsteps.call_count == 2
 734 |         assert driver.launchTasks.call_count == 1
 735 |         assert cs.tasksLaunched == 1
 736 |         assert driver.declineOffer.call_count == 0
 737 | 
 738 |         # Decline any unused offers (offer2 should be open, since it failed
 739 |         # to schedule.)
 740 |         cs.decline_open_offers(driver)
 741 |         driver.declineOffer.assert_called_once_with(offer2.id)
 742 | 
 743 |     def test_group_snapshots_on_same_machine(self):
 744 |         # Create 2 tasks with same snapshot and assert they both go to the
 745 |         # same slave.
 746 |         api = mock.Mock(spec=ChangesAPI)
 747 |         api.get_allocate_jobsteps.return_value = [
 748 |             self._make_changes_task('1', cpus=2, snapshot='snapfoo'),
 749 |             self._make_changes_task('2', cpus=2, snapshot='snapfoo')
 750 |         ]
 751 |         api.post_allocate_jobsteps.return_value = ['1', '2']
 752 | 
 753 |         cs = ChangesScheduler(state_file=None, api=api,
 754 |                               blacklist=_noop_blacklist())
 755 |         driver = mock.Mock()
 756 | 
 757 |         launched_offer_id = None
 758 |         def launchTasks(offers, tasks, filters=None):
 759 |             # Assert all launched tasks go to offer1 (host1)
 760 |             # host1 is assured to be picked first because it has slightly more
 761 |             # resources at first.
 762 |             assert len(offers) == 1
 763 |             assert offers == [mesos_pb2.OfferID(value="offer1")]
 764 | 
 765 |         driver.launchTasks.side_effect = launchTasks
 766 | 
 767 |         offer1 = self._make_offer(id='offer1', hostname='host1', cpus=5)
 768 |         offer2 = self._make_offer(id='offer2', hostname='host2', cpus=4)
 769 | 
 770 |         help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2])
 771 | 
 772 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200,
 773 |                                                           cluster=None)
 774 |         assert api.post_allocate_jobsteps.call_count == 1
 775 |         assert driver.launchTasks.call_count == 1
 776 |         assert driver.declineOffer.call_count == 0
 777 |         assert cs.tasksLaunched == 2
 778 | 
 779 |         # Decline any unused offers. Expect offer2 remains, since both tasks
 780 |         # were schedule on offer1.
 781 |         cs.decline_open_offers(driver)
 782 |         driver.declineOffer.assert_called_once_with(offer2.id)
 783 | 
 784 |     def test_fall_back_to_least_loaded(self):
 785 |         # Fall back to least-loaded assignment if the snapshot for a task is
 786 |         # not found on any slave.
 787 |         api = mock.Mock(spec=ChangesAPI)
 788 |         api.get_allocate_jobsteps.return_value = [
 789 |             self._make_changes_task('1', cpus=2, snapshot='snapfoo'),
 790 |             self._make_changes_task('2', cpus=2, snapshot='snapbar')
 791 |         ]
 792 |         api.post_allocate_jobsteps.return_value = ['1', '2']
 793 | 
 794 |         cs = ChangesScheduler(state_file=None, api=api,
 795 |                               blacklist=_noop_blacklist())
 796 |         driver = mock.Mock()
 797 | 
 798 |         offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4)
 799 |         offer2 = self._make_offer(id='offer2', hostname='host2', cpus=4)
 800 | 
 801 |         help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2])
 802 | 
 803 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200,
 804 |                                                           cluster=None)
 805 |         assert api.post_allocate_jobsteps.call_count == 1
 806 |         assert driver.launchTasks.call_count == 2  # Jobs are sent to separate slaves
 807 |         assert driver.declineOffer.call_count == 0
 808 |         assert cs.tasksLaunched == 2
 809 | 
 810 |         # Decline any unused offers (should be none)
 811 |         cs.decline_open_offers(driver)
 812 |         assert driver.declineOffer.call_count == 0
 813 | 
 814 |     def test_prefer_loaded_slave_with_snapshot(self):
 815 |         # Fall back to least-loaded assignment if the snapshot for a task is
 816 |         # not found on any slave.
 817 |         api = mock.Mock(spec=ChangesAPI)
 818 |         api.get_allocate_jobsteps.return_value = [
 819 |             self._make_changes_task('1', cpus=2, snapshot='snapfoo')
 820 |         ]
 821 |         api.post_allocate_jobsteps.return_value = ['1']
 822 | 
 823 |         cs = ChangesScheduler(state_file=None, api=api,
 824 |                               blacklist=_noop_blacklist())
 825 |         driver = mock.Mock()
 826 | 
 827 |         offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4)
 828 |         help_resource_offers_and_poll_changes(cs, driver, [offer1])
 829 | 
 830 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200,
 831 |                                                           cluster=None)
 832 | 
 833 |         cs.decline_open_offers(driver)
 834 |         api.reset_mock()
 835 |         driver.reset_mock()
 836 | 
 837 |         api.get_allocate_jobsteps.return_value = [
 838 |             self._make_changes_task('2', cpus=2, snapshot='snapfoo')
 839 |         ]
 840 |         api.post_allocate_jobsteps.return_value = ['2']
 841 | 
 842 |         def launchTasks(offers, tasks, filters=None):
 843 |             # Assert launched task goes to offer1 (host1)
 844 |             # although it has lesser resources than host2
 845 |             assert offers == [mesos_pb2.OfferID(value="offer1")]
 846 | 
 847 |         driver.launchTasks.side_effect = launchTasks
 848 | 
 849 |         offer1 = self._make_offer(id='offer1', hostname='host1', cpus=2)
 850 |         offer2 = self._make_offer(id='offer2', hostname='host2', cpus=4)
 851 |         help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2])
 852 | 
 853 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200,
 854 |                                                           cluster=None)
 855 |         assert api.post_allocate_jobsteps.call_count == 1
 856 |         assert driver.launchTasks.call_count == 1
 857 |         assert driver.declineOffer.call_count == 0
 858 |         assert cs.tasksLaunched == 2
 859 | 
 860 |         # Decline any unused offers. Expect offer2 remains since offer1 was
 861 |         # prefered.
 862 |         cs.decline_open_offers(driver)
 863 |         driver.declineOffer.assert_called_once_with(offer2.id)
 864 | 
 865 |     def test_slave_with_snapshot_unavailable(self):
 866 |         # Fall back to least-loaded assignment if the snapshot for a task is
 867 |         # not found on any slave.
 868 |         api = mock.Mock(spec=ChangesAPI)
 869 |         api.get_allocate_jobsteps.return_value = [
 870 |             self._make_changes_task('1', cpus=2, snapshot='snapfoo')
 871 |         ]
 872 |         api.post_allocate_jobsteps.return_value = ['1']
 873 | 
 874 |         cs = ChangesScheduler(state_file=None, api=api,
 875 |                               blacklist=_noop_blacklist())
 876 |         driver = mock.Mock()
 877 | 
 878 |         offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4)
 879 |         help_resource_offers_and_poll_changes(cs, driver, [offer1])
 880 | 
 881 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200,
 882 |                                                           cluster=None)
 883 | 
 884 |         cs.decline_open_offers(driver)
 885 |         api.reset_mock()
 886 |         driver.reset_mock()
 887 | 
 888 |         api.get_allocate_jobsteps.return_value = [
 889 |             self._make_changes_task('2', cpus=2, snapshot='snapfoo')
 890 |         ]
 891 |         api.post_allocate_jobsteps.return_value = ['2']
 892 | 
 893 |         # Use this slightly roundabout way of verify launchTasks in order to
 894 |         # avoid hanging the changes-polling thread. Otherwise the test will
 895 |         # hang when it fails.
 896 |         expected_launched_tasks = [mesos_pb2.OfferID(value="offer2").value]
 897 |         launched_tasks = []
 898 |         def launchTasks(offers, tasks, filters=None):
 899 |             # Assert offer is accepted although slave doesn't have snapshot.
 900 |             assert len(offers) == 1
 901 |             launched_tasks.append(offers[0].value)
 902 | 
 903 |         driver.launchTasks.side_effect = launchTasks
 904 | 
 905 |         offer2 = self._make_offer(id='offer2', hostname='host2', cpus=4)
 906 |         help_resource_offers_and_poll_changes(cs, driver, [offer2])
 907 |         assert launched_tasks == expected_launched_tasks
 908 | 
 909 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200,
 910 |                                                           cluster=None)
 911 |         assert api.post_allocate_jobsteps.call_count == 1
 912 |         assert driver.launchTasks.call_count == 1
 913 |         assert driver.declineOffer.call_count == 0
 914 |         assert cs.tasksLaunched == 2
 915 | 
 916 |         # Decline any unused offers (should be none)
 917 |         cs.decline_open_offers(driver)
 918 |         assert driver.declineOffer.call_count == 0
 919 | 
 920 |     @mock.patch('time.time')
 921 |     def test_slave_with_stale_snapshot(self, time_mock):
 922 |         # Fall back to least-loaded assignment if the snapshot for a task is
 923 |         # not found on any slave.
 924 |         api = mock.Mock(spec=ChangesAPI)
 925 |         time_mock.return_value = 1
 926 |         api.get_allocate_jobsteps.return_value = [
 927 |             self._make_changes_task('1', cpus=2, snapshot='snapfoo')
 928 |         ]
 929 |         api.post_allocate_jobsteps.return_value = ['1']
 930 | 
 931 |         cs = ChangesScheduler(state_file=None, api=api,
 932 |                               blacklist=_noop_blacklist())
 933 |         driver = mock.Mock()
 934 | 
 935 |         offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4)
 936 |         help_resource_offers_and_poll_changes(cs, driver, [offer1])
 937 | 
 938 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200,
 939 |                                                           cluster=None)
 940 |         assert api.post_allocate_jobsteps.call_count == 1
 941 | 
 942 |         cs.decline_open_offers(driver)
 943 |         api.reset_mock()
 944 |         driver.reset_mock()
 945 |         time_mock.return_value = 1000000
 946 | 
 947 |         api.get_allocate_jobsteps.return_value = [
 948 |             self._make_changes_task('2', cpus=2, snapshot='snapfoo')
 949 |         ]
 950 |         api.post_allocate_jobsteps.return_value = ['2']
 951 | 
 952 |         def launchTasks(offers, tasks, filters=None):
 953 |             # Ignore stale snapshot association and select least-loaded slave.
 954 |             assert offers == [mesos_pb2.OfferID(value="offer2")]
 955 | 
 956 |         driver.launchTasks.side_effect = launchTasks
 957 | 
 958 |         offer1 = self._make_offer(id='offer1', hostname='host1', cpus=2)
 959 |         offer2 = self._make_offer(id='offer2', hostname='host2', cpus=4)
 960 |         help_resource_offers_and_poll_changes(cs, driver, [offer1, offer2])
 961 | 
 962 |         api.get_allocate_jobsteps.assert_called_once_with(limit=200,
 963 |                                                           cluster=None)
 964 |         assert api.post_allocate_jobsteps.call_count == 1
 965 |         assert driver.launchTasks.call_count == 1
 966 |         assert driver.declineOffer.call_count == 0
 967 |         assert cs.tasksLaunched == 2
 968 | 
 969 |         # Decline any unused offers. Expect offer1 remains, since offer2 was
 970 |         # least-loaded.
 971 |         cs.decline_open_offers(driver)
 972 |         driver.declineOffer.assert_called_once_with(offer1.id)
 973 | 
 974 |     def test_cached_offer_is_used(self):
 975 |         api = mock.Mock(spec=ChangesAPI)
 976 |         cs = ChangesScheduler(state_file=None, api=api,
 977 |                               blacklist=_noop_blacklist())
 978 |         driver = mock.Mock()
 979 | 
 980 |         # Scheduler has an offer, but no tasks.
 981 |         api.get_allocate_jobsteps.return_value = []
 982 |         offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4)
 983 |         help_resource_offers_and_poll_changes(cs, driver, [offer1])
 984 |         assert api.get_allocate_jobsteps.call_count == 1
 985 |         assert api.post_allocate_jobsteps.call_count == 0
 986 | 
 987 |         # Don't decline offers here like we decline offers when resetting
 988 |         # elsewhere. We need to leave the offer cache intact.
 989 |         api.reset_mock()
 990 |         driver.reset_mock()
 991 | 
 992 |         # When an offer arrives, the scheduler uses on the cached offer.
 993 |         api.get_allocate_jobsteps.return_value = [
 994 |             self._make_changes_task('2', cpus=2, snapshot='snapfoo')
 995 |         ]
 996 |         api.post_allocate_jobsteps.return_value = ['2']
 997 |         help_resource_offers_and_poll_changes(cs, driver, [])
 998 |         assert api.get_allocate_jobsteps.call_count == 1
 999 |         assert api.post_allocate_jobsteps.call_count == 1
1000 | 
1001 |         cs.decline_open_offers(driver)
1002 |         driver.declineOffer.assert_not_called()
1003 | 
1004 |     def test_offer_rescinded(self):
1005 |         api = mock.Mock(spec=ChangesAPI)
1006 |         cs = ChangesScheduler(state_file=None, api=api,
1007 |                               blacklist=_noop_blacklist())
1008 |         driver = mock.Mock()
1009 | 
1010 |         # Scheduler has an offer, but no tasks are available.
1011 |         api.get_allocate_jobsteps.return_value = []
1012 |         offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4)
1013 |         help_resource_offers_and_poll_changes(cs, driver, [offer1])
1014 |         assert api.get_allocate_jobsteps.call_count == 1
1015 |         assert api.post_allocate_jobsteps.call_count == 0
1016 | 
1017 |         cs.decline_open_offers(driver)
1018 |         api.reset_mock()
1019 |         driver.reset_mock()
1020 | 
1021 |         # Offer gets rescinded by Mesos master.
1022 |         cs.offerRescinded(driver, offer1.id)
1023 |         api.get_allocate_jobsteps.assert_not_called()
1024 |         api.reset_mock()
1025 |         driver.reset_mock()
1026 | 
1027 |         # Now changes has no offers, so the task can't be scheduled.
1028 |         api.get_allocate_jobsteps.return_value = [
1029 |             self._make_changes_task('2', cpus=2, snapshot='snapfoo')
1030 |         ]
1031 |         help_resource_offers_and_poll_changes(cs, driver, [])
1032 |         # No offers -> no clusters to query for -> no get_allocate_jobsteps calls.
1033 |         assert api.get_allocate_jobsteps.call_count == 0
1034 |         assert api.post_allocate_jobsteps.call_count == 0
1035 | 
1036 |         cs.decline_open_offers(driver)
1037 |         driver.declineOffer.assert_not_called()
1038 | 
1039 |     def test_combine_offer_fragments(self):
1040 |         api = mock.Mock(spec=ChangesAPI)
1041 |         cs = ChangesScheduler(state_file=None, api=api,
1042 |                               blacklist=_noop_blacklist())
1043 |         driver = mock.Mock()
1044 | 
1045 |         api.get_allocate_jobsteps.return_value = [
1046 |             self._make_changes_task('1', cpus=2, mem=2048),
1047 |         ]
1048 |         api.post_allocate_jobsteps.return_value = ['1']
1049 | 
1050 |         # Add an offer for a different host, to make sure offers for different
1051 |         # hosts aren't being merged/defragmented.
1052 |         host2_offer = self._make_offer(id='host2_offer', hostname='host2',
1053 |                                        cpus=1, mem=1024)
1054 |         cs.resourceOffers(driver, [host2_offer])
1055 | 
1056 |         # Add a set of small, fragmented offers one at a time. The task can
1057 |         # only be scheduled once all offers have arrived. By our powers
1058 |         # combined!
1059 |         offers = ((1, 0), (1, 0), (0, 1024), (0, 1024))
1060 |         expected_offer_ids = []
1061 |         for i, (cpu, mem) in enumerate(offers):
1062 |             offer_id = 'host1_offer{}'.format(i)
1063 |             new_offer = self._make_offer(id=offer_id, hostname='host1',
1064 |                                          cpus=cpu, mem=mem)
1065 |             expected_offer_ids.append(new_offer.id)
1066 | 
1067 |             if i < len(offers) - 1:  # Not the last iteration
1068 |                 help_resource_offers_and_poll_changes(cs, driver, [new_offer])
1069 |                 assert api.get_allocate_jobsteps.call_count == 1
1070 |                 assert api.post_allocate_jobsteps.call_count == 0
1071 |             else:  # on the final iteration only
1072 |                 def check_tasks(offer_ids, tasks, filters):
1073 |                     assert offer_ids == expected_offer_ids
1074 |                     assert len(tasks) == 1
1075 |                 driver.launchTasks.side_effect = check_tasks
1076 |                 help_resource_offers_and_poll_changes(cs, driver, [new_offer])
1077 |                 assert api.get_allocate_jobsteps.call_count == 1
1078 |                 assert api.post_allocate_jobsteps.call_count == 1
1079 |             api.reset_mock()
1080 |             driver.reset_mock()
1081 |         cs.decline_open_offers(driver)
1082 |         driver.declineOffer.assert_not_called()
1083 | 
1084 |     def test_full_thread_polling(self):
1085 |         """This test simply runs through the startup/teardown machinery for the
1086 |         Changes polling thread. We run the polling a couple of times to ensure
1087 |         the looping works properly.
1088 |         """
1089 |         api = mock.Mock(spec=ChangesAPI)
1090 |         cs = ChangesScheduler(state_file=None, api=api,
1091 |                               blacklist=_noop_blacklist())
1092 |         driver = mock.Mock()
1093 | 
1094 |         # Add an offer to ensure get_allocate_jobsteps() is polled.
1095 |         offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4)
1096 |         cs.resourceOffers(driver, [offer1])
1097 | 
1098 |         # After get_allocate_jobsteps() is called a couple of times, shut down.
1099 |         count = [0]
1100 |         def get_allocate_jobsteps(limit, cluster):
1101 |             if count[0] > 3:
1102 |                 cs.shuttingDown.set()
1103 |             count[0] += 1
1104 |             return []
1105 |         api.get_allocate_jobsteps.side_effect = get_allocate_jobsteps
1106 | 
1107 |         # Just makes sure this executes with no exceptions.
1108 |         cs.poll_changes_until_shutdown(driver, 0)
1109 | 
1110 |     def test_full_thread_polling_with_exception(self):
1111 |         """Test that exceptions in the polling thread are reported back to the
1112 |         main thread correctly.
1113 |         """
1114 |         api = mock.Mock(spec=ChangesAPI)
1115 |         cs = ChangesScheduler(state_file=None, api=api,
1116 |                               blacklist=_noop_blacklist())
1117 |         driver = mock.Mock()
1118 | 
1119 |         # Add an offer to ensure get_allocate_jobsteps() is polled.
1120 |         offer1 = self._make_offer(id='offer1', hostname='host1', cpus=4)
1121 |         cs.resourceOffers(driver, [offer1])
1122 | 
1123 |         # Force an exception in get_allocate_jobsteps()
1124 |         def get_allocate_jobsteps(limit, cluster):
1125 |             assert False
1126 |         api.get_allocate_jobsteps.side_effect = get_allocate_jobsteps
1127 | 
1128 |         class Filter(logging.Filter):
1129 |             def __init__(self):
1130 |                 super(Filter, self).__init__()
1131 |                 self.found_error = False
1132 | 
1133 |             def filter(self, record):
1134 |                 if record.getMessage() == "Polling thread failed. Exiting.":
1135 |                     self.found_error = True
1136 | 
1137 |         f = Filter()
1138 |         try:
1139 |             logger.addFilter(f)
1140 |             cs.poll_changes_until_shutdown(driver, 0)
1141 |             assert f.found_error
1142 |         finally:
1143 |             logger.removeFilter(f)
1144 | 
1145 |     def test_state_json(self):
1146 |         framework_id = 'frameworkid'
1147 |         changes_request_limit = 53
1148 | 
1149 |         blpath = self.test_dir + '/blacklist'
1150 |         blacklist = open(blpath, 'w+')
1151 |         blacklist.write('hostname1\nhostname2\n')
1152 |         blacklist.close()
1153 | 
1154 |         api = mock.Mock(spec=ChangesAPI)
1155 |         cs = ChangesScheduler(state_file=None,
1156 |                               api=api,
1157 |                               blacklist=FileBlacklist(blpath),
1158 |                               changes_request_limit=changes_request_limit)
1159 |         cs.framework_id = framework_id
1160 |         driver = mock.Mock()
1161 |         now = time.time()
1162 | 
1163 |         offer1 = self._make_offer(id='offer1', hostname='host1', cpus=1, mem=1024)
1164 |         offer2 = self._make_offer(id='offer2', hostname='host1', cpus=2, mem=2048)
1165 |         offer3 = self._make_offer(id='offer3', hostname='host3', cpus=4, mem=4096,
1166 |                                   cluster='some_cluster')
1167 |         offer4 = self._make_offer(hostname='host4',
1168 |                                   id='offer4',
1169 |                                   cpus=5,
1170 |                                   mem=5000,
1171 |                                   unavailability_start_secs=now - 5,
1172 |                                   unavailability_duration_secs=100)
1173 |         offer4.attributes.add(name="ranges_example",
1174 |                               type=mesos_pb2.Value.RANGES,
1175 |                               ranges=mesos_pb2.Value.Ranges(range=[
1176 |                                   mesos_pb2.Value.Range(begin=10, end=20),
1177 |                                   mesos_pb2.Value.Range(begin=30, end=40),
1178 |                               ]))
1179 |         offer4.attributes.add(name="set_example",
1180 |                               type=mesos_pb2.Value.SET,
1181 |                               set=mesos_pb2.Value.Set(item=[
1182 |                                   'string_1',
1183 |                                   'string_2',
1184 |                               ]))
1185 |         cs.resourceOffers(driver, [offer1, offer2, offer3, offer4])
1186 | 
1187 |         expected_state = {
1188 |             'framework_id': framework_id,
1189 |             'taskJobStepMapping': {},
1190 |             'tasksPendingKill': {},
1191 |             'tasksLaunched': 0,
1192 |             'tasksFinished': 0,
1193 |             'shuttingDown': False,
1194 |             'blacklist': {
1195 |                 'path': blpath,
1196 |                 'entries': [
1197 |                     'hostname1',
1198 |                     'hostname2',
1199 |                 ],
1200 |             },
1201 |             'snapshot_slave_map': {},
1202 |             'changes_request_limit': changes_request_limit,
1203 |             'cached_slaves': [
1204 |                 {
1205 |                     'slave_id': 'slave_id_host1',
1206 |                     'hostname': 'host1',
1207 |                     'cluster': None,
1208 |                     'offers': [
1209 |                         {
1210 |                             'offer_id': 'offer1',
1211 |                             'framework_id': framework_id,
1212 |                             'url': '',
1213 |                             'cpu': 1.0,
1214 |                             'mem': 1024,
1215 |                             'attributes': [],
1216 |                             'resources': [
1217 |                                 {'name': 'cpus', 'type': mesos_pb2.Value.SCALAR, 'value': 1},
1218 |                                 {'name': 'mem', 'type': mesos_pb2.Value.SCALAR, 'value': 1024},
1219 |                             ],
1220 |                         },
1221 |                         {
1222 |                             'offer_id': 'offer2',
1223 |                             'framework_id': framework_id,
1224 |                             'url': '',
1225 |                             'cpu': 2.0,
1226 |                             'mem': 2048,
1227 |                             'attributes': [],
1228 |                             'resources': [
1229 |                                 {'name': 'cpus', 'type': mesos_pb2.Value.SCALAR, 'value': 2},
1230 |                                 {'name': 'mem', 'type': mesos_pb2.Value.SCALAR, 'value': 2048},
1231 |                             ],
1232 |                         },
1233 |                     ],
1234 |                     'total_cpu': 3.0,
1235 |                     'total_mem': 3072,
1236 |                     'is_maintenanced': False,
1237 |                 },
1238 |                 {
1239 |                     'slave_id': 'slave_id_host3',
1240 |                     'hostname': 'host3',
1241 |                     'cluster': 'some_cluster',
1242 |                     'offers': [
1243 |                         {
1244 |                             'offer_id': 'offer3',
1245 |                             'framework_id': framework_id,
1246 |                             'url': '',
1247 |                             'cpu': 4.0,
1248 |                             'mem': 4096,
1249 |                             'attributes': [
1250 |                                 {'name': 'labels', 'type': mesos_pb2.Value.TEXT, 'value': 'some_cluster'},
1251 |                             ],
1252 |                             'resources': [
1253 |                                 {'name': 'cpus', 'type': mesos_pb2.Value.SCALAR, 'value': 4},
1254 |                                 {'name': 'mem', 'type': mesos_pb2.Value.SCALAR, 'value': 4096},
1255 |                             ],
1256 |                         },
1257 |                     ],
1258 |                     'total_cpu': 4.0,
1259 |                     'total_mem': 4096,
1260 |                     'is_maintenanced': False,
1261 |                 },
1262 |                 {
1263 |                     'slave_id': 'slave_id_host4',
1264 |                     'hostname': 'host4',
1265 |                     'cluster': None,
1266 |                     'offers': [
1267 |                         {
1268 |                             'offer_id': 'offer4',
1269 |                             'framework_id': framework_id,
1270 |                             'url': '',
1271 |                             'cpu': 5.0,
1272 |                             'mem': 5000,
1273 |                             'attributes': [
1274 |                                 {'name': 'ranges_example', 'type': mesos_pb2.Value.RANGES, 'value': '(10, 20), (30, 40)'},
1275 |                                 {'name': 'set_example', 'type': mesos_pb2.Value.SET, 'value': 'string_1, string_2'},
1276 |                             ],
1277 |                             'resources': [
1278 |                                 {'name': 'cpus', 'type': mesos_pb2.Value.SCALAR, 'value': 5},
1279 |                                 {'name': 'mem', 'type': mesos_pb2.Value.SCALAR, 'value': 5000},
1280 |                             ],
1281 |                         },
1282 |                     ],
1283 |                     'total_cpu': 5.0,
1284 |                     'total_mem': 5000,
1285 |                     'is_maintenanced': True,
1286 |                 },
1287 |             ],
1288 |             'build_state_json_secs': .5,
1289 |         }
1290 | 
1291 |         state = cs.state_json()
1292 | 
1293 |         # Verify that the state can be json-converted cleanly.
1294 |         json.dumps(state)
1295 | 
1296 |         # Verify that we got a time-to-build with approximately the right order
1297 |         # of magnitude, then replace the value with something predictable.
1298 |         assert state['build_state_json_secs'] > 0
1299 |         assert state['build_state_json_secs'] < 10
1300 |         state['build_state_json_secs'] = expected_state['build_state_json_secs']
1301 | 
1302 |         # Verify a bunch of state fields individually to make diffing easier
1303 |         # when we find a problem
1304 |         for slave, expected_slave in (zip(state['cached_slaves'],
1305 |                                           expected_state['cached_slaves'])):
1306 |             for offer, expected_offer in zip(slave['offers'], expected_slave['offers']):
1307 |                 assert offer == expected_offer
1308 | 
1309 |         # Compare all state keys individually.
1310 |         for key in expected_state:
1311 |             print 'Compare key {}: [{}] vs expected [{}]'.format(
1312 |                 key, state[key], expected_state[key])
1313 |             assert state[key] == expected_state[key]
1314 | 
1315 |         # Ensure both dicts have the same number of keys, which means the
1316 |         # previous loop hit everything.
1317 |         assert sorted(expected_state.keys()) == sorted(state.keys())
1318 | 
1319 |         # Add some tasks to the scheduler and reschedule, to trigger some
1320 |         # snapshot-slave mappings.
1321 |         tasks = [
1322 |             self._make_changes_task('1', mem=3072, snapshot='snap1'),
1323 |             self._make_changes_task('2', mem=4096, snapshot='snap2'),
1324 |         ]
1325 |         api.get_allocate_jobsteps.return_value = tasks
1326 |         api.post_allocate_jobsteps.return_value = ['1', '2']
1327 | 
1328 |         assert api.get_allocate_jobsteps.reset()
1329 |         assert api.post_allocate_jobsteps.reset()
1330 |         assert not cs.poll_and_launch_once(driver)  # Get jobsteps and launch them.
1331 |         assert api.get_allocate_jobsteps.call_count == 2
1332 |         assert api.post_allocate_jobsteps.call_count == 2
1333 | 
1334 |         state = cs.state_json()
1335 |         assert len(state['snapshot_slave_map']) == 2
1336 | 
1337 |     def test_state_json_performance(self):
1338 |         """Verify that the /state_json handler can build its JSON payload in
1339 |         less than .05 seconds, on average.
1340 |         """
1341 |         framework_id = 'frameworkid'
1342 |         changes_request_limit = 53
1343 | 
1344 |         blpath = self.test_dir + '/blacklist'
1345 |         blacklist = open(blpath, 'w+')
1346 |         blacklist.write('hostname1\nhostname2\n')
1347 |         blacklist.close()
1348 | 
1349 |         api = mock.Mock(spec=ChangesAPI)
1350 |         cs = ChangesScheduler(state_file=None,
1351 |                               api=api,
1352 |                               blacklist=FileBlacklist(blpath),
1353 |                               changes_request_limit=changes_request_limit)
1354 |         cs.framework_id = framework_id
1355 |         driver = mock.Mock()
1356 |         now = time.time()
1357 | 
1358 |         offer1 = self._make_offer(id='offer1', hostname='host1', cpus=1, mem=1024)
1359 |         offer2 = self._make_offer(id='offer2', hostname='host1', cpus=2, mem=2048)
1360 |         offer3 = self._make_offer(id='offer3', hostname='host3', cpus=4, mem=4096,
1361 |                                   cluster='some_cluster')
1362 |         offer4 = self._make_offer(hostname='host4',
1363 |                                   id='offer4',
1364 |                                   cpus=5,
1365 |                                   mem=5000,
1366 |                                   unavailability_start_secs=now - 5,
1367 |                                   unavailability_duration_secs=100)
1368 |         offer4.attributes.add(name="ranges_example",
1369 |                               type=mesos_pb2.Value.RANGES,
1370 |                               ranges=mesos_pb2.Value.Ranges(range=[
1371 |                                   mesos_pb2.Value.Range(begin=10, end=20),
1372 |                                   mesos_pb2.Value.Range(begin=30, end=40),
1373 |                               ]))
1374 |         offer4.attributes.add(name="set_example",
1375 |                               type=mesos_pb2.Value.SET,
1376 |                               set=mesos_pb2.Value.Set(item=[
1377 |                                   'string_1',
1378 |                                   'string_2',
1379 |                               ]))
1380 |                               
1381 |         cs.resourceOffers(driver, [offer1, offer2, offer3, offer4])
1382 | 
1383 |         tasks = [
1384 |             self._make_changes_task('1', mem=3072, snapshot='snap1'),
1385 |             self._make_changes_task('2', mem=4096, snapshot='snap2'),
1386 |         ]
1387 |         api.get_allocate_jobsteps.return_value = tasks
1388 |         api.post_allocate_jobsteps.return_value = ['1', '2']
1389 |         assert not cs.poll_and_launch_once(driver)
1390 | 
1391 |         start_time = time.time()
1392 |         loops = 1000
1393 |         for i in xrange(loops):
1394 |             state_json = cs.state_json()
1395 |         total_time = time.time() - start_time
1396 | 
1397 |         max_avg_time_per_loop = .05
1398 |         assert total_time < max_avg_time_per_loop * loops
1399 | 


--------------------------------------------------------------------------------
/changes_mesos_scheduler/tests/test_service.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import print_function
 4 | 
 5 | import json
 6 | import random
 7 | from pprint import pprint
 8 | 
 9 | from flask import jsonify, Flask, Response, request
10 | 
11 | 
12 | app = Flask(__name__)
13 | 
14 | @app.route("/")
15 | def index():
16 |   return "Mesos HTTP Proxy test service."
17 | 
18 | 
19 | @app.route("/jobsteps/allocate/", methods = ['POST'])
20 | def offer():
21 |   print("Received resource offer:")
22 |   print(json.dumps(request.get_json(), sort_keys=True, indent=2, separators=(',', ': ')))
23 | 
24 |   REQUIRED_MEM = 500
25 |   REQUIRED_CPU = 0.5
26 | 
27 |   tasks_to_run = []
28 | 
29 |   info = request.get_json()
30 | 
31 |   if info["resources"]["cpus"] >= REQUIRED_CPU \
32 |      and info["resources"]["mem"] >= REQUIRED_MEM:
33 | 
34 |     random_id = str(random.randint(0, 1000))
35 |     tasks_to_run.append(
36 |       {
37 |         "id": "my_job_" + random_id,
38 |         # "cmd": "pwd && /bin/sleep " + str(random.randint(10, 60)),
39 |         "project": {
40 |           "slug": random_id
41 |         },
42 |         "resources": {
43 |           "cpus": REQUIRED_CPU,
44 |           "mem": REQUIRED_MEM
45 |         }
46 |       }
47 |     )
48 | 
49 |   print("Responding with the following tasks:")
50 |   print(json.dumps(tasks_to_run, sort_keys=True, indent=2, separators=(',', ': ')))
51 |   return Response(json.dumps(tasks_to_run),  mimetype='application/json')
52 | 
53 | 
54 | @app.route("/jobsteps/<job_id>/", methods = ['POST'])
55 | def status(job_id):
56 |   print("Received status update:")
57 |   print(json.dumps(request.get_json(), sort_keys=True, indent=2, separators=(',', ': ')))
58 |   return "OK"
59 | 
60 | @app.route("/jobsteps/<job_id>/deallocate/", methods = ['POST'])
61 | def delallocate(job_id):
62 |   print("Received status update:")
63 |   print(json.dumps(request.get_json(), sort_keys=True, indent=2, separators=(',', ': ')))
64 |   return "OK"
65 | 
66 | 
67 | if __name__ == "__main__":
68 |   app.debug = True
69 |   app.run(host='0.0.0.0')
70 | 


--------------------------------------------------------------------------------
/ci/mypy-run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eux
 2 | 
 3 | # Allow the path to mypy to be specified in the MYPY environment variable, but default to "mypy".
 4 | : ${MYPY=mypy}
 5 | 
 6 | # Any paths we need to include in typechecking that are not automatically found (that is, that
 7 | # have no '# type:' annotation)
 8 | EXTRA_FILES=""
 9 | 
10 | # Any files with type annotations that should be excluded from typechecking. This is a regular
11 | # expression matched against the filenames.
12 | EXCLUDE=""
13 | 
14 | # Find all Python files that are not in the exclude list and which have a '# type:' annotation.
15 | FILES=`find . -type f -name \*.py -print0  \
16 |        | xargs -0 grep -ls '# type:'`
17 | 
18 | if [ -n "$EXCLUDE" ]; then
19 |     FILES=`echo "$FILES" | egrep -v "$EXCLUDE"`
20 | fi
21 | 
22 | ci/run_mypy.py $MYPY --silent-imports --py2 $FILES $EXTRA_FILES
23 | 


--------------------------------------------------------------------------------
/ci/mypy-setup:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eux
 2 | 
 3 | # Heroic effort to install Python 3.2, setuptools, pip, mypy (in that
 4 | # order) on Ubuntu 12.04.
 5 | 
 6 | # This script needs to run as root.
 7 | 
 8 | case `whoami` in
 9 | root) ;;
10 | *) echo "Please use sudo to run this script as root."; exit 1;;
11 | esac
12 | 
13 | apt-get install -y -q python3
14 | apt-get install -y -q python3-setuptools
15 | apt-get install -y -q python3-pip
16 | 
17 | # Sadly, setuptools and pip are installed in /usr/local/lib/python3.4/
18 | # (but there's no apt-get package that installs Python 3.4).
19 | # Just add that directory to sys.path
20 | 
21 | export PYTHONPATH=/usr/local/lib/python3.4/dist-packages
22 | 
23 | python3 -m pip install -q -U git+https://github.com/python/mypy
24 | #python3 -m pip install -q -U git+https://github.com/gvanrossum/pyxl3
25 | # Copied from pyxl3/finish_install.py
26 | #python3 <<EOF
27 | #import os
28 | #from distutils.sysconfig import get_python_lib
29 | #python_lib = get_python_lib()
30 | #pyxl_path = os.path.join(python_lib, 'pyxl.pth')
31 | #with open(pyxl_path, 'w') as f:
32 | #    f.write('import pyxl.codec.register\n')
33 | #EOF
34 | 


--------------------------------------------------------------------------------
/ci/run_mypy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Run a script and generate a junit.xml file.
  4 | 
  5 | If the script has exit status 0, the junit.xml will have one
  6 | successful test.
  7 | 
  8 | Otherwise, the junit.xml will have a failed or error test (specific
  9 | rules are in the code below).
 10 | 
 11 | Stdout and stderr are copied to the program's stdout and stderr.
 12 | They may also be included in the XML for fail/error tests.
 13 | """
 14 | 
 15 | from __future__ import print_function
 16 | 
 17 | from xml.sax.saxutils import escape
 18 | import subprocess
 19 | import sys
 20 | import time
 21 | 
 22 | 
 23 | PASS_TEMPLATE = """<?xml version="1.0" encoding="utf-8"?>
 24 | <testsuite errors="0" failures="0" name="mypy" skips="0" tests="1" time="{time}">
 25 |   <testcase classname="mypy" file="mypy" line="1" name="mypy" time="{time}">
 26 |   </testcase>
 27 | </testsuite>
 28 | """
 29 | 
 30 | FAIL_TEMPLATE = """<?xml version="1.0" encoding="utf-8"?>
 31 | <testsuite errors="0" failures="1" name="mypy" skips="0" tests="1" time="{time}">
 32 |   <testcase classname="mypy" file="mypy" line="1" name="mypy" time="{time}">
 33 |     <failure message="mypy produced messages">{text}</failure>
 34 |   </testcase>
 35 | </testsuite>
 36 | """
 37 | 
 38 | ERROR_TEMPLATE = """<?xml version="1.0" encoding="utf-8"?>
 39 | <testsuite errors="1" failures="0" name="mypy" skips="0" tests="1" time="{time}">
 40 |   <testcase classname="mypy" file="mypy" line="1" name="mypy" time="{time}">
 41 |     <error message="mypy produced errors">{text}</error>
 42 |   </testcase>
 43 | </testsuite>
 44 | """
 45 | 
 46 | 
 47 | def main():
 48 |     # TODO: parse flags args
 49 |     cmd = sys.argv[1:]
 50 |     if not cmd:
 51 |         sys.stderr.write("Usage: run_mypy mypy <mypy-flags> <mypy-args>\n")
 52 |         return 2
 53 |     junit_file = 'mypy.junit.xml'
 54 | 
 55 |     t0 = time.time()
 56 |     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 57 |     outb, errb = p.communicate()
 58 |     code = p.returncode
 59 |     t1 = time.time()
 60 |     dt = '%.3f' % (t1 - t0)
 61 | 
 62 |     out = outb.decode('utf-8')
 63 |     err = errb.decode('utf-8')
 64 | 
 65 |     if out:
 66 |         if not out.endswith("\n"):
 67 |             out += "\n"
 68 |         sys.stdout.write(out)
 69 |     if err:
 70 |         if not err.endswith("\n"):
 71 |             err += "\n"
 72 |         sys.stderr.write(err)
 73 | 
 74 |     if code == 0:
 75 |         print("Pass")
 76 |         xml = PASS_TEMPLATE.format(time=dt)
 77 |     # TODO(guido): Remove the "mypy:" check once mypy writes to stderr.
 78 |     elif code == 1 and not err and out and not out.startswith("mypy:"):
 79 |         print("Fail")
 80 |         xml = FAIL_TEMPLATE.format(text=escape(out), time=dt)
 81 |     else:
 82 |         print("Error")
 83 |         texts = []
 84 |         # TODO(guido): Use <system-out> and <system-error> once Changes supports them.
 85 |         if out:
 86 |             texts.append("=== stdout ===\n")
 87 |             texts.append(out)
 88 |         if err:
 89 |             texts.append("=== stderr ===\n")
 90 |             texts.append(err)
 91 |         text = "".join(texts)
 92 |         xml = ERROR_TEMPLATE.format(text=escape(text), time=dt)
 93 | 
 94 |     with open(junit_file, 'w') as f:
 95 |         f.write(xml)
 96 | 
 97 |     return code
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     sys.exit(main())
102 | 


--------------------------------------------------------------------------------
/ci/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eux
2 | 
3 | ci/mypy-run
4 | 
5 | make virtualenv_coverage
6 | 


--------------------------------------------------------------------------------
/ci/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eux
 2 | 
 3 | export DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | # Install git
 6 | sudo apt-get install -y git
 7 | 
 8 | # Install fpm
 9 | sudo apt-get install -y ruby-dev gcc
10 | fpm -h > /dev/null || sudo gem install fpm --no-ri --no-rdoc
11 | 
12 | # Install easy_install for fpm to use
13 | sudo apt-get install -y python-setuptools
14 | 
15 | sudo ci/mypy-setup
16 | 


--------------------------------------------------------------------------------
/make_virtualenv.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | set -xe
 3 | 
 4 | # Based on: https://github.com/brutasse/graphite-api/blob/master/fpm/build-deb.sh
 5 | # but adapted to use easy_install
 6 | 
 7 | export PROJECT=$1
 8 | 
 9 | sudo apt-get -y install build-essential python-dev python-virtualenv 
10 | 
11 | rm -rf build
12 | 
13 | mkdir -p build/usr/share/python
14 | virtualenv build/usr/share/python/$PROJECT
15 | 
16 | build/usr/share/python/$PROJECT/bin/easy_install virtualenv-tools
17 | # Actually install our project
18 | build/usr/share/python/$PROJECT/bin/easy_install .
19 | 
20 | # can't seem to do this with easy_install
21 | # ideally we wouldn't install test requirements for the deb we install
22 | build/usr/share/python/$PROJECT/bin/pip install "file://`pwd`#egg=$PROJECT[tests]"
23 | 
24 | find build ! -perm -a+r -exec chmod a+r {} \;
25 | 
26 | cd build/usr/share/python/$PROJECT
27 | # Not sure if this is necessary
28 | sed -i "s/'\/bin\/python'/\('\/bin\/python','\/bin\/python2'\)/g" lib/python2.7/site-packages/virtualenv_tools-*-py2.7.egg/virtualenv_tools.py
29 | ./bin/virtualenv-tools --update-path /usr/share/python/$PROJECT
30 | cd -
31 | 
32 | find build -iname *.pyc -exec rm {} \;
33 | find build -iname *.pyo -exec rm {} \;
34 | 


--------------------------------------------------------------------------------
/scripts/changes-mesos-scheduler:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | from changes_mesos_scheduler.main import main
4 | 
5 | main()
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | MESOS_VERSION = '0.27.0'
 4 | UBUNTU_VERSION = '14.04'
 5 | 
 6 | tests_require = ['pytest>=2.5.0,<2.6.0', 'pytest-cov>=1.6,<1.7',
 7 |                  'pytest-xdist>=1.9,<1.10', 'unittest2>=0.5.1,<0.6.0',
 8 |                  'mock>=1.0.1,<1.1.0', 'flask>=0.10.1,<0.11.0']
 9 | 
10 | setup(name='changes-mesos-scheduler',
11 |       scripts=['scripts/changes-mesos-scheduler'],
12 |       packages=['changes_mesos_scheduler'],
13 |       extras_require={'tests': tests_require},
14 |       dependency_links = ['http://downloads.mesosphere.io/master/ubuntu/%s/mesos-%s-py2.7-linux-x86_64.egg#egg=mesos'
15 |                           % (UBUNTU_VERSION, MESOS_VERSION)],
16 |       install_requires=['futures==2.2', 'mesos', 'protobuf>=2.5.0,<3a0', 'raven', 'statsd', 'typing'],
17 |       package_dir={'changes_mesos_scheduler': 'changes_mesos_scheduler'})
18 | 
19 | 


--------------------------------------------------------------------------------
/support/bootstrap-vagrant.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eux
 2 | 
 3 | export DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | sudo apt-get update -y
 6 | 
 7 | # Install git
 8 | sudo apt-get install -y git
 9 | 
10 | # Install fpm
11 | sudo apt-get install -y ruby-dev gcc
12 | sudo gem install fpm --no-ri --no-rdoc
13 | 
14 | # Install easy_install for fpm to use
15 | sudo apt-get install -y python-setuptools
16 | 
17 | # Install pytest and flask required for tests.
18 | sudo apt-get install -y python-pip
19 | sudo pip install pytest
20 | sudo pip install flask
21 | 


--------------------------------------------------------------------------------