├── .gitignore
├── LICENSE
├── README.md
├── actions.py
├── config
├── GantryConfig.py
├── __init__.py
└── object.py
├── containerutil.py
├── gantry.py
├── gantryd.py
├── gantryd
├── __init__.py
├── client.py
├── componentstate.py
├── componentwatcher.py
├── etcdpaths.py
├── etcdstate.py
└── machinestate.py
├── health
├── __init__.py
├── checks.py
├── healthcheck.py
├── networkcheck.py
└── termination.py
├── proxy
├── __init__.py
├── haproxy.tmpl
└── portproxy.py
├── requirements.system
├── requirements.system.rhel
├── requirements.txt
├── restart-haproxy.sh
├── runtime
├── __init__.py
├── component.py
├── manager.py
└── metadata.py
├── shutdown-haproxy.sh
└── util.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .devtable
2 | gantry.xml
3 | *.pyc
4 | venv
5 | haproxy.conf
6 | .gantry_metadata
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction, and
10 | distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright
13 | owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all other entities
16 | that control, are controlled by, or are under common control with that entity.
17 | For the purposes of this definition, "control" means (i) the power, direct or
18 | indirect, to cause the direction or management of such entity, whether by
19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
20 | outstanding shares, or (iii) beneficial ownership of such entity.
21 |
22 | "You" (or "Your") shall mean an individual or Legal Entity exercising
23 | permissions granted by this License.
24 |
25 | "Source" form shall mean the preferred form for making modifications, including
26 | but not limited to software source code, documentation source, and configuration
27 | files.
28 |
29 | "Object" form shall mean any form resulting from mechanical transformation or
30 | translation of a Source form, including but not limited to compiled object code,
31 | generated documentation, and conversions to other media types.
32 |
33 | "Work" shall mean the work of authorship, whether in Source or Object form, made
34 | available under the License, as indicated by a copyright notice that is included
35 | in or attached to the work (an example is provided in the Appendix below).
36 |
37 | "Derivative Works" shall mean any work, whether in Source or Object form, that
38 | is based on (or derived from) the Work and for which the editorial revisions,
39 | annotations, elaborations, or other modifications represent, as a whole, an
40 | original work of authorship. For the purposes of this License, Derivative Works
41 | shall not include works that remain separable from, or merely link (or bind by
42 | name) to the interfaces of, the Work and Derivative Works thereof.
43 |
44 | "Contribution" shall mean any work of authorship, including the original version
45 | of the Work and any modifications or additions to that Work or Derivative Works
46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
47 | by the copyright owner or by an individual or Legal Entity authorized to submit
48 | on behalf of the copyright owner. For the purposes of this definition,
49 | "submitted" means any form of electronic, verbal, or written communication sent
50 | to the Licensor or its representatives, including but not limited to
51 | communication on electronic mailing lists, source code control systems, and
52 | issue tracking systems that are managed by, or on behalf of, the Licensor for
53 | the purpose of discussing and improving the Work, but excluding communication
54 | that is conspicuously marked or otherwise designated in writing by the copyright
55 | owner as "Not a Contribution."
56 |
57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
58 | of whom a Contribution has been received by Licensor and subsequently
59 | incorporated within the Work.
60 |
61 | 2. Grant of Copyright License.
62 |
63 | Subject to the terms and conditions of this License, each Contributor hereby
64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
65 | irrevocable copyright license to reproduce, prepare Derivative Works of,
66 | publicly display, publicly perform, sublicense, and distribute the Work and such
67 | Derivative Works in Source or Object form.
68 |
69 | 3. Grant of Patent License.
70 |
71 | Subject to the terms and conditions of this License, each Contributor hereby
72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
73 | irrevocable (except as stated in this section) patent license to make, have
74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
75 | such license applies only to those patent claims licensable by such Contributor
76 | that are necessarily infringed by their Contribution(s) alone or by combination
77 | of their Contribution(s) with the Work to which such Contribution(s) was
78 | submitted. If You institute patent litigation against any entity (including a
79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
80 | Contribution incorporated within the Work constitutes direct or contributory
81 | patent infringement, then any patent licenses granted to You under this License
82 | for that Work shall terminate as of the date such litigation is filed.
83 |
84 | 4. Redistribution.
85 |
86 | You may reproduce and distribute copies of the Work or Derivative Works thereof
87 | in any medium, with or without modifications, and in Source or Object form,
88 | provided that You meet the following conditions:
89 |
90 | You must give any other recipients of the Work or Derivative Works a copy of
91 | this License; and
92 | You must cause any modified files to carry prominent notices stating that You
93 | changed the files; and
94 | You must retain, in the Source form of any Derivative Works that You distribute,
95 | all copyright, patent, trademark, and attribution notices from the Source form
96 | of the Work, excluding those notices that do not pertain to any part of the
97 | Derivative Works; and
98 | If the Work includes a "NOTICE" text file as part of its distribution, then any
99 | Derivative Works that You distribute must include a readable copy of the
100 | attribution notices contained within such NOTICE file, excluding those notices
101 | that do not pertain to any part of the Derivative Works, in at least one of the
102 | following places: within a NOTICE text file distributed as part of the
103 | Derivative Works; within the Source form or documentation, if provided along
104 | with the Derivative Works; or, within a display generated by the Derivative
105 | Works, if and wherever such third-party notices normally appear. The contents of
106 | the NOTICE file are for informational purposes only and do not modify the
107 | License. You may add Your own attribution notices within Derivative Works that
108 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
109 | provided that such additional attribution notices cannot be construed as
110 | modifying the License.
111 | You may add Your own copyright statement to Your modifications and may provide
112 | additional or different license terms and conditions for use, reproduction, or
113 | distribution of Your modifications, or for any such Derivative Works as a whole,
114 | provided Your use, reproduction, and distribution of the Work otherwise complies
115 | with the conditions stated in this License.
116 |
117 | 5. Submission of Contributions.
118 |
119 | Unless You explicitly state otherwise, any Contribution intentionally submitted
120 | for inclusion in the Work by You to the Licensor shall be under the terms and
121 | conditions of this License, without any additional terms or conditions.
122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
123 | any separate license agreement you may have executed with Licensor regarding
124 | such Contributions.
125 |
126 | 6. Trademarks.
127 |
128 | This License does not grant permission to use the trade names, trademarks,
129 | service marks, or product names of the Licensor, except as required for
130 | reasonable and customary use in describing the origin of the Work and
131 | reproducing the content of the NOTICE file.
132 |
133 | 7. Disclaimer of Warranty.
134 |
135 | Unless required by applicable law or agreed to in writing, Licensor provides the
136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
138 | including, without limitation, any warranties or conditions of TITLE,
139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
140 | solely responsible for determining the appropriateness of using or
141 | redistributing the Work and assume any risks associated with Your exercise of
142 | permissions under this License.
143 |
144 | 8. Limitation of Liability.
145 |
146 | In no event and under no legal theory, whether in tort (including negligence),
147 | contract, or otherwise, unless required by applicable law (such as deliberate
148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
149 | liable to You for damages, including any direct, indirect, special, incidental,
150 | or consequential damages of any character arising as a result of this License or
151 | out of the use or inability to use the Work (including but not limited to
152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
153 | any and all other commercial damages or losses), even if such Contributor has
154 | been advised of the possibility of such damages.
155 |
156 | 9. Accepting Warranty or Additional Liability.
157 |
158 | While redistributing the Work or Derivative Works thereof, You may choose to
159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
160 | other liability obligations and/or rights consistent with this License. However,
161 | in accepting such obligations, You may act only on Your own behalf and on Your
162 | sole responsibility, not on behalf of any other Contributor, and only if You
163 | agree to indemnify, defend, and hold each Contributor harmless for any liability
164 | incurred by, or claims asserted against, such Contributor by reason of your
165 | accepting any such warranty or additional liability.
166 |
167 | END OF TERMS AND CONDITIONS
168 |
169 | APPENDIX: How to apply the Apache License to your work
170 |
171 | To apply the Apache License to your work, attach the following boilerplate
172 | notice, with the fields enclosed by brackets "[]" replaced with your own
173 | identifying information. (Don't include the brackets!) The text should be
174 | enclosed in the appropriate comment syntax for the file format. We also
175 | recommend that a file or class name and description of purpose be included on
176 | the same "printed page" as the copyright notice for easier identification within
177 | third-party archives.
178 |
179 | Copyright [yyyy] [name of copyright owner]
180 |
181 | Licensed under the Apache License, Version 2.0 (the "License");
182 | you may not use this file except in compliance with the License.
183 | You may obtain a copy of the License at
184 |
185 | http://www.apache.org/licenses/LICENSE-2.0
186 |
187 | Unless required by applicable law or agreed to in writing, software
188 | distributed under the License is distributed on an "AS IS" BASIS,
189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 | See the License for the specific language governing permissions and
191 | limitations under the License.
192 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # gantryd
2 |
3 | A framework built on top of [Docker](http://docker.io) that allows for easy deployment and management of project components, with a focus on:
4 |
5 | * Easy management of components of a project across multiple machines
6 | * Single command updating of components with **automatic draining** and **progressive rollout**
7 | * Ability to manage components locally, when necessary (see **gantry** below)
8 |
9 | ## Overview
10 |
11 | **gantryd** is a distributed, etcd-based system for running, updating, monitoring and managing various Docker images (known as "components") across
12 | multiple machines.
13 |
14 | 
15 |
16 | **gantryd** manages the running, monitoring and draining of containers, automatically updating machines *progressively* on update, and *draining* the old containers
17 | as it goes along. A container is only shutdown when *all connections* to it have terminated (or it is manually killed). This, combined with progressive
18 | update, allows for *continuous deployment* by simply pushing a new docker image to a repository and running `update` via `gantryd.py`.
19 |
20 | **gantryd** also automatically monitors the containers of a component, running checks periodically to ensure they are healthy. If a container goes bad, a new one is automatically started in its place, with traffic being moved over.
21 |
22 | ## Getting Started
23 |
24 | ### Getting etcd
25 |
26 | The latest etcd release is available as a binary at [Github][github-release].
27 | Installation instructions can be found at [Etcd README][etcd-readme].
28 |
29 | [github-release]: https://github.com/coreos/etcd/releases/
30 | [etcd-readme]: https://github.com/coreos/etcd/blob/master/README.md
31 |
32 |
33 | ### Cloning the source
34 |
35 | ```sh
36 | git clone https://github.com/DevTable/gantryd.git
37 | ```
38 |
39 | ### Installing dependencies
40 |
41 | #### Debian or Ubuntu
42 | ```sh
43 | # Install apt-get dependencies.
44 | cat requirements.system | xargs sudo apt-get install -y
45 |
46 | # Install python dependencies.
47 | sudo pip install -r requirements.txt
48 | ```
49 |
50 | #### RHEL or Centos
51 | ```sh
52 | # Install yum dependencies.
53 | cat requirements.system.rhel | xargs sudo yum install -y
54 |
55 | # Install python dependencies.
56 | sudo pip install -r requirements.txt
57 | ```
58 |
59 | ### Setting up
60 |
61 | All settings for gantryd are defined in a JSON format. A project's configuration is stored in etcd but is set initially from a local file (see `setconfig` below).
62 |
63 | The configuration defines the various components of the project you want to manage:
64 | ```json
65 | {
66 | "components": [
67 | {
68 | "name": "someexamplecomponent",
69 | "repo": "my/localrepo",
70 | "tag": "latest",
71 | "command": ["/usr/bin/python", "/somedir/myapplication.py"],
72 | "ports": [
73 | {"external": 8888, "container": 8888}
74 | ],
75 | "readyChecks": [
76 | { "kind": "http", "port": 8888 }
77 | ],
78 | "healthChecks": [
79 | { "kind": "http", "port": 8888, "path": "/some/path" }
80 | ],
81 | "volumesFrom": [
82 | "somedatacontainer"
83 | ],
84 | "bindings": [
85 | { "external": "/an/external/path", "volume": "/some/container/path"}
86 | ],
87 | "defineComponentLinks": [
88 | { "port": 8888, "name": "mycoolserver", "kind": "tcp" }
89 | ],
90 | "requireComponentLinks": [
91 | { "name": "anotherserver", "alias": "serveralias" }
92 | ],
93 | "environmentVariables": [
94 | { "name": "FOO", "value": "somevalue" }
95 | ]
96 | }
97 | ]
98 | }
99 | ```
100 |
101 | | Field | Description | Default |
102 | | --------------------- | --------------------------------------------------------------------------------- | ----------- |
103 | | name | The name of the component | |
104 | | repo | The docker image to use for the component | |
105 | | tag | The tag of the docker image to use | latest |
106 | | user | The user under which to run the command in the container | (in image) |
107 | | command | The command to run inside the container | (in image) |
108 | | ports | Mappings of container ports to external ports | |
109 | | readyChecks | The various checks to run to ensure the container is ready (see below for list) | |
110 | | healthChecks | The various checks to run to ensure the container is healthy (see below for list) | |
111 | | terminationSignals | Signals which should be sent to a specific container when it should be shut down | |
112 | | terminationChecks | The various checks to run to ensure that the container is ready to be shut down | connections |
113 | | volumesFrom | Container(s), by name, whose volume(s) should be mounted into the container | |
114 | | bindings | Mapping between external hosts paths and the corresponding container volumes | |
115 | | defineComponentLinks | Defines the component links exported by this component | |
116 | | requireComponentLinks | Defines the component links imported/required by this component | |
117 | | readyTimeout | Timeout in milliseconds that we will wait for a container to pass a ready check | 10,000 |
118 | | environmentVariables | Environment variables to set when running the component's containers | |
119 | | privileged | Whether the container should run in privileged mode | False |
120 |
121 | ### Terminology
122 |
123 | **Project**: Namespace that contains configuration for a set of components, as well as any metadata associated
124 | when those components are running. For example: 'frontend', 'backend', 'someproduct'.
125 |
126 | **Component**: A named component that runs a specific docker image in a container. For example: 'elasticsearch', 'mongodb'.
127 |
128 | **Component Link**: Similar to a Docker link: An exposed port by one *component* that is imported by one or more other
129 | components. Unlike a Docker link, a component link is managed by gantry and automatically updated via the proxy just link
130 | normal exposed ports. When a component link is required/imported by a container, the following environment variables are
131 | added into the containers for that component:
132 |
133 | | Environment Variable | Example Name | Example Value |
134 | | --------------------------------- | ----------------------------------- | ------------------------------------------------- |
135 | | {ALIAS}_CLINK | SERVERALIAS_CLINK | tcp://172.17.42.1:53852 |
136 | | {ALIAS}\_CLINK\_{PORT}\_{KIND} | SERVERALIAS_CLINK_8888_TCP | tcp://172.17.42.1:53852 |
137 | | {ALIAS}\_CLINK\_{PORT}\_{KIND}\_PROTO | SERVERALIAS_CLINK_8888_TCP_PROTO | tcp |
138 | | {ALIAS}\_CLINK\_{PORT}\_{KIND}\_ADDR | SERVERALIAS_CLINK_8888_TCP_ADDR | 172.17.42.1 |
139 | | {ALIAS}\_CLINK\_{PORT}\_{KIND}\_PORT | SERVERALIAS_CLINK_8888_TCP_PORT | 53852 |
140 |
141 |
142 | ### Setting up a project
143 |
144 | - [Gantryd commands](#gantryd-commands) - distributed management
145 | - [Gantry commands](#gantry-commands) - local management
146 |
147 | ### Gantryd commands
148 |
149 | #### Creating/updating the project's configuration
150 |
151 | To setup a gantryd project, make sure that etcd is running, and gantry configuration is avaliable in some file.
152 |
153 | Run the following to update the configuration for project `myprojectname` in gantryd:
154 | ```sh
155 | sudo ./gantryd.py setconfig myprojectname myconfigfile
156 | ```
157 |
158 | Response:
159 | ```sh
160 | Configuration updated
161 | ```
162 |
163 | #### Setup components by 'updating' them
164 |
165 | To mark one or more components as ready for deployment, execute the following from a machine with the latest images:
166 | ```sh
167 | sudo ./gantryd.py update myprojectname -c firstcomponent secondcomponent
168 | ```
169 |
170 | Response:
171 | ```sh
172 | Updating the image IDs on components
173 | Component firstcomponent -> 4ae76210a4fe
174 | Component secondcomponent -> 0cf0c034fc89
175 | ```
176 |
177 | This sets the status of the components to 'ready' and associates them with the image IDs listed. Once run, any followup
178 | `gantryd run` commands on this machine (or any other machines in the etcd cluster) will update and start those components
179 | with those images.
180 |
181 | #### Running components on machine(s)
182 |
183 | Once components have been marked as ready, they can be run by executing `gantryd run` on one or more machines:
184 |
185 | ```sh
186 | sudo ./gantryd.py run myprojectname -c firstcomponent secondcomponent
187 | ```
188 |
189 | This command will start a daemon (and block), starting the components and monitoring them, until it is shutdown.
190 |
191 | #### Updating a component across all listening machines
192 |
193 | To tell components to update themselves in response to an image change, execute:
194 |
195 | ```sh
196 | sudo ./gantryd.py update myprojectname -c firstcomponent secondcomponent
197 | ```
198 |
199 | Response:
200 | ```sh
201 | Updating the image IDs on components
202 | Component firstcomponent -> 4ae76210a4fe
203 | Component secondcomponent -> 0cf0c034fc89
204 | ```
205 |
206 | The first machine running the gantryd daemon will start the update within 30 seconds.
207 |
208 | ### Listing the status of all components
209 | ```sh
210 | sudo ./gantryd.py list myprojectname
211 | ```
212 |
213 | Response:
214 | ```sh
215 | COMPONENT STATUS IMAGE ID
216 | firstcomponent ready 4ae76210a4fe
217 | secondcomponent stopped 0cf0c034fc89
218 | ```
219 |
220 | #### Stopping a component on all machines
221 |
222 | To tell components to stop themselves on all machines, execute:
223 |
224 | ```sh
225 | sudo ./gantryd.py stop myprojectname -c firstcomponent secondcomponent
226 | ```
227 |
228 | Response:
229 | ```sh
230 | Marking components as stopped
231 | ```
232 |
233 | All components specified will start the shutdown process within 30 seconds.
234 |
235 | #### Killing a component on all machines
236 |
237 | To order components to kill themselves immediately on all machines, execute:
238 |
239 | ```sh
240 | sudo ./gantryd.py kill myprojectname -c firstcomponent secondcomponent
241 | ```
242 |
243 | Response:
244 | ```sh
245 | Marking components as killed
246 | ```
247 |
248 | All components specified will be killed within 30 seconds.
249 |
250 |
251 | ### Gantryd health checks
252 |
253 | Gantryd supports a number of built-in checks for verifying that a container is properly started, running and healthy.
254 |
255 | #### http Health Check
256 |
257 | ```json
258 | { "kind": "http", "port": 8888, "path": "/somepath" }
259 | ```
260 |
261 | Attempts to connect and download the HTTP page located at the given port and path. Fails if the HTTP response is not 2XX.
262 |
263 | Note that "path" is **optional**.
264 |
265 | #### tcp Health Check
266 |
267 | ```json
268 | { "kind": "tcp", "port": 8888 }
269 | ```
270 |
271 | Attempts to connect to the given port via TCP. Fails if the connection cannot be established.
272 |
273 |
274 | ###Gantry commands
275 |
276 | **gantry** is the **local** version of gantry, intended for starting, stopping and updating of components on a **single** machine. Please note that you don't need etcd to be installed (or running) to use **gantry**.
277 |
278 | #### Listing all containers running on a local machine for a component
279 | ```sh
280 | sudo ./gantry.py myconfigfile list firstcomponent
281 | ```
282 |
283 | Response:
284 | ```sh
285 | CONTAINER ID UPTIME IMAGE ID STATUS
286 | 39d59e26ee64 Up 17 seconds my/image:latest running
287 | 18182e07ade1 Up 2 minutes 0cf0c034fc89 draining
288 | 87b14f60b220 Up 4 minutes 26c8cb358b9d draining
289 | ```
290 |
291 | #### Performing a *local* update of a component
292 |
293 | *Note*: This will occur outside of the gantryd event loop, so this should *only* be used for **single machine** or **canary** images.
294 |
295 | ```sh
296 | sudo ./gantry.py myconfigfile update firstcomponent
297 | ```
298 |
299 | Response:
300 | ```sh
301 | Starting container 39d59e26ee64
302 | Waiting for health checks...
303 | Running health check: http
304 | Checking HTTP address: http://localhost:49320
305 | Redirecting traffic to new container
306 | Checking container statuses...
307 | Updating proxy...
308 | Starting monitoring...
309 | Monitor check started
310 | ```
311 |
312 | *Note*: If the `-m` flag is specified, then gantry will remain running and actively monitor the component's container, restarting it automatically if it becomes unhealthy.
313 |
314 | #### Stopping all containers running on a local machine for a component
315 |
316 | *Note*: This will *drain* containers in a safe way, so the process will block until all containers are free from incoming connections
317 |
318 | ```sh
319 | sudo ./gantry.py myconfigfile stop firstcomponent
320 | ```
321 |
322 | Response:
323 | ```sh
324 | Draining all containers...
325 | Checking container statuses...
326 | Updating proxy...
327 | Starting monitoring...
328 | Monitor check started
329 | Shutting down container: 39d59e26ee64
330 | Proxy updated
331 | ```
332 |
333 | #### Killing all containers running on a local machine for a component
334 | ```sh
335 | sudo ./gantry.py myconfigfile kill firstcomponent
336 | ```
337 |
338 | Response:
339 | ```sh
340 | Draining all containers...
341 | Killing container d05d73bc6c3
342 | Checking container statuses...
343 | Shutting down proxy...
344 | ```
345 |
--------------------------------------------------------------------------------
/actions.py:
--------------------------------------------------------------------------------
1 | def start_action(component):
2 | if component.isRunning():
3 | print 'Component ' + component.getName() + ' is already running'
4 | return False
5 |
6 | return component.update()
7 |
8 | def stop_action(component):
9 | if not component.isRunning():
10 | print 'Component ' + component.getName() + ' is not running'
11 | return False
12 |
13 | component.stop(kill=False)
14 | return False
15 |
16 | def kill_action(component):
17 | if not component.isRunning():
18 | print 'Component ' + component.getName() + ' is not running'
19 | return False
20 |
21 | component.stop(kill=True)
22 | return False
23 |
24 | def update_action(component):
25 | return component.update()
26 |
27 | def list_action(component):
28 | if not component.isRunning():
29 | print 'Component ' + component.getName() + ' is not running'
30 | return False
31 |
32 | print "%-20s %-20s %-20s %-20s" % ('CONTAINER ID', 'UPTIME', 'IMAGE ID', 'STATUS')
33 |
34 | for info in component.getContainerInformation():
35 | container = info[0]
36 | status = info[1]
37 |
38 | id = container['Id']
39 | uptime = container['Status']
40 | image = container['Image']
41 | i = (id[0:12], uptime, image, status)
42 | print "%-20s %-20s %-20s %-20s" % i
43 |
44 | return False
45 |
46 |
--------------------------------------------------------------------------------
/config/GantryConfig.py:
--------------------------------------------------------------------------------
1 | from object import CFObject, CFField
2 | from util import pickUnusedPort
3 | from runtime.metadata import getComponentField, setComponentField
4 |
5 | class _HealthCheck(CFObject):
6 | """ A single check to perform to verify that a component is ready to be
7 | pushed or is running properly.
8 | """
9 | id = CFField('id').default('').name_field()
10 | kind = CFField('kind').value_field()
11 | timeout = CFField('timeout').kind(int).default(3)
12 |
13 | def __init__(self):
14 | super(_HealthCheck, self).__init__('Health Check')
15 |
16 | def getTitle(self):
17 | """ Returns a descriptive title for the check. """
18 | if self.id != '':
19 | return self.id
20 |
21 | return self.kind
22 |
23 |
24 | class _TerminationSignal(CFObject):
25 | """ A single signal that is sent to a component when the component should shut
26 | itself down.
27 | """
28 | id = CFField('id').default('').name_field()
29 | kind = CFField('kind').value_field()
30 | timeout = CFField('timeout').kind(int).default(3)
31 | exec_command = CFField('exec_command').default('')
32 |
33 | def __init__(self):
34 | super(_TerminationSignal, self).__init__('Termination Signal')
35 |
36 | def getTitle(self):
37 | """ Returns a descriptive title for the check. """
38 | if self.id != '':
39 | return self.id
40 |
41 | return self.kind
42 |
43 |
44 | class _PortMapping(CFObject):
45 | """ A port mapping of an internal container port to the outside world. """
46 | external = CFField('external').kind(int).name_field()
47 | container = CFField('container').kind(int).value_field()
48 | kind = CFField('kind').default('tcp')
49 |
50 | def __init__(self):
51 | super(_PortMapping, self).__init__('Port Mapping')
52 |
53 |
54 | class _VolumeBinding(CFObject):
55 | """ A port mapping of an internal container port to the outside world. """
56 | external = CFField('external').name_field()
57 | volume = CFField('volume').value_field()
58 |
59 | def __init__(self):
60 | super(_VolumeBinding, self).__init__('Volume Binding')
61 |
62 |
63 | class _DefinedComponentLink(CFObject):
64 | """ A network link exported by a component. """
65 | name = CFField('name').name_field()
66 | port = CFField('port').kind(int).value_field()
67 | kind = CFField('kind').default('tcp')
68 |
69 | def __init__(self):
70 | super(_DefinedComponentLink, self).__init__('Component Link')
71 |
72 | def getHostPort(self):
73 | """ Returns the port used by the component link on the host. """
74 | key = 'link-' + self.name + '-port'
75 | port = getComponentField(self.parent.name, key, 0)
76 | if not port:
77 | port = pickUnusedPort()
78 | setComponentField(self.parent.name, key, port)
79 |
80 | return port
81 |
82 |
83 | class _RequiredComponentLink(CFObject):
84 | """ A network link required by a component. """
85 | name = CFField('name').name_field()
86 | alias = CFField('alias').value_field()
87 |
88 | def __init__(self):
89 | super(_RequiredComponentLink, self).__init__('Required Component Link')
90 |
91 |
92 | class _EnvironmentVariable(CFObject):
93 | """ An environment variable to set when running a component. """
94 | name = CFField('name').name_field()
95 | value = CFField('value').value_field()
96 |
97 | def __init__(self):
98 | super(_EnvironmentVariable, self).__init__('Environment Variable')
99 |
100 |
101 | class _Component(CFObject):
102 | """ A single gantry component. """
103 | name = CFField('name')
104 | repo = CFField('repo')
105 | tag = CFField('tag').default('latest')
106 | command = CFField('command').list_of(str).default([])
107 | user = CFField('user').default('')
108 | ports = CFField('ports').list_of(_PortMapping).default([])
109 | bindings = CFField('bindings').list_of(_VolumeBinding).default([])
110 | volumes_from = CFField('volumesFrom').list_of(str).default([])
111 | ready_checks = CFField('readyChecks').list_of(_HealthCheck).default([])
112 | health_checks = CFField('healthChecks').list_of(_HealthCheck).default([])
113 | ready_timeout = CFField('readyTimeout').kind(int).default(10000)
114 | termination_signals = CFField('terminationSignals').list_of(_TerminationSignal).default([])
115 | privileged = CFField('privileged').kind(bool).default(False)
116 | defined_component_links = CFField('defineComponentLinks').list_of(_DefinedComponentLink).default([])
117 | required_component_links = CFField('requireComponentLinks').list_of(_RequiredComponentLink).default([])
118 | environment_variables = CFField('environmentVariables').list_of(_EnvironmentVariable).default([])
119 |
120 | connection_check = _HealthCheck().build({'kind': 'connection'})
121 | termination_checks = CFField('terminationChecks').list_of(_HealthCheck).default([connection_check])
122 |
123 | def __init__(self):
124 | super(_Component, self).__init__('Component')
125 |
126 | def getFullImage(self):
127 | """ Returns the full image ID for this component, of the form 'repo:tag' """
128 | return self.repo + ':' + self.tag
129 |
130 | def getUser(self):
131 | """ Returns the user under which to run the container or None if none. """
132 | if not self.user:
133 | return None
134 |
135 | return self.user
136 |
137 | def getCommand(self):
138 | """ Returns the command string to run on component startup or None if none. """
139 | if not self.command:
140 | return None
141 |
142 | return ' '.join(self.command)
143 |
144 | def getContainerPorts(self):
145 | """ Returns the full set of ports exposed by this component. """
146 | return set([p.container for p in self.ports] + [l.port for l in self.defined_component_links])
147 |
148 | def getReadyCheckTimeout(self):
149 | """ Returns the maximum amount of time, in seconds, before ready checks time out. """
150 | return self.ready_timeout / 1000
151 |
152 | def getVolumes(self):
153 | """ Returns the volumes exposed by this component. """
154 | return [binding.volume for binding in self.bindings]
155 |
156 | def getBindings(self, container_id):
157 | """ Returns the volumes exposed by this component. """
158 | def substitute_id(external_dir):
159 | return external_dir.format(container_id=container_id[:12])
160 |
161 | return {substitute_id(binding.external): binding.volume for binding in self.bindings}
162 |
163 | def getDefinedComponentLinks(self):
164 | """ Returns the dict of defined components links. """
165 | return {l.name: l for l in self.defined_component_links}
166 |
167 | def getComponentLinks(self):
168 | """ Returns a dict of aliases for component links required, with the values being the links' names. """
169 | return {l.alias: l.name for l in self.required_component_links}
170 |
171 | def getEnvironmentVariables(self):
172 | """ Returns a dict of the defined environments variables and their values. """
173 | return {v.name: v.value for v in self.environment_variables}
174 |
175 |
176 | class Configuration(CFObject):
177 | """ The overall gantry configuration. """
178 | components = CFField('components').list_of(_Component)
179 |
180 | def __init__(self):
181 | super(Configuration, self).__init__('Configuration')
182 |
183 | def lookupComponent(self, name):
184 | """ Looks up the component with the given name under this config. """
185 | for component in self.components:
186 | if component.name == name:
187 | return component
188 |
189 | return None
--------------------------------------------------------------------------------
/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DevTable/gantryd/eb348113f0f73a0be45a45f7a5626ad2b5dd30ba/config/__init__.py
--------------------------------------------------------------------------------
/config/object.py:
--------------------------------------------------------------------------------
1 | import json
2 | import string
3 | import copy
4 | import inspect
5 |
6 | class ConfigParseException(Exception):
7 | pass
8 |
9 | class CFObject(object):
10 | """
11 | Defines a class in which all fields marked with the CFField are automatically handled
12 | for configuration purposes.
13 | """
14 | def __init__(self, name):
15 | # The parent of the object.
16 | self.parent = None
17 |
18 | # The name of the object.
19 | self.name = name
20 |
21 | # The cached fields for the object.
22 | self.fields = None
23 |
24 | # The "extra" fields on the object, if any.
25 | self.extra_fields = {}
26 |
27 | def applyOverride(self, override):
28 | """ Applies the given configuration override to this config object.
29 |
30 | Format: 'Name.SubName=Value'
31 | """
32 | (path, value) = override.split('=', 2)
33 | path_pieces = path.split('.')
34 |
35 | # Find the field with the associated name.
36 | field_name = path_pieces[0]
37 | field = self.get_fields().get(field_name)
38 | if not field:
39 | raise ConfigParseException('No field named %s found' % field_name)
40 |
41 | # If the field is a list, then we need a named key (at least).
42 | if field.get_kind() == list:
43 | list_item_kind = field.get_list_kind()
44 | current_list = field.get_value(self)
45 |
46 | # If the list item kind is not a config object, then we expect to just add the config
47 | # value.
48 | if not CFObject in inspect.getmro(list_item_kind):
49 | if len(path_pieces) > 1:
50 | raise ConfigParseException('Found list field %s, expected simply the list name' % field_name)
51 |
52 | new_list_item = list_item_kind(value)
53 | current_list.append(new_list_item)
54 | return
55 |
56 | # Otherwise, find the named field in the config object and set the override in there.
57 | name_field = list_item_kind().get_name_field()
58 | if not name_field:
59 | raise ConfigParseException('List field %s does not support overrides by name' % field_name)
60 |
61 | # Find the named value in the list that matches the name given. If none, we create a new
62 | # entry.
63 | entry_name = path_pieces[1]
64 | found_entry = None
65 | for entry in current_list:
66 | if name_field.get_value(entry) == name_field.get_kind()(entry_name):
67 | found_entry = entry
68 | break
69 |
70 | if not found_entry:
71 | # Create the new entry with the user specified name and add it to the list.
72 | found_entry = list_item_kind()
73 | found_entry.parent = self
74 | name_field.set_value(found_entry, entry_name)
75 |
76 | current_list.append(found_entry)
77 |
78 | # If there are no further path pieces, then we look for a value field to set.
79 | if len(path_pieces) == 2:
80 | value_field = found_entry.get_value_field()
81 | if not value_field:
82 | raise ConfigParseException('No default value field found for config %s' % entry.name)
83 |
84 | value_field.set_value(found_entry, value_field.get_kind()(value))
85 | return
86 |
87 | # Otherwise, we set the override under the entry.
88 | found_entry.applyOverride('%s=%s' % ('.'.join(path_pieces[2:]), value))
89 |
90 | else:
91 | if len(path_pieces) > 1:
92 | raise ConfigParseException('Cannot access named keys under %s' % field_name)
93 |
94 | # Update the field's value.
95 | field.set_value(self, field.get_kind()(value))
96 |
97 | def hasExtraField(self, name):
98 | """ Returns true if there is an 'extra' field with the given name. """
99 | return name in self.extra_fields
100 |
101 | def getExtraField(self, name):
102 | """ Returns the 'extra' field with this name. """
103 | return self.extra_fields[name]
104 |
105 | def getRootConfig(self):
106 | """ Returns the root configuration object. """
107 | if self.parent:
108 | return self.parent
109 |
110 | return self
111 |
112 | @classmethod
113 | def parse(cls, json_data):
114 | """ Parses the given JSON data into an instance of this config object. """
115 | dictionary = json.loads(json_data)
116 | return cls.build(dictionary)
117 |
118 | @classmethod
119 | def build(cls, dictionary):
120 | """ Builds an instance of this config object from the given dictionary. """
121 | instance = cls()
122 | instance.extra_fields = copy.copy(dictionary)
123 | for name in instance.get_fields():
124 | if name in instance.extra_fields:
125 | del instance.extra_fields[name]
126 |
127 | field = instance.get_fields()[name]
128 | if field.is_required() and not name in dictionary:
129 | raise ConfigParseException('Missing required property ' + name + ' under object ' + instance.name)
130 |
131 | if name in dictionary:
132 | field.populate(instance, dictionary[name])
133 |
134 | return instance
135 |
136 | def get_name_field(self):
137 | """ Returns the name field under this config object, if any. """
138 | for field in self.get_fields().values():
139 | if field.get_is_name_field():
140 | return field
141 |
142 | return None
143 |
144 | def get_value_field(self):
145 | """ Returns the value field under this config object, if any. """
146 | for field in self.get_fields().values():
147 | if field.get_is_value_field():
148 | return field
149 |
150 | return None
151 |
152 | def get_fields(self):
153 | """ Returns a dictionary of all CFField's defined in the CFObject """
154 | # Check the field cache first
155 | if self.fields:
156 | return self.fields
157 |
158 | fields = {}
159 | class_fields = dir(self.__class__)
160 | class_dict = self.__class__.__dict__
161 | for field_name in class_fields:
162 | if class_dict.has_key(field_name):
163 | field = class_dict[field_name]
164 | if field.__class__ == CFField:
165 | name = CFField.get_name(field)
166 | fields[name] = field
167 | self.fields = fields
168 | return fields
169 |
170 |
171 | class CFField(object):
172 | """ A field representing a property in the configuration object """
173 | def __init__(self, name):
174 | # Whether the field is the name field for the parent object.
175 | self.is_name_field = False
176 |
177 | # Whether the field is the primary value field for the parent object.
178 | self.is_value_field = False
179 |
180 | # The name of the field in the config.
181 | self.name = name
182 |
183 | # The current value of the field.
184 | self.value = None
185 |
186 | # The type of the field. Defaults to string.
187 | self.field_kind = str
188 |
189 | # If this field is a list, the kind of its elements.
190 | self.list_kind = None
191 |
192 | # The default value for the field. If none, the field is required.
193 | self.default_value = None
194 |
195 | def __get__(self, instance, owner):
196 | return self.get_value(instance)
197 |
198 | def __set__(self, instance, value):
199 | self.update(instance, value)
200 |
201 | def name_field(self):
202 | """ Marks a field as being the name field for the parent object. """
203 | self.is_name_field = True
204 | return self
205 |
206 | def value_field(self):
207 | """ Marks a field as being the value field for the parent object. """
208 | self.is_value_field = True
209 | return self
210 |
211 | def kind(self, kind):
212 | """ Sets the kind of the field. """
213 | self.field_kind = kind
214 | return self
215 |
216 | def default(self, value):
217 | """ Sets the default value for the field. """
218 | self.default_value = value
219 | return self
220 |
221 | def list_of(self, kind):
222 | """ Sets that this field is a list of some kind of values. """
223 | self.field_kind = list
224 | self.list_kind = kind
225 | return self
226 |
227 | def get_kind(self):
228 | return self.field_kind
229 |
230 | def get_list_kind(self):
231 | return self.list_kind
232 |
233 | def is_required(self):
234 | return self.default_value is None
235 |
236 | def get_is_name_field(self):
237 | return self.is_name_field
238 |
239 | def get_is_value_field(self):
240 | return self.is_value_field
241 |
242 | def get_name(self):
243 | """ Returns the name of the field """
244 | return self.name
245 |
246 | def populate(self, instance, primitive):
247 | """ Attempts to populate this list from the given primitive value. """
248 | if self.field_kind == list:
249 | if not isinstance(primitive, list):
250 | raise ConfigParseException('Expected list for field ' + self.name)
251 |
252 | list_value = []
253 | for p in primitive:
254 | c_value = self.get_converted_value(instance, p, self.list_kind)
255 | if not isinstance(c_value, self.list_kind):
256 | raise ConfigParseException('Expected items of kind ' + str(self.list_kind) + ' in ' + self.name)
257 | list_value.append(c_value)
258 |
259 | self.update(instance, list_value)
260 | return
261 |
262 | self.update(instance, self.get_converted_value(instance, primitive, self.field_kind))
263 |
264 | def get_converted_value(self, instance, primitive, kind):
265 | # Class types.
266 | if issubclass(kind, CFObject):
267 | if not isinstance(primitive, dict):
268 | raise ConfigParseException('Expected dictionary for field ' + self.name)
269 |
270 | built = kind.build(primitive)
271 | built.parent = instance;
272 | return built
273 |
274 | # Otherwise, convert to from a string.
275 | return kind(primitive)
276 |
277 | def internal_data(self, instance):
278 | internal_name = self.name + '_data'
279 | if internal_name not in instance.__dict__:
280 | instance.__dict__[internal_name] = {'data': None}
281 | return instance.__dict__[internal_name]
282 |
283 | def get_value(self, instance):
284 | """ Returns the value of the field for the given instance """
285 | value = self.internal_data(instance)['data'];
286 | if value is None and self.default_value is not None:
287 | return self.default_value
288 |
289 | return value
290 |
291 | def set_value(self, instance, value):
292 | """ Sets the value of the field for the given instance """
293 | self.__set__(instance, value)
294 |
295 | def update(self, instance, value):
296 | """ Updates the value of the field """
297 | self.internal_data(instance)['data'] = value
--------------------------------------------------------------------------------
/containerutil.py:
--------------------------------------------------------------------------------
1 | def getContainerIPAddress(client, container):
2 | """ Returns the IP address on which the container is running. """
3 | container_info = client.inspect_container(container)
4 | return container_info['NetworkSettings']['IPAddress']
5 |
--------------------------------------------------------------------------------
/gantry.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import argparse
4 | import signal
5 | import time
6 |
7 | from actions import start_action, update_action, list_action, stop_action, kill_action
8 | from config.GantryConfig import Configuration
9 | from runtime.manager import RuntimeManager
10 | from util import report, fail
11 |
12 |
13 | ACTIONS = {
14 | 'start': start_action,
15 | 'update': update_action,
16 | 'list': list_action,
17 | 'stop': stop_action,
18 | 'kill': kill_action
19 | }
20 |
21 |
22 | def loadConfig(config_file):
23 | """ Attempts to load and parse the given config file. """
24 | try:
25 | with open(config_file, 'r') as f:
26 | config_json = f.read()
27 | except:
28 | print 'Could not find config file: ' + config_file
29 | return None
30 |
31 | try:
32 | return Configuration.parse(config_json)
33 | except Exception as e:
34 | print 'Error parsing gantry config: ' + str(e)
35 | return None
36 |
37 |
38 | def monitor(component):
39 | while True:
40 | # Sleep for 30 seconds.
41 | time.sleep(30)
42 |
43 | # Conduct the checks.
44 | report('Checking in on component ' + component.getName())
45 | if not component.isHealthy():
46 | report('Component ' + component.getName() + ' is not healthy. Killing and restarting')
47 | component.stop(kill=True)
48 | if not component.update():
49 | report('Could not restart component ' + component.getName())
50 | return
51 |
52 |
53 | def run():
54 | # Setup the gantry arguments
55 | parser = argparse.ArgumentParser(description='gantry continuous deployment system')
56 | parser.add_argument('config_file', help='The configuration file')
57 | parser.add_argument('action', help='The action to perform', choices=ACTIONS.keys())
58 | parser.add_argument('component_name', help='The name of the component to manage')
59 | parser.add_argument('-m', dest='monitor', action='store_true', help='If specified and the action is "start" or "update", gantry will remain running to monitor components, auto restarting them as necessary')
60 | parser.add_argument('--setconfig', dest='config_overrides', action='append', help='Configuration overrides for the component')
61 |
62 | args = parser.parse_args()
63 | component_name = args.component_name
64 | action = args.action
65 | should_monitor = args.monitor
66 | config_file = args.config_file
67 | config_overrides = args.config_overrides
68 |
69 | # Load the config.
70 | config = loadConfig(config_file)
71 | if not config:
72 | return
73 |
74 | # Create the manager.
75 | manager = RuntimeManager(config)
76 |
77 | # Find the component
78 | component = manager.getComponent(component_name)
79 | if not component:
80 | raise Exception('Unknown component: ' + component_name)
81 |
82 | # Apply the config overrides (if any).
83 | if config_overrides:
84 | component.applyConfigOverrides(config_overrides)
85 |
86 | # Run the action with the component and config.
87 | result = ACTIONS[action](component)
88 | if result and should_monitor:
89 | try:
90 | report('Starting monitoring of component: ' + component_name)
91 | monitor(component)
92 | except KeyboardInterrupt:
93 | report('Terminating monitoring of component: ' + component_name)
94 |
95 | def cleanup_monitor(signum, frame):
96 | manager.join()
97 |
98 | # Set the signal handler and a 5-second alarm
99 | signal.signal(signal.SIGINT, cleanup_monitor)
100 |
101 | # We may have to call cleanup manually if we weren't asked to monitor
102 | cleanup_monitor(None, None)
103 |
104 | if __name__ == "__main__":
105 | run()
--------------------------------------------------------------------------------
/gantryd.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from gantryd.client import GantryDClient
4 | import argparse
5 | import json
6 |
7 | ETCD_HOST = '127.0.0.1'
8 | ETCD_PORT = 4001
9 |
10 | def run(dclient, args):
11 | """ Runs gantryd. """
12 | dclient.run(args.component)
13 |
14 | def getconfig(dclient, args):
15 | """ Prints out the current project configuration stored in etcd. """
16 | config = None
17 | try:
18 | config = dclient.getConfigJSON()
19 | except:
20 | pass
21 |
22 | if not config:
23 | print 'No config found'
24 | return
25 |
26 | print json.dumps(json.loads(config), sort_keys=True, indent=2, separators=(',', ': '))
27 |
28 | def setconfig(dclient, args):
29 | """ Sets the current project configuration stored in etcd. """
30 | if not args.configfile:
31 | print 'Missing configfile parameter'
32 | return
33 |
34 | with open(args.configfile, 'r') as f:
35 | dclient.setConfig(json.loads(f.read()))
36 | print 'Configuration updated'
37 |
38 | def list_status(dclient, args):
39 | """ Lists the status of all components in gantryd. """
40 | dclient.listStatus()
41 |
42 | def mark_updated(dclient, args):
43 | """ Marks a component to be updated. """
44 | dclient.markUpdated(args.component)
45 |
46 | def stop(dclient, args):
47 | """ Marks a component to be stopped. """
48 | dclient.stopComponents(args.component)
49 |
50 | def kill(dclient, args):
51 | """ Marks a component to be killed. """
52 | dclient.killComponents(args.component)
53 |
54 | ACTIONS = {
55 | 'run': run,
56 | 'getconfig': getconfig,
57 | 'setconfig': setconfig,
58 | 'list': list_status,
59 | 'update': mark_updated,
60 | 'stop': stop,
61 | 'kill': kill
62 | }
63 |
64 | def start():
65 | # Setup the gantryd arguments.
66 | parser = argparse.ArgumentParser(description='gantry continuous deployment system daemon')
67 | parser.add_argument('action', help='The action to perform', choices=ACTIONS.keys())
68 | parser.add_argument('project', help='The name of the project containing the components')
69 | parser.add_argument('configfile', help='The name of the config file. Only applies to setconfig.', nargs='?')
70 | parser.add_argument('-c', help='A component to watch and run', nargs='+', type=str, dest='component')
71 | parser.add_argument('-etcd', help='The etcd endpoint to which the client should connect. Defaults to 127.0.0.1', dest='etcd_host', nargs='?', const=ETCD_HOST)
72 | parser.add_argument('-etcdport', help='The client port of the etcd endpoint. Defaults to 4001.', dest='etcd_port', nargs='?', const=ETCD_PORT)
73 |
74 | # Parse the arguments.
75 | args = parser.parse_args()
76 | port = int(args.etcd_port) if args.etcd_port else ETCD_PORT
77 |
78 | # Initialize the gantryd client.
79 | dclient = GantryDClient(args.etcd_host or ETCD_HOST, args.project, port)
80 |
81 | # Run the action.
82 | action = ACTIONS[args.action]
83 | action(dclient, args)
84 |
85 | if __name__ == "__main__":
86 | start()
87 |
--------------------------------------------------------------------------------
/gantryd/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DevTable/gantryd/eb348113f0f73a0be45a45f7a5626ad2b5dd30ba/gantryd/__init__.py
--------------------------------------------------------------------------------
/gantryd/client.py:
--------------------------------------------------------------------------------
1 | from runtime.manager import RuntimeManager
2 | from config.GantryConfig import Configuration
3 | from config.object import ConfigParseException
4 |
5 | from gantryd.componentwatcher import ComponentWatcher
6 | from gantryd.machinestate import MachineState
7 | from gantryd.componentstate import ComponentState, STOPPED_STATUS, KILLED_STATUS
8 | from gantryd.etcdpaths import getProjectConfigPath
9 |
10 | from util import report, fail, ReportLevels
11 |
12 | import etcd
13 | import uuid
14 | import atexit
15 | import threading
16 | import time
17 | import socket
18 | import json
19 | import logging
20 |
21 | REPORT_TTL = 60 # Report that this machine is running, every 60 seconds
22 |
23 | class GantryDClient(object):
24 | """ A client in gantryd. """
25 | def __init__(self, etcdHost, projectName, etcdPort):
26 | self.project_name = projectName
27 | self.runtime_manager = None
28 | self.components = []
29 | self.is_running = False
30 |
31 | # Generate a unique ID for this machine/client.
32 | self.machine_id = str(uuid.uuid1())
33 |
34 | # Logging.
35 | self.logger = logging.getLogger(__name__)
36 |
37 | # Initialize the etcd client that we'll use.
38 | self.etcd_client = etcd.Client(host=etcdHost, port=etcdPort)
39 |
40 | # Initialize the thread used for reporting the status of this machine to etcd.
41 | self.reporting_thread = threading.Thread(target=self.reportMachineStatus, args=[])
42 | self.reporting_thread.daemon = True
43 |
44 | def getConfigJSON(self):
45 | """ Returns the project's config JSON or raises an exception if none. """
46 | # Lookup the project on etcd. If none, report an error.
47 | config_json = None
48 | try:
49 | self.logger.debug('Looking up configuration for project %s in etcd', self.project_name)
50 | config_json = self.etcd_client.get(getProjectConfigPath(self.project_name)).value
51 | except KeyError as k:
52 | self.logger.exception(k)
53 | fail('Unknown project ' + self.project_name, project=self.project_name)
54 |
55 | return config_json
56 |
57 | def getConfig(self):
58 | """ Returns the project's config or raises an exception if none. """
59 | config_json = self.getConfigJSON()
60 |
61 | # Parse the project's configuration and save it.
62 | try:
63 | self.config = Configuration.parse(config_json)
64 | except ConfigParseException as cpe:
65 | fail('Error parsing gantry config', project=self.project_name, exception=cpe)
66 | except Exception as e:
67 | self.logger.exception(e)
68 |
69 | return self.config
70 |
71 | def setConfig(self, config):
72 | """ Sets the project's config in etcd. """
73 | config_json = json.dumps(config)
74 | self.logger.debug('Updating configuration for project %s', self.project_name)
75 | self.etcd_client.set(getProjectConfigPath(self.project_name), config_json)
76 |
77 | def stopComponents(self, component_names):
78 | """ Tells all the given components on all systems to stop. """
79 | self.initialize(component_names)
80 |
81 | report('Marking components as stopped', project=self.project_name)
82 | for component in self.components:
83 | report('Marking component as stopped', project=self.project_name, component=component,
84 | level = ReportLevels.EXTRA)
85 | state = ComponentState(self.project_name, component, self.etcd_client)
86 | state.setStatus(STOPPED_STATUS)
87 |
88 | def killComponents(self, component_names):
89 | """ Tells all the given components on all systems to die. """
90 | self.initialize(component_names)
91 |
92 | report('Marking components as killed', project=self.project_name)
93 | for component in self.components:
94 | report('Marking component as killed', project=self.project_name, component=component,
95 | level = ReportLevels.EXTRA)
96 | state = ComponentState(self.project_name, component, self.etcd_client)
97 | state.setStatus(KILLED_STATUS)
98 |
99 | def markUpdated(self, component_names):
100 | """ Tells all the given components to update themselves. """
101 | self.initialize(component_names)
102 |
103 | report('Updating the image IDs on components', project=self.project_name)
104 | for component in self.components:
105 | image_id = component.getImageId()
106 | state = ComponentState(self.project_name, component, self.etcd_client)
107 |
108 | report('Component %s->%s' % (component.getName(), image_id[0:12]), project=self.project_name,
109 | component = component)
110 | state.setReadyStatus(image_id)
111 |
112 | def listStatus(self):
113 | """ Lists the status of all components in this project. """
114 | self.getConfig()
115 | self.initialize([c.name for c in self.config.components])
116 |
117 | print "%-20s %-20s %-20s" % ('COMPONENT', 'STATUS', 'IMAGE ID')
118 | for component in self.components:
119 | state = ComponentState(self.project_name, component, self.etcd_client).getState()
120 | status = ComponentState.getStatusOf(state)
121 | imageid = ComponentState.getImageIdOf(state)
122 | print "%-20s %-20s %-20s" % (component.getName(), status, imageid)
123 |
124 |
125 | def run(self, component_names):
126 | """ Runs the given components on this machine. """
127 | self.initialize(component_names)
128 |
129 | # Register a handler to remove this machine from the list when the daemon is
130 | # shutdown. The controller will also occasionally ping a machine to verify it
131 | # is present.
132 | self.logger.debug('Registering exit listener')
133 | atexit.register(self.handleExit)
134 |
135 | # Start the thread to register this machine as being part of the project.
136 | self.startReporter()
137 |
138 | # Start watcher thread(s), one for each component, to see when to update them.
139 | report('Gantryd running', project=self.project_name)
140 | for component in self.components:
141 | self.logger.debug('Starting component watcher for component: %s', component.getName())
142 | watcher = ComponentWatcher(component, self.project_name, self.machine_id, self.etcd_client)
143 | watcher.start()
144 |
145 | # And sleep until new stuff comes in.
146 | while True:
147 | time.sleep(1)
148 |
149 |
150 | ########################################################################
151 |
152 | def initialize(self, component_names):
153 | """ Initializes this client for working with the components given. """
154 | # Load the project configuration.
155 | self.getConfig()
156 |
157 | # Initialize the runtime manager.
158 | self.runtime_manager = RuntimeManager(self.config)
159 |
160 | # Find all the components for this machine.
161 | for component_name in component_names:
162 | component = self.runtime_manager.getComponent(component_name)
163 | if not component:
164 | fail('Unknown component named ' + component_name, project=self.project_name)
165 |
166 | self.components.append(component)
167 |
168 | def handleExit(self):
169 | """ Function executed when the Python system exits. This unregisters the machine in etcd. """
170 | self.is_running = False
171 | try:
172 | machine_state = MachineState(self.project_name, self.machine_id, self.etcd_client)
173 | machine_state.removeMachine()
174 |
175 | # Shut down the runtime manager if we have one
176 | if self.runtime_manager is not None:
177 | self.runtime_manager.join()
178 |
179 | except Exception as e:
180 | self.logger.exception(e)
181 | pass
182 |
183 | def startReporter(self):
184 | """ Starts reporting that this machine is running. """
185 | self.is_running = True
186 | self.reporting_thread.start()
187 |
188 | def reportMachineStatus(self):
189 | """ Reports that this machine has running components. """
190 | while self.is_running:
191 | # Perform the update.
192 | self.logger.debug('Reporting status for machine %s to etcd', self.machine_id)
193 | machine_state = MachineState(self.project_name, self.machine_id, self.etcd_client)
194 | machine_state.registerMachine([c.getName() for c in self.components], ttl=REPORT_TTL)
195 |
196 | # Sleep for the TTL minus a few seconds.
197 | time.sleep(REPORT_TTL - 5)
198 |
199 |
200 |
--------------------------------------------------------------------------------
/gantryd/componentstate.py:
--------------------------------------------------------------------------------
1 | import json
2 | from etcdstate import EtcdState
3 | from etcdpaths import getComponentStatePath
4 |
5 | READY_STATUS = 'ready'
6 | STOPPED_STATUS = 'stopped'
7 | KILLED_STATUS = 'killed'
8 | PULL_FAIL = 'pullfail'
9 |
10 | IMAGE_ID = 'imageid'
11 |
12 | class ComponentState(EtcdState):
13 | """ Helper class which allows easy getting and setting of the etcd distributed
14 | state of a component.
15 | """
16 | def __init__(self, project_name, component, etcd_client):
17 | path = getComponentStatePath(project_name, component)
18 | super(ComponentState, self).__init__(path, etcd_client)
19 |
20 | @staticmethod
21 | def getStatusOf(state):
22 | """ Returns the status field in the given state object. """
23 | return state['status'] if 'status' in state else 'unknown'
24 |
25 | @staticmethod
26 | def getImageIdOf(state):
27 | """ Returns the image ID field in the given state object or empty string if None. """
28 | return state[IMAGE_ID] if IMAGE_ID in state else ''
29 |
30 | def getStatus(self):
31 | """ Returns the status of the component. """
32 | return self.getState(default={'status': 'unknown'}).status
33 |
34 | def setStatus(self, status, **kwargs):
35 | """ Sets the status of the component. """
36 | state = dict(kwargs)
37 | state['status'] = status
38 | self.setState(state)
39 |
40 | def setReadyStatus(self, imageid):
41 | """ Sets the status of the component to 'ready', with the given imageid. """
42 | self.setStatus(READY_STATUS, imageid=imageid)
43 |
44 | def setUpdatingStatus(self, status, machine_id, original_state):
45 | """ Attempts to set the status of the component to being updated by the given machine.
46 | Returns the updated state on success and None otherwise.
47 | """
48 | state = {}
49 | state['status'] = status
50 | state['machine'] = machine_id
51 | return self.replaceState(original_state, state)
52 |
--------------------------------------------------------------------------------
/gantryd/componentwatcher.py:
--------------------------------------------------------------------------------
1 | import time
2 | import threading
3 | import json
4 | import logging
5 |
6 | from gantryd.componentstate import ComponentState, STOPPED_STATUS, KILLED_STATUS, READY_STATUS, PULL_FAIL
7 | from util import report, fail, getDockerClient, ReportLevels
8 |
9 | CHECK_SLEEP_TIME = 30 # 30 seconds
10 | CHECK_SHORT_SLEEP_TIME = 10 # 10 seconds
11 | MONITOR_SLEEP_TIME = 30 # 30 seconds
12 |
13 | class ComponentWatcher(object):
14 | """ Helper class which watches a specific component's status in etcd and
15 | manages the update/stop/kill process (if necessary). Also watches the
16 | component itself once started, and ensures that it remains running (restarting
17 | it if it failed).
18 | """
19 | def __init__(self, component, project_name, machine_id, etcd_client):
20 | self.component = component
21 | self.project_name = project_name
22 | self.machine_id = machine_id
23 | self.is_running = False
24 |
25 | # Logging.
26 | self.logger = logging.getLogger(__name__)
27 |
28 | # Setup the state helper for the component.
29 | self.state = ComponentState(project_name, component, etcd_client)
30 |
31 | # Setup the watcher thread.
32 | self.watcher_thread = threading.Thread(target=self.waitForCommand, args=[])
33 | self.watcher_thread.daemon = True
34 |
35 | # Setup the monitor thread.
36 | self.monitor_thread = threading.Thread(target=self.monitorComponent, args=[])
37 | self.monitor_thread.daemon = True
38 |
39 | # Setup an event to ping the monitor thread when it should restart checking in
40 | # on the component.
41 | self.monitor_event = threading.Event()
42 |
43 | # Setup a lock to prevent multiple threads from trying to (re)start a container.
44 | self.update_lock = threading.Lock()
45 |
46 | def start(self):
47 | """ Starts the watcher. """
48 | self.watcher_thread.start()
49 | self.monitor_thread.start()
50 |
51 | def monitorComponent(self):
52 | """ Monitors a component by pinging it every MONITOR_SLEEP_TIME seconds or so. If a component
53 | fails, then the system will try to restart it. If that fails, the component is marked
54 | as dead.
55 | """
56 | while True:
57 | # Wait for the component to be running.
58 | self.monitor_event.wait()
59 |
60 | # Sleep MONITOR_SLEEP_TIME seconds.
61 | time.sleep(MONITOR_SLEEP_TIME)
62 |
63 | # Check the component.
64 | report('Checking in on component', project=self.project_name, component=self.component,
65 | level=ReportLevels.BACKGROUND)
66 |
67 | if not self.component.isHealthy():
68 | self.logger.debug('Component %s is not healty', self.component.getName())
69 | with self.update_lock:
70 | # Just to be sure...
71 | if not self.is_running:
72 | continue
73 |
74 | # Ensure that the component is still ready.
75 | state = self.state.getState()
76 | current_status = ComponentState.getStatusOf(state)
77 | if current_status == READY_STATUS:
78 | report('Component ' + self.component.getName() + ' is not healthy. Restarting...',
79 | project=self.project_name, component=self.component)
80 |
81 | if not self.component.update():
82 | report('Could not restart component ' + self.component.getName(),
83 | project=self.project_name, component=self.component,
84 | level=ReportLevels.IMPORTANT)
85 | self.monitor_event.clear()
86 | continue
87 |
88 | def waitForCommand(self):
89 | """ Waits for an command notification on the component in etcd. If one is received,
90 | processes it by attempting to update the component.
91 | """
92 | is_initial_loop = True
93 | sleep_time = 0
94 | while True:
95 | # Sleep and then check again.
96 | time.sleep(sleep_time)
97 | sleep_time = CHECK_SLEEP_TIME
98 |
99 | # Check the component's status.
100 | self.logger.debug('Checking state for component %s', self.component.getName())
101 | state = self.state.getState()
102 | self.logger.debug('Found state %s for component %s', state, self.component.getName())
103 |
104 | # Determine whether we should give initial status messages.
105 | was_initial_loop = is_initial_loop
106 | is_initial_loop = False
107 |
108 | # Take actions based on the status requested.
109 | current_status = ComponentState.getStatusOf(state)
110 | sleep_time = self.handleStatus(current_status, state, was_initial_loop)
111 |
112 | def handleStatus(self, current_status, state, was_initial_check):
113 | """ Handles the various status states for the component, returning the
114 | amount of time after which to retry lookup up the state or -1 for
115 | terminated.
116 | """
117 | if current_status == STOPPED_STATUS:
118 | return self.handleStopped(was_initial_check)
119 | elif current_status == KILLED_STATUS:
120 | return self.handleKilled(was_initial_check)
121 | elif current_status == READY_STATUS or current_status == PULL_FAIL:
122 | with self.update_lock:
123 | return self.handleReady(state, was_initial_check)
124 |
125 | return CHECK_SLEEP_TIME
126 |
127 | def handleStopped(self, was_initial_check):
128 | """ Handles when the component has been marked to be stopped. """
129 | self.monitor_event.clear()
130 |
131 | if was_initial_check:
132 | report('Component %s is marked as stopped' % self.component.getName(),
133 | project=self.project_name, component=self.component)
134 |
135 | self.is_running = False
136 | self.component.stop(kill=False)
137 | return CHECK_SLEEP_TIME
138 |
139 | def handleKilled(self, was_initial_check):
140 | """ Handles when the component has been marked to be killed. """
141 | self.monitor_event.clear()
142 |
143 | if was_initial_check:
144 | report('Component %s is marked as killed' % self.component.getName(),
145 | project=self.project_name, component=self.component)
146 |
147 | self.is_running = False
148 | self.component.stop(kill=True)
149 | return CHECK_SLEEP_TIME
150 |
151 | def handleReady(self, state, was_initial_check):
152 | """ Handles when the component has been marked as ready. """
153 |
154 | # If the status is ready, we update the component if:
155 | # - The ID of the component's image does not match that found in the status.
156 | # - The process is not running.
157 | imageid = ComponentState.getImageIdOf(state)
158 | imageid_different = imageid != self.component.getImageId()
159 | should_update = not self.is_running or imageid_different
160 |
161 | if should_update:
162 | self.is_running = False
163 | self.monitor_event.clear()
164 |
165 | # We need to update this machine's copy. First, do a test and set to ensure that
166 | # we are the only machine allowed to update. If the test and set fails, we'll
167 | # try again in 10s.
168 | if imageid_different:
169 | report('Detected pushed update for component ' + self.component.getName(),
170 | project=self.project_name, component=self.component)
171 | else:
172 | report('Component %s is not running; starting' % self.component.getName(),
173 | project=self.project_name, component=self.component)
174 |
175 | result = self.state.setUpdatingStatus('updating', self.machine_id, state)
176 | if not result:
177 | # The exchange failed. Sleep CHECK_SHORT_SLEEP_TIME seconds and try again.
178 | report('Could not grab update lock. Will try again in %s seconds' % CHECK_SHORT_SLEEP_TIME,
179 | project=self.project_name, component=self.component)
180 | return CHECK_SHORT_SLEEP_TIME
181 |
182 | # Start the update by pulling the repo for the component.
183 | if imageid_different:
184 | report('Pulling the image for component ' + self.component.getName())
185 | if not self.component.pullRepo():
186 | # The pull failed.
187 | report('Pull failed of image %s for component %s' % (imageid[0:12],
188 | self.component.getName()),
189 | project=self.project_name, component=self.component, level=ReportLevels.IMPORTANT)
190 | self.state.setUpdatingStatus('pullfail', self.machine_id, result)
191 | return CHECK_SLEEP_TIME
192 |
193 | # Run the update on the component and wait for it to finish.
194 | if imageid_different:
195 | report('Starting update for component ' + self.component.getName(),
196 | project=self.project_name, component=self.component)
197 |
198 | if not self.component.update():
199 | # The update failed.
200 | self.state.setUpdatingStatus('updatefail', self.machine_id, result)
201 | return CHECK_SLEEP_TIME
202 |
203 | # Otherwise, the update has succeeded. Mark the component as ready, so another
204 | # gantryd can start its update.
205 | if imageid_different:
206 | report('Update completed for component ' + self.component.getName(),
207 | project=self.project_name, component=self.component)
208 | else:
209 | report('Component ' + self.component.getName() + ' is now running',
210 | project=self.project_name, component=self.component)
211 |
212 | self.state.setReadyStatus(self.component.getImageId())
213 | self.is_running = True
214 | self.monitor_event.set()
215 |
216 | return CHECK_SLEEP_TIME
217 |
--------------------------------------------------------------------------------
/gantryd/etcdpaths.py:
--------------------------------------------------------------------------------
1 | GANTRYD_NAMESPACE = 'gantryd'
2 | PROJECT_NAMESPACE = 'projects'
3 | COMPONENT_NAMESPACE = 'components'
4 | MACHINES_NAMESPACE = 'machines'
5 |
6 | STATE_FILE = 'state'
7 | CONFIG_FILE = 'config'
8 |
9 | def buildPath(*args):
10 | return '/' + GANTRYD_NAMESPACE + '/' + '/'.join(args)
11 |
12 | def getMachineStatePath(projectName, machineId):
13 | """ Returns the path for this machine in the etcd config for the project. """
14 | # gantryd/projects/{project}/machines/{machineid}/state
15 | return buildPath(PROJECT_NAMESPACE, projectName, MACHINES_NAMESPACE, machineId, STATE_FILE)
16 |
17 | def getProjectConfigPath(projectName):
18 | """ Returns the path for this project's config in the etcd config. """
19 | # gantryd/projects/{project}/config
20 | return buildPath(PROJECT_NAMESPACE, projectName, CONFIG_FILE)
21 |
22 | def getComponentStatePath(projectName, component):
23 | """ Returns the path for the given component under this project in the etcd config. """
24 | # gantryd/projects/{project}/components/{componentname}/state
25 | return buildPath(PROJECT_NAMESPACE, projectName, COMPONENT_NAMESPACE, component.getName(), STATE_FILE)
26 |
--------------------------------------------------------------------------------
/gantryd/etcdstate.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 |
4 | class EtcdState(object):
5 | """ Base class for all helper classes which get and set state in etcd for objects.
6 | """
7 | def __init__(self, state_path, etcd_client):
8 | self.etcd_client = etcd_client
9 | self.state_path = state_path
10 |
11 | # Logging.
12 | self.logger = logging.getLogger(__name__)
13 |
14 | def getState(self, default={}):
15 | """ Gets the state. """
16 | try:
17 | self.logger.debug('Looking up etcd path: %s', self.state_path)
18 | return json.loads(self.etcd_client.get(self.state_path).value)
19 | except KeyError as k:
20 | pass
21 | except ValueError as v:
22 | self.logger.exception(v)
23 | pass
24 |
25 | return default
26 |
27 | def replaceState(self, previous_state, new_state):
28 | """ Attempts to atomically replace the given previous state with a new state.
29 | On success, returns the new state object. On failure, returns None.
30 | """
31 | try:
32 | self.logger.debug('Test and set replacing etcd path: %s', self.state_path)
33 | original_contents_json = json.dumps(previous_state, separators=(',', ':'))
34 | new_contents_json = json.dumps(new_state, separators=(',', ':'))
35 | self.etcd_client.test_and_set(self.state_path, new_contents_json, original_contents_json)
36 | except ValueError as e:
37 | self.logger.debug('Test and set replacment for etcd path %s failed', self.state_path)
38 | return None
39 |
40 | return new_state
41 |
42 | def buildAndSetState(self, **kwargs):
43 | """ Builds state from the given args and sets the state. """
44 | state_obj = dict(kwargs)
45 | self.setState(state_obj)
46 |
47 | def setState(self, state_obj={}, ttl=None):
48 | """ Sets the state to the given object. """
49 | self.etcd_client.set(self.state_path, json.dumps(state_obj, separators=(',', ':')), ttl=ttl)
50 |
51 | def deleteState(self):
52 | """ Deletes the state. """
53 | self.etcd_client.delete()
54 |
55 |
--------------------------------------------------------------------------------
/gantryd/machinestate.py:
--------------------------------------------------------------------------------
1 | import json
2 | import socket
3 |
4 | from etcdstate import EtcdState
5 | from etcdpaths import getMachineStatePath
6 |
7 | STATUS_RUNNING = 'running'
8 |
9 | class MachineState(EtcdState):
10 | """ Helper class which allows easy getting and setting of the etcd distributed
11 | state of a machine.
12 | """
13 | def __init__(self, project_name, machine_id, etcd_client):
14 | path = getMachineStatePath(project_name, machine_id)
15 | super(MachineState, self).__init__(path, etcd_client)
16 |
17 | def registerMachine(self, component_names, ttl=60):
18 | """ Registers this machine with etcd. """
19 | machine_state = {
20 | 'status': STATUS_RUNNING,
21 | 'components': component_names,
22 | 'ip': socket.gethostbyname(socket.gethostname())
23 | }
24 |
25 | self.setState(machine_state, ttl=ttl)
26 |
27 | def getStatus(self):
28 | """ Returns the status of this machine. """
29 | return self.getState({'status': 'unknown'})
30 |
31 | def removeMachine(self):
32 | """ Removes this machine from etcd. """
33 | self.deleteState()
--------------------------------------------------------------------------------
/health/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DevTable/gantryd/eb348113f0f73a0be45a45f7a5626ad2b5dd30ba/health/__init__.py
--------------------------------------------------------------------------------
/health/checks.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | from networkcheck import TcpCheck, HttpRequestCheck, IncomingConnectionCheck
4 | from termination import HttpTerminationSignal, ExecTerminationSignal
5 | from util import report, fail, getDockerClient
6 |
7 | # The list of registered health checks
8 | HEALTH_CHECKS = {
9 | 'tcp': TcpCheck,
10 | 'http': partial(HttpRequestCheck, 'http'),
11 | 'https': partial(HttpRequestCheck, 'https'),
12 | 'connection': IncomingConnectionCheck,
13 | }
14 |
15 | def buildHealthCheck(check_config):
16 | """ Builds a health check to run and returns it. """
17 | kind = check_config.kind
18 | if not kind in HEALTH_CHECKS:
19 | fail('Unknown health check: ' + kind)
20 |
21 | return HEALTH_CHECKS[kind](check_config)
22 |
23 | TERMINATION_SIGNALS = {
24 | 'http': partial(HttpTerminationSignal, 'http'),
25 | 'https': partial(HttpTerminationSignal, 'https'),
26 | 'exec': ExecTerminationSignal,
27 | }
28 |
29 | def buildTerminationSignal(check_config):
30 | """ Builds a termination signal and returns it. """
31 | kind = check_config.kind
32 | if not kind in TERMINATION_SIGNALS:
33 | fail('Unknown termination signal kind: ' + kind)
34 |
35 | return TERMINATION_SIGNALS[kind](check_config)
36 |
--------------------------------------------------------------------------------
/health/healthcheck.py:
--------------------------------------------------------------------------------
1 | import containerutil
2 |
3 | from util import getDockerClient
4 |
5 | import logging
6 |
7 |
8 | class ContainerSignal(object):
9 | def __init__(self):
10 | # Logging.
11 | self.logger = logging.getLogger(__name__)
12 |
13 | def getContainerIPAddress(self, container):
14 | """ Returns the IP address on which the container is running. """
15 | client = getDockerClient()
16 | return containerutil.getContainerIPAddress(client, container)
17 |
18 |
19 | class TerminationSignal(ContainerSignal):
20 | """ Base class for all termination signals. """
21 | def run(self, container, report):
22 | """ Sends the termination signal to the given container, returning True if it succeeds.
23 | """
24 | return False
25 |
26 |
27 | class HealthCheck(ContainerSignal):
28 | """ Base class for all health checks. """
29 | def run(self, container, report):
30 | """ Runs the given health check on the given container, returning True if it succeeds and
31 | false otherwise.
32 | """
33 | return False
34 |
--------------------------------------------------------------------------------
/health/networkcheck.py:
--------------------------------------------------------------------------------
1 | import socket
2 | import urllib2
3 |
4 | from health.healthcheck import HealthCheck
5 | from util import ReportLevels
6 | from proxy.portproxy import Proxy
7 |
8 | class TcpCheck(HealthCheck):
9 | """ A health check which tries to connect to a port via TCP. """
10 | def __init__(self, config):
11 | super(TcpCheck, self).__init__()
12 | self.config = config
13 |
14 | def run(self, container, report):
15 | container_port = self.config.getExtraField('port')
16 | container_ip = self.getContainerIPAddress(container)
17 |
18 | report('Checking TCP port in container ' + container['Id'][0:12] + ': ' + str(container_port),
19 | level = ReportLevels.EXTRA)
20 | try:
21 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
22 | sock.connect((container_ip, container_port))
23 | sock.close()
24 | except Exception as e:
25 | print e
26 | return False
27 |
28 | return True
29 |
30 |
31 | class HttpRequestCheck(HealthCheck):
32 | """ A health check which tries to connect to an HTTP server on a known port. """
33 | def __init__(self, protocol, config):
34 | super(HttpRequestCheck, self).__init__()
35 | self.protocol = protocol
36 | self.config = config
37 |
38 | def run(self, container, report):
39 | container_port = self.config.getExtraField('port')
40 | container_ip = self.getContainerIPAddress(container)
41 |
42 | address = '%s://%s:%s' % (self.protocol, container_ip, container_port)
43 | if self.config.hasExtraField('path'):
44 | address += self.config.getExtraField('path')
45 |
46 | report('Checking HTTP address in container ' + container['Id'][0:12] + ': ' + address,
47 | level = ReportLevels.EXTRA)
48 | try:
49 | response = urllib2.urlopen(address, timeout=2)
50 | response.read()
51 | except Exception as exc:
52 | self.logger.exception(exc)
53 | return False
54 |
55 | return True
56 |
57 |
58 | class IncomingConnectionCheck(HealthCheck):
59 | """ A health check which will succeed only if there are NO incoming connections to a container.
60 | """
61 | def __init__(self, config):
62 | super(IncomingConnectionCheck, self).__init__()
63 | self.config = config
64 |
65 | def run(self, container, report):
66 | container_ip = self.getContainerIPAddress(container)
67 |
68 | for connection in Proxy.get_connections():
69 | if not connection.laddr or not connection.raddr:
70 | continue
71 |
72 | if connection.raddr[0] == container_ip:
73 | report('Container still has existing connections: %s' % container['Id'][0:12],
74 | level=ReportLevels.EXTRA)
75 | return False
76 |
77 | report('Container has no remaining connections: %s' % container['Id'][0:12],
78 | level=ReportLevels.EXTRA)
79 | return True
80 |
--------------------------------------------------------------------------------
/health/termination.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 |
3 | from health.healthcheck import TerminationSignal
4 | from util import ReportLevels, getDockerClient
5 |
6 | class HttpTerminationSignal(TerminationSignal):
7 | """ A termination signal which tries to POST to an HTTP server on a known port. """
8 | def __init__(self, protocol, config):
9 | super(TerminationSignal, self).__init__()
10 | self.protocol = protocol
11 | self.config = config
12 |
13 | def run(self, container, report):
14 | container_port = self.config.getExtraField('port')
15 | container_ip = self.getContainerIPAddress(container)
16 |
17 | address = '%s://%s:%s' % (self.protocol, container_ip, container_port)
18 | if self.config.hasExtraField('path'):
19 | address += self.config.getExtraField('path')
20 |
21 | data = ''
22 |
23 | report('Posting to HTTP address in container ' + container['Id'][0:12] + ': ' + address,
24 | level=ReportLevels.EXTRA)
25 | try:
26 | req = urllib2.Request(address, data)
27 | response = urllib2.urlopen(req, timeout=2)
28 | response.read()
29 | except Exception as exc:
30 | self.logger.exception(exc)
31 | return False
32 |
33 | return True
34 |
35 | class ExecTerminationSignal(TerminationSignal):
36 | """ A termination signal which tries to EXEC a command on a running container """
37 | def __init__(self, config):
38 | super(TerminationSignal, self).__init__()
39 | self.config = config
40 |
41 | def run(self, container, report):
42 | report('ExecTerminationSignal in container %s: %s' % (container['Id'][0:12], self.config.exec_command),
43 | level=ReportLevels.EXTRA)
44 |
45 | try:
46 | client = getDockerClient()
47 | response = client.exec_create(container, self.config.exec_command)
48 | client.exec_start(response['Id'])
49 | except Exception as exc:
50 | self.logger.exception(exc)
51 | return False
52 |
53 | return True
54 |
--------------------------------------------------------------------------------
/proxy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DevTable/gantryd/eb348113f0f73a0be45a45f7a5626ad2b5dd30ba/proxy/__init__.py
--------------------------------------------------------------------------------
/proxy/haproxy.tmpl:
--------------------------------------------------------------------------------
1 | global
2 | daemon
3 | maxconn 4096
4 | user haproxy
5 | group haproxy
6 | quiet
7 | nbproc 1
8 | pidfile /var/run/haproxy-private.pid
9 | log 127.0.0.1 local1 notice
10 | stats socket /var/run/haproxy.sock mode 0600 level admin
11 |
12 | defaults
13 | option abortonclose
14 | option forwardfor
15 | option httpclose
16 |
17 | log global
18 | {% for port, route in port_routes.items() %}
19 | frontend port_{{ port }}
20 | bind 0.0.0.0:{{ port }}
21 | timeout client 86400000
22 | {% if route.is_http -%}
23 | mode http
24 | {%- else -%}
25 | mode tcp
26 | {%- endif %}
27 |
28 | default_backend {{ route.id }}-backend
29 | {% endfor %}
30 |
31 | {% for route in port_routes.values() %}
32 | backend {{ route.id }}-backend
33 | {%- if route.is_http %}
34 | mode http
35 | {%- else %}
36 | mode tcp
37 | {%- endif %}
38 | balance roundrobin
39 | timeout server 86400000
40 | timeout connect 5000
41 | server {{ route.id }}-backend-0 {{ route.container_ip }}:{{ route.container_port }}
42 |
43 | {% endfor %}
44 |
--------------------------------------------------------------------------------
/proxy/portproxy.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import logging
3 | import psutil
4 |
5 | from uuid import uuid4
6 | from jinja2 import Environment, FileSystemLoader
7 |
8 | TEMPLATE_FOLDER = 'proxy'
9 |
10 | HAPROXY = 'haproxy'
11 | HAPROXY_TEMPLATE = 'haproxy.tmpl'
12 | HAPROXY_PID_FILE = '/var/run/haproxy-private.pid'
13 | HAPROXY_CONFIG_FILE = 'haproxy.conf'
14 |
15 | CLOSE_WAIT = 'CLOSE_WAIT'
16 |
17 |
18 | logger = logging.getLogger(__name__)
19 |
20 |
21 | class Proxy(object):
22 | def __init__(self):
23 | # The registered routes, by external port number.
24 | self._port_routes = {}
25 |
26 | jinja_options = {
27 | "loader": FileSystemLoader(TEMPLATE_FOLDER),
28 | }
29 |
30 | env = Environment(**jinja_options)
31 | self._template = env.get_template(HAPROXY_TEMPLATE)
32 |
33 | @staticmethod
34 | def get_connections():
35 | """ Returns the connection information for all proxy processes. """
36 | logger.debug('Getting proxy connections')
37 | connections = []
38 | for proc in psutil.process_iter():
39 | if proc.is_running() and proc.name() == HAPROXY:
40 | connections.extend([conn for conn in proc.get_connections() if conn.status != CLOSE_WAIT])
41 |
42 | return connections
43 |
44 | def clear_routes(self):
45 | """ Clears all routes found in the proxy. """
46 | self._port_routes = {}
47 |
48 | def add_route(self, route):
49 | """ Adds a route to the proxy (but does not commit the changes). """
50 | self._port_routes[route.host_port] = route
51 |
52 | def shutdown(self):
53 | """ Shuts down the proxy entirely. """
54 | subprocess.call('./shutdown-haproxy.sh', shell=True, close_fds=True)
55 |
56 | def commit(self):
57 | """ Commits the changes made to the proxy. """
58 | logger.debug("Restarting haproxy with new rules.")
59 |
60 | # If the port routes are empty, add a dummy mapping to the proxy.
61 | if len(self._port_routes.values()) == 0:
62 | self.add_route(Route(False, 65535, '127.0.0.2', 65534, is_fake=True))
63 |
64 | # Write out the config.
65 | rendered = self._template.render({'port_routes': self._port_routes})
66 | with open(HAPROXY_CONFIG_FILE, 'w') as config_file:
67 | config_file.write(rendered)
68 |
69 | # Restart haproxy
70 | subprocess.call('./restart-haproxy.sh', shell=True, close_fds=True)
71 |
72 |
73 | class Route(object):
74 | """ A single route proxied. """
75 | def __init__(self, is_http, host_port, container_ip, container_port, is_fake=False):
76 | self.id = str(uuid4())
77 | self.is_fake = is_fake
78 | self.is_http = is_http
79 | self.host_port = host_port
80 | self.container_ip = container_ip
81 | self.container_port = container_port
82 |
--------------------------------------------------------------------------------
/requirements.system:
--------------------------------------------------------------------------------
1 | python-dev
2 | haproxy
3 | python-virtualenv
4 | libssl-dev
5 | libffi-dev
--------------------------------------------------------------------------------
/requirements.system.rhel:
--------------------------------------------------------------------------------
1 | python-devel
2 | haproxy
3 | python-virtualenv
4 | python-pip
5 | openssl-devel
6 | libffi-devel
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | termcolor
2 | docker-py
3 | psutil >=2.2.1,<3.0
4 | jinja2
5 | python-etcd
6 | peewee
--------------------------------------------------------------------------------
/restart-haproxy.sh:
--------------------------------------------------------------------------------
1 | running="/var/run/haproxy-private.pid"
2 | if [ -f "$running" ]
3 | then
4 | haproxy -f haproxy.conf -sf $(cat /var/run/haproxy-private.pid)
5 | else
6 | haproxy -f haproxy.conf
7 | fi
--------------------------------------------------------------------------------
/runtime/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DevTable/gantryd/eb348113f0f73a0be45a45f7a5626ad2b5dd30ba/runtime/__init__.py
--------------------------------------------------------------------------------
/runtime/component.py:
--------------------------------------------------------------------------------
1 | from threading import Thread, Event
2 |
3 | from health.checks import buildHealthCheck
4 | from metadata import (getContainerStatus, setContainerStatus, removeContainerMetadata,
5 | getContainerComponent, setContainerComponent)
6 | from util import report, fail, getDockerClient, ReportLevels
7 |
8 | import time
9 | import logging
10 |
11 | class Component(object):
12 | """ A component that can be/is running. Tracks all the runtime information
13 | for a component.
14 | """
15 | def __init__(self, manager, config):
16 | # Logging.
17 | self.logger = logging.getLogger(__name__)
18 |
19 | # The overall manager for components, which tracks global state.
20 | self.manager = manager
21 |
22 | # The underlying config for the component.
23 | self.config = config
24 |
25 | def applyConfigOverrides(self, config_overrides):
26 | """ Applies the list of configuration overrides to this component's config.
27 |
28 | Format: ['Name=Value', 'Name.SubName=Value']
29 | """
30 | for override in config_overrides:
31 | self.config.applyOverride(override)
32 |
33 | def getName(self):
34 | """ Returns the name of the component. """
35 | return self.config.name
36 |
37 | def lookupExportedComponentLink(self, link_name):
38 | """ Looks up the exported component link with the given name and returns it or None if none. """
39 | pass
40 |
41 | def isRunning(self):
42 | """ Returns whether this component has at least one running container. Note that
43 | this will return True for ALL possible containers of the component, including
44 | deprecated ones.
45 | """
46 | self.logger.debug('Checking if component %s is running', self.getName())
47 | client = getDockerClient()
48 | return len(self.getAllContainers(client)) > 0
49 |
50 | def getPrimaryContainer(self):
51 | """ Returns the container for this component that is not marked as draining or None if
52 | none.
53 | """
54 | client = getDockerClient()
55 | for container in self.getAllContainers(client):
56 | if getContainerStatus(container) != 'draining':
57 | return container
58 |
59 | return None
60 |
61 | def getImageId(self):
62 | """ Returns the docker ID of the image used for this component. Note that this
63 | will *not* return the *named* image, but rather the full UUID-like ID.
64 | """
65 | client = getDockerClient()
66 | named_image = self.config.getFullImage()
67 | self.logger.debug('Finding image ID for component %s with named image %s', self.getName(), named_image)
68 | result = client.inspect_image(named_image)
69 | return result['Id']
70 |
71 | def pullRepo(self):
72 | """ Attempts to pull the repo for this component. On failure, returns False. """
73 | try:
74 | self.logger.debug('Attempting to pull repo for component %s: %s:%s', self.getName(), self.config.repo, self.config.tag)
75 | client = getDockerClient()
76 | client.pull(self.config.repo, tag=self.config.tag)
77 | return True
78 | except Exception as e:
79 | self.logger.exception(e)
80 | return False
81 |
82 | def update(self):
83 | """ Updates a running instance of the component. Returns True on success and False
84 | otherwise.
85 | """
86 | self.logger.debug('Updating component %s', self.getName())
87 | client = getDockerClient()
88 |
89 | # Get the list of currently running container(s).
90 | existing_containers = self.getAllContainers(client)
91 | existing_primary = self.getPrimaryContainer()
92 |
93 | # Start the new instance.
94 | container = self.start()
95 | if not container:
96 | return False
97 |
98 | # Mark all the existing containers as draining.
99 | for existing in existing_containers:
100 | setContainerStatus(existing, 'draining')
101 |
102 | # Update the port proxy to redirect the external ports to the new
103 | # container.
104 | report('Redirecting traffic to new container', component=self)
105 | self.manager.adjustForUpdatingComponent(self, container)
106 |
107 | # Signal the existing primary container to terminate
108 | if existing_primary is not None:
109 | self.manager.terminateContainer(existing_primary, self)
110 |
111 | return True
112 |
113 | def stop(self, kill=False):
114 | """ Stops all containers for this component. """
115 | if not self.isRunning():
116 | return
117 |
118 | self.logger.debug('Stopping component %s', self.getName())
119 | client = getDockerClient()
120 |
121 | # Mark all the containers as draining.
122 | report('Draining all containers...', component=self)
123 | for container in self.getAllContainers(client):
124 | setContainerStatus(container, 'draining')
125 | self.manager.terminateContainer(container, self)
126 |
127 | # Kill any associated containers if asked.
128 | if kill:
129 | for container in self.getAllContainers(client):
130 | report('Killing container ' + container['Id'][:12], component=self)
131 | client.kill(container)
132 | removeContainerMetadata(container)
133 |
134 | # Clear the proxy and rebuild its routes for the running components.
135 | self.manager.adjustForStoppingComponent(self)
136 |
137 | def getContainerInformation(self):
138 | """ Returns the container status information for all containers. """
139 | client = getDockerClient()
140 | information = []
141 |
142 | for container in self.getAllContainers(client):
143 | information.append((container, getContainerStatus(container)))
144 |
145 | return information
146 |
147 | def isHealthy(self):
148 | """ Runs the health checks on this component's container, ensuring that it is healthy.
149 | Returns True if healthy and False otherwise.
150 | """
151 | self.logger.debug('Checking if component %s is healthy...', self.getName())
152 | container = self.getPrimaryContainer()
153 | if not container:
154 | self.logger.debug('No container running for component %s', self.getName())
155 | return False
156 |
157 | checks = []
158 | for check in self.config.health_checks:
159 | checks.append((check, buildHealthCheck(check)))
160 |
161 | for (config, check) in checks:
162 | report('Running health check: ' + config.getTitle(), component=self)
163 | result = check.run(container, report)
164 | if not result:
165 | report('Health check failed', component=self)
166 | return False
167 |
168 | self.logger.debug('Component %s is healthy', self.getName())
169 | return True
170 |
171 | ######################################################################
172 |
173 | def readyCheck(self, container, timeout):
174 | """ Method which performs ready health check(s) on a container, returning whether
175 | they succeeded or not.
176 |
177 | container: The container running the component that will be checked.
178 | timeout: The amount of time after which the checks have timed out.
179 | """
180 | self.logger.debug('Checking if component %s is ready...', self.getName())
181 | checks = []
182 | for check in self.config.ready_checks:
183 | checks.append((check, buildHealthCheck(check)))
184 |
185 | start = time.time()
186 | while True:
187 | now = time.time()
188 | if now - start > timeout:
189 | # Timed out completely.
190 | self.logger.debug('Component %s ready checks have timed out')
191 | return False
192 |
193 | # Try each check. If any fail, we'll sleep and try again.
194 | check_failed = None
195 | for (config, check) in checks:
196 | report('Running health check: ' + config.getTitle(), component=self)
197 | result = check.run(container, report)
198 | if not result:
199 | report('Health check failed', component=self)
200 | check_failed = config
201 | break
202 |
203 | if check_failed:
204 | report('Sleeping ' + str(check_failed.timeout) + ' second(s)...', component=self)
205 | time.sleep(check_failed.timeout)
206 | else:
207 | break
208 |
209 | return True
210 |
211 | def start(self):
212 | """ Starts a new instance of the component. Note that this does *not* update the proxy. """
213 | client = getDockerClient()
214 | self.logger.debug('Starting container for component %s', self.getName())
215 |
216 | # Ensure that we have the image. If not, we try to download it.
217 | self.ensureImage(client)
218 |
219 | # Start the instance with the proper image ID.
220 | container = self.createContainer(client)
221 | report('Starting container ' + container['Id'][:12], component=self)
222 |
223 | if self.config.privileged:
224 | report('Container will be run in privileged mode', component=self)
225 |
226 | client.start(container, binds=self.config.getBindings(container['Id']),
227 | volumes_from=self.config.volumes_from,
228 | privileged=self.config.privileged)
229 |
230 | # Health check until the instance is ready.
231 | report('Waiting for health checks...', component=self)
232 |
233 | # Start a health check thread to determine when the component is ready.
234 | timeout = self.config.getReadyCheckTimeout()
235 | readycheck_thread = Thread(target=self.readyCheck, args=[container, timeout])
236 | readycheck_thread.daemon = True
237 | readycheck_thread.start()
238 |
239 | # Wait for the health thread to finish.
240 | readycheck_thread.join(self.config.getReadyCheckTimeout())
241 |
242 | # If the thread is still alived, then our join timed out.
243 | if readycheck_thread.isAlive():
244 | report('Timed out waiting for health checks. Stopping container...', component=self)
245 | client.stop(container)
246 | report('Container stopped', component=self)
247 | return None
248 |
249 | # Otherwise, the container is ready. Set it as starting.
250 | setContainerComponent(container, self.getName())
251 | setContainerStatus(container, 'starting')
252 | return container
253 |
254 | def getAllContainers(self, client):
255 | """ Returns all the matching containers for this component. """
256 | containers = []
257 | for container in client.containers():
258 | containerName = getContainerComponent(container)
259 | if ((not containerName and container['Image'] == self.config.getFullImage()) or
260 | containerName == self.getName()):
261 | containers.append(container)
262 |
263 | return containers
264 |
265 | def calculateEnvForComponent(self):
266 | """ Calculates the dict of environment variables for this component. """
267 | links = self.config.getComponentLinks()
268 | environment = self.config.getEnvironmentVariables()
269 |
270 | for link_alias, link_name in links.items():
271 | component_link_info = self.manager.lookupComponentLink(link_name)
272 | if not component_link_info:
273 | fail('Component link %s not defined on any component' % link_name, component=self)
274 | return None
275 |
276 | if not component_link_info.running:
277 | info = (link_name, component_link_info.component.getName())
278 | fail('Component link "%s" cannot be setup: Component "%s" is not running' % info,
279 | component=self)
280 | return None
281 |
282 | # Component link env var format:
283 | # THEALIAS_CLINK=tcp://{hostip}:{hostport}
284 | # THEALIAS_CLINK_6379_TCP=tcp://{hostip}:{hostport}
285 | # THEALIAS_CLINK_6379_TCP_PROTO=tcp
286 | # THEALIAS_CLINK_6379_TCP_ADDR={hostip}
287 | # THEALIAS_CLINK_6379_TCP_PORT={hostport}
288 |
289 | prefix = link_alias.upper() + '_CLINK'
290 | prefix_with_port = prefix + '_' + str(component_link_info.container_port)
291 | full_prefix = prefix_with_port + ('_HTTP' if component_link_info.kind == 'http' else '_TCP')
292 | full_uri = '%s://%s:%s' % (component_link_info.kind, component_link_info.address,
293 | component_link_info.exposed_port)
294 |
295 | environment[prefix] = full_uri
296 | environment[full_prefix] = full_uri
297 | environment[full_prefix + '_PROTO'] = component_link_info.kind
298 | environment[full_prefix + '_ADDR'] = component_link_info.address
299 | environment[full_prefix + '_PORT'] = component_link_info.exposed_port
300 |
301 | return environment
302 |
303 | def createContainer(self, client):
304 | """ Creates a docker container for this component and returns it. """
305 | command = self.getCommand()
306 | if not command:
307 | fail('No command defined in either gantry config or docker image for component ' +
308 | self.getName(), component=self)
309 |
310 | self.logger.debug('Starting container for component %s with command %s', self.getName(),
311 | command)
312 |
313 | container = client.create_container(self.config.getFullImage(), command,
314 | user=self.config.getUser(),
315 | volumes=self.config.getVolumes(),
316 | ports=[str(p) for p in self.config.getContainerPorts()],
317 | environment=self.calculateEnvForComponent())
318 |
319 | return container
320 |
321 | def getCommand(self):
322 | """ Returns the command to run or None if none found. """
323 | config_command = self.config.getCommand()
324 | if config_command:
325 | return config_command
326 |
327 | client = getDockerClient()
328 | named_image = self.config.getFullImage()
329 | result = client.inspect_image(named_image)
330 | container_cfg = result['Config']
331 | if not 'Cmd' in container_cfg:
332 | return None
333 |
334 | return ' '.join(container_cfg['Cmd'])
335 |
336 | def ensureImage(self, client):
337 | """ Ensures that the image for this component is present locally. If not,
338 | we attempt to pull the image.
339 | """
340 | images = client.images(name=self.config.repo)
341 | if images:
342 | for image in images:
343 | if 'RepoTags' in image.keys() and self.config.getFullImage() in image['RepoTags']:
344 | return
345 |
346 | try:
347 | client.pull(self.config.repo, tag=self.config.tag)
348 | except Exception as e:
349 | fail('Could not pull repo ' + self.config.repo, component=self, exception=str(e))
350 |
--------------------------------------------------------------------------------
/runtime/manager.py:
--------------------------------------------------------------------------------
1 | from component import Component
2 | from metadata import getContainerStatus, setContainerStatus, removeContainerMetadata
3 | from proxy.portproxy import Proxy, Route
4 | from util import report, fail, getDockerClient, ReportLevels
5 | from health.checks import buildTerminationSignal, buildHealthCheck
6 |
7 | from collections import defaultdict
8 | from Queue import Queue
9 | from multiprocessing.pool import ThreadPool
10 |
11 | import docker
12 | import psutil
13 | import threading
14 | import time
15 | import logging
16 | import containerutil
17 |
18 | class ComponentLinkInformation(object):
19 | """ Helper class which contains all runtime information about a component link. """
20 | def __init__(self, manager, component, link_config):
21 | # The component that exports the link.
22 | self.component = component
23 |
24 | # The configuration for the component link.
25 | self.link_config = link_config
26 |
27 | # The kind of the link.
28 | self.kind = 'http' if link_config.kind.lower() == 'http' else 'tcp'
29 |
30 | # The port of the link inside the running container.
31 | self.container_port = link_config.port
32 |
33 | # The address of the link under the proxy (None if the link is not running).
34 | self.address = None
35 |
36 | # The port of the link under the proxy (None if the link is not running).
37 | self.exposed_port = None
38 |
39 | # Whether the link is currently running.
40 | self.running = False
41 |
42 | # Lookup the runtime information for the link.
43 | client = getDockerClient()
44 | container = component.getPrimaryContainer()
45 | if container:
46 | container_ip = containerutil.getContainerIPAddress(client, container)
47 |
48 | self.address = client.inspect_container(container)['NetworkSettings']['Gateway'] # The host's IP address.
49 | self.exposed_port = link_config.getHostPort()
50 | self.running = True
51 |
52 |
53 | class RuntimeManager(object):
54 | """ Manager class which handles tracking of all the components and other runtime
55 | information.
56 | """
57 | def __init__(self, config):
58 | # Logging.
59 | self.logger = logging.getLogger(__name__)
60 |
61 | # The overall configuration.
62 | self.config = config
63 |
64 | # The proxy being used to talk to HAProxy.
65 | self.proxy = Proxy()
66 |
67 | # The components, by name.
68 | self.components = {}
69 |
70 | # Build the components map.
71 | for component_config in config.components:
72 | self.components[component_config.name] = Component(self, component_config)
73 |
74 | # Create the lock for the watcher thread and the notification event.
75 | self.watcher_lock = threading.Lock()
76 | self.watcher_event = threading.Event()
77 |
78 | # The set of containers which should be terminated by the terminating workers.
79 | self.containers_to_terminate = Queue()
80 |
81 | # Start the thread used to watch and stop containers that are no longer needed.
82 | self.pool = ThreadPool()
83 |
84 | # Place to collect the results of the monitor
85 | self.monitor_futures = Queue()
86 |
87 | def getComponent(self, name):
88 | """ Returns the component with the given name defined or None if none. """
89 | if not name in self.components:
90 | return None
91 |
92 | return self.components[name]
93 |
94 | def lookupComponentLink(self, link_name):
95 | """ Looks up the component link with the given name defined or None if none. """
96 | for component_name, component in self.components.items():
97 | defined_links = component.config.getDefinedComponentLinks()
98 | if link_name in defined_links:
99 | return ComponentLinkInformation(self, component, defined_links[link_name])
100 |
101 | return None
102 |
103 | def adjustForUpdatingComponent(self, component, started_container):
104 | """ Adjusts the runtime for a component which has been started in the given
105 | container.
106 | """
107 | self.logger.debug('Adjusting runtime for updating component: %s', component.getName())
108 | self.updateProxy()
109 |
110 | def adjustForStoppingComponent(self, component):
111 | """ Adjusts the runtime for a component which has been stopped.
112 | """
113 | self.logger.debug('Adjusting runtime for stopped component: %s', component.getName())
114 | self.updateProxy()
115 |
116 |
117 | def watchTermination(self, container, component):
118 | report('Monitor check started', level=ReportLevels.BACKGROUND)
119 |
120 | client = getDockerClient()
121 |
122 | # Send the termination signal(s) to the container
123 | signals = []
124 |
125 | for signal in component.config.termination_signals:
126 | signals.append((signal, buildTerminationSignal(signal)))
127 |
128 | report('Sending %s termination signals' % len(signals), component=component)
129 |
130 | for (config, signal) in signals:
131 | report('Sending termination signal: ' + config.getTitle(), component=component)
132 | result = signal.run(container, report)
133 | if not result:
134 | report('Termination signal failed', component=component)
135 |
136 | # Now wait until all of the termination conditions are met
137 | checks = []
138 | for check in component.config.termination_checks:
139 | checks.append((check, buildHealthCheck(check)))
140 |
141 | report('Waiting for %s termination checks' % len(checks), component=component)
142 |
143 | for (config, check) in checks:
144 | check_passed = False
145 |
146 | while not check_passed:
147 | report('Running termination check: ' + config.getTitle(), component=component)
148 | result = check.run(container, report)
149 | if not result:
150 | report('Termination check failed', component=component)
151 |
152 | report('Sleeping ' + str(config.timeout) + ' second(s)...', component=component)
153 | time.sleep(config.timeout)
154 | else:
155 | check_passed = True
156 |
157 | report('Monitor check finished', level=ReportLevels.BACKGROUND)
158 |
159 | setContainerStatus(container, 'shutting-down')
160 | report('Shutting down container: ' + container['Id'][0:12], level=ReportLevels.BACKGROUND)
161 | client.stop(container)
162 | removeContainerMetadata(container)
163 |
164 |
165 | def terminateContainer(self, container, component):
166 | """ Adds the given container to the list of containers which should be terminated.
167 | """
168 | report('Terminating container: %s' % container['Id'][:12], component=component)
169 | self.monitor_futures.put(self.pool.apply_async(self.watchTermination, (container, component)))
170 |
171 |
172 | def updateProxy(self):
173 | """ Updates the proxy used for port mapping to conform to the current running container
174 | list.
175 | """
176 | client = getDockerClient()
177 |
178 | # Clear all routes in the proxy.
179 | # TODO: When this is in daemon mode, don't need do this. We could selectively
180 | # edit it instead.
181 | self.proxy.clear_routes()
182 |
183 | # Add routes for the non-draining containers and collect the draining containers to
184 | # watch.
185 | report('Finding running containers...', level=ReportLevels.EXTRA)
186 | draining_containers = []
187 | starting_containers = []
188 |
189 | for component in self.components.values():
190 | for container in component.getAllContainers(client):
191 | if getContainerStatus(container) != 'draining':
192 | container_ip = containerutil.getContainerIPAddress(client, container)
193 | starting_containers.append(container)
194 |
195 | # Add the normal exposed ports.
196 | for mapping in component.config.ports:
197 | route = Route(mapping.kind == 'http', mapping.external, container_ip,
198 | mapping.container)
199 | self.proxy.add_route(route)
200 |
201 | # Add the container link ports.
202 | for link in component.config.defined_component_links:
203 | route = Route(link.kind == 'http', link.getHostPort(), container_ip, link.port)
204 | self.proxy.add_route(route)
205 | else:
206 | draining_containers.append(container)
207 |
208 | # Commit the changes to the proxy.
209 | if draining_containers or starting_containers:
210 | report('Updating proxy...', level=ReportLevels.EXTRA)
211 | self.proxy.commit()
212 | else:
213 | report('Shutting down proxy...', level=ReportLevels.EXTRA)
214 | self.proxy.shutdown()
215 |
216 | # Mark the starting containers as running.
217 | for container in starting_containers:
218 | setContainerStatus(container, 'running')
219 |
220 | def join(self):
221 | self.pool.close()
222 |
223 | while not self.monitor_futures.empty():
224 | # If any of the futures threw and exception we'll get it now
225 | self.monitor_futures.get().get()
226 |
227 | self.pool.join()
228 |
--------------------------------------------------------------------------------
/runtime/metadata.py:
--------------------------------------------------------------------------------
1 | import docker
2 | import json
3 |
4 | from peewee import (Model, SqliteDatabase, ForeignKeyField, CharField, OperationalError,
5 | sort_models_topologically, DoesNotExist)
6 | from functools import wraps
7 |
8 | GANTRY_METADATA_FILE = '.gantry_metadata'
9 | cached_metadata = None
10 |
11 |
12 | db = SqliteDatabase(GANTRY_METADATA_FILE)
13 |
14 |
15 | class BaseModel(Model):
16 | class Meta:
17 | database = db
18 |
19 |
20 | class Component(BaseModel):
21 | name = CharField(index=True)
22 |
23 |
24 | class ComponentField(BaseModel):
25 | component = ForeignKeyField(Component)
26 | key = CharField(index=True)
27 | value = CharField()
28 |
29 |
30 | class Container(BaseModel):
31 | docker_id = CharField(index=True)
32 | component = ForeignKeyField(Component, null=True)
33 |
34 |
35 | class ContainerField(BaseModel):
36 | container = ForeignKeyField(Container)
37 | key = CharField(index=True)
38 | value = CharField()
39 |
40 | class Meta:
41 | database = db
42 | indexes = (
43 | # A team name must be unique within an organization
44 | (('container', 'key'), True),
45 | )
46 |
47 |
48 | all_models = [Component, ComponentField, Container, ContainerField]
49 |
50 |
51 | def _initialze_db():
52 | for model in sort_models_topologically(all_models):
53 | try:
54 | model.select().get()
55 | except OperationalError as exc:
56 | model.create_table()
57 | except DoesNotExist:
58 | pass
59 |
60 |
61 | def db_access(to_wrap):
62 | @wraps(to_wrap)
63 | def wrapper(*args, **kwargs):
64 | _initialze_db()
65 |
66 | try:
67 | return to_wrap(*args, **kwargs)
68 | finally:
69 | if not db.is_closed():
70 | db.close()
71 |
72 | return wrapper
73 |
74 |
75 | def getContainerStatus(container):
76 | """ Returns the status code of the given container. """
77 | return _getContainerField(container, 'status', default='unknown')
78 |
79 |
80 | def setContainerStatus(container, status):
81 | """ Sets the status code for the given container. """
82 | _setContainerField(container, 'status', status)
83 |
84 |
85 | @db_access
86 | def getContainerComponent(container):
87 | """ Returns the component that owns the given container. """
88 | container_record = _upsertContainerRecord(container)
89 | return container_record.component and container_record.component.name
90 |
91 |
92 | @db_access
93 | def setContainerComponent(container, component_name):
94 | """ Sets the component code for the given container. """
95 | component = _upsertComponentRecord(component_name)
96 | container_record = _upsertContainerRecord(container)
97 | container_record.component = component
98 | container_record.save()
99 |
100 |
101 | def _getContainerId(container_or_id):
102 | return container_or_id['Id'] if isinstance(container_or_id, dict) else container_or_id
103 |
104 |
105 | @db_access
106 | def removeContainerMetadata(container):
107 | found = _upsertContainerRecord(container)
108 | found.delete_instance(recursive=True)
109 |
110 |
111 | def _getContainerFieldRecord(container, field):
112 | try:
113 | return (ContainerField
114 | .select()
115 | .join(Container)
116 | .where(Container.docker_id == container, ContainerField.key == field)
117 | .get())
118 | except ContainerField.DoesNotExist:
119 | return None
120 |
121 |
122 | def _upsertContainerRecord(container):
123 | container_id = _getContainerId(container)
124 | try:
125 | return (Container
126 | .select()
127 | .where(Container.docker_id == container_id)
128 | .get())
129 | except Container.DoesNotExist:
130 | return Container.create(docker_id=container_id)
131 |
132 |
133 | @db_access
134 | def _getContainerField(container, field, default):
135 | """ Returns the metadata field for the given container or the default value. """
136 | container_id = _getContainerId(container)
137 | found = _getContainerFieldRecord(container_id, field)
138 | return found.value if found else default
139 |
140 |
141 | @db_access
142 | def _setContainerField(container, field, value):
143 | """ Sets the metadata field for the given container. """
144 | container_id = _getContainerId(container)
145 | found = _getContainerFieldRecord(container_id, field)
146 | if found is not None:
147 | found.value = value
148 | found.save()
149 | else:
150 | container_record = _upsertContainerRecord(container_id)
151 | ContainerField.create(container=container_record, key=field, value=value)
152 |
153 |
154 | def _upsertComponentRecord(component):
155 | try:
156 | return (Component
157 | .select()
158 | .where(Component.name == component)
159 | .get())
160 | except Component.DoesNotExist:
161 | return Component.create(name=component)
162 |
163 |
164 | def _getComponentFieldRecord(component_name, field):
165 | try:
166 | return (ComponentField
167 | .select()
168 | .join(Component)
169 | .where(Component.name == component_name, ComponentField.key == field)
170 | .get())
171 | except ComponentField.DoesNotExist:
172 | return None
173 |
174 |
175 | @db_access
176 | def getComponentField(component_name, field, default):
177 | """ Returns the metadata field for the given component or the default value. """
178 | found = _getComponentFieldRecord(component_name, field)
179 | return found.value if found else default
180 |
181 |
182 | @db_access
183 | def setComponentField(component_name, field, value):
184 | """ Sets the metadata field for the given component. """
185 | found = _getComponentFieldRecord(component_name, field)
186 | if found is not None:
187 | found.value = value
188 | found.save()
189 | else:
190 | component = _upsertComponentRecord(component_name)
191 | ComponentField.create(component=component, key=field, value=value)
192 |
--------------------------------------------------------------------------------
/shutdown-haproxy.sh:
--------------------------------------------------------------------------------
1 | running="/var/run/haproxy-private.pid"
2 | if [ -f "$running" ]
3 | then
4 | kill $(cat