├── .gitignore ├── LICENSE ├── README.md ├── actions.py ├── config ├── GantryConfig.py ├── __init__.py └── object.py ├── containerutil.py ├── gantry.py ├── gantryd.py ├── gantryd ├── __init__.py ├── client.py ├── componentstate.py ├── componentwatcher.py ├── etcdpaths.py ├── etcdstate.py └── machinestate.py ├── health ├── __init__.py ├── checks.py ├── healthcheck.py ├── networkcheck.py └── termination.py ├── proxy ├── __init__.py ├── haproxy.tmpl └── portproxy.py ├── requirements.system ├── requirements.system.rhel ├── requirements.txt ├── restart-haproxy.sh ├── runtime ├── __init__.py ├── component.py ├── manager.py └── metadata.py ├── shutdown-haproxy.sh └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | .devtable 2 | gantry.xml 3 | *.pyc 4 | venv 5 | haproxy.conf 6 | .gantry_metadata 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and 10 | distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 13 | owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities 16 | that control, are controlled by, or are under common control with that entity. 17 | For the purposes of this definition, "control" means (i) the power, direct or 18 | indirect, to cause the direction or management of such entity, whether by 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising 23 | permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including 26 | but not limited to software source code, documentation source, and configuration 27 | files. 28 | 29 | "Object" form shall mean any form resulting from mechanical transformation or 30 | translation of a Source form, including but not limited to compiled object code, 31 | generated documentation, and conversions to other media types. 32 | 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made 34 | available under the License, as indicated by a copyright notice that is included 35 | in or attached to the work (an example is provided in the Appendix below). 36 | 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that 38 | is based on (or derived from) the Work and for which the editorial revisions, 39 | annotations, elaborations, or other modifications represent, as a whole, an 40 | original work of authorship. For the purposes of this License, Derivative Works 41 | shall not include works that remain separable from, or merely link (or bind by 42 | name) to the interfaces of, the Work and Derivative Works thereof. 43 | 44 | "Contribution" shall mean any work of authorship, including the original version 45 | of the Work and any modifications or additions to that Work or Derivative Works 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 47 | by the copyright owner or by an individual or Legal Entity authorized to submit 48 | on behalf of the copyright owner. For the purposes of this definition, 49 | "submitted" means any form of electronic, verbal, or written communication sent 50 | to the Licensor or its representatives, including but not limited to 51 | communication on electronic mailing lists, source code control systems, and 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for 53 | the purpose of discussing and improving the Work, but excluding communication 54 | that is conspicuously marked or otherwise designated in writing by the copyright 55 | owner as "Not a Contribution." 56 | 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 58 | of whom a Contribution has been received by Licensor and subsequently 59 | incorporated within the Work. 60 | 61 | 2. Grant of Copyright License. 62 | 63 | Subject to the terms and conditions of this License, each Contributor hereby 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 65 | irrevocable copyright license to reproduce, prepare Derivative Works of, 66 | publicly display, publicly perform, sublicense, and distribute the Work and such 67 | Derivative Works in Source or Object form. 68 | 69 | 3. Grant of Patent License. 70 | 71 | Subject to the terms and conditions of this License, each Contributor hereby 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 73 | irrevocable (except as stated in this section) patent license to make, have 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 75 | such license applies only to those patent claims licensable by such Contributor 76 | that are necessarily infringed by their Contribution(s) alone or by combination 77 | of their Contribution(s) with the Work to which such Contribution(s) was 78 | submitted. If You institute patent litigation against any entity (including a 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 80 | Contribution incorporated within the Work constitutes direct or contributory 81 | patent infringement, then any patent licenses granted to You under this License 82 | for that Work shall terminate as of the date such litigation is filed. 83 | 84 | 4. Redistribution. 85 | 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof 87 | in any medium, with or without modifications, and in Source or Object form, 88 | provided that You meet the following conditions: 89 | 90 | You must give any other recipients of the Work or Derivative Works a copy of 91 | this License; and 92 | You must cause any modified files to carry prominent notices stating that You 93 | changed the files; and 94 | You must retain, in the Source form of any Derivative Works that You distribute, 95 | all copyright, patent, trademark, and attribution notices from the Source form 96 | of the Work, excluding those notices that do not pertain to any part of the 97 | Derivative Works; and 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any 99 | Derivative Works that You distribute must include a readable copy of the 100 | attribution notices contained within such NOTICE file, excluding those notices 101 | that do not pertain to any part of the Derivative Works, in at least one of the 102 | following places: within a NOTICE text file distributed as part of the 103 | Derivative Works; within the Source form or documentation, if provided along 104 | with the Derivative Works; or, within a display generated by the Derivative 105 | Works, if and wherever such third-party notices normally appear. The contents of 106 | the NOTICE file are for informational purposes only and do not modify the 107 | License. You may add Your own attribution notices within Derivative Works that 108 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 109 | provided that such additional attribution notices cannot be construed as 110 | modifying the License. 111 | You may add Your own copyright statement to Your modifications and may provide 112 | additional or different license terms and conditions for use, reproduction, or 113 | distribution of Your modifications, or for any such Derivative Works as a whole, 114 | provided Your use, reproduction, and distribution of the Work otherwise complies 115 | with the conditions stated in this License. 116 | 117 | 5. Submission of Contributions. 118 | 119 | Unless You explicitly state otherwise, any Contribution intentionally submitted 120 | for inclusion in the Work by You to the Licensor shall be under the terms and 121 | conditions of this License, without any additional terms or conditions. 122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 123 | any separate license agreement you may have executed with Licensor regarding 124 | such Contributions. 125 | 126 | 6. Trademarks. 127 | 128 | This License does not grant permission to use the trade names, trademarks, 129 | service marks, or product names of the Licensor, except as required for 130 | reasonable and customary use in describing the origin of the Work and 131 | reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. 134 | 135 | Unless required by applicable law or agreed to in writing, Licensor provides the 136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 138 | including, without limitation, any warranties or conditions of TITLE, 139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 140 | solely responsible for determining the appropriateness of using or 141 | redistributing the Work and assume any risks associated with Your exercise of 142 | permissions under this License. 143 | 144 | 8. Limitation of Liability. 145 | 146 | In no event and under no legal theory, whether in tort (including negligence), 147 | contract, or otherwise, unless required by applicable law (such as deliberate 148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 149 | liable to You for damages, including any direct, indirect, special, incidental, 150 | or consequential damages of any character arising as a result of this License or 151 | out of the use or inability to use the Work (including but not limited to 152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 153 | any and all other commercial damages or losses), even if such Contributor has 154 | been advised of the possibility of such damages. 155 | 156 | 9. Accepting Warranty or Additional Liability. 157 | 158 | While redistributing the Work or Derivative Works thereof, You may choose to 159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 160 | other liability obligations and/or rights consistent with this License. However, 161 | in accepting such obligations, You may act only on Your own behalf and on Your 162 | sole responsibility, not on behalf of any other Contributor, and only if You 163 | agree to indemnify, defend, and hold each Contributor harmless for any liability 164 | incurred by, or claims asserted against, such Contributor by reason of your 165 | accepting any such warranty or additional liability. 166 | 167 | END OF TERMS AND CONDITIONS 168 | 169 | APPENDIX: How to apply the Apache License to your work 170 | 171 | To apply the Apache License to your work, attach the following boilerplate 172 | notice, with the fields enclosed by brackets "[]" replaced with your own 173 | identifying information. (Don't include the brackets!) The text should be 174 | enclosed in the appropriate comment syntax for the file format. We also 175 | recommend that a file or class name and description of purpose be included on 176 | the same "printed page" as the copyright notice for easier identification within 177 | third-party archives. 178 | 179 | Copyright [yyyy] [name of copyright owner] 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gantryd 2 | 3 | A framework built on top of [Docker](http://docker.io) that allows for easy deployment and management of project components, with a focus on: 4 | 5 | * Easy management of components of a project across multiple machines 6 | * Single command updating of components with **automatic draining** and **progressive rollout** 7 | * Ability to manage components locally, when necessary (see **gantry** below) 8 | 9 | ## Overview 10 | 11 | **gantryd** is a distributed, etcd-based system for running, updating, monitoring and managing various Docker images (known as "components") across 12 | multiple machines. 13 | 14 | ![gantryd overview](https://docs.google.com/drawings/d/1S0P8XE9H6lxUZNyQkfAXW9uYfKnXxUrzwA23oihwXlQ/pub?w=596&h=349) 15 | 16 | **gantryd** manages the running, monitoring and draining of containers, automatically updating machines *progressively* on update, and *draining* the old containers 17 | as it goes along. A container is only shutdown when *all connections* to it have terminated (or it is manually killed). This, combined with progressive 18 | update, allows for *continuous deployment* by simply pushing a new docker image to a repository and running `update` via `gantryd.py`. 19 | 20 | **gantryd** also automatically monitors the containers of a component, running checks periodically to ensure they are healthy. If a container goes bad, a new one is automatically started in its place, with traffic being moved over. 21 | 22 | ## Getting Started 23 | 24 | ### Getting etcd 25 | 26 | The latest etcd release is available as a binary at [Github][github-release]. 27 | Installation instructions can be found at [Etcd README][etcd-readme]. 28 | 29 | [github-release]: https://github.com/coreos/etcd/releases/ 30 | [etcd-readme]: https://github.com/coreos/etcd/blob/master/README.md 31 | 32 | 33 | ### Cloning the source 34 | 35 | ```sh 36 | git clone https://github.com/DevTable/gantryd.git 37 | ``` 38 | 39 | ### Installing dependencies 40 | 41 | #### Debian or Ubuntu 42 | ```sh 43 | # Install apt-get dependencies. 44 | cat requirements.system | xargs sudo apt-get install -y 45 | 46 | # Install python dependencies. 47 | sudo pip install -r requirements.txt 48 | ``` 49 | 50 | #### RHEL or Centos 51 | ```sh 52 | # Install yum dependencies. 53 | cat requirements.system.rhel | xargs sudo yum install -y 54 | 55 | # Install python dependencies. 56 | sudo pip install -r requirements.txt 57 | ``` 58 | 59 | ### Setting up 60 | 61 | All settings for gantryd are defined in a JSON format. A project's configuration is stored in etcd but is set initially from a local file (see `setconfig` below). 62 | 63 | The configuration defines the various components of the project you want to manage: 64 | ```json 65 | { 66 | "components": [ 67 | { 68 | "name": "someexamplecomponent", 69 | "repo": "my/localrepo", 70 | "tag": "latest", 71 | "command": ["/usr/bin/python", "/somedir/myapplication.py"], 72 | "ports": [ 73 | {"external": 8888, "container": 8888} 74 | ], 75 | "readyChecks": [ 76 | { "kind": "http", "port": 8888 } 77 | ], 78 | "healthChecks": [ 79 | { "kind": "http", "port": 8888, "path": "/some/path" } 80 | ], 81 | "volumesFrom": [ 82 | "somedatacontainer" 83 | ], 84 | "bindings": [ 85 | { "external": "/an/external/path", "volume": "/some/container/path"} 86 | ], 87 | "defineComponentLinks": [ 88 | { "port": 8888, "name": "mycoolserver", "kind": "tcp" } 89 | ], 90 | "requireComponentLinks": [ 91 | { "name": "anotherserver", "alias": "serveralias" } 92 | ], 93 | "environmentVariables": [ 94 | { "name": "FOO", "value": "somevalue" } 95 | ] 96 | } 97 | ] 98 | } 99 | ``` 100 | 101 | | Field | Description | Default | 102 | | --------------------- | --------------------------------------------------------------------------------- | ----------- | 103 | | name | The name of the component | | 104 | | repo | The docker image to use for the component | | 105 | | tag | The tag of the docker image to use | latest | 106 | | user | The user under which to run the command in the container | (in image) | 107 | | command | The command to run inside the container | (in image) | 108 | | ports | Mappings of container ports to external ports | | 109 | | readyChecks | The various checks to run to ensure the container is ready (see below for list) | | 110 | | healthChecks | The various checks to run to ensure the container is healthy (see below for list) | | 111 | | terminationSignals | Signals which should be sent to a specific container when it should be shut down | | 112 | | terminationChecks | The various checks to run to ensure that the container is ready to be shut down | connections | 113 | | volumesFrom | Container(s), by name, whose volume(s) should be mounted into the container | | 114 | | bindings | Mapping between external hosts paths and the corresponding container volumes | | 115 | | defineComponentLinks | Defines the component links exported by this component | | 116 | | requireComponentLinks | Defines the component links imported/required by this component | | 117 | | readyTimeout | Timeout in milliseconds that we will wait for a container to pass a ready check | 10,000 | 118 | | environmentVariables | Environment variables to set when running the component's containers | | 119 | | privileged | Whether the container should run in privileged mode | False | 120 | 121 | ### Terminology 122 | 123 | **Project**: Namespace that contains configuration for a set of components, as well as any metadata associated 124 | when those components are running. For example: 'frontend', 'backend', 'someproduct'. 125 | 126 | **Component**: A named component that runs a specific docker image in a container. For example: 'elasticsearch', 'mongodb'. 127 | 128 | **Component Link**: Similar to a Docker link: An exposed port by one *component* that is imported by one or more other 129 | components. Unlike a Docker link, a component link is managed by gantry and automatically updated via the proxy just link 130 | normal exposed ports. When a component link is required/imported by a container, the following environment variables are 131 | added into the containers for that component: 132 | 133 | | Environment Variable | Example Name | Example Value | 134 | | --------------------------------- | ----------------------------------- | ------------------------------------------------- | 135 | | {ALIAS}_CLINK | SERVERALIAS_CLINK | tcp://172.17.42.1:53852 | 136 | | {ALIAS}\_CLINK\_{PORT}\_{KIND} | SERVERALIAS_CLINK_8888_TCP | tcp://172.17.42.1:53852 | 137 | | {ALIAS}\_CLINK\_{PORT}\_{KIND}\_PROTO | SERVERALIAS_CLINK_8888_TCP_PROTO | tcp | 138 | | {ALIAS}\_CLINK\_{PORT}\_{KIND}\_ADDR | SERVERALIAS_CLINK_8888_TCP_ADDR | 172.17.42.1 | 139 | | {ALIAS}\_CLINK\_{PORT}\_{KIND}\_PORT | SERVERALIAS_CLINK_8888_TCP_PORT | 53852 | 140 | 141 | 142 | ### Setting up a project 143 | 144 | - [Gantryd commands](#gantryd-commands) - distributed management 145 | - [Gantry commands](#gantry-commands) - local management 146 | 147 | ### Gantryd commands 148 | 149 | #### Creating/updating the project's configuration 150 | 151 | To setup a gantryd project, make sure that etcd is running, and gantry configuration is avaliable in some file. 152 | 153 | Run the following to update the configuration for project `myprojectname` in gantryd: 154 | ```sh 155 | sudo ./gantryd.py setconfig myprojectname myconfigfile 156 | ``` 157 | 158 | Response: 159 | ```sh 160 | Configuration updated 161 | ``` 162 | 163 | #### Setup components by 'updating' them 164 | 165 | To mark one or more components as ready for deployment, execute the following from a machine with the latest images: 166 | ```sh 167 | sudo ./gantryd.py update myprojectname -c firstcomponent secondcomponent 168 | ``` 169 | 170 | Response: 171 | ```sh 172 | Updating the image IDs on components 173 | Component firstcomponent -> 4ae76210a4fe 174 | Component secondcomponent -> 0cf0c034fc89 175 | ``` 176 | 177 | This sets the status of the components to 'ready' and associates them with the image IDs listed. Once run, any followup 178 | `gantryd run` commands on this machine (or any other machines in the etcd cluster) will update and start those components 179 | with those images. 180 | 181 | #### Running components on machine(s) 182 | 183 | Once components have been marked as ready, they can be run by executing `gantryd run` on one or more machines: 184 | 185 | ```sh 186 | sudo ./gantryd.py run myprojectname -c firstcomponent secondcomponent 187 | ``` 188 | 189 | This command will start a daemon (and block), starting the components and monitoring them, until it is shutdown. 190 | 191 | #### Updating a component across all listening machines 192 | 193 | To tell components to update themselves in response to an image change, execute: 194 | 195 | ```sh 196 | sudo ./gantryd.py update myprojectname -c firstcomponent secondcomponent 197 | ``` 198 | 199 | Response: 200 | ```sh 201 | Updating the image IDs on components 202 | Component firstcomponent -> 4ae76210a4fe 203 | Component secondcomponent -> 0cf0c034fc89 204 | ``` 205 | 206 | The first machine running the gantryd daemon will start the update within 30 seconds. 207 | 208 | ### Listing the status of all components 209 | ```sh 210 | sudo ./gantryd.py list myprojectname 211 | ``` 212 | 213 | Response: 214 | ```sh 215 | COMPONENT STATUS IMAGE ID 216 | firstcomponent ready 4ae76210a4fe 217 | secondcomponent stopped 0cf0c034fc89 218 | ``` 219 | 220 | #### Stopping a component on all machines 221 | 222 | To tell components to stop themselves on all machines, execute: 223 | 224 | ```sh 225 | sudo ./gantryd.py stop myprojectname -c firstcomponent secondcomponent 226 | ``` 227 | 228 | Response: 229 | ```sh 230 | Marking components as stopped 231 | ``` 232 | 233 | All components specified will start the shutdown process within 30 seconds. 234 | 235 | #### Killing a component on all machines 236 | 237 | To order components to kill themselves immediately on all machines, execute: 238 | 239 | ```sh 240 | sudo ./gantryd.py kill myprojectname -c firstcomponent secondcomponent 241 | ``` 242 | 243 | Response: 244 | ```sh 245 | Marking components as killed 246 | ``` 247 | 248 | All components specified will be killed within 30 seconds. 249 | 250 | 251 | ### Gantryd health checks 252 | 253 | Gantryd supports a number of built-in checks for verifying that a container is properly started, running and healthy. 254 | 255 | #### http Health Check 256 | 257 | ```json 258 | { "kind": "http", "port": 8888, "path": "/somepath" } 259 | ``` 260 | 261 | Attempts to connect and download the HTTP page located at the given port and path. Fails if the HTTP response is not 2XX. 262 | 263 | Note that "path" is **optional**. 264 | 265 | #### tcp Health Check 266 | 267 | ```json 268 | { "kind": "tcp", "port": 8888 } 269 | ``` 270 | 271 | Attempts to connect to the given port via TCP. Fails if the connection cannot be established. 272 | 273 | 274 | ###Gantry commands 275 | 276 | **gantry** is the **local** version of gantry, intended for starting, stopping and updating of components on a **single** machine. Please note that you don't need etcd to be installed (or running) to use **gantry**. 277 | 278 | #### Listing all containers running on a local machine for a component 279 | ```sh 280 | sudo ./gantry.py myconfigfile list firstcomponent 281 | ``` 282 | 283 | Response: 284 | ```sh 285 | CONTAINER ID UPTIME IMAGE ID STATUS 286 | 39d59e26ee64 Up 17 seconds my/image:latest running 287 | 18182e07ade1 Up 2 minutes 0cf0c034fc89 draining 288 | 87b14f60b220 Up 4 minutes 26c8cb358b9d draining 289 | ``` 290 | 291 | #### Performing a *local* update of a component 292 | 293 | *Note*: This will occur outside of the gantryd event loop, so this should *only* be used for **single machine** or **canary** images. 294 | 295 | ```sh 296 | sudo ./gantry.py myconfigfile update firstcomponent 297 | ``` 298 | 299 | Response: 300 | ```sh 301 | Starting container 39d59e26ee64 302 | Waiting for health checks... 303 | Running health check: http 304 | Checking HTTP address: http://localhost:49320 305 | Redirecting traffic to new container 306 | Checking container statuses... 307 | Updating proxy... 308 | Starting monitoring... 309 | Monitor check started 310 | ``` 311 | 312 | *Note*: If the `-m` flag is specified, then gantry will remain running and actively monitor the component's container, restarting it automatically if it becomes unhealthy. 313 | 314 | #### Stopping all containers running on a local machine for a component 315 | 316 | *Note*: This will *drain* containers in a safe way, so the process will block until all containers are free from incoming connections 317 | 318 | ```sh 319 | sudo ./gantry.py myconfigfile stop firstcomponent 320 | ``` 321 | 322 | Response: 323 | ```sh 324 | Draining all containers... 325 | Checking container statuses... 326 | Updating proxy... 327 | Starting monitoring... 328 | Monitor check started 329 | Shutting down container: 39d59e26ee64 330 | Proxy updated 331 | ``` 332 | 333 | #### Killing all containers running on a local machine for a component 334 | ```sh 335 | sudo ./gantry.py myconfigfile kill firstcomponent 336 | ``` 337 | 338 | Response: 339 | ```sh 340 | Draining all containers... 341 | Killing container d05d73bc6c3 342 | Checking container statuses... 343 | Shutting down proxy... 344 | ``` 345 | -------------------------------------------------------------------------------- /actions.py: -------------------------------------------------------------------------------- 1 | def start_action(component): 2 | if component.isRunning(): 3 | print 'Component ' + component.getName() + ' is already running' 4 | return False 5 | 6 | return component.update() 7 | 8 | def stop_action(component): 9 | if not component.isRunning(): 10 | print 'Component ' + component.getName() + ' is not running' 11 | return False 12 | 13 | component.stop(kill=False) 14 | return False 15 | 16 | def kill_action(component): 17 | if not component.isRunning(): 18 | print 'Component ' + component.getName() + ' is not running' 19 | return False 20 | 21 | component.stop(kill=True) 22 | return False 23 | 24 | def update_action(component): 25 | return component.update() 26 | 27 | def list_action(component): 28 | if not component.isRunning(): 29 | print 'Component ' + component.getName() + ' is not running' 30 | return False 31 | 32 | print "%-20s %-20s %-20s %-20s" % ('CONTAINER ID', 'UPTIME', 'IMAGE ID', 'STATUS') 33 | 34 | for info in component.getContainerInformation(): 35 | container = info[0] 36 | status = info[1] 37 | 38 | id = container['Id'] 39 | uptime = container['Status'] 40 | image = container['Image'] 41 | i = (id[0:12], uptime, image, status) 42 | print "%-20s %-20s %-20s %-20s" % i 43 | 44 | return False 45 | 46 | -------------------------------------------------------------------------------- /config/GantryConfig.py: -------------------------------------------------------------------------------- 1 | from object import CFObject, CFField 2 | from util import pickUnusedPort 3 | from runtime.metadata import getComponentField, setComponentField 4 | 5 | class _HealthCheck(CFObject): 6 | """ A single check to perform to verify that a component is ready to be 7 | pushed or is running properly. 8 | """ 9 | id = CFField('id').default('').name_field() 10 | kind = CFField('kind').value_field() 11 | timeout = CFField('timeout').kind(int).default(3) 12 | 13 | def __init__(self): 14 | super(_HealthCheck, self).__init__('Health Check') 15 | 16 | def getTitle(self): 17 | """ Returns a descriptive title for the check. """ 18 | if self.id != '': 19 | return self.id 20 | 21 | return self.kind 22 | 23 | 24 | class _TerminationSignal(CFObject): 25 | """ A single signal that is sent to a component when the component should shut 26 | itself down. 27 | """ 28 | id = CFField('id').default('').name_field() 29 | kind = CFField('kind').value_field() 30 | timeout = CFField('timeout').kind(int).default(3) 31 | exec_command = CFField('exec_command').default('') 32 | 33 | def __init__(self): 34 | super(_TerminationSignal, self).__init__('Termination Signal') 35 | 36 | def getTitle(self): 37 | """ Returns a descriptive title for the check. """ 38 | if self.id != '': 39 | return self.id 40 | 41 | return self.kind 42 | 43 | 44 | class _PortMapping(CFObject): 45 | """ A port mapping of an internal container port to the outside world. """ 46 | external = CFField('external').kind(int).name_field() 47 | container = CFField('container').kind(int).value_field() 48 | kind = CFField('kind').default('tcp') 49 | 50 | def __init__(self): 51 | super(_PortMapping, self).__init__('Port Mapping') 52 | 53 | 54 | class _VolumeBinding(CFObject): 55 | """ A port mapping of an internal container port to the outside world. """ 56 | external = CFField('external').name_field() 57 | volume = CFField('volume').value_field() 58 | 59 | def __init__(self): 60 | super(_VolumeBinding, self).__init__('Volume Binding') 61 | 62 | 63 | class _DefinedComponentLink(CFObject): 64 | """ A network link exported by a component. """ 65 | name = CFField('name').name_field() 66 | port = CFField('port').kind(int).value_field() 67 | kind = CFField('kind').default('tcp') 68 | 69 | def __init__(self): 70 | super(_DefinedComponentLink, self).__init__('Component Link') 71 | 72 | def getHostPort(self): 73 | """ Returns the port used by the component link on the host. """ 74 | key = 'link-' + self.name + '-port' 75 | port = getComponentField(self.parent.name, key, 0) 76 | if not port: 77 | port = pickUnusedPort() 78 | setComponentField(self.parent.name, key, port) 79 | 80 | return port 81 | 82 | 83 | class _RequiredComponentLink(CFObject): 84 | """ A network link required by a component. """ 85 | name = CFField('name').name_field() 86 | alias = CFField('alias').value_field() 87 | 88 | def __init__(self): 89 | super(_RequiredComponentLink, self).__init__('Required Component Link') 90 | 91 | 92 | class _EnvironmentVariable(CFObject): 93 | """ An environment variable to set when running a component. """ 94 | name = CFField('name').name_field() 95 | value = CFField('value').value_field() 96 | 97 | def __init__(self): 98 | super(_EnvironmentVariable, self).__init__('Environment Variable') 99 | 100 | 101 | class _Component(CFObject): 102 | """ A single gantry component. """ 103 | name = CFField('name') 104 | repo = CFField('repo') 105 | tag = CFField('tag').default('latest') 106 | command = CFField('command').list_of(str).default([]) 107 | user = CFField('user').default('') 108 | ports = CFField('ports').list_of(_PortMapping).default([]) 109 | bindings = CFField('bindings').list_of(_VolumeBinding).default([]) 110 | volumes_from = CFField('volumesFrom').list_of(str).default([]) 111 | ready_checks = CFField('readyChecks').list_of(_HealthCheck).default([]) 112 | health_checks = CFField('healthChecks').list_of(_HealthCheck).default([]) 113 | ready_timeout = CFField('readyTimeout').kind(int).default(10000) 114 | termination_signals = CFField('terminationSignals').list_of(_TerminationSignal).default([]) 115 | privileged = CFField('privileged').kind(bool).default(False) 116 | defined_component_links = CFField('defineComponentLinks').list_of(_DefinedComponentLink).default([]) 117 | required_component_links = CFField('requireComponentLinks').list_of(_RequiredComponentLink).default([]) 118 | environment_variables = CFField('environmentVariables').list_of(_EnvironmentVariable).default([]) 119 | 120 | connection_check = _HealthCheck().build({'kind': 'connection'}) 121 | termination_checks = CFField('terminationChecks').list_of(_HealthCheck).default([connection_check]) 122 | 123 | def __init__(self): 124 | super(_Component, self).__init__('Component') 125 | 126 | def getFullImage(self): 127 | """ Returns the full image ID for this component, of the form 'repo:tag' """ 128 | return self.repo + ':' + self.tag 129 | 130 | def getUser(self): 131 | """ Returns the user under which to run the container or None if none. """ 132 | if not self.user: 133 | return None 134 | 135 | return self.user 136 | 137 | def getCommand(self): 138 | """ Returns the command string to run on component startup or None if none. """ 139 | if not self.command: 140 | return None 141 | 142 | return ' '.join(self.command) 143 | 144 | def getContainerPorts(self): 145 | """ Returns the full set of ports exposed by this component. """ 146 | return set([p.container for p in self.ports] + [l.port for l in self.defined_component_links]) 147 | 148 | def getReadyCheckTimeout(self): 149 | """ Returns the maximum amount of time, in seconds, before ready checks time out. """ 150 | return self.ready_timeout / 1000 151 | 152 | def getVolumes(self): 153 | """ Returns the volumes exposed by this component. """ 154 | return [binding.volume for binding in self.bindings] 155 | 156 | def getBindings(self, container_id): 157 | """ Returns the volumes exposed by this component. """ 158 | def substitute_id(external_dir): 159 | return external_dir.format(container_id=container_id[:12]) 160 | 161 | return {substitute_id(binding.external): binding.volume for binding in self.bindings} 162 | 163 | def getDefinedComponentLinks(self): 164 | """ Returns the dict of defined components links. """ 165 | return {l.name: l for l in self.defined_component_links} 166 | 167 | def getComponentLinks(self): 168 | """ Returns a dict of aliases for component links required, with the values being the links' names. """ 169 | return {l.alias: l.name for l in self.required_component_links} 170 | 171 | def getEnvironmentVariables(self): 172 | """ Returns a dict of the defined environments variables and their values. """ 173 | return {v.name: v.value for v in self.environment_variables} 174 | 175 | 176 | class Configuration(CFObject): 177 | """ The overall gantry configuration. """ 178 | components = CFField('components').list_of(_Component) 179 | 180 | def __init__(self): 181 | super(Configuration, self).__init__('Configuration') 182 | 183 | def lookupComponent(self, name): 184 | """ Looks up the component with the given name under this config. """ 185 | for component in self.components: 186 | if component.name == name: 187 | return component 188 | 189 | return None -------------------------------------------------------------------------------- /config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevTable/gantryd/eb348113f0f73a0be45a45f7a5626ad2b5dd30ba/config/__init__.py -------------------------------------------------------------------------------- /config/object.py: -------------------------------------------------------------------------------- 1 | import json 2 | import string 3 | import copy 4 | import inspect 5 | 6 | class ConfigParseException(Exception): 7 | pass 8 | 9 | class CFObject(object): 10 | """ 11 | Defines a class in which all fields marked with the CFField are automatically handled 12 | for configuration purposes. 13 | """ 14 | def __init__(self, name): 15 | # The parent of the object. 16 | self.parent = None 17 | 18 | # The name of the object. 19 | self.name = name 20 | 21 | # The cached fields for the object. 22 | self.fields = None 23 | 24 | # The "extra" fields on the object, if any. 25 | self.extra_fields = {} 26 | 27 | def applyOverride(self, override): 28 | """ Applies the given configuration override to this config object. 29 | 30 | Format: 'Name.SubName=Value' 31 | """ 32 | (path, value) = override.split('=', 2) 33 | path_pieces = path.split('.') 34 | 35 | # Find the field with the associated name. 36 | field_name = path_pieces[0] 37 | field = self.get_fields().get(field_name) 38 | if not field: 39 | raise ConfigParseException('No field named %s found' % field_name) 40 | 41 | # If the field is a list, then we need a named key (at least). 42 | if field.get_kind() == list: 43 | list_item_kind = field.get_list_kind() 44 | current_list = field.get_value(self) 45 | 46 | # If the list item kind is not a config object, then we expect to just add the config 47 | # value. 48 | if not CFObject in inspect.getmro(list_item_kind): 49 | if len(path_pieces) > 1: 50 | raise ConfigParseException('Found list field %s, expected simply the list name' % field_name) 51 | 52 | new_list_item = list_item_kind(value) 53 | current_list.append(new_list_item) 54 | return 55 | 56 | # Otherwise, find the named field in the config object and set the override in there. 57 | name_field = list_item_kind().get_name_field() 58 | if not name_field: 59 | raise ConfigParseException('List field %s does not support overrides by name' % field_name) 60 | 61 | # Find the named value in the list that matches the name given. If none, we create a new 62 | # entry. 63 | entry_name = path_pieces[1] 64 | found_entry = None 65 | for entry in current_list: 66 | if name_field.get_value(entry) == name_field.get_kind()(entry_name): 67 | found_entry = entry 68 | break 69 | 70 | if not found_entry: 71 | # Create the new entry with the user specified name and add it to the list. 72 | found_entry = list_item_kind() 73 | found_entry.parent = self 74 | name_field.set_value(found_entry, entry_name) 75 | 76 | current_list.append(found_entry) 77 | 78 | # If there are no further path pieces, then we look for a value field to set. 79 | if len(path_pieces) == 2: 80 | value_field = found_entry.get_value_field() 81 | if not value_field: 82 | raise ConfigParseException('No default value field found for config %s' % entry.name) 83 | 84 | value_field.set_value(found_entry, value_field.get_kind()(value)) 85 | return 86 | 87 | # Otherwise, we set the override under the entry. 88 | found_entry.applyOverride('%s=%s' % ('.'.join(path_pieces[2:]), value)) 89 | 90 | else: 91 | if len(path_pieces) > 1: 92 | raise ConfigParseException('Cannot access named keys under %s' % field_name) 93 | 94 | # Update the field's value. 95 | field.set_value(self, field.get_kind()(value)) 96 | 97 | def hasExtraField(self, name): 98 | """ Returns true if there is an 'extra' field with the given name. """ 99 | return name in self.extra_fields 100 | 101 | def getExtraField(self, name): 102 | """ Returns the 'extra' field with this name. """ 103 | return self.extra_fields[name] 104 | 105 | def getRootConfig(self): 106 | """ Returns the root configuration object. """ 107 | if self.parent: 108 | return self.parent 109 | 110 | return self 111 | 112 | @classmethod 113 | def parse(cls, json_data): 114 | """ Parses the given JSON data into an instance of this config object. """ 115 | dictionary = json.loads(json_data) 116 | return cls.build(dictionary) 117 | 118 | @classmethod 119 | def build(cls, dictionary): 120 | """ Builds an instance of this config object from the given dictionary. """ 121 | instance = cls() 122 | instance.extra_fields = copy.copy(dictionary) 123 | for name in instance.get_fields(): 124 | if name in instance.extra_fields: 125 | del instance.extra_fields[name] 126 | 127 | field = instance.get_fields()[name] 128 | if field.is_required() and not name in dictionary: 129 | raise ConfigParseException('Missing required property ' + name + ' under object ' + instance.name) 130 | 131 | if name in dictionary: 132 | field.populate(instance, dictionary[name]) 133 | 134 | return instance 135 | 136 | def get_name_field(self): 137 | """ Returns the name field under this config object, if any. """ 138 | for field in self.get_fields().values(): 139 | if field.get_is_name_field(): 140 | return field 141 | 142 | return None 143 | 144 | def get_value_field(self): 145 | """ Returns the value field under this config object, if any. """ 146 | for field in self.get_fields().values(): 147 | if field.get_is_value_field(): 148 | return field 149 | 150 | return None 151 | 152 | def get_fields(self): 153 | """ Returns a dictionary of all CFField's defined in the CFObject """ 154 | # Check the field cache first 155 | if self.fields: 156 | return self.fields 157 | 158 | fields = {} 159 | class_fields = dir(self.__class__) 160 | class_dict = self.__class__.__dict__ 161 | for field_name in class_fields: 162 | if class_dict.has_key(field_name): 163 | field = class_dict[field_name] 164 | if field.__class__ == CFField: 165 | name = CFField.get_name(field) 166 | fields[name] = field 167 | self.fields = fields 168 | return fields 169 | 170 | 171 | class CFField(object): 172 | """ A field representing a property in the configuration object """ 173 | def __init__(self, name): 174 | # Whether the field is the name field for the parent object. 175 | self.is_name_field = False 176 | 177 | # Whether the field is the primary value field for the parent object. 178 | self.is_value_field = False 179 | 180 | # The name of the field in the config. 181 | self.name = name 182 | 183 | # The current value of the field. 184 | self.value = None 185 | 186 | # The type of the field. Defaults to string. 187 | self.field_kind = str 188 | 189 | # If this field is a list, the kind of its elements. 190 | self.list_kind = None 191 | 192 | # The default value for the field. If none, the field is required. 193 | self.default_value = None 194 | 195 | def __get__(self, instance, owner): 196 | return self.get_value(instance) 197 | 198 | def __set__(self, instance, value): 199 | self.update(instance, value) 200 | 201 | def name_field(self): 202 | """ Marks a field as being the name field for the parent object. """ 203 | self.is_name_field = True 204 | return self 205 | 206 | def value_field(self): 207 | """ Marks a field as being the value field for the parent object. """ 208 | self.is_value_field = True 209 | return self 210 | 211 | def kind(self, kind): 212 | """ Sets the kind of the field. """ 213 | self.field_kind = kind 214 | return self 215 | 216 | def default(self, value): 217 | """ Sets the default value for the field. """ 218 | self.default_value = value 219 | return self 220 | 221 | def list_of(self, kind): 222 | """ Sets that this field is a list of some kind of values. """ 223 | self.field_kind = list 224 | self.list_kind = kind 225 | return self 226 | 227 | def get_kind(self): 228 | return self.field_kind 229 | 230 | def get_list_kind(self): 231 | return self.list_kind 232 | 233 | def is_required(self): 234 | return self.default_value is None 235 | 236 | def get_is_name_field(self): 237 | return self.is_name_field 238 | 239 | def get_is_value_field(self): 240 | return self.is_value_field 241 | 242 | def get_name(self): 243 | """ Returns the name of the field """ 244 | return self.name 245 | 246 | def populate(self, instance, primitive): 247 | """ Attempts to populate this list from the given primitive value. """ 248 | if self.field_kind == list: 249 | if not isinstance(primitive, list): 250 | raise ConfigParseException('Expected list for field ' + self.name) 251 | 252 | list_value = [] 253 | for p in primitive: 254 | c_value = self.get_converted_value(instance, p, self.list_kind) 255 | if not isinstance(c_value, self.list_kind): 256 | raise ConfigParseException('Expected items of kind ' + str(self.list_kind) + ' in ' + self.name) 257 | list_value.append(c_value) 258 | 259 | self.update(instance, list_value) 260 | return 261 | 262 | self.update(instance, self.get_converted_value(instance, primitive, self.field_kind)) 263 | 264 | def get_converted_value(self, instance, primitive, kind): 265 | # Class types. 266 | if issubclass(kind, CFObject): 267 | if not isinstance(primitive, dict): 268 | raise ConfigParseException('Expected dictionary for field ' + self.name) 269 | 270 | built = kind.build(primitive) 271 | built.parent = instance; 272 | return built 273 | 274 | # Otherwise, convert to from a string. 275 | return kind(primitive) 276 | 277 | def internal_data(self, instance): 278 | internal_name = self.name + '_data' 279 | if internal_name not in instance.__dict__: 280 | instance.__dict__[internal_name] = {'data': None} 281 | return instance.__dict__[internal_name] 282 | 283 | def get_value(self, instance): 284 | """ Returns the value of the field for the given instance """ 285 | value = self.internal_data(instance)['data']; 286 | if value is None and self.default_value is not None: 287 | return self.default_value 288 | 289 | return value 290 | 291 | def set_value(self, instance, value): 292 | """ Sets the value of the field for the given instance """ 293 | self.__set__(instance, value) 294 | 295 | def update(self, instance, value): 296 | """ Updates the value of the field """ 297 | self.internal_data(instance)['data'] = value -------------------------------------------------------------------------------- /containerutil.py: -------------------------------------------------------------------------------- 1 | def getContainerIPAddress(client, container): 2 | """ Returns the IP address on which the container is running. """ 3 | container_info = client.inspect_container(container) 4 | return container_info['NetworkSettings']['IPAddress'] 5 | -------------------------------------------------------------------------------- /gantry.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import signal 5 | import time 6 | 7 | from actions import start_action, update_action, list_action, stop_action, kill_action 8 | from config.GantryConfig import Configuration 9 | from runtime.manager import RuntimeManager 10 | from util import report, fail 11 | 12 | 13 | ACTIONS = { 14 | 'start': start_action, 15 | 'update': update_action, 16 | 'list': list_action, 17 | 'stop': stop_action, 18 | 'kill': kill_action 19 | } 20 | 21 | 22 | def loadConfig(config_file): 23 | """ Attempts to load and parse the given config file. """ 24 | try: 25 | with open(config_file, 'r') as f: 26 | config_json = f.read() 27 | except: 28 | print 'Could not find config file: ' + config_file 29 | return None 30 | 31 | try: 32 | return Configuration.parse(config_json) 33 | except Exception as e: 34 | print 'Error parsing gantry config: ' + str(e) 35 | return None 36 | 37 | 38 | def monitor(component): 39 | while True: 40 | # Sleep for 30 seconds. 41 | time.sleep(30) 42 | 43 | # Conduct the checks. 44 | report('Checking in on component ' + component.getName()) 45 | if not component.isHealthy(): 46 | report('Component ' + component.getName() + ' is not healthy. Killing and restarting') 47 | component.stop(kill=True) 48 | if not component.update(): 49 | report('Could not restart component ' + component.getName()) 50 | return 51 | 52 | 53 | def run(): 54 | # Setup the gantry arguments 55 | parser = argparse.ArgumentParser(description='gantry continuous deployment system') 56 | parser.add_argument('config_file', help='The configuration file') 57 | parser.add_argument('action', help='The action to perform', choices=ACTIONS.keys()) 58 | parser.add_argument('component_name', help='The name of the component to manage') 59 | parser.add_argument('-m', dest='monitor', action='store_true', help='If specified and the action is "start" or "update", gantry will remain running to monitor components, auto restarting them as necessary') 60 | parser.add_argument('--setconfig', dest='config_overrides', action='append', help='Configuration overrides for the component') 61 | 62 | args = parser.parse_args() 63 | component_name = args.component_name 64 | action = args.action 65 | should_monitor = args.monitor 66 | config_file = args.config_file 67 | config_overrides = args.config_overrides 68 | 69 | # Load the config. 70 | config = loadConfig(config_file) 71 | if not config: 72 | return 73 | 74 | # Create the manager. 75 | manager = RuntimeManager(config) 76 | 77 | # Find the component 78 | component = manager.getComponent(component_name) 79 | if not component: 80 | raise Exception('Unknown component: ' + component_name) 81 | 82 | # Apply the config overrides (if any). 83 | if config_overrides: 84 | component.applyConfigOverrides(config_overrides) 85 | 86 | # Run the action with the component and config. 87 | result = ACTIONS[action](component) 88 | if result and should_monitor: 89 | try: 90 | report('Starting monitoring of component: ' + component_name) 91 | monitor(component) 92 | except KeyboardInterrupt: 93 | report('Terminating monitoring of component: ' + component_name) 94 | 95 | def cleanup_monitor(signum, frame): 96 | manager.join() 97 | 98 | # Set the signal handler and a 5-second alarm 99 | signal.signal(signal.SIGINT, cleanup_monitor) 100 | 101 | # We may have to call cleanup manually if we weren't asked to monitor 102 | cleanup_monitor(None, None) 103 | 104 | if __name__ == "__main__": 105 | run() -------------------------------------------------------------------------------- /gantryd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from gantryd.client import GantryDClient 4 | import argparse 5 | import json 6 | 7 | ETCD_HOST = '127.0.0.1' 8 | ETCD_PORT = 4001 9 | 10 | def run(dclient, args): 11 | """ Runs gantryd. """ 12 | dclient.run(args.component) 13 | 14 | def getconfig(dclient, args): 15 | """ Prints out the current project configuration stored in etcd. """ 16 | config = None 17 | try: 18 | config = dclient.getConfigJSON() 19 | except: 20 | pass 21 | 22 | if not config: 23 | print 'No config found' 24 | return 25 | 26 | print json.dumps(json.loads(config), sort_keys=True, indent=2, separators=(',', ': ')) 27 | 28 | def setconfig(dclient, args): 29 | """ Sets the current project configuration stored in etcd. """ 30 | if not args.configfile: 31 | print 'Missing configfile parameter' 32 | return 33 | 34 | with open(args.configfile, 'r') as f: 35 | dclient.setConfig(json.loads(f.read())) 36 | print 'Configuration updated' 37 | 38 | def list_status(dclient, args): 39 | """ Lists the status of all components in gantryd. """ 40 | dclient.listStatus() 41 | 42 | def mark_updated(dclient, args): 43 | """ Marks a component to be updated. """ 44 | dclient.markUpdated(args.component) 45 | 46 | def stop(dclient, args): 47 | """ Marks a component to be stopped. """ 48 | dclient.stopComponents(args.component) 49 | 50 | def kill(dclient, args): 51 | """ Marks a component to be killed. """ 52 | dclient.killComponents(args.component) 53 | 54 | ACTIONS = { 55 | 'run': run, 56 | 'getconfig': getconfig, 57 | 'setconfig': setconfig, 58 | 'list': list_status, 59 | 'update': mark_updated, 60 | 'stop': stop, 61 | 'kill': kill 62 | } 63 | 64 | def start(): 65 | # Setup the gantryd arguments. 66 | parser = argparse.ArgumentParser(description='gantry continuous deployment system daemon') 67 | parser.add_argument('action', help='The action to perform', choices=ACTIONS.keys()) 68 | parser.add_argument('project', help='The name of the project containing the components') 69 | parser.add_argument('configfile', help='The name of the config file. Only applies to setconfig.', nargs='?') 70 | parser.add_argument('-c', help='A component to watch and run', nargs='+', type=str, dest='component') 71 | parser.add_argument('-etcd', help='The etcd endpoint to which the client should connect. Defaults to 127.0.0.1', dest='etcd_host', nargs='?', const=ETCD_HOST) 72 | parser.add_argument('-etcdport', help='The client port of the etcd endpoint. Defaults to 4001.', dest='etcd_port', nargs='?', const=ETCD_PORT) 73 | 74 | # Parse the arguments. 75 | args = parser.parse_args() 76 | port = int(args.etcd_port) if args.etcd_port else ETCD_PORT 77 | 78 | # Initialize the gantryd client. 79 | dclient = GantryDClient(args.etcd_host or ETCD_HOST, args.project, port) 80 | 81 | # Run the action. 82 | action = ACTIONS[args.action] 83 | action(dclient, args) 84 | 85 | if __name__ == "__main__": 86 | start() 87 | -------------------------------------------------------------------------------- /gantryd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevTable/gantryd/eb348113f0f73a0be45a45f7a5626ad2b5dd30ba/gantryd/__init__.py -------------------------------------------------------------------------------- /gantryd/client.py: -------------------------------------------------------------------------------- 1 | from runtime.manager import RuntimeManager 2 | from config.GantryConfig import Configuration 3 | from config.object import ConfigParseException 4 | 5 | from gantryd.componentwatcher import ComponentWatcher 6 | from gantryd.machinestate import MachineState 7 | from gantryd.componentstate import ComponentState, STOPPED_STATUS, KILLED_STATUS 8 | from gantryd.etcdpaths import getProjectConfigPath 9 | 10 | from util import report, fail, ReportLevels 11 | 12 | import etcd 13 | import uuid 14 | import atexit 15 | import threading 16 | import time 17 | import socket 18 | import json 19 | import logging 20 | 21 | REPORT_TTL = 60 # Report that this machine is running, every 60 seconds 22 | 23 | class GantryDClient(object): 24 | """ A client in gantryd. """ 25 | def __init__(self, etcdHost, projectName, etcdPort): 26 | self.project_name = projectName 27 | self.runtime_manager = None 28 | self.components = [] 29 | self.is_running = False 30 | 31 | # Generate a unique ID for this machine/client. 32 | self.machine_id = str(uuid.uuid1()) 33 | 34 | # Logging. 35 | self.logger = logging.getLogger(__name__) 36 | 37 | # Initialize the etcd client that we'll use. 38 | self.etcd_client = etcd.Client(host=etcdHost, port=etcdPort) 39 | 40 | # Initialize the thread used for reporting the status of this machine to etcd. 41 | self.reporting_thread = threading.Thread(target=self.reportMachineStatus, args=[]) 42 | self.reporting_thread.daemon = True 43 | 44 | def getConfigJSON(self): 45 | """ Returns the project's config JSON or raises an exception if none. """ 46 | # Lookup the project on etcd. If none, report an error. 47 | config_json = None 48 | try: 49 | self.logger.debug('Looking up configuration for project %s in etcd', self.project_name) 50 | config_json = self.etcd_client.get(getProjectConfigPath(self.project_name)).value 51 | except KeyError as k: 52 | self.logger.exception(k) 53 | fail('Unknown project ' + self.project_name, project=self.project_name) 54 | 55 | return config_json 56 | 57 | def getConfig(self): 58 | """ Returns the project's config or raises an exception if none. """ 59 | config_json = self.getConfigJSON() 60 | 61 | # Parse the project's configuration and save it. 62 | try: 63 | self.config = Configuration.parse(config_json) 64 | except ConfigParseException as cpe: 65 | fail('Error parsing gantry config', project=self.project_name, exception=cpe) 66 | except Exception as e: 67 | self.logger.exception(e) 68 | 69 | return self.config 70 | 71 | def setConfig(self, config): 72 | """ Sets the project's config in etcd. """ 73 | config_json = json.dumps(config) 74 | self.logger.debug('Updating configuration for project %s', self.project_name) 75 | self.etcd_client.set(getProjectConfigPath(self.project_name), config_json) 76 | 77 | def stopComponents(self, component_names): 78 | """ Tells all the given components on all systems to stop. """ 79 | self.initialize(component_names) 80 | 81 | report('Marking components as stopped', project=self.project_name) 82 | for component in self.components: 83 | report('Marking component as stopped', project=self.project_name, component=component, 84 | level = ReportLevels.EXTRA) 85 | state = ComponentState(self.project_name, component, self.etcd_client) 86 | state.setStatus(STOPPED_STATUS) 87 | 88 | def killComponents(self, component_names): 89 | """ Tells all the given components on all systems to die. """ 90 | self.initialize(component_names) 91 | 92 | report('Marking components as killed', project=self.project_name) 93 | for component in self.components: 94 | report('Marking component as killed', project=self.project_name, component=component, 95 | level = ReportLevels.EXTRA) 96 | state = ComponentState(self.project_name, component, self.etcd_client) 97 | state.setStatus(KILLED_STATUS) 98 | 99 | def markUpdated(self, component_names): 100 | """ Tells all the given components to update themselves. """ 101 | self.initialize(component_names) 102 | 103 | report('Updating the image IDs on components', project=self.project_name) 104 | for component in self.components: 105 | image_id = component.getImageId() 106 | state = ComponentState(self.project_name, component, self.etcd_client) 107 | 108 | report('Component %s->%s' % (component.getName(), image_id[0:12]), project=self.project_name, 109 | component = component) 110 | state.setReadyStatus(image_id) 111 | 112 | def listStatus(self): 113 | """ Lists the status of all components in this project. """ 114 | self.getConfig() 115 | self.initialize([c.name for c in self.config.components]) 116 | 117 | print "%-20s %-20s %-20s" % ('COMPONENT', 'STATUS', 'IMAGE ID') 118 | for component in self.components: 119 | state = ComponentState(self.project_name, component, self.etcd_client).getState() 120 | status = ComponentState.getStatusOf(state) 121 | imageid = ComponentState.getImageIdOf(state) 122 | print "%-20s %-20s %-20s" % (component.getName(), status, imageid) 123 | 124 | 125 | def run(self, component_names): 126 | """ Runs the given components on this machine. """ 127 | self.initialize(component_names) 128 | 129 | # Register a handler to remove this machine from the list when the daemon is 130 | # shutdown. The controller will also occasionally ping a machine to verify it 131 | # is present. 132 | self.logger.debug('Registering exit listener') 133 | atexit.register(self.handleExit) 134 | 135 | # Start the thread to register this machine as being part of the project. 136 | self.startReporter() 137 | 138 | # Start watcher thread(s), one for each component, to see when to update them. 139 | report('Gantryd running', project=self.project_name) 140 | for component in self.components: 141 | self.logger.debug('Starting component watcher for component: %s', component.getName()) 142 | watcher = ComponentWatcher(component, self.project_name, self.machine_id, self.etcd_client) 143 | watcher.start() 144 | 145 | # And sleep until new stuff comes in. 146 | while True: 147 | time.sleep(1) 148 | 149 | 150 | ######################################################################## 151 | 152 | def initialize(self, component_names): 153 | """ Initializes this client for working with the components given. """ 154 | # Load the project configuration. 155 | self.getConfig() 156 | 157 | # Initialize the runtime manager. 158 | self.runtime_manager = RuntimeManager(self.config) 159 | 160 | # Find all the components for this machine. 161 | for component_name in component_names: 162 | component = self.runtime_manager.getComponent(component_name) 163 | if not component: 164 | fail('Unknown component named ' + component_name, project=self.project_name) 165 | 166 | self.components.append(component) 167 | 168 | def handleExit(self): 169 | """ Function executed when the Python system exits. This unregisters the machine in etcd. """ 170 | self.is_running = False 171 | try: 172 | machine_state = MachineState(self.project_name, self.machine_id, self.etcd_client) 173 | machine_state.removeMachine() 174 | 175 | # Shut down the runtime manager if we have one 176 | if self.runtime_manager is not None: 177 | self.runtime_manager.join() 178 | 179 | except Exception as e: 180 | self.logger.exception(e) 181 | pass 182 | 183 | def startReporter(self): 184 | """ Starts reporting that this machine is running. """ 185 | self.is_running = True 186 | self.reporting_thread.start() 187 | 188 | def reportMachineStatus(self): 189 | """ Reports that this machine has running components. """ 190 | while self.is_running: 191 | # Perform the update. 192 | self.logger.debug('Reporting status for machine %s to etcd', self.machine_id) 193 | machine_state = MachineState(self.project_name, self.machine_id, self.etcd_client) 194 | machine_state.registerMachine([c.getName() for c in self.components], ttl=REPORT_TTL) 195 | 196 | # Sleep for the TTL minus a few seconds. 197 | time.sleep(REPORT_TTL - 5) 198 | 199 | 200 | -------------------------------------------------------------------------------- /gantryd/componentstate.py: -------------------------------------------------------------------------------- 1 | import json 2 | from etcdstate import EtcdState 3 | from etcdpaths import getComponentStatePath 4 | 5 | READY_STATUS = 'ready' 6 | STOPPED_STATUS = 'stopped' 7 | KILLED_STATUS = 'killed' 8 | PULL_FAIL = 'pullfail' 9 | 10 | IMAGE_ID = 'imageid' 11 | 12 | class ComponentState(EtcdState): 13 | """ Helper class which allows easy getting and setting of the etcd distributed 14 | state of a component. 15 | """ 16 | def __init__(self, project_name, component, etcd_client): 17 | path = getComponentStatePath(project_name, component) 18 | super(ComponentState, self).__init__(path, etcd_client) 19 | 20 | @staticmethod 21 | def getStatusOf(state): 22 | """ Returns the status field in the given state object. """ 23 | return state['status'] if 'status' in state else 'unknown' 24 | 25 | @staticmethod 26 | def getImageIdOf(state): 27 | """ Returns the image ID field in the given state object or empty string if None. """ 28 | return state[IMAGE_ID] if IMAGE_ID in state else '' 29 | 30 | def getStatus(self): 31 | """ Returns the status of the component. """ 32 | return self.getState(default={'status': 'unknown'}).status 33 | 34 | def setStatus(self, status, **kwargs): 35 | """ Sets the status of the component. """ 36 | state = dict(kwargs) 37 | state['status'] = status 38 | self.setState(state) 39 | 40 | def setReadyStatus(self, imageid): 41 | """ Sets the status of the component to 'ready', with the given imageid. """ 42 | self.setStatus(READY_STATUS, imageid=imageid) 43 | 44 | def setUpdatingStatus(self, status, machine_id, original_state): 45 | """ Attempts to set the status of the component to being updated by the given machine. 46 | Returns the updated state on success and None otherwise. 47 | """ 48 | state = {} 49 | state['status'] = status 50 | state['machine'] = machine_id 51 | return self.replaceState(original_state, state) 52 | -------------------------------------------------------------------------------- /gantryd/componentwatcher.py: -------------------------------------------------------------------------------- 1 | import time 2 | import threading 3 | import json 4 | import logging 5 | 6 | from gantryd.componentstate import ComponentState, STOPPED_STATUS, KILLED_STATUS, READY_STATUS, PULL_FAIL 7 | from util import report, fail, getDockerClient, ReportLevels 8 | 9 | CHECK_SLEEP_TIME = 30 # 30 seconds 10 | CHECK_SHORT_SLEEP_TIME = 10 # 10 seconds 11 | MONITOR_SLEEP_TIME = 30 # 30 seconds 12 | 13 | class ComponentWatcher(object): 14 | """ Helper class which watches a specific component's status in etcd and 15 | manages the update/stop/kill process (if necessary). Also watches the 16 | component itself once started, and ensures that it remains running (restarting 17 | it if it failed). 18 | """ 19 | def __init__(self, component, project_name, machine_id, etcd_client): 20 | self.component = component 21 | self.project_name = project_name 22 | self.machine_id = machine_id 23 | self.is_running = False 24 | 25 | # Logging. 26 | self.logger = logging.getLogger(__name__) 27 | 28 | # Setup the state helper for the component. 29 | self.state = ComponentState(project_name, component, etcd_client) 30 | 31 | # Setup the watcher thread. 32 | self.watcher_thread = threading.Thread(target=self.waitForCommand, args=[]) 33 | self.watcher_thread.daemon = True 34 | 35 | # Setup the monitor thread. 36 | self.monitor_thread = threading.Thread(target=self.monitorComponent, args=[]) 37 | self.monitor_thread.daemon = True 38 | 39 | # Setup an event to ping the monitor thread when it should restart checking in 40 | # on the component. 41 | self.monitor_event = threading.Event() 42 | 43 | # Setup a lock to prevent multiple threads from trying to (re)start a container. 44 | self.update_lock = threading.Lock() 45 | 46 | def start(self): 47 | """ Starts the watcher. """ 48 | self.watcher_thread.start() 49 | self.monitor_thread.start() 50 | 51 | def monitorComponent(self): 52 | """ Monitors a component by pinging it every MONITOR_SLEEP_TIME seconds or so. If a component 53 | fails, then the system will try to restart it. If that fails, the component is marked 54 | as dead. 55 | """ 56 | while True: 57 | # Wait for the component to be running. 58 | self.monitor_event.wait() 59 | 60 | # Sleep MONITOR_SLEEP_TIME seconds. 61 | time.sleep(MONITOR_SLEEP_TIME) 62 | 63 | # Check the component. 64 | report('Checking in on component', project=self.project_name, component=self.component, 65 | level=ReportLevels.BACKGROUND) 66 | 67 | if not self.component.isHealthy(): 68 | self.logger.debug('Component %s is not healty', self.component.getName()) 69 | with self.update_lock: 70 | # Just to be sure... 71 | if not self.is_running: 72 | continue 73 | 74 | # Ensure that the component is still ready. 75 | state = self.state.getState() 76 | current_status = ComponentState.getStatusOf(state) 77 | if current_status == READY_STATUS: 78 | report('Component ' + self.component.getName() + ' is not healthy. Restarting...', 79 | project=self.project_name, component=self.component) 80 | 81 | if not self.component.update(): 82 | report('Could not restart component ' + self.component.getName(), 83 | project=self.project_name, component=self.component, 84 | level=ReportLevels.IMPORTANT) 85 | self.monitor_event.clear() 86 | continue 87 | 88 | def waitForCommand(self): 89 | """ Waits for an command notification on the component in etcd. If one is received, 90 | processes it by attempting to update the component. 91 | """ 92 | is_initial_loop = True 93 | sleep_time = 0 94 | while True: 95 | # Sleep and then check again. 96 | time.sleep(sleep_time) 97 | sleep_time = CHECK_SLEEP_TIME 98 | 99 | # Check the component's status. 100 | self.logger.debug('Checking state for component %s', self.component.getName()) 101 | state = self.state.getState() 102 | self.logger.debug('Found state %s for component %s', state, self.component.getName()) 103 | 104 | # Determine whether we should give initial status messages. 105 | was_initial_loop = is_initial_loop 106 | is_initial_loop = False 107 | 108 | # Take actions based on the status requested. 109 | current_status = ComponentState.getStatusOf(state) 110 | sleep_time = self.handleStatus(current_status, state, was_initial_loop) 111 | 112 | def handleStatus(self, current_status, state, was_initial_check): 113 | """ Handles the various status states for the component, returning the 114 | amount of time after which to retry lookup up the state or -1 for 115 | terminated. 116 | """ 117 | if current_status == STOPPED_STATUS: 118 | return self.handleStopped(was_initial_check) 119 | elif current_status == KILLED_STATUS: 120 | return self.handleKilled(was_initial_check) 121 | elif current_status == READY_STATUS or current_status == PULL_FAIL: 122 | with self.update_lock: 123 | return self.handleReady(state, was_initial_check) 124 | 125 | return CHECK_SLEEP_TIME 126 | 127 | def handleStopped(self, was_initial_check): 128 | """ Handles when the component has been marked to be stopped. """ 129 | self.monitor_event.clear() 130 | 131 | if was_initial_check: 132 | report('Component %s is marked as stopped' % self.component.getName(), 133 | project=self.project_name, component=self.component) 134 | 135 | self.is_running = False 136 | self.component.stop(kill=False) 137 | return CHECK_SLEEP_TIME 138 | 139 | def handleKilled(self, was_initial_check): 140 | """ Handles when the component has been marked to be killed. """ 141 | self.monitor_event.clear() 142 | 143 | if was_initial_check: 144 | report('Component %s is marked as killed' % self.component.getName(), 145 | project=self.project_name, component=self.component) 146 | 147 | self.is_running = False 148 | self.component.stop(kill=True) 149 | return CHECK_SLEEP_TIME 150 | 151 | def handleReady(self, state, was_initial_check): 152 | """ Handles when the component has been marked as ready. """ 153 | 154 | # If the status is ready, we update the component if: 155 | # - The ID of the component's image does not match that found in the status. 156 | # - The process is not running. 157 | imageid = ComponentState.getImageIdOf(state) 158 | imageid_different = imageid != self.component.getImageId() 159 | should_update = not self.is_running or imageid_different 160 | 161 | if should_update: 162 | self.is_running = False 163 | self.monitor_event.clear() 164 | 165 | # We need to update this machine's copy. First, do a test and set to ensure that 166 | # we are the only machine allowed to update. If the test and set fails, we'll 167 | # try again in 10s. 168 | if imageid_different: 169 | report('Detected pushed update for component ' + self.component.getName(), 170 | project=self.project_name, component=self.component) 171 | else: 172 | report('Component %s is not running; starting' % self.component.getName(), 173 | project=self.project_name, component=self.component) 174 | 175 | result = self.state.setUpdatingStatus('updating', self.machine_id, state) 176 | if not result: 177 | # The exchange failed. Sleep CHECK_SHORT_SLEEP_TIME seconds and try again. 178 | report('Could not grab update lock. Will try again in %s seconds' % CHECK_SHORT_SLEEP_TIME, 179 | project=self.project_name, component=self.component) 180 | return CHECK_SHORT_SLEEP_TIME 181 | 182 | # Start the update by pulling the repo for the component. 183 | if imageid_different: 184 | report('Pulling the image for component ' + self.component.getName()) 185 | if not self.component.pullRepo(): 186 | # The pull failed. 187 | report('Pull failed of image %s for component %s' % (imageid[0:12], 188 | self.component.getName()), 189 | project=self.project_name, component=self.component, level=ReportLevels.IMPORTANT) 190 | self.state.setUpdatingStatus('pullfail', self.machine_id, result) 191 | return CHECK_SLEEP_TIME 192 | 193 | # Run the update on the component and wait for it to finish. 194 | if imageid_different: 195 | report('Starting update for component ' + self.component.getName(), 196 | project=self.project_name, component=self.component) 197 | 198 | if not self.component.update(): 199 | # The update failed. 200 | self.state.setUpdatingStatus('updatefail', self.machine_id, result) 201 | return CHECK_SLEEP_TIME 202 | 203 | # Otherwise, the update has succeeded. Mark the component as ready, so another 204 | # gantryd can start its update. 205 | if imageid_different: 206 | report('Update completed for component ' + self.component.getName(), 207 | project=self.project_name, component=self.component) 208 | else: 209 | report('Component ' + self.component.getName() + ' is now running', 210 | project=self.project_name, component=self.component) 211 | 212 | self.state.setReadyStatus(self.component.getImageId()) 213 | self.is_running = True 214 | self.monitor_event.set() 215 | 216 | return CHECK_SLEEP_TIME 217 | -------------------------------------------------------------------------------- /gantryd/etcdpaths.py: -------------------------------------------------------------------------------- 1 | GANTRYD_NAMESPACE = 'gantryd' 2 | PROJECT_NAMESPACE = 'projects' 3 | COMPONENT_NAMESPACE = 'components' 4 | MACHINES_NAMESPACE = 'machines' 5 | 6 | STATE_FILE = 'state' 7 | CONFIG_FILE = 'config' 8 | 9 | def buildPath(*args): 10 | return '/' + GANTRYD_NAMESPACE + '/' + '/'.join(args) 11 | 12 | def getMachineStatePath(projectName, machineId): 13 | """ Returns the path for this machine in the etcd config for the project. """ 14 | # gantryd/projects/{project}/machines/{machineid}/state 15 | return buildPath(PROJECT_NAMESPACE, projectName, MACHINES_NAMESPACE, machineId, STATE_FILE) 16 | 17 | def getProjectConfigPath(projectName): 18 | """ Returns the path for this project's config in the etcd config. """ 19 | # gantryd/projects/{project}/config 20 | return buildPath(PROJECT_NAMESPACE, projectName, CONFIG_FILE) 21 | 22 | def getComponentStatePath(projectName, component): 23 | """ Returns the path for the given component under this project in the etcd config. """ 24 | # gantryd/projects/{project}/components/{componentname}/state 25 | return buildPath(PROJECT_NAMESPACE, projectName, COMPONENT_NAMESPACE, component.getName(), STATE_FILE) 26 | -------------------------------------------------------------------------------- /gantryd/etcdstate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | class EtcdState(object): 5 | """ Base class for all helper classes which get and set state in etcd for objects. 6 | """ 7 | def __init__(self, state_path, etcd_client): 8 | self.etcd_client = etcd_client 9 | self.state_path = state_path 10 | 11 | # Logging. 12 | self.logger = logging.getLogger(__name__) 13 | 14 | def getState(self, default={}): 15 | """ Gets the state. """ 16 | try: 17 | self.logger.debug('Looking up etcd path: %s', self.state_path) 18 | return json.loads(self.etcd_client.get(self.state_path).value) 19 | except KeyError as k: 20 | pass 21 | except ValueError as v: 22 | self.logger.exception(v) 23 | pass 24 | 25 | return default 26 | 27 | def replaceState(self, previous_state, new_state): 28 | """ Attempts to atomically replace the given previous state with a new state. 29 | On success, returns the new state object. On failure, returns None. 30 | """ 31 | try: 32 | self.logger.debug('Test and set replacing etcd path: %s', self.state_path) 33 | original_contents_json = json.dumps(previous_state, separators=(',', ':')) 34 | new_contents_json = json.dumps(new_state, separators=(',', ':')) 35 | self.etcd_client.test_and_set(self.state_path, new_contents_json, original_contents_json) 36 | except ValueError as e: 37 | self.logger.debug('Test and set replacment for etcd path %s failed', self.state_path) 38 | return None 39 | 40 | return new_state 41 | 42 | def buildAndSetState(self, **kwargs): 43 | """ Builds state from the given args and sets the state. """ 44 | state_obj = dict(kwargs) 45 | self.setState(state_obj) 46 | 47 | def setState(self, state_obj={}, ttl=None): 48 | """ Sets the state to the given object. """ 49 | self.etcd_client.set(self.state_path, json.dumps(state_obj, separators=(',', ':')), ttl=ttl) 50 | 51 | def deleteState(self): 52 | """ Deletes the state. """ 53 | self.etcd_client.delete() 54 | 55 | -------------------------------------------------------------------------------- /gantryd/machinestate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import socket 3 | 4 | from etcdstate import EtcdState 5 | from etcdpaths import getMachineStatePath 6 | 7 | STATUS_RUNNING = 'running' 8 | 9 | class MachineState(EtcdState): 10 | """ Helper class which allows easy getting and setting of the etcd distributed 11 | state of a machine. 12 | """ 13 | def __init__(self, project_name, machine_id, etcd_client): 14 | path = getMachineStatePath(project_name, machine_id) 15 | super(MachineState, self).__init__(path, etcd_client) 16 | 17 | def registerMachine(self, component_names, ttl=60): 18 | """ Registers this machine with etcd. """ 19 | machine_state = { 20 | 'status': STATUS_RUNNING, 21 | 'components': component_names, 22 | 'ip': socket.gethostbyname(socket.gethostname()) 23 | } 24 | 25 | self.setState(machine_state, ttl=ttl) 26 | 27 | def getStatus(self): 28 | """ Returns the status of this machine. """ 29 | return self.getState({'status': 'unknown'}) 30 | 31 | def removeMachine(self): 32 | """ Removes this machine from etcd. """ 33 | self.deleteState() -------------------------------------------------------------------------------- /health/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevTable/gantryd/eb348113f0f73a0be45a45f7a5626ad2b5dd30ba/health/__init__.py -------------------------------------------------------------------------------- /health/checks.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from networkcheck import TcpCheck, HttpRequestCheck, IncomingConnectionCheck 4 | from termination import HttpTerminationSignal, ExecTerminationSignal 5 | from util import report, fail, getDockerClient 6 | 7 | # The list of registered health checks 8 | HEALTH_CHECKS = { 9 | 'tcp': TcpCheck, 10 | 'http': partial(HttpRequestCheck, 'http'), 11 | 'https': partial(HttpRequestCheck, 'https'), 12 | 'connection': IncomingConnectionCheck, 13 | } 14 | 15 | def buildHealthCheck(check_config): 16 | """ Builds a health check to run and returns it. """ 17 | kind = check_config.kind 18 | if not kind in HEALTH_CHECKS: 19 | fail('Unknown health check: ' + kind) 20 | 21 | return HEALTH_CHECKS[kind](check_config) 22 | 23 | TERMINATION_SIGNALS = { 24 | 'http': partial(HttpTerminationSignal, 'http'), 25 | 'https': partial(HttpTerminationSignal, 'https'), 26 | 'exec': ExecTerminationSignal, 27 | } 28 | 29 | def buildTerminationSignal(check_config): 30 | """ Builds a termination signal and returns it. """ 31 | kind = check_config.kind 32 | if not kind in TERMINATION_SIGNALS: 33 | fail('Unknown termination signal kind: ' + kind) 34 | 35 | return TERMINATION_SIGNALS[kind](check_config) 36 | -------------------------------------------------------------------------------- /health/healthcheck.py: -------------------------------------------------------------------------------- 1 | import containerutil 2 | 3 | from util import getDockerClient 4 | 5 | import logging 6 | 7 | 8 | class ContainerSignal(object): 9 | def __init__(self): 10 | # Logging. 11 | self.logger = logging.getLogger(__name__) 12 | 13 | def getContainerIPAddress(self, container): 14 | """ Returns the IP address on which the container is running. """ 15 | client = getDockerClient() 16 | return containerutil.getContainerIPAddress(client, container) 17 | 18 | 19 | class TerminationSignal(ContainerSignal): 20 | """ Base class for all termination signals. """ 21 | def run(self, container, report): 22 | """ Sends the termination signal to the given container, returning True if it succeeds. 23 | """ 24 | return False 25 | 26 | 27 | class HealthCheck(ContainerSignal): 28 | """ Base class for all health checks. """ 29 | def run(self, container, report): 30 | """ Runs the given health check on the given container, returning True if it succeeds and 31 | false otherwise. 32 | """ 33 | return False 34 | -------------------------------------------------------------------------------- /health/networkcheck.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import urllib2 3 | 4 | from health.healthcheck import HealthCheck 5 | from util import ReportLevels 6 | from proxy.portproxy import Proxy 7 | 8 | class TcpCheck(HealthCheck): 9 | """ A health check which tries to connect to a port via TCP. """ 10 | def __init__(self, config): 11 | super(TcpCheck, self).__init__() 12 | self.config = config 13 | 14 | def run(self, container, report): 15 | container_port = self.config.getExtraField('port') 16 | container_ip = self.getContainerIPAddress(container) 17 | 18 | report('Checking TCP port in container ' + container['Id'][0:12] + ': ' + str(container_port), 19 | level = ReportLevels.EXTRA) 20 | try: 21 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 22 | sock.connect((container_ip, container_port)) 23 | sock.close() 24 | except Exception as e: 25 | print e 26 | return False 27 | 28 | return True 29 | 30 | 31 | class HttpRequestCheck(HealthCheck): 32 | """ A health check which tries to connect to an HTTP server on a known port. """ 33 | def __init__(self, protocol, config): 34 | super(HttpRequestCheck, self).__init__() 35 | self.protocol = protocol 36 | self.config = config 37 | 38 | def run(self, container, report): 39 | container_port = self.config.getExtraField('port') 40 | container_ip = self.getContainerIPAddress(container) 41 | 42 | address = '%s://%s:%s' % (self.protocol, container_ip, container_port) 43 | if self.config.hasExtraField('path'): 44 | address += self.config.getExtraField('path') 45 | 46 | report('Checking HTTP address in container ' + container['Id'][0:12] + ': ' + address, 47 | level = ReportLevels.EXTRA) 48 | try: 49 | response = urllib2.urlopen(address, timeout=2) 50 | response.read() 51 | except Exception as exc: 52 | self.logger.exception(exc) 53 | return False 54 | 55 | return True 56 | 57 | 58 | class IncomingConnectionCheck(HealthCheck): 59 | """ A health check which will succeed only if there are NO incoming connections to a container. 60 | """ 61 | def __init__(self, config): 62 | super(IncomingConnectionCheck, self).__init__() 63 | self.config = config 64 | 65 | def run(self, container, report): 66 | container_ip = self.getContainerIPAddress(container) 67 | 68 | for connection in Proxy.get_connections(): 69 | if not connection.laddr or not connection.raddr: 70 | continue 71 | 72 | if connection.raddr[0] == container_ip: 73 | report('Container still has existing connections: %s' % container['Id'][0:12], 74 | level=ReportLevels.EXTRA) 75 | return False 76 | 77 | report('Container has no remaining connections: %s' % container['Id'][0:12], 78 | level=ReportLevels.EXTRA) 79 | return True 80 | -------------------------------------------------------------------------------- /health/termination.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | 3 | from health.healthcheck import TerminationSignal 4 | from util import ReportLevels, getDockerClient 5 | 6 | class HttpTerminationSignal(TerminationSignal): 7 | """ A termination signal which tries to POST to an HTTP server on a known port. """ 8 | def __init__(self, protocol, config): 9 | super(TerminationSignal, self).__init__() 10 | self.protocol = protocol 11 | self.config = config 12 | 13 | def run(self, container, report): 14 | container_port = self.config.getExtraField('port') 15 | container_ip = self.getContainerIPAddress(container) 16 | 17 | address = '%s://%s:%s' % (self.protocol, container_ip, container_port) 18 | if self.config.hasExtraField('path'): 19 | address += self.config.getExtraField('path') 20 | 21 | data = '' 22 | 23 | report('Posting to HTTP address in container ' + container['Id'][0:12] + ': ' + address, 24 | level=ReportLevels.EXTRA) 25 | try: 26 | req = urllib2.Request(address, data) 27 | response = urllib2.urlopen(req, timeout=2) 28 | response.read() 29 | except Exception as exc: 30 | self.logger.exception(exc) 31 | return False 32 | 33 | return True 34 | 35 | class ExecTerminationSignal(TerminationSignal): 36 | """ A termination signal which tries to EXEC a command on a running container """ 37 | def __init__(self, config): 38 | super(TerminationSignal, self).__init__() 39 | self.config = config 40 | 41 | def run(self, container, report): 42 | report('ExecTerminationSignal in container %s: %s' % (container['Id'][0:12], self.config.exec_command), 43 | level=ReportLevels.EXTRA) 44 | 45 | try: 46 | client = getDockerClient() 47 | response = client.exec_create(container, self.config.exec_command) 48 | client.exec_start(response['Id']) 49 | except Exception as exc: 50 | self.logger.exception(exc) 51 | return False 52 | 53 | return True 54 | -------------------------------------------------------------------------------- /proxy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevTable/gantryd/eb348113f0f73a0be45a45f7a5626ad2b5dd30ba/proxy/__init__.py -------------------------------------------------------------------------------- /proxy/haproxy.tmpl: -------------------------------------------------------------------------------- 1 | global 2 | daemon 3 | maxconn 4096 4 | user haproxy 5 | group haproxy 6 | quiet 7 | nbproc 1 8 | pidfile /var/run/haproxy-private.pid 9 | log 127.0.0.1 local1 notice 10 | stats socket /var/run/haproxy.sock mode 0600 level admin 11 | 12 | defaults 13 | option abortonclose 14 | option forwardfor 15 | option httpclose 16 | 17 | log global 18 | {% for port, route in port_routes.items() %} 19 | frontend port_{{ port }} 20 | bind 0.0.0.0:{{ port }} 21 | timeout client 86400000 22 | {% if route.is_http -%} 23 | mode http 24 | {%- else -%} 25 | mode tcp 26 | {%- endif %} 27 | 28 | default_backend {{ route.id }}-backend 29 | {% endfor %} 30 | 31 | {% for route in port_routes.values() %} 32 | backend {{ route.id }}-backend 33 | {%- if route.is_http %} 34 | mode http 35 | {%- else %} 36 | mode tcp 37 | {%- endif %} 38 | balance roundrobin 39 | timeout server 86400000 40 | timeout connect 5000 41 | server {{ route.id }}-backend-0 {{ route.container_ip }}:{{ route.container_port }} 42 | 43 | {% endfor %} 44 | -------------------------------------------------------------------------------- /proxy/portproxy.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import logging 3 | import psutil 4 | 5 | from uuid import uuid4 6 | from jinja2 import Environment, FileSystemLoader 7 | 8 | TEMPLATE_FOLDER = 'proxy' 9 | 10 | HAPROXY = 'haproxy' 11 | HAPROXY_TEMPLATE = 'haproxy.tmpl' 12 | HAPROXY_PID_FILE = '/var/run/haproxy-private.pid' 13 | HAPROXY_CONFIG_FILE = 'haproxy.conf' 14 | 15 | CLOSE_WAIT = 'CLOSE_WAIT' 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class Proxy(object): 22 | def __init__(self): 23 | # The registered routes, by external port number. 24 | self._port_routes = {} 25 | 26 | jinja_options = { 27 | "loader": FileSystemLoader(TEMPLATE_FOLDER), 28 | } 29 | 30 | env = Environment(**jinja_options) 31 | self._template = env.get_template(HAPROXY_TEMPLATE) 32 | 33 | @staticmethod 34 | def get_connections(): 35 | """ Returns the connection information for all proxy processes. """ 36 | logger.debug('Getting proxy connections') 37 | connections = [] 38 | for proc in psutil.process_iter(): 39 | if proc.is_running() and proc.name() == HAPROXY: 40 | connections.extend([conn for conn in proc.get_connections() if conn.status != CLOSE_WAIT]) 41 | 42 | return connections 43 | 44 | def clear_routes(self): 45 | """ Clears all routes found in the proxy. """ 46 | self._port_routes = {} 47 | 48 | def add_route(self, route): 49 | """ Adds a route to the proxy (but does not commit the changes). """ 50 | self._port_routes[route.host_port] = route 51 | 52 | def shutdown(self): 53 | """ Shuts down the proxy entirely. """ 54 | subprocess.call('./shutdown-haproxy.sh', shell=True, close_fds=True) 55 | 56 | def commit(self): 57 | """ Commits the changes made to the proxy. """ 58 | logger.debug("Restarting haproxy with new rules.") 59 | 60 | # If the port routes are empty, add a dummy mapping to the proxy. 61 | if len(self._port_routes.values()) == 0: 62 | self.add_route(Route(False, 65535, '127.0.0.2', 65534, is_fake=True)) 63 | 64 | # Write out the config. 65 | rendered = self._template.render({'port_routes': self._port_routes}) 66 | with open(HAPROXY_CONFIG_FILE, 'w') as config_file: 67 | config_file.write(rendered) 68 | 69 | # Restart haproxy 70 | subprocess.call('./restart-haproxy.sh', shell=True, close_fds=True) 71 | 72 | 73 | class Route(object): 74 | """ A single route proxied. """ 75 | def __init__(self, is_http, host_port, container_ip, container_port, is_fake=False): 76 | self.id = str(uuid4()) 77 | self.is_fake = is_fake 78 | self.is_http = is_http 79 | self.host_port = host_port 80 | self.container_ip = container_ip 81 | self.container_port = container_port 82 | -------------------------------------------------------------------------------- /requirements.system: -------------------------------------------------------------------------------- 1 | python-dev 2 | haproxy 3 | python-virtualenv 4 | libssl-dev 5 | libffi-dev -------------------------------------------------------------------------------- /requirements.system.rhel: -------------------------------------------------------------------------------- 1 | python-devel 2 | haproxy 3 | python-virtualenv 4 | python-pip 5 | openssl-devel 6 | libffi-devel 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | termcolor 2 | docker-py 3 | psutil >=2.2.1,<3.0 4 | jinja2 5 | python-etcd 6 | peewee -------------------------------------------------------------------------------- /restart-haproxy.sh: -------------------------------------------------------------------------------- 1 | running="/var/run/haproxy-private.pid" 2 | if [ -f "$running" ] 3 | then 4 | haproxy -f haproxy.conf -sf $(cat /var/run/haproxy-private.pid) 5 | else 6 | haproxy -f haproxy.conf 7 | fi -------------------------------------------------------------------------------- /runtime/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevTable/gantryd/eb348113f0f73a0be45a45f7a5626ad2b5dd30ba/runtime/__init__.py -------------------------------------------------------------------------------- /runtime/component.py: -------------------------------------------------------------------------------- 1 | from threading import Thread, Event 2 | 3 | from health.checks import buildHealthCheck 4 | from metadata import (getContainerStatus, setContainerStatus, removeContainerMetadata, 5 | getContainerComponent, setContainerComponent) 6 | from util import report, fail, getDockerClient, ReportLevels 7 | 8 | import time 9 | import logging 10 | 11 | class Component(object): 12 | """ A component that can be/is running. Tracks all the runtime information 13 | for a component. 14 | """ 15 | def __init__(self, manager, config): 16 | # Logging. 17 | self.logger = logging.getLogger(__name__) 18 | 19 | # The overall manager for components, which tracks global state. 20 | self.manager = manager 21 | 22 | # The underlying config for the component. 23 | self.config = config 24 | 25 | def applyConfigOverrides(self, config_overrides): 26 | """ Applies the list of configuration overrides to this component's config. 27 | 28 | Format: ['Name=Value', 'Name.SubName=Value'] 29 | """ 30 | for override in config_overrides: 31 | self.config.applyOverride(override) 32 | 33 | def getName(self): 34 | """ Returns the name of the component. """ 35 | return self.config.name 36 | 37 | def lookupExportedComponentLink(self, link_name): 38 | """ Looks up the exported component link with the given name and returns it or None if none. """ 39 | pass 40 | 41 | def isRunning(self): 42 | """ Returns whether this component has at least one running container. Note that 43 | this will return True for ALL possible containers of the component, including 44 | deprecated ones. 45 | """ 46 | self.logger.debug('Checking if component %s is running', self.getName()) 47 | client = getDockerClient() 48 | return len(self.getAllContainers(client)) > 0 49 | 50 | def getPrimaryContainer(self): 51 | """ Returns the container for this component that is not marked as draining or None if 52 | none. 53 | """ 54 | client = getDockerClient() 55 | for container in self.getAllContainers(client): 56 | if getContainerStatus(container) != 'draining': 57 | return container 58 | 59 | return None 60 | 61 | def getImageId(self): 62 | """ Returns the docker ID of the image used for this component. Note that this 63 | will *not* return the *named* image, but rather the full UUID-like ID. 64 | """ 65 | client = getDockerClient() 66 | named_image = self.config.getFullImage() 67 | self.logger.debug('Finding image ID for component %s with named image %s', self.getName(), named_image) 68 | result = client.inspect_image(named_image) 69 | return result['Id'] 70 | 71 | def pullRepo(self): 72 | """ Attempts to pull the repo for this component. On failure, returns False. """ 73 | try: 74 | self.logger.debug('Attempting to pull repo for component %s: %s:%s', self.getName(), self.config.repo, self.config.tag) 75 | client = getDockerClient() 76 | client.pull(self.config.repo, tag=self.config.tag) 77 | return True 78 | except Exception as e: 79 | self.logger.exception(e) 80 | return False 81 | 82 | def update(self): 83 | """ Updates a running instance of the component. Returns True on success and False 84 | otherwise. 85 | """ 86 | self.logger.debug('Updating component %s', self.getName()) 87 | client = getDockerClient() 88 | 89 | # Get the list of currently running container(s). 90 | existing_containers = self.getAllContainers(client) 91 | existing_primary = self.getPrimaryContainer() 92 | 93 | # Start the new instance. 94 | container = self.start() 95 | if not container: 96 | return False 97 | 98 | # Mark all the existing containers as draining. 99 | for existing in existing_containers: 100 | setContainerStatus(existing, 'draining') 101 | 102 | # Update the port proxy to redirect the external ports to the new 103 | # container. 104 | report('Redirecting traffic to new container', component=self) 105 | self.manager.adjustForUpdatingComponent(self, container) 106 | 107 | # Signal the existing primary container to terminate 108 | if existing_primary is not None: 109 | self.manager.terminateContainer(existing_primary, self) 110 | 111 | return True 112 | 113 | def stop(self, kill=False): 114 | """ Stops all containers for this component. """ 115 | if not self.isRunning(): 116 | return 117 | 118 | self.logger.debug('Stopping component %s', self.getName()) 119 | client = getDockerClient() 120 | 121 | # Mark all the containers as draining. 122 | report('Draining all containers...', component=self) 123 | for container in self.getAllContainers(client): 124 | setContainerStatus(container, 'draining') 125 | self.manager.terminateContainer(container, self) 126 | 127 | # Kill any associated containers if asked. 128 | if kill: 129 | for container in self.getAllContainers(client): 130 | report('Killing container ' + container['Id'][:12], component=self) 131 | client.kill(container) 132 | removeContainerMetadata(container) 133 | 134 | # Clear the proxy and rebuild its routes for the running components. 135 | self.manager.adjustForStoppingComponent(self) 136 | 137 | def getContainerInformation(self): 138 | """ Returns the container status information for all containers. """ 139 | client = getDockerClient() 140 | information = [] 141 | 142 | for container in self.getAllContainers(client): 143 | information.append((container, getContainerStatus(container))) 144 | 145 | return information 146 | 147 | def isHealthy(self): 148 | """ Runs the health checks on this component's container, ensuring that it is healthy. 149 | Returns True if healthy and False otherwise. 150 | """ 151 | self.logger.debug('Checking if component %s is healthy...', self.getName()) 152 | container = self.getPrimaryContainer() 153 | if not container: 154 | self.logger.debug('No container running for component %s', self.getName()) 155 | return False 156 | 157 | checks = [] 158 | for check in self.config.health_checks: 159 | checks.append((check, buildHealthCheck(check))) 160 | 161 | for (config, check) in checks: 162 | report('Running health check: ' + config.getTitle(), component=self) 163 | result = check.run(container, report) 164 | if not result: 165 | report('Health check failed', component=self) 166 | return False 167 | 168 | self.logger.debug('Component %s is healthy', self.getName()) 169 | return True 170 | 171 | ###################################################################### 172 | 173 | def readyCheck(self, container, timeout): 174 | """ Method which performs ready health check(s) on a container, returning whether 175 | they succeeded or not. 176 | 177 | container: The container running the component that will be checked. 178 | timeout: The amount of time after which the checks have timed out. 179 | """ 180 | self.logger.debug('Checking if component %s is ready...', self.getName()) 181 | checks = [] 182 | for check in self.config.ready_checks: 183 | checks.append((check, buildHealthCheck(check))) 184 | 185 | start = time.time() 186 | while True: 187 | now = time.time() 188 | if now - start > timeout: 189 | # Timed out completely. 190 | self.logger.debug('Component %s ready checks have timed out') 191 | return False 192 | 193 | # Try each check. If any fail, we'll sleep and try again. 194 | check_failed = None 195 | for (config, check) in checks: 196 | report('Running health check: ' + config.getTitle(), component=self) 197 | result = check.run(container, report) 198 | if not result: 199 | report('Health check failed', component=self) 200 | check_failed = config 201 | break 202 | 203 | if check_failed: 204 | report('Sleeping ' + str(check_failed.timeout) + ' second(s)...', component=self) 205 | time.sleep(check_failed.timeout) 206 | else: 207 | break 208 | 209 | return True 210 | 211 | def start(self): 212 | """ Starts a new instance of the component. Note that this does *not* update the proxy. """ 213 | client = getDockerClient() 214 | self.logger.debug('Starting container for component %s', self.getName()) 215 | 216 | # Ensure that we have the image. If not, we try to download it. 217 | self.ensureImage(client) 218 | 219 | # Start the instance with the proper image ID. 220 | container = self.createContainer(client) 221 | report('Starting container ' + container['Id'][:12], component=self) 222 | 223 | if self.config.privileged: 224 | report('Container will be run in privileged mode', component=self) 225 | 226 | client.start(container, binds=self.config.getBindings(container['Id']), 227 | volumes_from=self.config.volumes_from, 228 | privileged=self.config.privileged) 229 | 230 | # Health check until the instance is ready. 231 | report('Waiting for health checks...', component=self) 232 | 233 | # Start a health check thread to determine when the component is ready. 234 | timeout = self.config.getReadyCheckTimeout() 235 | readycheck_thread = Thread(target=self.readyCheck, args=[container, timeout]) 236 | readycheck_thread.daemon = True 237 | readycheck_thread.start() 238 | 239 | # Wait for the health thread to finish. 240 | readycheck_thread.join(self.config.getReadyCheckTimeout()) 241 | 242 | # If the thread is still alived, then our join timed out. 243 | if readycheck_thread.isAlive(): 244 | report('Timed out waiting for health checks. Stopping container...', component=self) 245 | client.stop(container) 246 | report('Container stopped', component=self) 247 | return None 248 | 249 | # Otherwise, the container is ready. Set it as starting. 250 | setContainerComponent(container, self.getName()) 251 | setContainerStatus(container, 'starting') 252 | return container 253 | 254 | def getAllContainers(self, client): 255 | """ Returns all the matching containers for this component. """ 256 | containers = [] 257 | for container in client.containers(): 258 | containerName = getContainerComponent(container) 259 | if ((not containerName and container['Image'] == self.config.getFullImage()) or 260 | containerName == self.getName()): 261 | containers.append(container) 262 | 263 | return containers 264 | 265 | def calculateEnvForComponent(self): 266 | """ Calculates the dict of environment variables for this component. """ 267 | links = self.config.getComponentLinks() 268 | environment = self.config.getEnvironmentVariables() 269 | 270 | for link_alias, link_name in links.items(): 271 | component_link_info = self.manager.lookupComponentLink(link_name) 272 | if not component_link_info: 273 | fail('Component link %s not defined on any component' % link_name, component=self) 274 | return None 275 | 276 | if not component_link_info.running: 277 | info = (link_name, component_link_info.component.getName()) 278 | fail('Component link "%s" cannot be setup: Component "%s" is not running' % info, 279 | component=self) 280 | return None 281 | 282 | # Component link env var format: 283 | # THEALIAS_CLINK=tcp://{hostip}:{hostport} 284 | # THEALIAS_CLINK_6379_TCP=tcp://{hostip}:{hostport} 285 | # THEALIAS_CLINK_6379_TCP_PROTO=tcp 286 | # THEALIAS_CLINK_6379_TCP_ADDR={hostip} 287 | # THEALIAS_CLINK_6379_TCP_PORT={hostport} 288 | 289 | prefix = link_alias.upper() + '_CLINK' 290 | prefix_with_port = prefix + '_' + str(component_link_info.container_port) 291 | full_prefix = prefix_with_port + ('_HTTP' if component_link_info.kind == 'http' else '_TCP') 292 | full_uri = '%s://%s:%s' % (component_link_info.kind, component_link_info.address, 293 | component_link_info.exposed_port) 294 | 295 | environment[prefix] = full_uri 296 | environment[full_prefix] = full_uri 297 | environment[full_prefix + '_PROTO'] = component_link_info.kind 298 | environment[full_prefix + '_ADDR'] = component_link_info.address 299 | environment[full_prefix + '_PORT'] = component_link_info.exposed_port 300 | 301 | return environment 302 | 303 | def createContainer(self, client): 304 | """ Creates a docker container for this component and returns it. """ 305 | command = self.getCommand() 306 | if not command: 307 | fail('No command defined in either gantry config or docker image for component ' + 308 | self.getName(), component=self) 309 | 310 | self.logger.debug('Starting container for component %s with command %s', self.getName(), 311 | command) 312 | 313 | container = client.create_container(self.config.getFullImage(), command, 314 | user=self.config.getUser(), 315 | volumes=self.config.getVolumes(), 316 | ports=[str(p) for p in self.config.getContainerPorts()], 317 | environment=self.calculateEnvForComponent()) 318 | 319 | return container 320 | 321 | def getCommand(self): 322 | """ Returns the command to run or None if none found. """ 323 | config_command = self.config.getCommand() 324 | if config_command: 325 | return config_command 326 | 327 | client = getDockerClient() 328 | named_image = self.config.getFullImage() 329 | result = client.inspect_image(named_image) 330 | container_cfg = result['Config'] 331 | if not 'Cmd' in container_cfg: 332 | return None 333 | 334 | return ' '.join(container_cfg['Cmd']) 335 | 336 | def ensureImage(self, client): 337 | """ Ensures that the image for this component is present locally. If not, 338 | we attempt to pull the image. 339 | """ 340 | images = client.images(name=self.config.repo) 341 | if images: 342 | for image in images: 343 | if 'RepoTags' in image.keys() and self.config.getFullImage() in image['RepoTags']: 344 | return 345 | 346 | try: 347 | client.pull(self.config.repo, tag=self.config.tag) 348 | except Exception as e: 349 | fail('Could not pull repo ' + self.config.repo, component=self, exception=str(e)) 350 | -------------------------------------------------------------------------------- /runtime/manager.py: -------------------------------------------------------------------------------- 1 | from component import Component 2 | from metadata import getContainerStatus, setContainerStatus, removeContainerMetadata 3 | from proxy.portproxy import Proxy, Route 4 | from util import report, fail, getDockerClient, ReportLevels 5 | from health.checks import buildTerminationSignal, buildHealthCheck 6 | 7 | from collections import defaultdict 8 | from Queue import Queue 9 | from multiprocessing.pool import ThreadPool 10 | 11 | import docker 12 | import psutil 13 | import threading 14 | import time 15 | import logging 16 | import containerutil 17 | 18 | class ComponentLinkInformation(object): 19 | """ Helper class which contains all runtime information about a component link. """ 20 | def __init__(self, manager, component, link_config): 21 | # The component that exports the link. 22 | self.component = component 23 | 24 | # The configuration for the component link. 25 | self.link_config = link_config 26 | 27 | # The kind of the link. 28 | self.kind = 'http' if link_config.kind.lower() == 'http' else 'tcp' 29 | 30 | # The port of the link inside the running container. 31 | self.container_port = link_config.port 32 | 33 | # The address of the link under the proxy (None if the link is not running). 34 | self.address = None 35 | 36 | # The port of the link under the proxy (None if the link is not running). 37 | self.exposed_port = None 38 | 39 | # Whether the link is currently running. 40 | self.running = False 41 | 42 | # Lookup the runtime information for the link. 43 | client = getDockerClient() 44 | container = component.getPrimaryContainer() 45 | if container: 46 | container_ip = containerutil.getContainerIPAddress(client, container) 47 | 48 | self.address = client.inspect_container(container)['NetworkSettings']['Gateway'] # The host's IP address. 49 | self.exposed_port = link_config.getHostPort() 50 | self.running = True 51 | 52 | 53 | class RuntimeManager(object): 54 | """ Manager class which handles tracking of all the components and other runtime 55 | information. 56 | """ 57 | def __init__(self, config): 58 | # Logging. 59 | self.logger = logging.getLogger(__name__) 60 | 61 | # The overall configuration. 62 | self.config = config 63 | 64 | # The proxy being used to talk to HAProxy. 65 | self.proxy = Proxy() 66 | 67 | # The components, by name. 68 | self.components = {} 69 | 70 | # Build the components map. 71 | for component_config in config.components: 72 | self.components[component_config.name] = Component(self, component_config) 73 | 74 | # Create the lock for the watcher thread and the notification event. 75 | self.watcher_lock = threading.Lock() 76 | self.watcher_event = threading.Event() 77 | 78 | # The set of containers which should be terminated by the terminating workers. 79 | self.containers_to_terminate = Queue() 80 | 81 | # Start the thread used to watch and stop containers that are no longer needed. 82 | self.pool = ThreadPool() 83 | 84 | # Place to collect the results of the monitor 85 | self.monitor_futures = Queue() 86 | 87 | def getComponent(self, name): 88 | """ Returns the component with the given name defined or None if none. """ 89 | if not name in self.components: 90 | return None 91 | 92 | return self.components[name] 93 | 94 | def lookupComponentLink(self, link_name): 95 | """ Looks up the component link with the given name defined or None if none. """ 96 | for component_name, component in self.components.items(): 97 | defined_links = component.config.getDefinedComponentLinks() 98 | if link_name in defined_links: 99 | return ComponentLinkInformation(self, component, defined_links[link_name]) 100 | 101 | return None 102 | 103 | def adjustForUpdatingComponent(self, component, started_container): 104 | """ Adjusts the runtime for a component which has been started in the given 105 | container. 106 | """ 107 | self.logger.debug('Adjusting runtime for updating component: %s', component.getName()) 108 | self.updateProxy() 109 | 110 | def adjustForStoppingComponent(self, component): 111 | """ Adjusts the runtime for a component which has been stopped. 112 | """ 113 | self.logger.debug('Adjusting runtime for stopped component: %s', component.getName()) 114 | self.updateProxy() 115 | 116 | 117 | def watchTermination(self, container, component): 118 | report('Monitor check started', level=ReportLevels.BACKGROUND) 119 | 120 | client = getDockerClient() 121 | 122 | # Send the termination signal(s) to the container 123 | signals = [] 124 | 125 | for signal in component.config.termination_signals: 126 | signals.append((signal, buildTerminationSignal(signal))) 127 | 128 | report('Sending %s termination signals' % len(signals), component=component) 129 | 130 | for (config, signal) in signals: 131 | report('Sending termination signal: ' + config.getTitle(), component=component) 132 | result = signal.run(container, report) 133 | if not result: 134 | report('Termination signal failed', component=component) 135 | 136 | # Now wait until all of the termination conditions are met 137 | checks = [] 138 | for check in component.config.termination_checks: 139 | checks.append((check, buildHealthCheck(check))) 140 | 141 | report('Waiting for %s termination checks' % len(checks), component=component) 142 | 143 | for (config, check) in checks: 144 | check_passed = False 145 | 146 | while not check_passed: 147 | report('Running termination check: ' + config.getTitle(), component=component) 148 | result = check.run(container, report) 149 | if not result: 150 | report('Termination check failed', component=component) 151 | 152 | report('Sleeping ' + str(config.timeout) + ' second(s)...', component=component) 153 | time.sleep(config.timeout) 154 | else: 155 | check_passed = True 156 | 157 | report('Monitor check finished', level=ReportLevels.BACKGROUND) 158 | 159 | setContainerStatus(container, 'shutting-down') 160 | report('Shutting down container: ' + container['Id'][0:12], level=ReportLevels.BACKGROUND) 161 | client.stop(container) 162 | removeContainerMetadata(container) 163 | 164 | 165 | def terminateContainer(self, container, component): 166 | """ Adds the given container to the list of containers which should be terminated. 167 | """ 168 | report('Terminating container: %s' % container['Id'][:12], component=component) 169 | self.monitor_futures.put(self.pool.apply_async(self.watchTermination, (container, component))) 170 | 171 | 172 | def updateProxy(self): 173 | """ Updates the proxy used for port mapping to conform to the current running container 174 | list. 175 | """ 176 | client = getDockerClient() 177 | 178 | # Clear all routes in the proxy. 179 | # TODO: When this is in daemon mode, don't need do this. We could selectively 180 | # edit it instead. 181 | self.proxy.clear_routes() 182 | 183 | # Add routes for the non-draining containers and collect the draining containers to 184 | # watch. 185 | report('Finding running containers...', level=ReportLevels.EXTRA) 186 | draining_containers = [] 187 | starting_containers = [] 188 | 189 | for component in self.components.values(): 190 | for container in component.getAllContainers(client): 191 | if getContainerStatus(container) != 'draining': 192 | container_ip = containerutil.getContainerIPAddress(client, container) 193 | starting_containers.append(container) 194 | 195 | # Add the normal exposed ports. 196 | for mapping in component.config.ports: 197 | route = Route(mapping.kind == 'http', mapping.external, container_ip, 198 | mapping.container) 199 | self.proxy.add_route(route) 200 | 201 | # Add the container link ports. 202 | for link in component.config.defined_component_links: 203 | route = Route(link.kind == 'http', link.getHostPort(), container_ip, link.port) 204 | self.proxy.add_route(route) 205 | else: 206 | draining_containers.append(container) 207 | 208 | # Commit the changes to the proxy. 209 | if draining_containers or starting_containers: 210 | report('Updating proxy...', level=ReportLevels.EXTRA) 211 | self.proxy.commit() 212 | else: 213 | report('Shutting down proxy...', level=ReportLevels.EXTRA) 214 | self.proxy.shutdown() 215 | 216 | # Mark the starting containers as running. 217 | for container in starting_containers: 218 | setContainerStatus(container, 'running') 219 | 220 | def join(self): 221 | self.pool.close() 222 | 223 | while not self.monitor_futures.empty(): 224 | # If any of the futures threw and exception we'll get it now 225 | self.monitor_futures.get().get() 226 | 227 | self.pool.join() 228 | -------------------------------------------------------------------------------- /runtime/metadata.py: -------------------------------------------------------------------------------- 1 | import docker 2 | import json 3 | 4 | from peewee import (Model, SqliteDatabase, ForeignKeyField, CharField, OperationalError, 5 | sort_models_topologically, DoesNotExist) 6 | from functools import wraps 7 | 8 | GANTRY_METADATA_FILE = '.gantry_metadata' 9 | cached_metadata = None 10 | 11 | 12 | db = SqliteDatabase(GANTRY_METADATA_FILE) 13 | 14 | 15 | class BaseModel(Model): 16 | class Meta: 17 | database = db 18 | 19 | 20 | class Component(BaseModel): 21 | name = CharField(index=True) 22 | 23 | 24 | class ComponentField(BaseModel): 25 | component = ForeignKeyField(Component) 26 | key = CharField(index=True) 27 | value = CharField() 28 | 29 | 30 | class Container(BaseModel): 31 | docker_id = CharField(index=True) 32 | component = ForeignKeyField(Component, null=True) 33 | 34 | 35 | class ContainerField(BaseModel): 36 | container = ForeignKeyField(Container) 37 | key = CharField(index=True) 38 | value = CharField() 39 | 40 | class Meta: 41 | database = db 42 | indexes = ( 43 | # A team name must be unique within an organization 44 | (('container', 'key'), True), 45 | ) 46 | 47 | 48 | all_models = [Component, ComponentField, Container, ContainerField] 49 | 50 | 51 | def _initialze_db(): 52 | for model in sort_models_topologically(all_models): 53 | try: 54 | model.select().get() 55 | except OperationalError as exc: 56 | model.create_table() 57 | except DoesNotExist: 58 | pass 59 | 60 | 61 | def db_access(to_wrap): 62 | @wraps(to_wrap) 63 | def wrapper(*args, **kwargs): 64 | _initialze_db() 65 | 66 | try: 67 | return to_wrap(*args, **kwargs) 68 | finally: 69 | if not db.is_closed(): 70 | db.close() 71 | 72 | return wrapper 73 | 74 | 75 | def getContainerStatus(container): 76 | """ Returns the status code of the given container. """ 77 | return _getContainerField(container, 'status', default='unknown') 78 | 79 | 80 | def setContainerStatus(container, status): 81 | """ Sets the status code for the given container. """ 82 | _setContainerField(container, 'status', status) 83 | 84 | 85 | @db_access 86 | def getContainerComponent(container): 87 | """ Returns the component that owns the given container. """ 88 | container_record = _upsertContainerRecord(container) 89 | return container_record.component and container_record.component.name 90 | 91 | 92 | @db_access 93 | def setContainerComponent(container, component_name): 94 | """ Sets the component code for the given container. """ 95 | component = _upsertComponentRecord(component_name) 96 | container_record = _upsertContainerRecord(container) 97 | container_record.component = component 98 | container_record.save() 99 | 100 | 101 | def _getContainerId(container_or_id): 102 | return container_or_id['Id'] if isinstance(container_or_id, dict) else container_or_id 103 | 104 | 105 | @db_access 106 | def removeContainerMetadata(container): 107 | found = _upsertContainerRecord(container) 108 | found.delete_instance(recursive=True) 109 | 110 | 111 | def _getContainerFieldRecord(container, field): 112 | try: 113 | return (ContainerField 114 | .select() 115 | .join(Container) 116 | .where(Container.docker_id == container, ContainerField.key == field) 117 | .get()) 118 | except ContainerField.DoesNotExist: 119 | return None 120 | 121 | 122 | def _upsertContainerRecord(container): 123 | container_id = _getContainerId(container) 124 | try: 125 | return (Container 126 | .select() 127 | .where(Container.docker_id == container_id) 128 | .get()) 129 | except Container.DoesNotExist: 130 | return Container.create(docker_id=container_id) 131 | 132 | 133 | @db_access 134 | def _getContainerField(container, field, default): 135 | """ Returns the metadata field for the given container or the default value. """ 136 | container_id = _getContainerId(container) 137 | found = _getContainerFieldRecord(container_id, field) 138 | return found.value if found else default 139 | 140 | 141 | @db_access 142 | def _setContainerField(container, field, value): 143 | """ Sets the metadata field for the given container. """ 144 | container_id = _getContainerId(container) 145 | found = _getContainerFieldRecord(container_id, field) 146 | if found is not None: 147 | found.value = value 148 | found.save() 149 | else: 150 | container_record = _upsertContainerRecord(container_id) 151 | ContainerField.create(container=container_record, key=field, value=value) 152 | 153 | 154 | def _upsertComponentRecord(component): 155 | try: 156 | return (Component 157 | .select() 158 | .where(Component.name == component) 159 | .get()) 160 | except Component.DoesNotExist: 161 | return Component.create(name=component) 162 | 163 | 164 | def _getComponentFieldRecord(component_name, field): 165 | try: 166 | return (ComponentField 167 | .select() 168 | .join(Component) 169 | .where(Component.name == component_name, ComponentField.key == field) 170 | .get()) 171 | except ComponentField.DoesNotExist: 172 | return None 173 | 174 | 175 | @db_access 176 | def getComponentField(component_name, field, default): 177 | """ Returns the metadata field for the given component or the default value. """ 178 | found = _getComponentFieldRecord(component_name, field) 179 | return found.value if found else default 180 | 181 | 182 | @db_access 183 | def setComponentField(component_name, field, value): 184 | """ Sets the metadata field for the given component. """ 185 | found = _getComponentFieldRecord(component_name, field) 186 | if found is not None: 187 | found.value = value 188 | found.save() 189 | else: 190 | component = _upsertComponentRecord(component_name) 191 | ComponentField.create(component=component, key=field, value=value) 192 | -------------------------------------------------------------------------------- /shutdown-haproxy.sh: -------------------------------------------------------------------------------- 1 | running="/var/run/haproxy-private.pid" 2 | if [ -f "$running" ] 3 | then 4 | kill $(cat