├── .clog.toml ├── .coveragerc ├── .flooignore ├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── ardere ├── __init__.py ├── aws.py ├── exceptions.py ├── scripts │ ├── __init__.py │ └── metric_creator.py └── step_functions.py ├── config.bash ├── default_dashboard.json ├── handler.py ├── package.json ├── requirements.txt ├── serverless.yml ├── setup.cfg ├── src └── shell │ ├── telegraf.toml │ └── waitforcluster.sh ├── test-requirements.txt └── tests ├── __init__.py ├── fixtures.py ├── test_aws.py ├── test_metric_creator.py └── test_step_functions.py /.clog.toml: -------------------------------------------------------------------------------- 1 | [clog] 2 | repository = "https://github.com/loads/ardere" 3 | changelog = "CHANGELOG.md" 4 | from-latest-tag = true 5 | link-style = "github" 6 | 7 | [sections] 8 | Refactor = ["refactor"] 9 | Test = ["test"] 10 | Doc = ["docs"] 11 | Chore = ["chore"] 12 | Features = ["feat", "feature"] 13 | "Bug Fixes" = ["fix", "bug"] 14 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | show_missing = True 3 | -------------------------------------------------------------------------------- /.flooignore: -------------------------------------------------------------------------------- 1 | # Distribution / packaging 2 | .Python 3 | env/ 4 | build/ 5 | develop-eggs/ 6 | dist/ 7 | downloads/ 8 | eggs/ 9 | .eggs/ 10 | lib/ 11 | lib64/ 12 | parts/ 13 | sdist/ 14 | var/ 15 | *.egg-info/ 16 | .installed.cfg 17 | *.egg 18 | node_modules/ 19 | 20 | # Serverless directories 21 | .serverless 22 | .requirements 23 | ardenv/ 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Distribution / packaging 2 | .coverage 3 | .floo 4 | .idea 5 | .Python 6 | .requirements 7 | .npmignore 8 | ardenv/ 9 | env/ 10 | build/ 11 | develop-eggs/ 12 | dist/ 13 | downloads/ 14 | eggs/ 15 | .eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.pyc 24 | *.egg 25 | node_modules/ 26 | 27 | # Serverless directories 28 | .serverless 29 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | cache: pip 3 | sudo: required 4 | dist: precise 5 | 6 | matrix: 7 | include: 8 | - python: 2.7 9 | env: CODECOV=true 10 | 11 | install: 12 | - pip install -r test-requirements.txt 13 | - pip install ${CODECOV:+codecov} 14 | script: 15 | - nosetests -d tests ${CODECOV:+--with-coverage --cover-xml --cover-package=ardere} 16 | after_success: 17 | - codecov 18 | notifications: 19 | slack: 20 | secure: vT9sWtUuxk28g6xYKAsQmiPZllErOYVfx5lcL+/jo1eRFrmbpYnyndT6s+FxGI1547oizZ0IqZbHVvB7BUoSJixXJyQJYXW2MchwN1UeHrey8mYpF1GNEaJT7FMfqSkxUU9gvAZ3IU7zstNeTLbfG1GkLuzybp0WAiHl/ocUTz8= 21 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 2 | ## 0.1.1 (2017-05-15) 3 | 4 | 5 | #### Doc 6 | 7 | * update README with run steps ([3d6b5aa2](https://github.com/loads/ardere/commit/3d6b5aa2e6277a33e1a464d30168bbc2f406c512)) 8 | 9 | #### Bug Fixes 10 | 11 | * bump wait for cluster ready from 10 to 30 minutes ([a23115b8](https://github.com/loads/ardere/commit/a23115b8bc20f4e7b44ef4bf78b3687069ea1253)) 12 | 13 | 14 | 15 | 16 | ## 0.1 (2017-04-25) 17 | 18 | 19 | #### Features 20 | 21 | * secure metrics/load-test nodes from outside access ([3d08dccd](https://github.com/loads/ardere/commit/3d08dccd2376f85976b2f7bd026295c504560485), closes [#54](https://github.com/loads/ardere/issues/54)) 22 | * Check names for invalid characters and lengths ([c886f6a9](https://github.com/loads/ardere/commit/c886f6a9598badb084871720515ff1663e61c032)) 23 | * use security groups to restrict node access ([6395f9cd](https://github.com/loads/ardere/commit/6395f9cd52ab0c74a2735a8fecc2b30a217ddfda), closes [#48](https://github.com/loads/ardere/issues/48)) 24 | * add grafana dashboarding ([a7a30df8](https://github.com/loads/ardere/commit/a7a30df8210429341e711ad713510e00acdc80c1), closes [#40](https://github.com/loads/ardere/issues/40)) 25 | * add telegraf setup for per-container stat reporting ([7749e2eb](https://github.com/loads/ardere/commit/7749e2eb373a6f6afc49b2a7d03fcf5c4f9a18fb), closes [#33](https://github.com/loads/ardere/issues/33)) 26 | * start influxdb with test runs ([8ddc48b5](https://github.com/loads/ardere/commit/8ddc48b5d3d395d54a914166e9803a6ab41ecf3f), closes [#19](https://github.com/loads/ardere/issues/19)) 27 | * validate test plan before running ([0314fae7](https://github.com/loads/ardere/commit/0314fae70962f6281a261499e32500291ff764ab), closes [#21](https://github.com/loads/ardere/issues/21)) 28 | * remove need to specify cpu_units ([e99eddea](https://github.com/loads/ardere/commit/e99eddead4b4119e508546aa38dc34873efa9632), closes [#20](https://github.com/loads/ardere/issues/20)) 29 | * add port mapping for containers ([af054af1](https://github.com/loads/ardere/commit/af054af18e6ab5e4cd163c903867dc2cfe415168), closes [#24](https://github.com/loads/ardere/issues/24)) 30 | * add toml loading as a test plan ([8342cb11](https://github.com/loads/ardere/commit/8342cb11902f6a225925cd1f8fd430d31a614cf9), closes [#32](https://github.com/loads/ardere/issues/32)) 31 | * use cloudwatch logs for container output ([8bafa09f](https://github.com/loads/ardere/commit/8bafa09f82ad0116e31cc49849b7bd679219506c), closes [#27](https://github.com/loads/ardere/issues/27)) 32 | * setup environment data from the test plan ([7e2ad2da](https://github.com/loads/ardere/commit/7e2ad2dad361336a4d46166e6aec32cd80c15e03), closes [#25](https://github.com/loads/ardere/issues/25)) 33 | * fixup readme and test suite ([047a7fa6](https://github.com/loads/ardere/commit/047a7fa6381f4d034fd0c2955e90319a29730c76), closes [#22](https://github.com/loads/ardere/issues/22)) 34 | * create MVP using serverless w/python ([9aa80467](https://github.com/loads/ardere/commit/9aa80467ce86b95e330886c1dcf57e5d84004e83), closes [#17](https://github.com/loads/ardere/issues/17)) 35 | * add the lambda to start the run by writing to s3 ([e45a2789](https://github.com/loads/ardere/commit/e45a278930589b8dddbf88e3fe151f979d388edd)) 36 | * add lambda function and basic CF templates for use ([0cb63bff](https://github.com/loads/ardere/commit/0cb63bff8f1d7b2533ee40a81a932e3bb618236f), closes [#11](https://github.com/loads/ardere/issues/11)) 37 | * add an initial state machine impl ([2f571b0a](https://github.com/loads/ardere/commit/2f571b0aec7df9252c8d0fce44da252c17985fa2)) 38 | * initial waiter script (#9) ([c07749c0](https://github.com/loads/ardere/commit/c07749c06a97bba50fe1701a2896d9b5a11dd18e)) 39 | 40 | #### Doc 41 | 42 | * update for use of cloud formation in setup (#2) ([243a4a11](https://github.com/loads/ardere/commit/243a4a11da3343735815dd42a0c78bb6936adf56)) 43 | * initial design docs from autoconf ([eead6dd8](https://github.com/loads/ardere/commit/eead6dd80a43c24b40047fc5c22571122878ce05)) 44 | 45 | #### Bug Fixes 46 | 47 | * check service drained vs container draining ([fd4907e1](https://github.com/loads/ardere/commit/fd4907e10be9103cc9e20511c2e16c4ae906e469), closes [#62](https://github.com/loads/ardere/issues/62)) 48 | * Do not check 'metrics' instance for draining ([40e8cd01](https://github.com/loads/ardere/commit/40e8cd01fc996f1596370c5ddb6ff6998b04ffdc)) 49 | * Ensure all containers drained before exiting ([4cbea2fd](https://github.com/loads/ardere/commit/4cbea2fd0a280993d4312f82ba52354a0bf15f7f)) 50 | * add proper tagging and socket limits ([15dc023e](https://github.com/loads/ardere/commit/15dc023efc91a0b3b644084a71f3f6f46be77158), closes [#44](https://github.com/loads/ardere/issues/44)) 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution Guidelines 2 | 3 | Anyone is welcome to contribute to this project. Feel free to get in touch with 4 | other community members on IRC, the mailing list or through issues here on 5 | GitHub. 6 | 7 | [See the README](/README.md) for contact information. 8 | 9 | ## Bug Reports 10 | 11 | You can file issues here on GitHub. Please try to include as much information as 12 | you can and under what conditions you saw the issue. 13 | 14 | ## Sending Pull Requests 15 | 16 | Patches should be submitted as pull requests (PR). 17 | 18 | Before submitting a PR: 19 | - Your code must run and pass all the automated tests before you submit your PR 20 | for review. "Work in progress" pull requests are allowed to be submitted, but 21 | should be clearly labeled as such and should not be merged until all tests 22 | pass and the code has been reviewed. 23 | - Your patch should include new tests that cover your changes. It is your and 24 | your reviewer's responsibility to ensure your patch includes adequate tests. 25 | 26 | When submitting a PR: 27 | - You agree to license your code under the project's open source license 28 | ([MPL 2.0](/LICENSE)). 29 | - Base your branch off the current `master` (see below for an example workflow). 30 | - Add both your code and new tests if relevant. 31 | - Run the test suite to make sure your code passes linting and tests. 32 | - Please do not include merge commits in pull requests; include only commits with the new relevant code. 33 | 34 | See the main [README.md](/README.md) for information on prerequisites, installing, running and testing. 35 | 36 | ## Code Review 37 | 38 | This project is production Mozilla code and subject to our [engineering practices and quality standards](https://developer.mozilla.org/en-US/docs/Mozilla/Developer_guide/Committing_Rules_and_Responsibilities). Every patch must be peer reviewed. 39 | 40 | ## Git Commit Guidelines 41 | 42 | We loosely follow the [Angular commit guidelines](https://github.com/angular/angular.js/blob/master/CONTRIBUTING.md#type) of `(): ` where `type` must be one of: 43 | 44 | * **feat**: A new feature 45 | * **fix**: A bug fix 46 | * **docs**: Documentation only changes 47 | * **style**: Changes that do not affect the meaning of the code (white-space, formatting, missing 48 | semi-colons, etc) 49 | * **refactor**: A code change that neither fixes a bug or adds a feature 50 | * **perf**: A code change that improves performance 51 | * **test**: Adding missing tests 52 | * **chore**: Changes to the build process or auxiliary tools and libraries such as documentation 53 | generation 54 | 55 | ### Scope 56 | The scope could be anything specifying place of the commit change. 57 | 58 | ### Subject 59 | The subject contains succinct description of the change: 60 | 61 | * use the imperative, present tense: "change" not "changed" nor "changes" 62 | * don't capitalize first letter 63 | * no dot (.) at the end 64 | 65 | ###Body 66 | In order to maintain a reference to the context of the commit, add 67 | `fixes #` if it closes a related issue or `issue #` 68 | if it's a partial fix. 69 | 70 | You can also write a detailed description of the commit: Just as in the 71 | **subject**, use the imperative, present tense: "change" not "changed" nor 72 | "changes" It should include the motivation for the change and contrast this with 73 | previous behavior. 74 | 75 | ###Footer 76 | The footer should contain any information about **Breaking Changes** and is also 77 | the place to reference GitHub issues that this commit **Closes**. 78 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ardere 2 | *AWS Serverless Service for Load-Testing* 3 | 4 | ardere runs as a serverless service using AWS to orchestrate 5 | load-tests consisting of docker container configurations arranged as 6 | test plans. 7 | 8 | ## Installation 9 | 10 | Pre-requisite: 11 | installation requires node > v6 12 | 13 | To deploy ardere to your AWS account, you will need a fairly recent 14 | install of Node, then install the Node packages required: 15 | 16 | $ npm install 17 | 18 | You will need to ensure your have AWS access and secret keys configured 19 | for serverless: 20 | 21 | $ sls config 22 | 23 | To deploy the ardere lambda's and required AWS stack: 24 | 25 | $ sls deploy 26 | 27 | Then you can deploy the ardere Step Function: 28 | 29 | $ sls deploy stepf 30 | 31 | 32 | ## Developing 33 | 34 | ardere is written in Python and deployed via serverless to AWS. To an 35 | extent testing it on AWS is the most reliable indicator it works as 36 | intended. However, there are sets of tests that ensure the Python code 37 | is valid and works with arguments as intended that may be run locally. 38 | 39 | Create a Python virtualenv, and install the test requirements: 40 | 41 | $ virtualenv ardenv 42 | $ source ardenv/bin/activate 43 | $ pip install -r test-requirements.txt 44 | 45 | The tests can now be run with nose: 46 | 47 | $ nosetests 48 | 49 | Note that **you cannot run the sls deploy while the virtualenv is active** 50 | due to how the serverless Python requirements plugin operates. 51 | 52 | ## Run Test 53 | 54 | 1. Login to AWS console 55 | (mozilla-services use: stage) 56 | 2. Go to Step Functions > Dashboard 57 | 3. Select your state machine 58 | (mozilla-services use: "ardere-dev-ardere") 59 | 4. Click on "New Execution" button 60 | 5. Paste your json config into text area 61 | (example: [**mozilla-services/screenshots-loadtests** /ardere.json](https://github.com/mozilla-services/screenshots-loadtests/blob/master/ardere.json)) 62 | 6. Optional: Assign a name to your execution 63 | 7. Click on "Start Execution" 64 | 8. Monitor execution in Dashboard 65 | 9. Test load should be visible in DataDog, NewRelic, etc. 66 | 67 | ## Monitoring 68 | 69 | ### Metrics Node Monitoring (Grafana) 70 | 71 | 1. ssh -L 3000:\:3000 \ 72 | 2. open local browser to http://localhost:3000 73 | 3. login using credentials specified in your ardere (JSON) config file 74 | -------------------------------------------------------------------------------- /ardere/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.1' # pragma: nocover 2 | -------------------------------------------------------------------------------- /ardere/aws.py: -------------------------------------------------------------------------------- 1 | """AWS Helper Classes""" 2 | import logging 3 | import os 4 | import time 5 | import uuid 6 | from collections import defaultdict 7 | 8 | import boto3 9 | import botocore 10 | from concurrent.futures import ThreadPoolExecutor 11 | from typing import Any, Dict, List, Optional, Tuple # noqa 12 | 13 | logger = logging.getLogger() 14 | logger.setLevel(logging.INFO) 15 | 16 | # Setup script paths 17 | dir_path = os.path.dirname(os.path.realpath(__file__)) 18 | parent_dir_path = os.path.dirname(dir_path) 19 | wait_script_path = os.path.join(parent_dir_path, "src", "shell", 20 | "waitforcluster.sh") 21 | telegraf_script_path = os.path.join(parent_dir_path, "src", "shell", 22 | "telegraf.toml") 23 | metric_create_script = os.path.join(parent_dir_path, "ardere", "scripts", 24 | "metric_creator.py") 25 | 26 | # EC2 userdata to setup values on load 27 | # Settings for net.ipv4 settings based on: 28 | # http://stackoverflow.com/questions/410616/increasing-the-maximum-number-of-tcp-ip-connections-in-linux 29 | # Other settings are from operations on kernel tweaks they've done to handle 30 | # large socket conditions. 31 | EC2_USER_DATA = """#!/bin/bash 32 | echo ECS_CLUSTER='{ecs_name}' >> /etc/ecs/ecs.config 33 | sysctl net.core.rmem_default=8388608 34 | sysctl net.core.rmem_max=16777216 35 | sysctl net.core.wmem_max=16777216 36 | sysctl net.core.netdev_max_backlog=2500 37 | sysctl net.core.somaxconn=3240000 38 | sysctl net.netfilter.nf_conntrack_tcp_timeout_established=600 39 | sysctl net.nf_conntrack_max=1000000 40 | sysctl net.ipv4.ip_local_port_range="1024 65535" 41 | sysctl net.ipv4.netfilter.ip_conntrack_max=4999999 42 | sysctl net.ipv4.netfilter.ip_conntrack_tcp_timeout_time_wait=1 43 | sysctl net.ipv4.netfilter.ip_conntrack_tcp_timeout_established=54000 44 | sysctl net.ipv4.tcp_fin_timeout=5 45 | sysctl net.ipv4.tcp_keepalive_time=30 46 | sysctl net.ipv4.tcp_keepalive_intvl=15 47 | sysctl net.ipv4.tcp_keepalive_probes=6 48 | sysctl net.ipv4.tcp_window_scaling=1 49 | sysctl net.ipv4.tcp_rmem="4096 87380 16777216" 50 | sysctl net.ipv4.tcp_wmem="4096 65536 16777216" 51 | sysctl net.ipv4.tcp_mem="786432 1048576 26777216" 52 | sysctl net.ipv4.tcp_max_tw_buckets=360000 53 | sysctl net.ipv4.tcp_max_syn_backlog=3240000 54 | sysctl net.ipv4.tcp_max_tw_buckets=1440000 55 | sysctl net.ipv4.tcp_slow_start_after_idle=0 56 | sysctl net.ipv4.tcp_retries2=5 57 | sysctl net.ipv4.tcp_tw_recycle=1 58 | sysctl net.ipv4.tcp_tw_reuse=1 59 | sysctl vm.min_free_kbytes=65536 60 | sysctl -w fs.file-max=1000000 61 | ulimit -n 1000000 62 | """ 63 | 64 | # List tracking vcpu's of all instance types for cpu unit reservations 65 | # We are intentionally leaving out the following instance types as they're 66 | # considered overkill for load-testing purposes or any instance req's we have 67 | # experienced so far: 68 | # P2, G2, F1, I3, D2 69 | ec2_type_by_vcpu = { 70 | 1: ["t2.nano", "t2.micro", "t2.small", "m3.medium"], 71 | 2: ["t2.medium", "t2.large", "m3.large", "m4.large", "c3.large", 72 | "c4.large", "r3.large", "r4.large"], 73 | 4: ["t2.xlarge", "m3.xlarge", "m4.xlarge", "c3.xlarge", "c4.xlarge", 74 | "r3.xlarge", "r4.xlarge"], 75 | 8: ["t2.2xlarge", "m3.2xlarge", "m4.2xlarge", "c3.2xlarge", "c4.2xlarge", 76 | "r3.2xlarge", "r4.2xlarge"], 77 | 16: ["m4.4xlarge", "c3.4xlarge", "c4.4xlarge", "r3.4xlarge", "r4.4xlarge"], 78 | 32: ["c3.8xlarge", "r3.8xlarge", "r4.8xlarge"], 79 | 36: ["c4.8xlarge"], 80 | 40: ["m4.10xlarge"], 81 | 64: ["m4.16xlarge", "x1.16xlarge", "r4.16xlarge"], 82 | 128: ["x1.32xlarge"] 83 | } 84 | 85 | # Build a list of vcpu's by instance type 86 | ec2_vcpu_by_type = {} 87 | for vcpu, instance_types in ec2_type_by_vcpu.items(): 88 | for instance_type in instance_types: 89 | ec2_vcpu_by_type[instance_type] = vcpu 90 | 91 | 92 | def cpu_units_for_instance_type(instance_type): 93 | # type: (str) -> int 94 | """Calculate how many CPU units to allocate for an instance_type 95 | 96 | We calculate cpu_units as 1024 * vcpu's for each instance to allocate 97 | almost the entirety of the instance's cpu units to the load-testing 98 | container. We take out 512 to ensure some leftover capacity for other 99 | utility containers we run with the load-testing container. 100 | 101 | """ 102 | return (ec2_vcpu_by_type[instance_type] * 1024) - 512 103 | 104 | 105 | class ECSManager(object): 106 | """ECS Manager queries and manages an ECS cluster""" 107 | # For testing purposes 108 | boto = boto3 109 | 110 | # ECS optimized AMI id's 111 | ecs_ami_ids = { 112 | "us-east-1": "ami-275ffe31", 113 | "us-east-2": "ami-62745007", 114 | "us-west-1": "ami-689bc208", 115 | "us-west-2": "ami-62d35c02" 116 | } 117 | 118 | influxdb_container = "influxdb:1.2-alpine" 119 | telegraf_container = "telegraf:1.2-alpine" 120 | grafana_container = "grafana/grafana:4.1.2" 121 | python_container = "jfloff/alpine-python:2.7-slim" 122 | 123 | _wait_script = None 124 | _telegraf_script = None 125 | _metric_create_script = None 126 | 127 | def __init__(self, plan): 128 | # type: (Dict[str, Any]) -> None 129 | """Create and return a ECSManager for a cluster of the given name.""" 130 | self._ecs_client = self.boto.client('ecs') 131 | self._ec2_client = self.boto.client('ec2') 132 | self._ecs_name = plan["ecs_name"] 133 | self._plan = plan 134 | 135 | # Pull out the env vars 136 | self.s3_ready_bucket = os.environ["s3_ready_bucket"] 137 | self.container_log_group = os.environ["container_log_group"] 138 | self.ecs_profile = os.environ["ecs_profile"] 139 | 140 | if "plan_run_uuid" not in plan: 141 | plan["plan_run_uuid"] = uuid.uuid4().hex 142 | 143 | self._plan_uuid = plan["plan_run_uuid"] 144 | 145 | @property 146 | def wait_script(self): 147 | if not self._wait_script: 148 | with open(wait_script_path, 'r') as f: 149 | self._wait_script = f.read() 150 | return self._wait_script 151 | 152 | @property 153 | def telegraf_script(self): 154 | if not self._telegraf_script: 155 | with open(telegraf_script_path, 'r') as f: 156 | self._telegraf_script = f.read() 157 | return self._telegraf_script 158 | 159 | @property 160 | def metric_create_script(self): 161 | if not self._metric_create_script: 162 | with open(metric_create_script, 'r') as f: 163 | self._metric_create_script = f.read() 164 | return self._metric_create_script 165 | 166 | @property 167 | def plan_uuid(self): 168 | return self._plan_uuid 169 | 170 | @property 171 | def s3_ready_file(self): 172 | return "https://s3.amazonaws.com/{bucket}/{key}".format( 173 | bucket=self.s3_ready_bucket, 174 | key="{}.ready".format(self._plan_uuid) 175 | ) 176 | 177 | @property 178 | def log_config(self): 179 | return { 180 | "logDriver": "awslogs", 181 | "options": {"awslogs-group": self.container_log_group, 182 | "awslogs-region": "us-east-1", 183 | "awslogs-stream-prefix": 184 | "ardere-{}".format(self.plan_uuid) 185 | } 186 | } 187 | 188 | @property 189 | def influx_db_name(self): 190 | return "run-{}".format(self.plan_uuid) 191 | 192 | @property 193 | def grafana_admin_user(self): 194 | return self._plan["metrics_options"]["dashboard"]["admin_user"] 195 | 196 | @property 197 | def grafana_admin_password(self): 198 | return self._plan["metrics_options"]["dashboard"]["admin_password"] 199 | 200 | def family_name(self, step): 201 | # type: (Dict[str, Any]) -> str 202 | """Generate a consistent family name for a given step""" 203 | return step["name"] + "-" + self._plan_uuid 204 | 205 | def metrics_family_name(self): 206 | # type: () -> str 207 | """Generate a consistent metrics family name""" 208 | return "{}-metrics".format(self._ecs_name) 209 | 210 | def metrics_setup_family_name(self): 211 | # type: () -> str 212 | """Generate a consistent metric setup family name""" 213 | return "{}-metrics-setup".format(self._ecs_name) 214 | 215 | def query_active_instances(self, additional_tags=None): 216 | # type: (Optional[Dict[str, str]]) -> Dict[str, int] 217 | """Query EC2 for all the instances owned by ardere for this cluster.""" 218 | instance_dict = defaultdict(int) 219 | paginator = self._ec2_client.get_paginator('describe_instances') 220 | filters = {"Owner": "ardere", "ECSCluster": self._ecs_name} 221 | if additional_tags: 222 | filters.update(additional_tags) 223 | response_iterator = paginator.paginate( 224 | Filters=[ 225 | { 226 | "Name": "tag:{}".format(tag_name), 227 | "Values": [tag_value] 228 | } for tag_name, tag_value in filters.items() 229 | ] 230 | ) 231 | for page in response_iterator: 232 | for reservation in page["Reservations"]: 233 | for instance in reservation["Instances"]: 234 | # Determine if the instance is pending/running and count 235 | # 0 = Pending, 16 = Running, > is all shutting down, etc. 236 | if instance["State"]["Code"] <= 16: 237 | instance_dict[instance["InstanceType"]] += 1 238 | return instance_dict 239 | 240 | def calculate_missing_instances(self, desired, current): 241 | # type: (Dict[str, int], Dict[str, int]) -> Dict[str, int] 242 | """Determine how many of what instance types are needed to ensure 243 | the current instance dict has all the desired instance count/types.""" 244 | needed = {} 245 | for instance_type, instance_count in desired.items(): 246 | cur = current.get(instance_type, 0) 247 | if cur < instance_count: 248 | needed[instance_type] = instance_count - cur 249 | return needed 250 | 251 | def has_metrics_node(self, instance_type): 252 | # type: (str) -> bool 253 | """Return whether a metrics node with this instance type exists""" 254 | instances = self.query_active_instances( 255 | additional_tags=dict(Role="metrics") 256 | ) 257 | return instance_type in instances 258 | 259 | def has_started_metric_creation(self): 260 | # type: () -> bool 261 | """Return whether the metric creation container was started""" 262 | response = self._ecs_client.list_tasks( 263 | cluster=self._ecs_name, 264 | startedBy=self.plan_uuid 265 | ) 266 | return bool(response["taskArns"]) 267 | 268 | def has_finished_metric_creation(self): 269 | # type: () -> bool 270 | """Return whether the metric creation container has finished""" 271 | response = self._ecs_client.list_tasks( 272 | cluster=self._ecs_name, 273 | startedBy=self.plan_uuid, 274 | desiredStatus="STOPPED" 275 | ) 276 | return bool(response["taskArns"]) 277 | 278 | def request_instances(self, instances, security_group_ids, 279 | additional_tags=None): 280 | # type: (Dict[str, int], List[str], Optional[Dict[str, str]]) -> None 281 | """Create requested types/quantities of instances for this cluster""" 282 | ami_id = self.ecs_ami_ids["us-east-1"] 283 | tags = dict(Name=self._ecs_name, Owner="ardere", 284 | ECSCluster=self._ecs_name) 285 | if additional_tags: 286 | tags.update(additional_tags) 287 | for instance_type, instance_count in instances.items(): 288 | self._ec2_client.run_instances( 289 | ImageId=ami_id, 290 | MinCount=instance_count, 291 | MaxCount=instance_count, 292 | InstanceType=instance_type, 293 | UserData=EC2_USER_DATA.format(ecs_name=self._ecs_name), 294 | IamInstanceProfile={"Arn": self.ecs_profile}, 295 | SecurityGroupIds=security_group_ids, 296 | TagSpecifications=[ 297 | { 298 | "ResourceType": "instance", 299 | "Tags": [ 300 | dict(Key=tag_name, Value=tag_value) 301 | for tag_name, tag_value in tags.items() 302 | ] 303 | } 304 | ] 305 | ) 306 | 307 | def locate_metrics_container_ip(self): 308 | # type: () -> Tuple[Optional[str], Optional[str]] 309 | """Locates the metrics container IP and container instance arn 310 | 311 | Returns a tuple of (public_ip, container_arn) 312 | 313 | """ 314 | response = self._ecs_client.list_container_instances( 315 | cluster=self._ecs_name, 316 | filter="task:group == service:metrics" 317 | ) 318 | if not response["containerInstanceArns"]: 319 | return None, None 320 | 321 | container_arn = response["containerInstanceArns"][0] 322 | response = self._ecs_client.describe_container_instances( 323 | cluster=self._ecs_name, 324 | containerInstances=[container_arn] 325 | ) 326 | 327 | container_instance = response["containerInstances"][0] 328 | ec2_instance_id = container_instance["ec2InstanceId"] 329 | instance = self.boto.resource("ec2").Instance(ec2_instance_id) 330 | return instance.private_ip_address, container_arn 331 | 332 | def locate_metrics_service(self): 333 | # type: () -> Optional[str] 334 | """Locate and return the metrics service arn if any""" 335 | response = self._ecs_client.describe_services( 336 | cluster=self._ecs_name, 337 | services=["metrics"] 338 | ) 339 | if response["services"] and response["services"][0]["status"] == \ 340 | "ACTIVE": 341 | return response["services"][0] 342 | else: 343 | return None 344 | 345 | def create_metrics_service(self, options): 346 | # type: (Dict[str, Any]) -> Dict[str, Any] 347 | """Creates an ECS service to run InfluxDB and Grafana for metric 348 | reporting and returns its info""" 349 | logger.info("Creating InfluxDB service with options: {}".format( 350 | options)) 351 | 352 | cmd = """\ 353 | export GF_DEFAULT_INSTANCE_NAME=`wget -qO- http://169.254.169.254/latest/meta-data/instance-id` && \ 354 | export GF_SECURITY_ADMIN_USER=%s && \ 355 | export GF_SECURITY_ADMIN_PASSWORD=%s && \ 356 | export GF_USERS_ALLOW_SIGN_UP=false && \ 357 | mkdir "${GF_DASHBOARDS_JSON_PATH}" && \ 358 | ./run.sh 359 | """ % (self.grafana_admin_user, self.grafana_admin_password) # noqa 360 | cmd = ['sh', '-c', '{}'.format(cmd)] 361 | 362 | gf_env = { 363 | "GF_DASHBOARDS_JSON_ENABLED": "true", 364 | "GF_DASHBOARDS_JSON_PATH": "/var/lib/grafana/dashboards", 365 | "__ARDERE_GRAFANA_URL__": 366 | "http://admin:admin@localhost:3000/api/datasources" 367 | } 368 | 369 | # Setup the task definition for setting up influxdb/grafana instances 370 | # per run 371 | mc_cmd = """\ 372 | pip install influxdb requests boto3 && \ 373 | echo "${__ARDERE_PYTHON_SCRIPT__}" > setup_db.py && \ 374 | python setup_db.py 375 | """ 376 | mc_cmd = ['sh', '-c', '{}'.format(mc_cmd)] 377 | self._ecs_client.register_task_definition( 378 | family=self.metrics_setup_family_name(), 379 | containerDefinitions=[ 380 | { 381 | "name": "metricsetup", 382 | "image": self.python_container, 383 | "cpu": 128, 384 | "entryPoint": mc_cmd, 385 | "memoryReservation": 256, 386 | "privileged": True, 387 | "logConfiguration": self.log_config 388 | } 389 | ], 390 | networkMode="host" 391 | ) 392 | 393 | task_response = self._ecs_client.register_task_definition( 394 | family=self.metrics_family_name(), 395 | containerDefinitions=[ 396 | { 397 | "name": "influxdb", 398 | "image": self.influxdb_container, 399 | "cpu": cpu_units_for_instance_type( 400 | options["instance_type"]), 401 | "memoryReservation": 256, 402 | "privileged": True, 403 | "portMappings": [ 404 | {"containerPort": 8086}, 405 | {"containerPort": 8088} 406 | ], 407 | "logConfiguration": self.log_config 408 | }, 409 | { 410 | "name": "grafana", 411 | "image": self.grafana_container, 412 | "cpu": 256, 413 | "memoryReservation": 256, 414 | "entryPoint": cmd, 415 | "portMappings": [ 416 | {"containerPort": 3000} 417 | ], 418 | "privileged": True, 419 | "environment": [ 420 | {"name": key, "value": value} for key, value in 421 | gf_env.items() 422 | ], 423 | "logConfiguration": self.log_config 424 | } 425 | ], 426 | # use host network mode for optimal performance 427 | networkMode="host", 428 | 429 | placementConstraints=[ 430 | # Ensure the service is confined to the right instance type 431 | { 432 | "type": "memberOf", 433 | "expression": "attribute:ecs.instance-type == {}".format( 434 | options["instance_type"]), 435 | } 436 | ], 437 | ) 438 | task_arn = task_response["taskDefinition"]["taskDefinitionArn"] 439 | service_result = self._ecs_client.create_service( 440 | cluster=self._ecs_name, 441 | serviceName="metrics", 442 | taskDefinition=task_arn, 443 | desiredCount=1, 444 | deploymentConfiguration={ 445 | "minimumHealthyPercent": 0, 446 | "maximumPercent": 100 447 | }, 448 | placementConstraints=[ 449 | { 450 | "type": "distinctInstance" 451 | } 452 | ] 453 | ) 454 | service_arn = service_result["service"]["serviceArn"] 455 | return dict(task_arn=task_arn, service_arn=service_arn) 456 | 457 | def run_metric_creation_task(self, container_instance, grafana_auth, 458 | dashboard=None, 459 | dashboard_name=None): 460 | # type: (str, Tuple[str, str], Optional[str], Optional[str]) -> None 461 | """Starts the metric creation task""" 462 | env = { 463 | "__ARDERE_GRAFANA_USER__": grafana_auth[0], 464 | "__ARDERE_GRAFANA_PASS__": grafana_auth[1], 465 | "__ARDERE_PYTHON_SCRIPT__": self.metric_create_script, 466 | "__ARDERE_INFLUXDB_NAME__": self.influx_db_name 467 | } 468 | 469 | if dashboard: 470 | env["__ARDERE_DASHBOARD__"] = dashboard 471 | env["__ARDERE_DASHBOARD_NAME__"] = dashboard_name 472 | 473 | self._ecs_client.start_task( 474 | cluster=self._ecs_name, 475 | taskDefinition=self.metrics_setup_family_name(), 476 | overrides={ 477 | 'containerOverrides': [ 478 | { 479 | "name": "metricsetup", 480 | "environment": [ 481 | {"name": key, "value": value} for key, value in 482 | env.items() 483 | ] 484 | } 485 | ] 486 | }, 487 | containerInstances=[container_instance], 488 | startedBy=self.plan_uuid 489 | ) 490 | 491 | def create_service(self, step): 492 | # type: (Dict[str, Any]) -> Dict[str, Any] 493 | """Creates an ECS service for a step and returns its info""" 494 | logger.info("CreateService called with: {}".format(step)) 495 | 496 | # Prep the shell command 497 | wfc_var = '__ARDERE_WAITFORCLUSTER_SH__' 498 | wfc_cmd = 'sh -c "${}" waitforcluster.sh {} {}'.format( 499 | wfc_var, 500 | self.s3_ready_file, 501 | step.get("run_delay", 0) 502 | ) 503 | service_cmd = step["cmd"] 504 | cmd = ['sh', '-c', '{} && {}'.format(wfc_cmd, service_cmd)] 505 | 506 | # Prep the env vars 507 | env_vars = [{"name": wfc_var, "value": self.wait_script}] 508 | for name, value in step.get("env", {}).items(): 509 | env_vars.append({"name": name, "value": value}) 510 | 511 | # ECS wants a family name for task definitions, no spaces, 255 chars 512 | family_name = step["name"] + "-" + self._plan_uuid 513 | 514 | # Use cpu_unit if provided, otherwise monopolize 515 | cpu_units = step.get( 516 | "cpu_units", 517 | cpu_units_for_instance_type(step["instance_type"]) 518 | ) 519 | 520 | # Setup the container definition 521 | container_def = { 522 | "name": step["name"], 523 | "image": step["container_name"], 524 | "cpu": cpu_units, 525 | 526 | # using only memoryReservation sets no hard limit 527 | "memoryReservation": 256, 528 | "privileged": True, 529 | "environment": env_vars, 530 | "entryPoint": cmd, 531 | "ulimits": [ 532 | dict(name="nofile", softLimit=1000000, hardLimit=1000000) 533 | ], 534 | "logConfiguration": self.log_config 535 | } 536 | if "port_mapping" in step: 537 | ports = [{"containerPort": port} for port in step["port_mapping"]] 538 | container_def["portMappings"] = ports 539 | 540 | # Setup the telegraf container definition 541 | cmd = """\ 542 | echo "${__ARDERE_TELEGRAF_CONF__}" > /etc/telegraf/telegraf.conf && \ 543 | export __ARDERE_TELEGRAF_HOST__=`wget -qO- http://169.254.169.254/latest/meta-data/instance-id` && \ 544 | telegraf \ 545 | """ # noqa 546 | cmd = ['sh', '-c', '{}'.format(cmd)] 547 | telegraf_def = { 548 | "name": "telegraf", 549 | "image": self.telegraf_container, 550 | "cpu": 512, 551 | "memoryReservation": 256, 552 | "entryPoint": cmd, 553 | "portMappings": [ 554 | {"containerPort": 8125} 555 | ], 556 | "privileged": True, 557 | "environment": [ 558 | {"name": "__ARDERE_TELEGRAF_CONF__", 559 | "value": self.telegraf_script}, 560 | {"name": "__ARDERE_TELEGRAF_STEP__", 561 | "value": step["name"]}, 562 | {"name": "__ARDERE_INFLUX_ADDR__", 563 | "value": "{}:8086".format(self._plan["influxdb_private_ip"])}, 564 | {"name": "__ARDERE_INFLUX_DB__", 565 | "value": self.influx_db_name}, 566 | {"name": "__ARDERE_TELEGRAF_TYPE__", 567 | "value": step["docker_series"]} 568 | ], 569 | "logConfiguration": self.log_config 570 | } 571 | 572 | task_response = self._ecs_client.register_task_definition( 573 | family=family_name, 574 | containerDefinitions=[ 575 | container_def, 576 | telegraf_def 577 | ], 578 | # use host network mode for optimal performance 579 | networkMode="host", 580 | 581 | placementConstraints=[ 582 | # Ensure the service is confined to the right instance type 583 | { 584 | "type": "memberOf", 585 | "expression": "attribute:ecs.instance-type == {}".format( 586 | step["instance_type"]), 587 | } 588 | ] 589 | ) 590 | task_arn = task_response["taskDefinition"]["taskDefinitionArn"] 591 | step["taskArn"] = task_arn 592 | service_result = self._ecs_client.create_service( 593 | cluster=self._ecs_name, 594 | serviceName=step["name"], 595 | taskDefinition=task_arn, 596 | desiredCount=step["instance_count"], 597 | deploymentConfiguration={ 598 | "minimumHealthyPercent": 0, 599 | "maximumPercent": 100 600 | }, 601 | placementConstraints=[ 602 | { 603 | "type": "distinctInstance" 604 | } 605 | ] 606 | ) 607 | step["serviceArn"] = service_result["service"]["serviceArn"] 608 | step["service_status"] = "STARTED" 609 | return step 610 | 611 | def create_services(self, steps): 612 | # type: (List[Dict[str, Any]]) -> None 613 | """Create ECS Services given a list of steps""" 614 | with ThreadPoolExecutor(max_workers=8) as executor: 615 | list(executor.map(self.create_service, steps)) 616 | 617 | def service_ready(self, step): 618 | # type: (Dict[str, Any]) -> bool 619 | """Query a service and return whether all its tasks are running""" 620 | service_name = step["name"] 621 | response = self._ecs_client.describe_services( 622 | cluster=self._ecs_name, 623 | services=[service_name] 624 | ) 625 | 626 | try: 627 | deploy = response["services"][0]["deployments"][0] 628 | except (TypeError, IndexError): 629 | return False 630 | return deploy["desiredCount"] == deploy["runningCount"] 631 | 632 | def all_services_ready(self, steps): 633 | # type: (List[Dict[str, Any]]) -> bool 634 | """Queries all service ARN's in the plan to see if they're ready""" 635 | with ThreadPoolExecutor(max_workers=8) as executor: 636 | results = executor.map(self.service_ready, steps) 637 | return all(results) 638 | 639 | def service_done(self, step): 640 | # type: (Dict[str, Any]) -> bool 641 | """Query a service to return whether its fully drained and back to 642 | INACTIVE""" 643 | service_name = step["name"] 644 | response = self._ecs_client.describe_services( 645 | cluster=self._ecs_name, 646 | services=[service_name] 647 | ) 648 | 649 | service = response["services"][0] 650 | return service["status"] == "INACTIVE" 651 | 652 | def all_services_done(self, steps): 653 | # type: (List[Dict[str, Any]]) -> bool 654 | """Queries all service ARN's in the plan to see if they're fully 655 | DRAINED and now INACTIVE""" 656 | with ThreadPoolExecutor(max_workers=8) as executor: 657 | results = executor.map(self.service_done, steps) 658 | return all(results) 659 | 660 | def stop_finished_service(self, start_time, step): 661 | # type: (start_time, Dict[str, Any]) -> None 662 | """Stops a service if it needs to shutdown""" 663 | if step["service_status"] == "STOPPED": 664 | return 665 | 666 | # Calculate time 667 | step_duration = step.get("run_delay", 0) + step["run_max_time"] 668 | now = time.time() 669 | if now < (start_time + step_duration): 670 | return 671 | 672 | # Running long enough to shutdown 673 | self._ecs_client.update_service( 674 | cluster=self._ecs_name, 675 | service=step["name"], 676 | desiredCount=0 677 | ) 678 | step["service_status"] = "STOPPED" 679 | 680 | def stop_finished_services(self, start_time, steps): 681 | # type: (int, List[Dict[str, Any]]) -> None 682 | """Shuts down any services that have run for their max time""" 683 | for step in steps: 684 | self.stop_finished_service(start_time, step) 685 | 686 | def shutdown_plan(self, steps): 687 | # type: (List[Dict[str, Any]]) -> None 688 | """Terminate the entire plan, ensure all services and task 689 | definitions are completely cleaned up and removed""" 690 | # Locate all the services for the ECS Cluster 691 | paginator = self._ecs_client.get_paginator('list_services') 692 | response_iterator = paginator.paginate( 693 | cluster=self._ecs_name 694 | ) 695 | 696 | # Collect all the service ARN's 697 | service_arns = [] 698 | for page in response_iterator: 699 | service_arns.extend(page["serviceArns"]) 700 | 701 | # Avoid shutting down metrics if tear down was not requested 702 | # We have to exclude it from the services discovered above if we 703 | # should NOT tear it down 704 | if not self._plan["metrics_options"]["tear_down"]: 705 | metric_service = self.locate_metrics_service() 706 | if metric_service and metric_service["serviceArn"] in service_arns: 707 | service_arns.remove(metric_service["serviceArn"]) 708 | 709 | for service_arn in service_arns: 710 | try: 711 | self._ecs_client.update_service( 712 | cluster=self._ecs_name, 713 | service=service_arn, 714 | desiredCount=0 715 | ) 716 | except botocore.exceptions.ClientError: 717 | continue 718 | 719 | try: 720 | self._ecs_client.delete_service( 721 | cluster=self._ecs_name, 722 | service=service_arn 723 | ) 724 | except botocore.exceptions.ClientError: 725 | pass 726 | 727 | # Locate all the task definitions for this plan 728 | step_family_names = [self.family_name(step) for step in steps] 729 | 730 | # Add in the metrics family name if we need to tear_down 731 | if self._plan["metrics_options"]["tear_down"]: 732 | step_family_names.append(self.metrics_family_name()) 733 | step_family_names.append(self.metrics_setup_family_name()) 734 | 735 | for family_name in step_family_names: 736 | try: 737 | response = self._ecs_client.describe_task_definition( 738 | taskDefinition=family_name 739 | ) 740 | except botocore.exceptions.ClientError: 741 | continue 742 | 743 | task_arn = response["taskDefinition"]["taskDefinitionArn"] 744 | 745 | # Deregister the task 746 | try: 747 | self._ecs_client.deregister_task_definition( 748 | taskDefinition=task_arn 749 | ) 750 | except botocore.exceptions.ClientError: 751 | pass 752 | -------------------------------------------------------------------------------- /ardere/exceptions.py: -------------------------------------------------------------------------------- 1 | class ServicesStartingException(Exception): 2 | """Exception to indicate Services are still Starting""" 3 | 4 | 5 | class ShutdownPlanException(Exception): 6 | """Exception to indicate the Plan should be Shutdown""" 7 | 8 | 9 | class ValidationException(Exception): 10 | """Exception to indicate validation error parsing input""" 11 | 12 | 13 | class UndrainedInstancesException(Exception): 14 | """There are still ACTIVE or DRAINING instances in the cluster""" 15 | 16 | 17 | class CreatingMetricSourceException(Exception): 18 | """Metric creation task hasn't completed yet""" 19 | -------------------------------------------------------------------------------- /ardere/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /ardere/scripts/metric_creator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | import os 4 | 5 | import boto3 6 | import influxdb 7 | import requests 8 | 9 | try: 10 | from typing import Any, Dict # noqa 11 | except ImportError: # pragma: nocover 12 | pass 13 | 14 | logging.basicConfig(level=logging.INFO) 15 | logger = logging.getLogger() 16 | 17 | 18 | class DashboardSetup(object): 19 | # For testing purposes 20 | boto = boto3 21 | req = requests 22 | influx = influxdb 23 | 24 | def __init__(self): 25 | self.influx_db_name = os.environ["__ARDERE_INFLUXDB_NAME__"] 26 | self.dashboard = os.environ.get("__ARDERE_DASHBOARD__") 27 | self.dashboard_name = os.environ.get("__ARDERE_DASHBOARD_NAME__") 28 | self.grafana_auth = ( 29 | os.environ.get("__ARDERE_GRAFANA_USER__"), 30 | os.environ.get("__ARDERE_GRAFANA_PASS__") 31 | ) 32 | 33 | def _load_dashboard(self): 34 | # type: () -> Dict[str, Any] 35 | """Load dashboard from S3 and update JSON contents""" 36 | logger.info("Fetching dashboard from S3") 37 | bucket, filename = self.dashboard.split(":") 38 | s3 = self.boto.resource('s3') 39 | dash_file = s3.Object(bucket, filename) 40 | file_contents = dash_file.get()['Body'].read().decode('utf-8') 41 | dash_contents = json.loads(file_contents) 42 | dash_contents["title"] = self.dashboard_name 43 | dash_contents["id"] = None 44 | logger.info("Fetched dashboard file") 45 | return dash_contents 46 | 47 | def _create_dashboard(self, grafana_url): 48 | # type: (str) -> None 49 | """Create the dashboard in grafana""" 50 | dash_contents = self._load_dashboard() 51 | logger.info("Creating dashboard in grafana") 52 | response = self.req.post(grafana_url + "/api/dashboards/db", 53 | auth=self.grafana_auth, 54 | json=dict( 55 | dashboard=dash_contents, 56 | overwrite=True 57 | )) 58 | if response.status_code != 200: 59 | raise Exception("Error creating dashboard: {}".format( 60 | response.status_code)) 61 | 62 | def _ensure_dashboard(self, grafana_url): 63 | # type: (str) -> None 64 | """Ensure the dashboard is present""" 65 | # Verify whether the dashboard exists 66 | response = self.req.get(grafana_url + "/api/search", 67 | auth=self.grafana_auth, 68 | params=dict(query=self.dashboard_name)) 69 | if response.status_code != 200: 70 | raise Exception("Failure to search dashboards") 71 | 72 | # search results for dashboard 73 | results = filter(lambda x: x["title"] == self.dashboard_name, 74 | response.json()) 75 | if not results: 76 | self._create_dashboard(grafana_url) 77 | 78 | def create_datasources(self): 79 | # type: () -> None 80 | # Create an influxdb for this run 81 | logger.info("Create influx database") 82 | influx_client = self.influx.InfluxDBClient() 83 | influx_client.create_database(self.influx_db_name) 84 | 85 | # Setup the grafana datasource 86 | grafana_url = "http://127.0.0.1:3000" 87 | ds_api_url = "http://127.0.0.1:3000/api/datasources" 88 | logger.info("Create datasource in grafana") 89 | self.req.post(ds_api_url, auth=self.grafana_auth, json=dict( 90 | name=self.influx_db_name, 91 | type="influxdb", 92 | url="http://localhost:8086", 93 | database=self.influx_db_name, 94 | access="proxy", 95 | basicAuth=False 96 | )) 97 | 98 | # Setup the grafana dashboard if needed/desired 99 | if self.dashboard: 100 | self._ensure_dashboard(grafana_url) 101 | 102 | 103 | if __name__ == "__main__": # pragma: no cover 104 | logger.info("Creating datasources") 105 | DashboardSetup().create_datasources() 106 | logger.info("Finished.") 107 | -------------------------------------------------------------------------------- /ardere/step_functions.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | import time 5 | from collections import defaultdict 6 | 7 | import boto3 8 | import botocore 9 | import toml 10 | from marshmallow import ( 11 | Schema, 12 | decorators, 13 | fields, 14 | validate, 15 | ValidationError, 16 | ) 17 | from typing import Any, Dict, List # noqa 18 | 19 | from ardere.aws import ( 20 | ECSManager, 21 | ec2_vcpu_by_type, 22 | ) 23 | from ardere.exceptions import ( 24 | CreatingMetricSourceException, 25 | ServicesStartingException, 26 | ShutdownPlanException, 27 | ValidationException, 28 | UndrainedInstancesException, 29 | ) 30 | 31 | logger = logging.getLogger() 32 | logger.setLevel(logging.INFO) 33 | 34 | # Step name is used as the Log stream name. 35 | # Log stream names are limited to 512 characters (no ":" or "*") 36 | # Name format is 37 | # ardere-UUID/STEP_NAME/LUUID 38 | # where UUID is dashed, and LUUID is not 39 | # therefore: 512 - (9 + 36 + 32) = max name len 40 | MAX_NAME_LEN = 435 41 | INVALID_NAME_CHECK = re.compile("([:\*]+)") 42 | 43 | 44 | class StepValidator(Schema): 45 | name = fields.String(required=True) 46 | instance_count = fields.Int(required=True) 47 | instance_type = fields.String( 48 | required=True, 49 | validate=validate.OneOf(ec2_vcpu_by_type.keys()) 50 | ) 51 | run_max_time = fields.Int(required=True) 52 | run_delay = fields.Int(missing=0) 53 | container_name = fields.String(required=True) 54 | cmd = fields.String(required=True) 55 | port_mapping = fields.List(fields.Int()) 56 | env = fields.Dict() 57 | docker_series = fields.String(missing="default") 58 | 59 | @decorators.validates("name") 60 | def validate_name(self, value): 61 | if len(value) == 0: 62 | raise ValidationError("Step name missing") 63 | if len(value) > MAX_NAME_LEN: 64 | raise ValidationError("Step name too long") 65 | if INVALID_NAME_CHECK.search(value): 66 | raise ValidationError("Step name contains invalid characters") 67 | 68 | 69 | class DashboardOptions(Schema): 70 | admin_user = fields.String(missing="admin") 71 | admin_password = fields.String(required=True) 72 | name = fields.String(required=True) 73 | filename = fields.String(required=True) 74 | 75 | 76 | class MetricsOptions(Schema): 77 | enabled = fields.Bool(missing=True) 78 | instance_type = fields.String( 79 | missing="c4.large", 80 | validate=validate.OneOf(ec2_vcpu_by_type.keys()) 81 | ) 82 | dashboard = fields.Nested(DashboardOptions) 83 | tear_down = fields.Bool(missing=False) 84 | 85 | 86 | class PlanValidator(Schema): 87 | ecs_name = fields.String(required=True) 88 | name = fields.String(required=True) 89 | metrics_options = fields.Nested(MetricsOptions, missing={}) 90 | 91 | steps = fields.Nested(StepValidator, many=True) 92 | 93 | def _log_validate_name(self, value, name_type): 94 | if len(value) == 0: 95 | raise ValidationError("{} missing".format(name_type)) 96 | if len(value) > MAX_NAME_LEN: 97 | raise ValidationError("{} too long".format(name_type)) 98 | if INVALID_NAME_CHECK.search(value): 99 | raise ValidationError( 100 | "{} contained invalid characters".format(name_type)) 101 | 102 | @decorators.validates("ecs_name") 103 | def validate_ecs_name(self, value): 104 | """Verify a cluster exists for this name""" 105 | self._log_validate_name(value, "Plan ecs_name") 106 | client = self.context["boto"].client('ecs') 107 | response = client.describe_clusters( 108 | clusters=[value] 109 | ) 110 | if not response.get("clusters"): 111 | raise ValidationError("No cluster with the provided name.") 112 | 113 | @decorators.validates("name") 114 | def validate_name(self, value): 115 | self._log_validate_name(value, "Step name") 116 | 117 | 118 | class AsynchronousPlanRunner(object): 119 | """Asynchronous Test Plan Runner 120 | 121 | This step function based runner handles running a test plan in an 122 | asynchronous manner, where each step will wait for its run_delay if 123 | present before running. 124 | 125 | """ 126 | # For testing purposes 127 | boto = boto3 128 | 129 | def __init__(self, event, context): 130 | logger.info("Called with {}".format(event)) 131 | logger.info("Environ: {}".format(os.environ)) 132 | 133 | # Load our TOML if needed 134 | event = self._load_toml(event) 135 | 136 | self.event = event 137 | self.context = context 138 | self.ecs = ECSManager(plan=event) 139 | 140 | @property 141 | def grafana_auth(self): 142 | if not self.event["metrics_options"].get("dashboard"): 143 | return "", "" 144 | 145 | dash_opts = self.event["metrics_options"]["dashboard"] 146 | return dash_opts["admin_user"], dash_opts["admin_password"] 147 | 148 | @property 149 | def dashboard_options(self): 150 | return self.event["metrics_options"]["dashboard"] 151 | 152 | def _build_instance_map(self): 153 | """Given a JSON test-plan, build and return a dict of instance types 154 | and how many should exist for each type.""" 155 | instances = defaultdict(int) 156 | for step in self.event["steps"]: 157 | instances[step["instance_type"]] += step["instance_count"] 158 | return instances 159 | 160 | def _find_test_plan_duration(self): 161 | # type: (Dict[str, Any]) -> int 162 | """Locates and calculates the longest test plan duration from its 163 | delay through its duration of the plan.""" 164 | return max( 165 | [x.get("run_delay", 0) + x["run_max_time"] for x in 166 | self.event["steps"]] 167 | ) 168 | 169 | def _load_toml(self, event): 170 | """Loads TOML if necessary""" 171 | return toml.loads(event["toml"]) if "toml" in event else event 172 | 173 | def _validate_plan(self): 174 | """Validates that the loaded plan is correct""" 175 | schema = PlanValidator() 176 | schema.context["boto"] = self.boto 177 | data, errors = schema.load(self.event) 178 | if errors: 179 | raise ValidationException("Failed to validate: {}".format(errors)) 180 | 181 | # Replace our event with the validated 182 | self.event = data 183 | 184 | def populate_missing_instances(self): 185 | """Populate any missing EC2 instances needed for the test plan in the 186 | cluster 187 | 188 | """ 189 | # First, validate the test plan, done only as part of step 1 190 | self._validate_plan() 191 | 192 | needed = self._build_instance_map() 193 | 194 | # Ensure we have the metrics instance 195 | if self.event["metrics_options"]["enabled"]: 196 | # Query to see if we need to add a metrics node 197 | metric_inst_type = self.event["metrics_options"]["instance_type"] 198 | 199 | # We add the instance type to needed to ensure we don't leave out 200 | # more nodes since this will turn up in the query_active results 201 | needed[metric_inst_type] += 1 202 | 203 | # We create it here up-front if needed since we have different 204 | # tags 205 | if not self.ecs.has_metrics_node(metric_inst_type): 206 | self.ecs.request_instances( 207 | instances={metric_inst_type: 1}, 208 | security_group_ids=[os.environ["metric_sg"], 209 | os.environ["ec2_sg"]], 210 | additional_tags={"Role": "metrics"} 211 | ) 212 | 213 | logger.info("Plan instances needed: {}".format(needed)) 214 | current_instances = self.ecs.query_active_instances() 215 | missing_instances = self.ecs.calculate_missing_instances( 216 | desired=needed, current=current_instances 217 | ) 218 | if missing_instances: 219 | logger.info("Requesting instances: {}".format(missing_instances)) 220 | self.ecs.request_instances( 221 | instances=missing_instances, 222 | security_group_ids=[os.environ["ec2_sg"]] 223 | ) 224 | return self.event 225 | 226 | def ensure_metrics_available(self): 227 | """Start the metrics service, ensure its running, and its IP is known 228 | 229 | """ 230 | if not self.event["metrics_options"]["enabled"]: 231 | return self.event 232 | 233 | # Is the service already running? 234 | metrics = self.ecs.locate_metrics_service() 235 | logger.info("Metrics info: %s", metrics) 236 | 237 | if not metrics: 238 | # Start the metrics service, throw a retry 239 | self.ecs.create_metrics_service(self.event["metrics_options"]) 240 | raise ServicesStartingException("Triggered metrics start") 241 | 242 | deploy = metrics["deployments"][0] 243 | ready = deploy["desiredCount"] == deploy["runningCount"] 244 | logger.info("Deploy info: %s", deploy) 245 | if not ready: 246 | raise ServicesStartingException("Waiting for metrics") 247 | 248 | # Populate the IP of the metrics service 249 | metric_ip, container_arn = self.ecs.locate_metrics_container_ip() 250 | 251 | if not metric_ip: 252 | raise Exception("Unable to locate metrics IP even though its " 253 | "running") 254 | 255 | self.event["influxdb_private_ip"] = metric_ip 256 | self.event["metric_container_arn"] = container_arn 257 | return self.event 258 | 259 | def ensure_metric_sources_created(self): 260 | """Ensure the metrics db and grafana datasource are configured""" 261 | if not self.event["metrics_options"]["enabled"]: 262 | return self.event 263 | 264 | if not self.ecs.has_started_metric_creation(): 265 | dashboard = None 266 | dashboard_name = None 267 | if self.event["metrics_options"].get("dashboard"): 268 | dashboard = ":".join([os.environ["metrics_bucket"], 269 | self.dashboard_options["filename"]]) 270 | dashboard_name = self.dashboard_options["name"] 271 | self.ecs.run_metric_creation_task( 272 | container_instance=self.event["metric_container_arn"], 273 | grafana_auth=self.grafana_auth, 274 | dashboard=dashboard, 275 | dashboard_name=dashboard_name 276 | ) 277 | raise CreatingMetricSourceException("Started metric creation") 278 | 279 | if not self.ecs.has_finished_metric_creation(): 280 | raise CreatingMetricSourceException("Metric creation still " 281 | "running") 282 | 283 | metric_ip = self.event["influxdb_private_ip"] 284 | self.event["grafana_dashboard"] = "http://{}:3000".format(metric_ip) 285 | return self.event 286 | 287 | def create_ecs_services(self): 288 | """Create all the ECS services needed 289 | 290 | """ 291 | self.ecs.create_services(self.event["steps"]) 292 | return self.event 293 | 294 | def wait_for_cluster_ready(self): 295 | """Check all the ECS services to see if they're ready 296 | 297 | """ 298 | if not self.ecs.all_services_ready(self.event["steps"]): 299 | raise ServicesStartingException() 300 | return self.event 301 | 302 | def signal_cluster_start(self): 303 | """Drop a ready file in S3 to trigger the test plan to being 304 | 305 | """ 306 | s3_client = self.boto.client('s3') 307 | s3_client.put_object( 308 | ACL="public-read", 309 | Body=b'{}'.format(int(time.time())), 310 | Bucket=os.environ["s3_ready_bucket"], 311 | Key="{}.ready".format(self.ecs.plan_uuid), 312 | Metadata={ 313 | "ECSCluster": self.event["ecs_name"] 314 | } 315 | ) 316 | return self.event 317 | 318 | def check_for_cluster_done(self): 319 | """Check all the ECS services to see if they've run for their 320 | specified duration 321 | 322 | """ 323 | # Check to see if the S3 file is still around 324 | s3 = self.boto.resource('s3') 325 | try: 326 | ready_file = s3.Object( 327 | os.environ["s3_ready_bucket"], 328 | "{}.ready".format(self.ecs.plan_uuid) 329 | ) 330 | except botocore.exceptions.ClientError: 331 | # Error getting to the bucket/key, abort test run 332 | raise ShutdownPlanException("Error accessing ready file") 333 | 334 | file_contents = ready_file.get()['Body'].read().decode('utf-8') 335 | start_time = int(file_contents) 336 | 337 | # Update to running count 0 any services that should halt by now 338 | self.ecs.stop_finished_services(start_time, self.event["steps"]) 339 | 340 | # If we're totally done, exit. 341 | now = time.time() 342 | plan_duration = self._find_test_plan_duration() 343 | if now > (start_time + plan_duration): 344 | raise ShutdownPlanException("Test Plan has completed") 345 | return self.event 346 | 347 | def cleanup_cluster(self): 348 | """Shutdown all ECS services and deregister all task definitions""" 349 | self.ecs.shutdown_plan(self.event["steps"]) 350 | 351 | # Attempt to remove the S3 object 352 | s3 = self.boto.resource('s3') 353 | try: 354 | ready_file = s3.Object( 355 | os.environ["s3_ready_bucket"], 356 | "{}.ready".format(self.ecs.plan_uuid) 357 | ) 358 | ready_file.delete() 359 | except botocore.exceptions.ClientError: 360 | pass 361 | return self.event 362 | 363 | def check_drained(self): 364 | """Ensure that all services are shut down before allowing restart""" 365 | if self.ecs.all_services_done(self.event["steps"]): 366 | return self.event 367 | else: 368 | raise UndrainedInstancesException("Services still draining") 369 | -------------------------------------------------------------------------------- /config.bash: -------------------------------------------------------------------------------- 1 | #! /bin/bash -w 2 | 3 | ctrlc() 4 | { 5 | echo " Exiting..." 6 | rm ~/.aws/credentials 7 | exit 1 8 | } 9 | set -e 10 | 11 | if [[ "`which serverless`" == "" ]] 12 | then 13 | echo "Hrm, serverless is not installed. " 14 | echo "See https://serverless.com/framework/docs/providers/aws/guide/installation/" 15 | return 16 | fi 17 | if [[ ! -e ~/.aws/credentials ]] 18 | then 19 | trap ctrlc SIGINT 20 | echo " credential file was not found. Let's make one." 21 | echo "" 22 | echo " If you haven't already, you'll need to create an access key." 23 | echo " e.g. go to https://console.aws.amazon.com/iam/home#/users/${USER}/?security_credientials" 24 | echo " and click [Create access key]." 25 | echo "" 26 | read -p "Access Key ID: " access_key 27 | read -p "Secret Key ID: " secret_key 28 | echo " Thanks! Running configuration"; 29 | echo serverless config credentials --provider aws --key $access_key --secret $secret_key 30 | serverless config credentials --provider aws --key $access_key --secret $secret_key 31 | fi 32 | echo " You're configured. The next step is to deploy." 33 | 34 | -------------------------------------------------------------------------------- /default_dashboard.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [] 4 | }, 5 | "editable": true, 6 | "gnetId": null, 7 | "graphTooltip": 0, 8 | "hideControls": false, 9 | "id": 1, 10 | "links": [], 11 | "refresh": false, 12 | "rows": [ 13 | { 14 | "collapse": false, 15 | "height": "250px", 16 | "panels": [ 17 | { 18 | "aliasColors": {}, 19 | "bars": false, 20 | "datasource": "$db", 21 | "fill": 1, 22 | "id": 1, 23 | "legend": { 24 | "alignAsTable": true, 25 | "avg": false, 26 | "current": false, 27 | "max": false, 28 | "min": false, 29 | "show": true, 30 | "total": false, 31 | "values": true 32 | }, 33 | "lines": true, 34 | "linewidth": 1, 35 | "links": [], 36 | "nullPointMode": "null", 37 | "percentage": false, 38 | "pointradius": 5, 39 | "points": false, 40 | "renderer": "flot", 41 | "seriesOverrides": [], 42 | "span": 6, 43 | "stack": false, 44 | "steppedLine": false, 45 | "targets": [ 46 | { 47 | "alias": "", 48 | "dsType": "influxdb", 49 | "groupBy": [ 50 | { 51 | "params": [ 52 | "$interval" 53 | ], 54 | "type": "time" 55 | }, 56 | { 57 | "params": [ 58 | "host" 59 | ], 60 | "type": "tag" 61 | }, 62 | { 63 | "params": [ 64 | "step" 65 | ], 66 | "type": "tag" 67 | }, 68 | { 69 | "params": [ 70 | "none" 71 | ], 72 | "type": "fill" 73 | } 74 | ], 75 | "hide": false, 76 | "measurement": "cpu", 77 | "policy": "default", 78 | "query": "SELECT mean(\"usage_user\") FROM \"cpu\" WHERE $timeFilter GROUP BY time($interval) fill(null)", 79 | "rawQuery": false, 80 | "refId": "A", 81 | "resultFormat": "time_series", 82 | "select": [ 83 | [ 84 | { 85 | "params": [ 86 | "usage_system" 87 | ], 88 | "type": "field" 89 | }, 90 | { 91 | "params": [], 92 | "type": "mean" 93 | } 94 | ] 95 | ], 96 | "tags": [ 97 | { 98 | "key": "step", 99 | "operator": "=~", 100 | "value": "/^$step$/" 101 | }, 102 | { 103 | "condition": "AND", 104 | "key": "host", 105 | "operator": "=~", 106 | "value": "/^$host$/" 107 | } 108 | ] 109 | } 110 | ], 111 | "thresholds": [], 112 | "timeFrom": null, 113 | "timeShift": null, 114 | "title": "CPU Usage", 115 | "tooltip": { 116 | "shared": false, 117 | "sort": 0, 118 | "value_type": "individual" 119 | }, 120 | "type": "graph", 121 | "xaxis": { 122 | "mode": "time", 123 | "name": null, 124 | "show": true, 125 | "values": [] 126 | }, 127 | "yaxes": [ 128 | { 129 | "format": "percentunit", 130 | "label": null, 131 | "logBase": 1, 132 | "max": null, 133 | "min": null, 134 | "show": true 135 | }, 136 | { 137 | "format": "short", 138 | "label": null, 139 | "logBase": 1, 140 | "max": null, 141 | "min": null, 142 | "show": true 143 | } 144 | ] 145 | }, 146 | { 147 | "aliasColors": {}, 148 | "bars": false, 149 | "datasource": "$db", 150 | "fill": 1, 151 | "id": 2, 152 | "legend": { 153 | "alignAsTable": true, 154 | "avg": false, 155 | "current": false, 156 | "max": false, 157 | "min": false, 158 | "show": true, 159 | "total": false, 160 | "values": true 161 | }, 162 | "lines": true, 163 | "linewidth": 1, 164 | "links": [], 165 | "nullPointMode": "null", 166 | "percentage": false, 167 | "pointradius": 5, 168 | "points": false, 169 | "renderer": "flot", 170 | "seriesOverrides": [], 171 | "span": 6, 172 | "stack": false, 173 | "steppedLine": false, 174 | "targets": [ 175 | { 176 | "dsType": "influxdb", 177 | "groupBy": [ 178 | { 179 | "params": [ 180 | "$interval" 181 | ], 182 | "type": "time" 183 | }, 184 | { 185 | "params": [ 186 | "step" 187 | ], 188 | "type": "tag" 189 | }, 190 | { 191 | "params": [ 192 | "host" 193 | ], 194 | "type": "tag" 195 | }, 196 | { 197 | "params": [ 198 | "none" 199 | ], 200 | "type": "fill" 201 | } 202 | ], 203 | "measurement": "mem", 204 | "policy": "default", 205 | "refId": "A", 206 | "resultFormat": "time_series", 207 | "select": [ 208 | [ 209 | { 210 | "params": [ 211 | "used" 212 | ], 213 | "type": "field" 214 | }, 215 | { 216 | "params": [], 217 | "type": "mean" 218 | } 219 | ] 220 | ], 221 | "tags": [ 222 | { 223 | "key": "step", 224 | "operator": "=~", 225 | "value": "/^$step$/" 226 | }, 227 | { 228 | "condition": "AND", 229 | "key": "host", 230 | "operator": "=~", 231 | "value": "/^$host$/" 232 | } 233 | ] 234 | } 235 | ], 236 | "thresholds": [], 237 | "timeFrom": null, 238 | "timeShift": null, 239 | "title": "Memory Usage", 240 | "tooltip": { 241 | "shared": false, 242 | "sort": 0, 243 | "value_type": "individual" 244 | }, 245 | "type": "graph", 246 | "xaxis": { 247 | "mode": "time", 248 | "name": null, 249 | "show": true, 250 | "values": [] 251 | }, 252 | "yaxes": [ 253 | { 254 | "format": "bytes", 255 | "label": null, 256 | "logBase": 1, 257 | "max": null, 258 | "min": null, 259 | "show": true 260 | }, 261 | { 262 | "format": "short", 263 | "label": null, 264 | "logBase": 1, 265 | "max": null, 266 | "min": null, 267 | "show": true 268 | } 269 | ] 270 | } 271 | ], 272 | "repeat": null, 273 | "repeatIteration": null, 274 | "repeatRowId": null, 275 | "showTitle": false, 276 | "title": "Dashboard Row", 277 | "titleSize": "h6" 278 | }, 279 | { 280 | "collapse": false, 281 | "height": 250, 282 | "panels": [ 283 | { 284 | "aliasColors": {}, 285 | "bars": false, 286 | "datasource": "$db", 287 | "fill": 1, 288 | "id": 3, 289 | "legend": { 290 | "alignAsTable": true, 291 | "avg": false, 292 | "current": false, 293 | "max": false, 294 | "min": false, 295 | "show": true, 296 | "total": false, 297 | "values": true 298 | }, 299 | "lines": true, 300 | "linewidth": 1, 301 | "links": [], 302 | "nullPointMode": "null", 303 | "percentage": false, 304 | "pointradius": 5, 305 | "points": false, 306 | "renderer": "flot", 307 | "seriesOverrides": [ 308 | { 309 | "alias": "/^in.*/", 310 | "transform": "negative-Y" 311 | } 312 | ], 313 | "span": 6, 314 | "stack": false, 315 | "steppedLine": false, 316 | "targets": [ 317 | { 318 | "alias": "out {host: [[tag_host]] step: [[tag_step]]}", 319 | "dsType": "influxdb", 320 | "groupBy": [ 321 | { 322 | "params": [ 323 | "$interval" 324 | ], 325 | "type": "time" 326 | }, 327 | { 328 | "params": [ 329 | "step" 330 | ], 331 | "type": "tag" 332 | }, 333 | { 334 | "params": [ 335 | "host" 336 | ], 337 | "type": "tag" 338 | }, 339 | { 340 | "params": [ 341 | "null" 342 | ], 343 | "type": "fill" 344 | } 345 | ], 346 | "measurement": "net", 347 | "policy": "default", 348 | "refId": "A", 349 | "resultFormat": "time_series", 350 | "select": [ 351 | [ 352 | { 353 | "params": [ 354 | "bytes_sent" 355 | ], 356 | "type": "field" 357 | }, 358 | { 359 | "params": [], 360 | "type": "mean" 361 | }, 362 | { 363 | "params": [ 364 | "1s" 365 | ], 366 | "type": "non_negative_derivative" 367 | }, 368 | { 369 | "params": [ 370 | " *8" 371 | ], 372 | "type": "math" 373 | } 374 | ] 375 | ], 376 | "tags": [ 377 | { 378 | "key": "step", 379 | "operator": "=~", 380 | "value": "/^$step$/" 381 | }, 382 | { 383 | "condition": "AND", 384 | "key": "host", 385 | "operator": "=~", 386 | "value": "/^$host$/" 387 | } 388 | ] 389 | }, 390 | { 391 | "alias": "in {host: [[tag_host]] step: [[tag_step]]}", 392 | "dsType": "influxdb", 393 | "groupBy": [ 394 | { 395 | "params": [ 396 | "$interval" 397 | ], 398 | "type": "time" 399 | }, 400 | { 401 | "params": [ 402 | "step" 403 | ], 404 | "type": "tag" 405 | }, 406 | { 407 | "params": [ 408 | "host" 409 | ], 410 | "type": "tag" 411 | }, 412 | { 413 | "params": [ 414 | "none" 415 | ], 416 | "type": "fill" 417 | } 418 | ], 419 | "measurement": "net", 420 | "policy": "default", 421 | "query": "SELECT non_negative_derivative(mean(\"bytes_recv\"), 1s) *8 FROM \"net\" WHERE \"step\" =~ /^$step$/ AND \"host\" =~ /^$host$/ AND $timeFilter GROUP BY time($interval), \"step\", \"host\" fill(none)", 422 | "rawQuery": true, 423 | "refId": "B", 424 | "resultFormat": "time_series", 425 | "select": [ 426 | [ 427 | { 428 | "params": [ 429 | "bytes_recv" 430 | ], 431 | "type": "field" 432 | }, 433 | { 434 | "params": [], 435 | "type": "mean" 436 | }, 437 | { 438 | "params": [ 439 | "1s" 440 | ], 441 | "type": "non_negative_derivative" 442 | }, 443 | { 444 | "params": [ 445 | "*8" 446 | ], 447 | "type": "math" 448 | } 449 | ] 450 | ], 451 | "tags": [ 452 | { 453 | "key": "step", 454 | "operator": "=~", 455 | "value": "/^$step$/" 456 | }, 457 | { 458 | "condition": "AND", 459 | "key": "host", 460 | "operator": "=~", 461 | "value": "/^$host$/" 462 | } 463 | ] 464 | } 465 | ], 466 | "thresholds": [], 467 | "timeFrom": null, 468 | "timeShift": null, 469 | "title": "Network Bytes/sec", 470 | "tooltip": { 471 | "shared": false, 472 | "sort": 0, 473 | "value_type": "individual" 474 | }, 475 | "type": "graph", 476 | "xaxis": { 477 | "mode": "time", 478 | "name": null, 479 | "show": true, 480 | "values": [] 481 | }, 482 | "yaxes": [ 483 | { 484 | "format": "Bps", 485 | "label": null, 486 | "logBase": 1, 487 | "max": null, 488 | "min": null, 489 | "show": true 490 | }, 491 | { 492 | "format": "short", 493 | "label": null, 494 | "logBase": 1, 495 | "max": null, 496 | "min": null, 497 | "show": true 498 | } 499 | ] 500 | } 501 | ], 502 | "repeat": null, 503 | "repeatIteration": null, 504 | "repeatRowId": null, 505 | "showTitle": false, 506 | "title": "Dashboard Row", 507 | "titleSize": "h6" 508 | } 509 | ], 510 | "schemaVersion": 14, 511 | "style": "dark", 512 | "tags": [], 513 | "templating": { 514 | "list": [ 515 | { 516 | "current": { 517 | "tags": [], 518 | "text": "run-a3226cfb9513415bac7c7053a8b62a5f", 519 | "value": "run-a3226cfb9513415bac7c7053a8b62a5f" 520 | }, 521 | "hide": 0, 522 | "label": null, 523 | "name": "db", 524 | "options": [], 525 | "query": "influxdb", 526 | "refresh": 1, 527 | "regex": "", 528 | "type": "datasource" 529 | }, 530 | { 531 | "allValue": null, 532 | "current": { 533 | "text": "All", 534 | "value": "$__all" 535 | }, 536 | "datasource": "$db", 537 | "hide": 0, 538 | "includeAll": true, 539 | "label": null, 540 | "multi": true, 541 | "name": "step", 542 | "options": [], 543 | "query": "SHOW TAG VALUES WITH KEY = step", 544 | "refresh": 2, 545 | "regex": "", 546 | "sort": 0, 547 | "tagValuesQuery": "", 548 | "tags": [], 549 | "tagsQuery": "", 550 | "type": "query", 551 | "useTags": false 552 | }, 553 | { 554 | "allValue": null, 555 | "current": { 556 | "text": "All", 557 | "value": "$__all" 558 | }, 559 | "datasource": "$db", 560 | "hide": 0, 561 | "includeAll": true, 562 | "label": null, 563 | "multi": true, 564 | "name": "host", 565 | "options": [], 566 | "query": "SHOW TAG VALUES WITH KEY = host", 567 | "refresh": 2, 568 | "regex": "", 569 | "sort": 0, 570 | "tagValuesQuery": "SHOW TAG VALUES WITH KEY = host WHERE step = $step", 571 | "tags": [ 572 | "i-08a40a6f64d2e2cbe" 573 | ], 574 | "tagsQuery": "SHOW TAG VALUES WITH KEY = host", 575 | "type": "query", 576 | "useTags": true 577 | } 578 | ] 579 | }, 580 | "time": { 581 | "from": "2017-04-05T03:01:01.177Z", 582 | "to": "2017-04-05T03:15:19.628Z" 583 | }, 584 | "timepicker": { 585 | "refresh_intervals": [ 586 | "5s", 587 | "10s", 588 | "30s", 589 | "1m", 590 | "5m", 591 | "15m", 592 | "30m", 593 | "1h", 594 | "2h", 595 | "1d" 596 | ], 597 | "time_options": [ 598 | "5m", 599 | "15m", 600 | "1h", 601 | "6h", 602 | "12h", 603 | "24h", 604 | "2d", 605 | "7d", 606 | "30d" 607 | ] 608 | }, 609 | "timezone": "browser", 610 | "title": "loads-broker Monitor", 611 | "version": 5 612 | } -------------------------------------------------------------------------------- /handler.py: -------------------------------------------------------------------------------- 1 | # First some funky path manipulation so that we can work properly in 2 | # the AWS environment 3 | import sys 4 | import os 5 | dir_path = os.path.dirname(os.path.realpath(__file__)) 6 | sys.path.append(dir_path) 7 | 8 | from ardere.step_functions import AsynchronousPlanRunner 9 | 10 | 11 | def populate_missing_instances(event, context): 12 | runner = AsynchronousPlanRunner(event, context) 13 | return runner.populate_missing_instances() 14 | 15 | 16 | def ensure_metrics_available(event, context): 17 | runner = AsynchronousPlanRunner(event, context) 18 | return runner.ensure_metrics_available() 19 | 20 | 21 | def ensure_metric_sources_created(event, context): 22 | runner = AsynchronousPlanRunner(event, context) 23 | return runner.ensure_metric_sources_created() 24 | 25 | 26 | def create_ecs_services(event, context): 27 | runner = AsynchronousPlanRunner(event, context) 28 | return runner.create_ecs_services() 29 | 30 | 31 | def wait_for_cluster_ready(event, context): 32 | runner = AsynchronousPlanRunner(event, context) 33 | return runner.wait_for_cluster_ready() 34 | 35 | 36 | def signal_cluster_start(event, context): 37 | runner = AsynchronousPlanRunner(event, context) 38 | return runner.signal_cluster_start() 39 | 40 | 41 | def check_for_cluster_done(event, context): 42 | runner = AsynchronousPlanRunner(event, context) 43 | return runner.check_for_cluster_done() 44 | 45 | 46 | def cleanup_cluster(event, context): 47 | runner = AsynchronousPlanRunner(event, context) 48 | return runner.cleanup_cluster() 49 | 50 | 51 | def check_drain(event, context): 52 | return AsynchronousPlanRunner(event, context).check_drained() 53 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ardere", 3 | "version": "1.0.0", 4 | "description": "Serverless Service for Load-Testing", 5 | "main": "index.js", 6 | "dependencies": { 7 | "serverless": "^1.8.0", 8 | "serverless-python-requirements": "^2.0.0-beta.7", 9 | "serverless-step-functions": "^0.4.1" 10 | }, 11 | "devDependencies": {}, 12 | "scripts": { 13 | "test": "echo \"Error: no test specified\" && exit 1" 14 | }, 15 | "repository": { 16 | "type": "git", 17 | "url": "git+https://github.com/loads/ardere.git" 18 | }, 19 | "author": "", 20 | "license": "MPL-2.0", 21 | "bugs": { 22 | "url": "https://github.com/loads/ardere/issues" 23 | }, 24 | "homepage": "https://github.com/loads/ardere#readme" 25 | } 26 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | futures==3.0.5 2 | typing==3.5.3.0 3 | toml==0.9.2 4 | marshmallow==2.13.4 5 | boto3==1.4.4 6 | requests==2.13.0 -------------------------------------------------------------------------------- /serverless.yml: -------------------------------------------------------------------------------- 1 | service: ardere 2 | 3 | plugins: 4 | - serverless-step-functions 5 | - serverless-python-requirements 6 | 7 | package: 8 | exclude: 9 | - node_modules/** 10 | - ardenv/** 11 | - tests/** 12 | - lib/** 13 | - share/** 14 | - man/** 15 | - bin/** 16 | - serverless/** 17 | include: 18 | - ardere/** 19 | 20 | provider: 21 | name: aws 22 | runtime: python2.7 23 | memorySize: 128 24 | timeout: 60 25 | environment: 26 | ecs_profile: 27 | Fn::GetAtt: 28 | - EC2ContainerProfile 29 | - Arn 30 | s3_ready_bucket: 31 | Ref: "S3ReadyBucket" 32 | metrics_bucket: 33 | Ref: "MetricsBucket" 34 | ec2_sg: 35 | Fn::GetAtt: 36 | - EC2SecurityGroup 37 | - GroupId 38 | metric_sg: 39 | Fn::GetAtt: 40 | - MetricSecurityGroup 41 | - GroupId 42 | container_log_group: 43 | Ref: "ContainerLogs" 44 | 45 | iamRoleStatements: 46 | - Effect: "Allow" 47 | Action: 48 | - "ecs:CreateCluster" 49 | - "ecs:ListServices" 50 | - "ecs:ListContainerInstances" 51 | - "ecs:ListTasks" 52 | - "ecs:DescribeClusters" 53 | - "ecs:DescribeServices" 54 | - "ecs:DescribeTaskDefinition" 55 | - "ecs:DescribeTasks" 56 | - "ecs:DescribeContainerInstances" 57 | - "ecs:CreateService" 58 | - "ecs:DeleteService" 59 | - "ecs:UpdateService" 60 | - "ecs:StartTask" 61 | - "ecs:RegisterTaskDefinition" 62 | - "ecs:DeregisterTaskDefinition" 63 | Resource: 64 | - "*" 65 | - Effect: "Allow" 66 | Action: 67 | - "s3:ListBucket" 68 | - "s3:PutObject" 69 | - "s3:PutObjectAcl" 70 | Resource: 71 | - Fn::Join: ['', ['arn:aws:s3:::', Ref: "S3ReadyBucket"]] 72 | - Effect: "Allow" 73 | Action: 74 | - "s3:PutObject" 75 | - "s3:PutObjectAcl" 76 | - "s3:GetObject" 77 | - "s3:DeleteObject" 78 | Resource: 79 | - Fn::Join: ['', ['arn:aws:s3:::', Ref: "S3ReadyBucket", "/*"]] 80 | - Effect: "Allow" 81 | Action: 82 | - "s3:ListBucket" 83 | - "s3:GetObject" 84 | Resource: 85 | - Fn::Join: ['', ['arn:aws:s3:::', Ref: "MetricsBucket", "/*"]] 86 | - Effect: "Allow" 87 | Action: 88 | - "ec2:DescribeInstances" 89 | - "ec2:RunInstances" 90 | - "ec2:CreateTags" 91 | Resource: 92 | - "*" 93 | - Effect: "Allow" 94 | Action: 95 | - "iam:GetRole" 96 | - "iam:PassRole" 97 | Resource: 98 | Fn::GetAtt: 99 | - EC2ContainerRole 100 | - Arn 101 | 102 | functions: 103 | populate_missing_instances: 104 | handler: handler.populate_missing_instances 105 | timeout: 300 106 | ensure_metrics_available: 107 | handler: handler.ensure_metrics_available 108 | timeout: 300 109 | ensure_metric_sources_created: 110 | handler: handler.ensure_metric_sources_created 111 | timeout: 300 112 | create_ecs_services: 113 | handler: handler.create_ecs_services 114 | timeout: 300 115 | wait_for_cluster_ready: 116 | handler: handler.wait_for_cluster_ready 117 | signal_cluster_start: 118 | handler: handler.signal_cluster_start 119 | check_for_cluster_done: 120 | handler: handler.check_for_cluster_done 121 | cleanup_cluster: 122 | handler: handler.cleanup_cluster 123 | timeout: 300 124 | check_drain: 125 | handler: handler.check_drain 126 | 127 | stepFunctions: 128 | stateMachines: 129 | ardere: 130 | Comment: "ardere load-tester" 131 | Version: "1.0" 132 | StartAt: "Populate Missing Instances" 133 | States: 134 | "Populate Missing Instances": 135 | Type: Task 136 | Resource: populate_missing_instances 137 | Next: "Ensure Metrics Available" 138 | "Ensure Metrics Available": 139 | Type: Task 140 | Resource: ensure_metrics_available 141 | Retry: 142 | - 143 | ErrorEquals: 144 | - ServicesStartingException 145 | IntervalSeconds: 10 146 | MaxAttempts: 60 147 | BackoffRate: 1 148 | Catch: 149 | - 150 | ErrorEquals: 151 | - States.ALL 152 | ResultPath: "$.error-info" 153 | Next: "Clean-up Cluster" 154 | Next: "Ensure Metric Sources Created" 155 | "Ensure Metric Sources Created": 156 | Type: Task 157 | Resource: ensure_metric_sources_created 158 | Retry: 159 | - 160 | ErrorEquals: 161 | - CreatingMetricSourceException 162 | IntervalSeconds: 5 163 | MaxAttempts: 20 164 | BackoffRate: 1 165 | Catch: 166 | - 167 | ErrorEquals: 168 | - States.ALL 169 | ResultPath: "$.error-info" 170 | Next: "Clean-up Cluster" 171 | Next: "Create ECS Services" 172 | "Create ECS Services": 173 | Type: Task 174 | Resource: create_ecs_services 175 | Catch: 176 | - 177 | ErrorEquals: 178 | - States.ALL 179 | ResultPath: "$.error-info" 180 | Next: "Clean-up Cluster" 181 | Next: "Wait for Cluster Ready" 182 | "Wait for Cluster Ready": 183 | Type: Task 184 | Resource: wait_for_cluster_ready 185 | Retry: 186 | - 187 | ErrorEquals: 188 | - ServicesStartingException 189 | IntervalSeconds: 10 190 | MaxAttempts: 180 191 | BackoffRate: 1 192 | Catch: 193 | - 194 | ErrorEquals: 195 | - States.ALL 196 | ResultPath: "$.error-info" 197 | Next: "Clean-up Cluster" 198 | Next: "Signal Cluster Start" 199 | "Signal Cluster Start": 200 | Type: Task 201 | Resource: signal_cluster_start 202 | Catch: 203 | - 204 | ErrorEquals: 205 | - States.ALL 206 | ResultPath: "$.error-info" 207 | Next: "Clean-up Cluster" 208 | Next: "Check for Cluster Done" 209 | "Check for Cluster Done": 210 | Type: Task 211 | Resource: check_for_cluster_done 212 | Next: "Wait for Cluster Done" 213 | Retry: 214 | - 215 | ErrorEquals: 216 | - NoSuchKey 217 | IntervalSeconds: 10 218 | MaxAttempts: 2 219 | BackoffRate: 1 220 | Catch: 221 | - 222 | ErrorEquals: 223 | - States.ALL 224 | ResultPath: "$.error-info" 225 | Next: "Clean-up Cluster" 226 | "Wait for Cluster Done": 227 | Type: Wait 228 | Seconds: 10 229 | Next: "Check for Cluster Done" 230 | "Clean-up Cluster": 231 | Type: Task 232 | Resource: cleanup_cluster 233 | Next: "Checking Drain" 234 | "Checking Drain": 235 | Type: Task 236 | Resource: check_drain 237 | Retry: 238 | - 239 | ErrorEquals: 240 | - UndrainedInstancesException 241 | IntervalSeconds: 10 242 | MaxAttempts: 10 243 | BackoffRate: 1 244 | End: true 245 | 246 | resources: 247 | Resources: 248 | S3ReadyBucket: 249 | Type: "AWS::S3::Bucket" 250 | Properties: 251 | AccessControl: "PublicRead" 252 | MetricsBucket: 253 | Type: "AWS::S3::Bucket" 254 | Properties: 255 | AccessControl: "AuthenticatedRead" 256 | MetricSecurityGroup: 257 | Type: "AWS::EC2::SecurityGroup" 258 | Properties: 259 | GroupDescription: "ardere metrics" 260 | SecurityGroupIngress: 261 | - 262 | IpProtocol: tcp 263 | FromPort: 3000 264 | ToPort: 3000 265 | SourceSecurityGroupId: 266 | Fn::GetAtt: 267 | - GrafanaSecurityGroup 268 | - GroupId 269 | - 270 | IpProtocol: tcp 271 | FromPort: 8086 272 | ToPort: 8086 273 | SourceSecurityGroupId: 274 | Fn::GetAtt: 275 | - EC2SecurityGroup 276 | - GroupId 277 | GrafanaSecurityGroup: 278 | Type: "AWS::EC2::SecurityGroup" 279 | Properties: 280 | GroupDescription: "grafana access" 281 | EC2SecurityGroup: 282 | Type: "AWS::EC2::SecurityGroup" 283 | Properties: 284 | GroupDescription: "ardere load-testers" 285 | EC2ContainerRole: 286 | Type: "AWS::IAM::Role" 287 | Properties: 288 | AssumeRolePolicyDocument: 289 | Version: "2012-10-17" 290 | Statement: 291 | - 292 | Effect: "Allow" 293 | Principal: 294 | Service: 295 | - "ec2.amazonaws.com" 296 | Action: 297 | - "sts:AssumeRole" 298 | Path: "/" 299 | Policies: 300 | - 301 | PolicyName: "ecs-service" 302 | PolicyDocument: 303 | Version: "2012-10-17" 304 | Statement: 305 | - 306 | Effect: "Allow" 307 | Action: 308 | - "ecs:CreateCluster" 309 | - "ecs:DeregisterContainerInstance" 310 | - "ecs:DiscoverPollEndpoint" 311 | - "ecs:Poll" 312 | - "ecs:RegisterContainerInstance" 313 | - "ecs:StartTelemetrySession" 314 | - "ecs:SubmitContainerStateChange" 315 | - "ecs:SubmitTaskStateChange" 316 | - "ecs:Submit" 317 | - "logs:CreateLogStream" 318 | - "logs:PutLogEvents" 319 | Resource: "*" 320 | - 321 | Effect: "Allow" 322 | Action: 323 | - "s3:ListBucket" 324 | - "s3:GetObject" 325 | Resource: 326 | - Fn::Join: ['', ['arn:aws:s3:::', Ref: "MetricsBucket", "/*"]] 327 | ContainerLogs: 328 | Type: "AWS::Logs::LogGroup" 329 | Properties: 330 | RetentionInDays: 1 331 | EC2ContainerProfile: 332 | Type: "AWS::IAM::InstanceProfile" 333 | Properties: 334 | Path: "/" 335 | Roles: 336 | - 337 | Ref: "EC2ContainerRole" 338 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [nosetests] 2 | verbose=True 3 | verbosity=1 4 | detailed-errors=True 5 | with-coverage=True 6 | cover-erase=True 7 | cover-package=ardere 8 | cover-tests=True 9 | cover-inclusive=True 10 | -------------------------------------------------------------------------------- /src/shell/telegraf.toml: -------------------------------------------------------------------------------- 1 | # Telegraf Configuration 2 | # 3 | # Telegraf is entirely plugin driven. All metrics are gathered from the 4 | # declared inputs, and sent to the declared outputs. 5 | # 6 | # Plugins must be declared in here to be active. 7 | # To deactivate a plugin, comment out the name and any variables. 8 | # 9 | # Use 'telegraf -config telegraf.conf -test' to see what metrics a config 10 | # file would generate. 11 | # 12 | # Environment variables can be used anywhere in this config file, simply prepend 13 | # them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), 14 | # for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) 15 | 16 | 17 | # Global tags can be specified here in key="value" format. 18 | [global_tags] 19 | # dc = "us-east-1" # will tag all metrics with dc=us-east-1 20 | # rack = "1a" 21 | ## Environment variables can be used as tags, and throughout the config file 22 | # user = "$USER" 23 | step = "$__ARDERE_TELEGRAF_STEP__" 24 | ## type is the old "docker_series" 25 | type = "$__ARDERE_TELEGRAF_TYPE__" 26 | 27 | 28 | # Configuration for telegraf agent 29 | [agent] 30 | ## Default data collection interval for all inputs 31 | interval = "10s" 32 | ## Rounds collection interval to 'interval' 33 | ## ie, if interval="10s" then always collect on :00, :10, :20, etc. 34 | round_interval = true 35 | 36 | ## Telegraf will send metrics to outputs in batches of at most 37 | ## metric_batch_size metrics. 38 | ## This controls the size of writes that Telegraf sends to output plugins. 39 | metric_batch_size = 1000 40 | 41 | ## For failed writes, telegraf will cache metric_buffer_limit metrics for each 42 | ## output, and will flush this buffer on a successful write. Oldest metrics 43 | ## are dropped first when this buffer fills. 44 | ## This buffer only fills when writes fail to output plugin(s). 45 | metric_buffer_limit = 10000 46 | 47 | ## Collection jitter is used to jitter the collection by a random amount. 48 | ## Each plugin will sleep for a random time within jitter before collecting. 49 | ## This can be used to avoid many plugins querying things like sysfs at the 50 | ## same time, which can have a measurable effect on the system. 51 | collection_jitter = "0s" 52 | 53 | ## Default flushing interval for all outputs. You shouldn't set this below 54 | ## interval. Maximum flush_interval will be flush_interval + flush_jitter 55 | flush_interval = "10s" 56 | ## Jitter the flush interval by a random amount. This is primarily to avoid 57 | ## large write spikes for users running a large number of telegraf instances. 58 | ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s 59 | flush_jitter = "0s" 60 | ## By default, precision will be set to the same timestamp order as the 61 | ## collection interval, with the maximum being 1s. 62 | ## Precision will NOT be used for service inputs, such as logparser and statsd. 63 | ## Valid values are "ns", "us" (or "µs"), "ms", "s". 64 | precision = "" 65 | ## Logging configuration: 66 | ## Run telegraf with debug log messages. 67 | debug = false 68 | ## Run telegraf in quiet mode (error log messages only). 69 | quiet = false 70 | ## Specify the log file name. The empty string means to log to stderr. 71 | logfile = "" 72 | ## Override default hostname, if empty use os.Hostname() 73 | hostname = "$__ARDERE_TELEGRAF_HOST__" 74 | ## If set to true, do no set the "host" tag in the telegraf agent. 75 | omit_hostname = false 76 | ############################################################################### 77 | # OUTPUT PLUGINS # 78 | ############################################################################### 79 | # Configuration for influxdb server to send metrics to 80 | [[outputs.influxdb]] 81 | ## The full HTTP or UDP endpoint URL for your InfluxDB instance. 82 | ## Multiple urls can be specified as part of the same cluster, 83 | ## this means that only ONE of the urls will be written to each interval. 84 | # urls = ["udp://localhost:8089"] # UDP endpoint example 85 | urls = ["http://$__ARDERE_INFLUX_ADDR__"] # required 86 | ## The target database for metrics (telegraf will create it if not exists). 87 | database = "$__ARDERE_INFLUX_DB__" # required 88 | ## Retention policy to write to. Empty string writes to the default rp. 89 | retention_policy = "" 90 | ## Write consistency (clusters only), can be: "any", "one", "quorum", "all" 91 | write_consistency = "any" 92 | ## Write timeout (for the InfluxDB client), formatted as a string. 93 | ## If not provided, will default to 5s. 0s means no timeout (not recommended). 94 | timeout = "5s" 95 | # username = "telegraf" 96 | # password = "metricsmetricsmetricsmetrics" 97 | ## Set the user agent for HTTP POSTs (can be useful for log differentiation) 98 | # user_agent = "telegraf" 99 | ## Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes) 100 | # udp_payload = 512 101 | ## Optional SSL Config 102 | # ssl_ca = "/etc/telegraf/ca.pem" 103 | # ssl_cert = "/etc/telegraf/cert.pem" 104 | # ssl_key = "/etc/telegraf/key.pem" 105 | ## Use SSL but skip chain & host verification 106 | # insecure_skip_verify = false 107 | ############################################################################### 108 | # PROCESSOR PLUGINS # 109 | ############################################################################### 110 | # # Print all metrics that pass through this filter. 111 | # [[processors.printer]] 112 | ############################################################################### 113 | # AGGREGATOR PLUGINS # 114 | ############################################################################### 115 | # # Keep the aggregate min/max of each metric passing through. 116 | # [[aggregators.minmax]] 117 | # ## General Aggregator Arguments: 118 | # ## The period on which to flush & clear the aggregator. 119 | # period = "30s" 120 | # ## If true, the original metric will be dropped by the 121 | # ## aggregator and will not get sent to the output plugins. 122 | # drop_original = false 123 | ############################################################################### 124 | # INPUT PLUGINS # 125 | ############################################################################### 126 | # Read metrics about cpu usage 127 | [[inputs.cpu]] 128 | ## Whether to report per-cpu stats or not 129 | percpu = true 130 | ## Whether to report total system cpu stats or not 131 | totalcpu = true 132 | ## If true, collect raw CPU time metrics. 133 | collect_cpu_time = false 134 | # Read metrics about memory usage 135 | [[inputs.mem]] 136 | # no configuration 137 | # Read TCP metrics such as established, time wait and sockets counts. 138 | [[inputs.netstat]] 139 | # no configuration 140 | ############################################################################### 141 | # SERVICE INPUT PLUGINS # 142 | ############################################################################### 143 | # Statsd Server 144 | [[inputs.statsd]] 145 | ## Address and port to host UDP listener on 146 | service_address = ":8125" 147 | ## The following configuration options control when telegraf clears it's cache 148 | ## of previous values. If set to false, then telegraf will only clear it's 149 | ## cache when the daemon is restarted. 150 | ## Reset gauges every interval (default=true) 151 | delete_gauges = true 152 | ## Reset counters every interval (default=true) 153 | delete_counters = true 154 | ## Reset sets every interval (default=true) 155 | delete_sets = true 156 | ## Reset timings & histograms every interval (default=true) 157 | delete_timings = true 158 | ## Percentiles to calculate for timing & histogram stats 159 | percentiles = [90] 160 | ## separator to use between elements of a statsd metric 161 | metric_separator = "_" 162 | ## Parses tags in the datadog statsd format 163 | ## http://docs.datadoghq.com/guides/dogstatsd/ 164 | parse_data_dog_tags = false 165 | ## Statsd data translation templates, more info can be read here: 166 | ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md#graphite 167 | # templates = [ 168 | # "cpu.* measurement*" 169 | # ] 170 | ## Number of UDP messages allowed to queue up, once filled, 171 | ## the statsd server will start dropping packets 172 | allowed_pending_messages = 10000 173 | ## Number of timing/histogram values to track per-measurement in the 174 | ## calculation of percentiles. Raising this limit increases the accuracy 175 | ## of percentiles but also increases the memory usage and cpu time. 176 | #percentile_limit = 1000 177 | percentile_limit = 10 -------------------------------------------------------------------------------- /src/shell/waitforcluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # waits for a cluster to be ready + a run_delay 4 | # 5 | # cluster readiness is indicated by existence of ready_url, containing 6 | # a timestamp (seconds since epoch) of when it was made so. timestamp 7 | # is factored into the run_delay. 8 | # 9 | 10 | # Polling frequency in seconds 11 | POLL_TIME=4 12 | 13 | if [ $# != 2 ]; then 14 | echo "usage $0: ready_url run_delay" 15 | exit 1 16 | fi 17 | READY_URL=$1 18 | RUN_DELAY=$2 19 | 20 | # XXX: a random jitter, backoff? 21 | JITTER=0 22 | 23 | while true; do 24 | START_TIME=`wget -qO- ${READY_URL}` && break 25 | sleep $(( ${POLL_TIME} + ${JITTER} )) 26 | done 27 | 28 | CURRENT_TIME=`date +%s` 29 | SINCE=$(( ${CURRENT_TIME} - ${START_TIME} )) 30 | if [ ${SINCE} -lt 0 ]; then 31 | echo "Clock skew: ${SINCE}" >&2 32 | SINCE=0 33 | fi 34 | 35 | RUN_DELAY=$(( ${RUN_DELAY} - ${SINCE} )) 36 | if [ ${RUN_DELAY} -gt 0 ]; then 37 | FMT_START_TIME=`date '+%FT%T+00:00' -d @${START_TIME}` 38 | echo "Cluster ready @ ${FMT_START_TIME}" \ 39 | "(sleeping for run_delay=${RUN_DELAY}s)" 40 | sleep $RUN_DELAY 41 | fi 42 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | nose==1.3.7 3 | mock==2.0.0 4 | coverage==4.3.4 5 | boto3==1.4.4 6 | influxdb==4.0.0 7 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/loads/ardere/0c1b7736c514d2b9fd1563cf96bbe0fd75244f95/tests/__init__.py -------------------------------------------------------------------------------- /tests/fixtures.py: -------------------------------------------------------------------------------- 1 | sample_basic_test_plan = """ 2 | { 3 | "ecs_name": "ardere-test", 4 | "name": "Loadtest", 5 | "description": "Run all APLT scenarios", 6 | "metrics_options": { 7 | "enabled": true, 8 | "dashboard": { 9 | "admin_user": "admin", 10 | "admin_password": "testing", 11 | "name": "ap-loadtester", 12 | "filename": "gf_basic_dashboard.json" 13 | } 14 | }, 15 | "steps": [ 16 | { 17 | "name": "TestCluster", 18 | "instance_count": 1, 19 | "instance_type": "t2.medium", 20 | "run_max_time": 140, 21 | "env": { 22 | "SOME_VAR": "great-value" 23 | }, 24 | "port_mapping": [8000, 4000], 25 | "container_name": "bbangert/ap-loadtester:latest", 26 | "cmd": "./apenv/bin/aplt_testplan wss://autopush.stage.mozaws.net 'aplt.scenarios:notification_forever,1000,1,0' --statsd_host=localhost --statsd_port=8125" 27 | } 28 | ] 29 | } 30 | """ 31 | 32 | sample_toml = """ 33 | ecs_name = "ardere-test" 34 | name = "connection loadtest" 35 | description = "autopush: connect and idle forever" 36 | 37 | 38 | [[steps]] 39 | name = "***************** RUN #01 ***********************" 40 | instance_count = 8 41 | instance_type = "m3.medium" 42 | container_name = "bbangert/ap-loadtester:latest" 43 | cmd = "./apenv/bin/aplt_testplan wss://autopush.stage.mozaws.net 'aplt.scenarios:connect_and_idle_forever,10000,5,0'" 44 | run_max_time = 300 45 | volume_mapping = "/var/log:/var/log/$RUN_ID:rw" 46 | docker_series = "push_tests" 47 | 48 | [[steps]] 49 | name = "***************** RUN #02 ***********************" 50 | instance_count = 8 51 | run_delay = 330 52 | instance_type = "m3.medium" 53 | container_name = "bbangert/ap-loadtester:latest" 54 | cmd = "./apenv/bin/aplt_testplan wss://autopush.stage.mozaws.net 'aplt.scenarios:connect_and_idle_forever,10000,5,0'" 55 | run_max_time = 300 56 | volume_mapping = "/var/log:/var/log/$RUN_ID:rw" 57 | docker_series = "push_tests" 58 | 59 | """ 60 | 61 | future_hypothetical_test=""" 62 | { 63 | "name": "TestCluster", 64 | "instance_count": 1, 65 | "instance_type": "t2.medium", 66 | "run_max_time": 140, 67 | "container_name": "bbangert/pushgo:1.5rc4", 68 | "port_mapping": "8080,8081,3000,8082", 69 | "load_balancer": { 70 | "env_var": "TEST_CLUSTER", 71 | "ping_path": "/status/health", 72 | "ping_port": 8080, 73 | "ping_protocol": "http", 74 | "listeners": [ 75 | { 76 | "listen_protocol": "ssl", 77 | "listen_port": 443, 78 | "backend_protocol": "tcp", 79 | "backend_port": 8080 80 | }, 81 | { 82 | "listen_protocol": "https", 83 | "listen_port": 9000, 84 | "backend_protocol": "http", 85 | "backend_port": 8090 86 | } 87 | ] 88 | } 89 | } 90 | """ -------------------------------------------------------------------------------- /tests/test_aws.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | import unittest 5 | 6 | import mock 7 | from nose.tools import assert_raises, eq_, ok_ 8 | 9 | from tests import fixtures 10 | 11 | 12 | class TestECSManager(unittest.TestCase): 13 | def _make_FUT(self, plan=None): 14 | from ardere.aws import ECSManager 15 | os.environ["s3_ready_bucket"] = "test_bucket" 16 | os.environ["ecs_profile"] = "arn:something:fantastic:::" 17 | os.environ["container_log_group"] = "ardere" 18 | self.boto_mock = mock.Mock() 19 | ECSManager.boto = self.boto_mock 20 | if not plan: 21 | plan = json.loads(fixtures.sample_basic_test_plan) 22 | plan["metrics_options"] = dict( 23 | dashboard=dict( 24 | admin_user="admin", 25 | admin_password="admin" 26 | ), 27 | tear_down=False 28 | ) 29 | return ECSManager(plan) 30 | 31 | def test_init(self): 32 | ecs = self._make_FUT() 33 | eq_(ecs._plan["plan_run_uuid"], ecs._plan_uuid) 34 | eq_(ecs.plan_uuid, ecs._plan_uuid) 35 | 36 | def test_ready_file(self): 37 | ecs = self._make_FUT() 38 | ready_filename = ecs.s3_ready_file 39 | ok_("test_bucket" in ready_filename) 40 | ok_(ecs._plan_uuid in ready_filename) 41 | 42 | def test_query_active(self): 43 | mock_paginator = mock.Mock() 44 | mock_paginator.paginate.return_value = [ 45 | {"Reservations": [ 46 | { 47 | "Instances": [ 48 | { 49 | "State": { 50 | "Code": 16 51 | }, 52 | "InstanceType": "t2.medium" 53 | } 54 | ] 55 | } 56 | ]} 57 | ] 58 | 59 | ecs = self._make_FUT() 60 | ecs._ec2_client.get_paginator.return_value = mock_paginator 61 | instance_dct = ecs.query_active_instances() 62 | eq_(len(instance_dct.values()), 1) 63 | 64 | def test_calculate_missing_instances(self): 65 | ecs = self._make_FUT() 66 | result = ecs.calculate_missing_instances( 67 | desired={"t2.medium": 2}, current={"t2.medium": 1} 68 | ) 69 | eq_(result, {"t2.medium": 1}) 70 | 71 | def test_has_metrics_node(self): 72 | mock_paginator = mock.Mock() 73 | mock_paginator.paginate.return_value = [ 74 | {"Reservations": [ 75 | { 76 | "Instances": [ 77 | { 78 | "State": { 79 | "Code": 16 80 | }, 81 | "InstanceType": "t2.medium" 82 | } 83 | ] 84 | } 85 | ]} 86 | ] 87 | 88 | ecs = self._make_FUT() 89 | ecs._ec2_client.get_paginator.return_value = mock_paginator 90 | resp = ecs.has_metrics_node("t2.medium") 91 | eq_(resp, True) 92 | 93 | def test_has_started_metric_creation(self): 94 | ecs = self._make_FUT() 95 | ecs._ecs_client.list_tasks.return_value = {"taskArns": [123]} 96 | eq_(ecs.has_started_metric_creation(), True) 97 | 98 | def test_has_finished_metric_creation(self): 99 | ecs = self._make_FUT() 100 | ecs._ecs_client.list_tasks.return_value = {"taskArns": [123]} 101 | eq_(ecs.has_finished_metric_creation(), True) 102 | 103 | def test_request_instances(self): 104 | instances = { 105 | "t2.medium": 10 106 | } 107 | ecs = self._make_FUT() 108 | ecs._ec2_client.run_instances.return_value = { 109 | "Instances": [{"InstanceId": 12345}] 110 | } 111 | ecs.request_instances(instances, ["i-382842"], {"Role": "metrics"}) 112 | ecs._ec2_client.run_instances.assert_called() 113 | 114 | def test_locate_metrics_container_ip(self): 115 | ecs = self._make_FUT() 116 | ecs._ecs_client.list_container_instances.return_value = { 117 | "containerInstanceArns": ["arn:of:some:container::"] 118 | } 119 | ecs._ecs_client.describe_container_instances.return_value = { 120 | "containerInstances": [ 121 | {"ec2InstanceId": "e-28193823"} 122 | ] 123 | } 124 | mock_resource = mock.Mock() 125 | ecs.boto.resource.return_value = mock_resource 126 | ecs.locate_metrics_container_ip() 127 | ecs.boto.resource.assert_called() 128 | 129 | def test_locate_metrics_container_ip_not_found(self): 130 | ecs = self._make_FUT() 131 | ecs._ecs_client.list_container_instances.return_value = { 132 | "containerInstanceArns": [] 133 | } 134 | result = ecs.locate_metrics_container_ip() 135 | eq_(result, (None, None)) 136 | 137 | def test_locate_metrics_service(self): 138 | ecs = self._make_FUT() 139 | ecs._ecs_client.describe_services.return_value = { 140 | "services": [ 141 | {"stuff": 1, "status": "ACTIVE"} 142 | ] 143 | } 144 | result = ecs.locate_metrics_service() 145 | eq_(result, {"stuff": 1, "status": "ACTIVE"}) 146 | 147 | def test_locate_metrics_service_not_found(self): 148 | ecs = self._make_FUT() 149 | ecs._ecs_client.describe_services.return_value = { 150 | "services": [] 151 | } 152 | result = ecs.locate_metrics_service() 153 | eq_(result, None) 154 | 155 | def test_create_metrics_service(self): 156 | ecs = self._make_FUT() 157 | 158 | # Setup mocks 159 | ecs._ecs_client.register_task_definition.return_value = { 160 | "taskDefinition": { 161 | "taskDefinitionArn": "arn:of:some:task::" 162 | } 163 | } 164 | ecs._ecs_client.create_service.return_value = { 165 | "service": {"serviceArn": "arn:of:some:service::"} 166 | } 167 | 168 | result = ecs.create_metrics_service(dict(instance_type="c4.large")) 169 | eq_(result["service_arn"], "arn:of:some:service::") 170 | 171 | def test_run_metric_creation_task(self): 172 | ecs = self._make_FUT() 173 | ecs.run_metric_creation_task("arn:::", ("admin", "admin"), 174 | "asdf", "atitle") 175 | ecs._ecs_client.start_task.assert_called() 176 | 177 | def test_create_service(self): 178 | ecs = self._make_FUT() 179 | 180 | step = ecs._plan["steps"][0] 181 | ecs._plan["influxdb_private_ip"] = "1.1.1.1" 182 | step["docker_series"] = "default" 183 | 184 | # Setup mocks 185 | ecs._ecs_client.register_task_definition.return_value = { 186 | "taskDefinition": { 187 | "taskDefinitionArn": "arn:of:some:task::" 188 | } 189 | } 190 | ecs._ecs_client.create_service.return_value = { 191 | "service": {"serviceArn": "arn:of:some:service::"} 192 | } 193 | 194 | ecs.create_service(step) 195 | 196 | eq_(step["serviceArn"], "arn:of:some:service::") 197 | ecs._ecs_client.register_task_definition.assert_called() 198 | _, kwargs = ecs._ecs_client.register_task_definition.call_args 199 | container_def = kwargs["containerDefinitions"][0] 200 | 201 | eq_(container_def["cpu"], 1536) 202 | 203 | _, kwargs = ecs._ecs_client.register_task_definition.call_args 204 | container_def = kwargs["containerDefinitions"][0] 205 | ok_("portMappings" in container_def) 206 | 207 | def test_create_services(self): 208 | ecs = self._make_FUT() 209 | ecs.create_service = mock.Mock() 210 | ecs.create_services(ecs._plan["steps"]) 211 | ecs.create_service.assert_called() 212 | 213 | def test_create_services_ecs_error(self): 214 | from botocore.exceptions import ClientError 215 | ecs = self._make_FUT() 216 | 217 | step = ecs._plan["steps"][0] 218 | ecs._plan["influxdb_private_ip"] = "1.1.1.1" 219 | step["docker_series"] = "default" 220 | ecs._ecs_client.register_task_definition.side_effect = ClientError( 221 | {"Error": {}}, "some_op" 222 | ) 223 | 224 | with assert_raises(ClientError): 225 | ecs.create_services(ecs._plan["steps"]) 226 | 227 | def test_service_ready_true(self): 228 | ecs = self._make_FUT() 229 | step = ecs._plan["steps"][0] 230 | 231 | ecs._ecs_client.describe_services.return_value = { 232 | "services": [{ 233 | "deployments": [{ 234 | "desiredCount": 2, 235 | "runningCount": 2 236 | }] 237 | }] 238 | } 239 | 240 | result = ecs.service_ready(step) 241 | eq_(result, True) 242 | 243 | def test_service_not_known_yet(self): 244 | ecs = self._make_FUT() 245 | step = ecs._plan["steps"][0] 246 | 247 | ecs._ecs_client.describe_services.return_value = { 248 | "services": [] 249 | } 250 | 251 | result = ecs.service_ready(step) 252 | eq_(result, False) 253 | 254 | def test_all_services_ready(self): 255 | ecs = self._make_FUT() 256 | ecs.service_ready = mock.Mock() 257 | 258 | ecs.all_services_ready(ecs._plan["steps"]) 259 | ecs.service_ready.assert_called() 260 | 261 | def test_service_done_true(self): 262 | ecs = self._make_FUT() 263 | step = ecs._plan["steps"][0] 264 | 265 | ecs._ecs_client.describe_services.return_value = { 266 | "services": [{ 267 | "status": "INACTIVE" 268 | }] 269 | } 270 | 271 | result = ecs.service_done(step) 272 | eq_(result, True) 273 | 274 | def test_service_not_known(self): 275 | ecs = self._make_FUT() 276 | step = ecs._plan["steps"][0] 277 | 278 | ecs._ecs_client.describe_services.return_value = { 279 | "services": [{ 280 | "status": "DRAINING" 281 | }] 282 | } 283 | 284 | result = ecs.service_done(step) 285 | eq_(result, False) 286 | 287 | def test_all_services_done(self): 288 | ecs = self._make_FUT() 289 | ecs.service_done = mock.Mock() 290 | ecs.all_services_done(ecs._plan["steps"]) 291 | ecs.service_done.assert_called() 292 | 293 | def test_stop_finished_service_stopped(self): 294 | ecs = self._make_FUT() 295 | ecs._ecs_client.update_service = mock.Mock() 296 | step = ecs._plan["steps"][0] 297 | step["service_status"] = "STARTED" 298 | past = time.time() - 400 299 | ecs.stop_finished_service(past, step) 300 | ecs._ecs_client.update_service.assert_called() 301 | eq_(step["service_status"], "STOPPED") 302 | 303 | def test_stop_finished_service_stop_already_stopped(self): 304 | ecs = self._make_FUT() 305 | ecs._ecs_client.update_service = mock.Mock() 306 | step = ecs._plan["steps"][0] 307 | step["service_status"] = "STOPPED" 308 | past = time.time() - 400 309 | ecs.stop_finished_service(past, step) 310 | ecs._ecs_client.update_service.assert_not_called() 311 | eq_(step["service_status"], "STOPPED") 312 | 313 | def test_stop_finished_service_still_running(self): 314 | ecs = self._make_FUT() 315 | ecs._ecs_client.update_service = mock.Mock() 316 | step = ecs._plan["steps"][0] 317 | step["service_status"] = "STARTED" 318 | past = time.time() - 100 319 | ecs.stop_finished_service(past, step) 320 | ecs._ecs_client.update_service.assert_not_called() 321 | eq_(step["service_status"], "STARTED") 322 | 323 | def test_stop_finished_services(self): 324 | ecs = self._make_FUT() 325 | ecs.stop_finished_service = mock.Mock() 326 | 327 | past = time.time() - 100 328 | ecs.stop_finished_services(past, ecs._plan["steps"]) 329 | ecs.stop_finished_service.assert_called() 330 | 331 | def test_shutdown_plan(self): 332 | mock_paginator = mock.Mock() 333 | mock_paginator.paginate.return_value = [ 334 | {"serviceArns": ["arn:123:::", "arn:456:::"]} 335 | ] 336 | 337 | ecs = self._make_FUT() 338 | ecs.locate_metrics_service = mock.Mock() 339 | ecs.locate_metrics_service.return_value = dict( 340 | serviceArn="arn:456:::" 341 | ) 342 | ecs._ecs_client.get_paginator.return_value = mock_paginator 343 | ecs._ecs_client.describe_task_definition.return_value = { 344 | "taskDefinition": {"taskDefinitionArn": "arn:task:::"} 345 | } 346 | 347 | ecs.shutdown_plan(ecs._plan["steps"]) 348 | ecs._ecs_client.deregister_task_definition.assert_called() 349 | ecs._ecs_client.delete_service.assert_called() 350 | 351 | def test_shutdown_plan_update_error(self): 352 | from botocore.exceptions import ClientError 353 | 354 | mock_paginator = mock.Mock() 355 | mock_paginator.paginate.return_value = [ 356 | {"serviceArns": ["arn:123:::", "arn:456:::"]} 357 | ] 358 | 359 | ecs = self._make_FUT() 360 | ecs.locate_metrics_service = mock.Mock() 361 | ecs.locate_metrics_service.return_value = dict( 362 | serviceArn="arn:456:::" 363 | ) 364 | ecs._ecs_client.get_paginator.return_value = mock_paginator 365 | ecs._ecs_client.describe_task_definition.return_value = { 366 | "taskDefinition": {"taskDefinitionArn": "arn:task:::"} 367 | } 368 | ecs._ecs_client.update_service.side_effect = ClientError( 369 | {"Error": {}}, "some_op" 370 | ) 371 | 372 | ecs.shutdown_plan(ecs._plan["steps"]) 373 | ecs._ecs_client.delete_service.assert_not_called() 374 | 375 | def test_shutdown_plan_describe_error(self): 376 | from botocore.exceptions import ClientError 377 | 378 | mock_paginator = mock.Mock() 379 | mock_paginator.paginate.return_value = [ 380 | {"serviceArns": ["arn:123:::", "arn:456:::"]} 381 | ] 382 | 383 | ecs = self._make_FUT() 384 | ecs.locate_metrics_service = mock.Mock() 385 | ecs.locate_metrics_service.return_value = dict( 386 | serviceArn="arn:456:::" 387 | ) 388 | ecs._plan["steps"] = ecs._plan["steps"][:1] 389 | ecs._ecs_client.get_paginator.return_value = mock_paginator 390 | ecs._ecs_client.describe_task_definition.side_effect = ClientError( 391 | {"Error": {}}, "some_op" 392 | ) 393 | 394 | ecs.shutdown_plan(ecs._plan["steps"]) 395 | ecs._ecs_client.deregister_task_definition.assert_not_called() 396 | 397 | def test_shutdown_plan_delete_error(self): 398 | from botocore.exceptions import ClientError 399 | 400 | mock_paginator = mock.Mock() 401 | mock_paginator.paginate.return_value = [ 402 | {"serviceArns": ["arn:123:::", "arn:456:::"]} 403 | ] 404 | 405 | ecs = self._make_FUT() 406 | ecs.locate_metrics_service = mock.Mock() 407 | ecs.locate_metrics_service.return_value = dict( 408 | serviceArn="arn:456:::" 409 | ) 410 | ecs._ecs_client.get_paginator.return_value = mock_paginator 411 | ecs._ecs_client.describe_task_definition.return_value = { 412 | "taskDefinition": {"taskDefinitionArn": "arn:task:::"} 413 | } 414 | ecs._ecs_client.delete_service.side_effect = ClientError( 415 | {"Error": {}}, "some_op" 416 | ) 417 | 418 | ecs.shutdown_plan(ecs._plan["steps"]) 419 | ecs._ecs_client.delete_service.assert_called() 420 | 421 | def test_shutdown_plan_deregister_error(self): 422 | from botocore.exceptions import ClientError 423 | 424 | mock_paginator = mock.Mock() 425 | mock_paginator.paginate.return_value = [ 426 | {"serviceArns": ["arn:123:::", "arn:456:::"]} 427 | ] 428 | 429 | ecs = self._make_FUT() 430 | ecs.locate_metrics_service = mock.Mock() 431 | ecs.locate_metrics_service.return_value = dict( 432 | serviceArn="arn:456:::" 433 | ) 434 | ecs._plan["metrics_options"]["tear_down"] = True 435 | ecs._ecs_client.get_paginator.return_value = mock_paginator 436 | ecs._ecs_client.describe_task_definition.return_value = { 437 | "taskDefinition": {"taskDefinitionArn": "arn:task:::"} 438 | } 439 | ecs._ecs_client.deregister_task_definition.side_effect = ClientError( 440 | {"Error": {}}, "some_op" 441 | ) 442 | 443 | ecs.shutdown_plan(ecs._plan["steps"]) 444 | ecs._ecs_client.delete_service.assert_called() 445 | -------------------------------------------------------------------------------- /tests/test_metric_creator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import mock 5 | from nose.tools import assert_raises, eq_ 6 | 7 | 8 | class TestMetricRunner(unittest.TestCase): 9 | def _make_FUT(self): 10 | from ardere.scripts.metric_creator import DashboardSetup 11 | # Setup the env vars we need 12 | os.environ["__ARDERE_INFLUXDB_NAME__"] = "ardere" 13 | return DashboardSetup() 14 | 15 | def test_load_dashboard(self): 16 | ds = self._make_FUT() 17 | mock_file = mock.Mock() 18 | mock_file.get.return_value = {"Body": mock_file} 19 | mock_file.read.return_value = "{}".encode( 20 | 'utf-8') 21 | mock_s3_obj = mock.Mock() 22 | mock_s3_obj.Object.return_value = mock_file 23 | 24 | ds.boto = mock.Mock() 25 | ds.boto.resource.return_value = mock_s3_obj 26 | ds.dashboard = "asdf:asdf" 27 | result = ds._load_dashboard() 28 | eq_(result, dict(id=None, title=None)) 29 | 30 | def test_create_dashboard(self): 31 | ds = self._make_FUT() 32 | ds._load_dashboard = mock.Mock() 33 | ds.req = mock.Mock() 34 | ds.req.post.return_value = mock.Mock(status_code=200) 35 | ds._create_dashboard("http://localhost") 36 | ds._load_dashboard.assert_called() 37 | 38 | def test_create_dashboard_exception(self): 39 | ds = self._make_FUT() 40 | ds._load_dashboard = mock.Mock() 41 | ds.req = mock.Mock() 42 | ds.req.post.return_value = mock.Mock(status_code=500) 43 | assert_raises(Exception, ds._create_dashboard, "http://localhost") 44 | 45 | def test_ensure_dashboard_create(self): 46 | ds = self._make_FUT() 47 | ds.req = mock.Mock() 48 | mock_response = mock.Mock() 49 | mock_response.status_code = 200 50 | mock_response.json.return_value = [] 51 | ds._create_dashboard = mock.Mock() 52 | ds.req.get.return_value = mock_response 53 | 54 | ds._ensure_dashboard("http://localhost") 55 | ds._create_dashboard.assert_called() 56 | 57 | def test_ensure_dashboard_exception(self): 58 | ds = self._make_FUT() 59 | ds.req = mock.Mock() 60 | mock_response = mock.Mock() 61 | mock_response.status_code = 500 62 | ds.req.get.return_value = mock_response 63 | assert_raises(Exception, ds._ensure_dashboard, "http://localhost") 64 | 65 | def test_create_datasources(self): 66 | ds = self._make_FUT() 67 | ds.dashboard = True 68 | ds.influx = mock.Mock() 69 | ds.req = mock.Mock() 70 | mock_client = mock.Mock() 71 | ds._ensure_dashboard = mock.Mock() 72 | ds.influx.InfluxDBClient.return_value = mock_client 73 | 74 | ds.create_datasources() 75 | mock_client.create_database.assert_called() 76 | ds._ensure_dashboard.assert_called() 77 | -------------------------------------------------------------------------------- /tests/test_step_functions.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | import unittest 5 | import uuid 6 | 7 | import mock 8 | from botocore.exceptions import ClientError 9 | from nose.tools import eq_, assert_raises 10 | 11 | from tests import fixtures 12 | 13 | 14 | class TestAsyncPlanRunner(unittest.TestCase): 15 | def setUp(self): 16 | self.mock_ecs = mock.Mock() 17 | self._patcher = mock.patch("ardere.step_functions.ECSManager") 18 | mock_manager = self._patcher.start() 19 | mock_manager.return_value = self.mock_ecs 20 | 21 | from ardere.step_functions import AsynchronousPlanRunner 22 | 23 | self.plan = json.loads(fixtures.sample_basic_test_plan) 24 | self.runner = AsynchronousPlanRunner(self.plan, {}) 25 | self.runner.boto = self.mock_boto = mock.Mock() 26 | 27 | def tearDown(self): 28 | self._patcher.stop() 29 | 30 | def test_build_instance_map(self): 31 | result = self.runner._build_instance_map() 32 | eq_(len(result), 1) 33 | eq_(result, {"t2.medium": 1}) 34 | 35 | def test_find_test_plan_duration(self): 36 | result = self.runner._find_test_plan_duration() 37 | eq_(result, 140) 38 | 39 | def test_load_toml(self): 40 | from ardere.step_functions import AsynchronousPlanRunner 41 | 42 | self.runner = AsynchronousPlanRunner({"toml": fixtures.sample_toml}, 43 | None) 44 | eq_(len(self.runner.event["steps"]), 2) 45 | eq_(self.runner.event["steps"][0]["instance_count"], 8) 46 | eq_(self.runner.event["ecs_name"], "ardere-test") 47 | 48 | def test_populate_missing_instances(self): 49 | os.environ["ec2_sg"] = "i-23232" 50 | os.environ["metric_sg"] = "i-84828" 51 | self.mock_ecs.has_metrics_node.return_value = False 52 | self.runner.populate_missing_instances() 53 | self.mock_ecs.query_active_instances.assert_called() 54 | self.mock_ecs.request_instances.assert_called() 55 | 56 | def test_populate_missing_instances_fail(self): 57 | from ardere.exceptions import ValidationException 58 | mock_client = mock.Mock() 59 | self.mock_boto.client.return_value = mock_client 60 | mock_client.describe_clusters.return_value = {"clusters": []} 61 | assert_raises(ValidationException, 62 | self.runner.populate_missing_instances) 63 | 64 | def test_ensure_metrics_available_running_create(self): 65 | from ardere.exceptions import ServicesStartingException 66 | 67 | self.plan["metrics_options"] = dict(enabled=True) 68 | self.mock_ecs.locate_metrics_service.return_value = None 69 | 70 | assert_raises(ServicesStartingException, 71 | self.runner.ensure_metrics_available) 72 | self.mock_ecs.create_metrics_service.assert_called() 73 | 74 | def test_ensure_metrics_available_running_waiting(self): 75 | from ardere.exceptions import ServicesStartingException 76 | 77 | self.plan["metrics_options"] = dict(enabled=True) 78 | self.mock_ecs.locate_metrics_service.return_value = { 79 | "deployments": [{ 80 | "desiredCount": 1, 81 | "runningCount": 0 82 | }] 83 | } 84 | 85 | assert_raises(ServicesStartingException, 86 | self.runner.ensure_metrics_available) 87 | 88 | def test_ensure_metrics_available_running_error(self): 89 | self.plan["metrics_options"] = dict(enabled=True) 90 | self.mock_ecs.locate_metrics_service.return_value = { 91 | "deployments": [{ 92 | "desiredCount": 1, 93 | "runningCount": 1 94 | }] 95 | } 96 | self.mock_ecs.locate_metrics_container_ip.return_value = None 97 | 98 | assert_raises(Exception, self.runner.ensure_metrics_available) 99 | 100 | def test_ensure_metrics_available_running(self): 101 | os.environ["metrics_bucket"] = "metrics" 102 | self.plan["metrics_options"] = dict( 103 | enabled=True, 104 | dashboard=dict(admin_user="admin", 105 | admin_password="admin", name="fred", 106 | filename="smith") 107 | ) 108 | self.mock_ecs.locate_metrics_service.return_value = { 109 | "deployments": [{ 110 | "desiredCount": 1, 111 | "runningCount": 1 112 | }] 113 | } 114 | self.mock_ecs.locate_metrics_container_ip.return_value = ( 115 | "1.1.1.1", "arn:::" 116 | ) 117 | 118 | self.runner.ensure_metrics_available() 119 | self.mock_ecs.locate_metrics_container_ip.assert_called() 120 | 121 | def test_ensure_metrics_available_running_no_metric_ip(self): 122 | os.environ["metrics_bucket"] = "metrics" 123 | self.plan["metrics_options"] = dict( 124 | enabled=True, 125 | dashboard=dict(admin_user="admin", 126 | admin_password="admin", name="fred", 127 | filename="smith") 128 | ) 129 | self.mock_ecs.locate_metrics_service.return_value = { 130 | "deployments": [{ 131 | "desiredCount": 1, 132 | "runningCount": 1 133 | }] 134 | } 135 | self.mock_ecs.locate_metrics_container_ip.return_value = ( 136 | None, None 137 | ) 138 | 139 | assert_raises(Exception, self.runner.ensure_metrics_available) 140 | self.mock_ecs.locate_metrics_container_ip.assert_called() 141 | 142 | def test_ensure_metrics_available_disabled(self): 143 | self.plan["metrics_options"] = dict(enabled=False) 144 | self.runner.ensure_metrics_available() 145 | 146 | def test_ensure_metric_sources_created(self): 147 | os.environ["metrics_bucket"] = "metrics" 148 | self.plan["influxdb_private_ip"] = "1.1.1.1" 149 | self.plan["metrics_options"] = dict( 150 | enabled=True, 151 | dashboard=dict() 152 | ) 153 | self.mock_ecs.has_started_metric_creation.return_value = True 154 | self.runner.ensure_metric_sources_created() 155 | self.mock_ecs.has_started_metric_creation.assert_called() 156 | 157 | def test_ensure_metric_sources_created_not_finished(self): 158 | from ardere.exceptions import CreatingMetricSourceException 159 | os.environ["metrics_bucket"] = "metrics" 160 | self.plan["influxdb_private_ip"] = "1.1.1.1" 161 | self.plan["metrics_options"] = dict( 162 | enabled=True, 163 | ) 164 | self.mock_ecs.has_started_metric_creation.return_value = True 165 | self.mock_ecs.has_finished_metric_creation.return_value = False 166 | assert_raises(CreatingMetricSourceException, 167 | self.runner.ensure_metric_sources_created) 168 | self.mock_ecs.has_started_metric_creation.assert_called() 169 | 170 | def test_ensure_metric_sources_created_not_enabled(self): 171 | self.plan["metrics_options"] = dict( 172 | enabled=False, 173 | dashboard=dict() 174 | ) 175 | self.runner.ensure_metric_sources_created() 176 | 177 | def test_ensure_metric_sources_created_not_started(self): 178 | from ardere.exceptions import CreatingMetricSourceException 179 | os.environ["metrics_bucket"] = "metrics" 180 | self.plan["influxdb_private_ip"] = "1.1.1.1" 181 | self.plan["metric_container_arn"] = "arn:::" 182 | self.plan["metrics_options"] = dict( 183 | enabled=True, 184 | dashboard=dict( 185 | admin_user="admin", 186 | admin_password="admin", 187 | filename="asdf", 188 | name="a title" 189 | ) 190 | ) 191 | self.mock_ecs.has_started_metric_creation.return_value = False 192 | assert_raises(CreatingMetricSourceException, 193 | self.runner.ensure_metric_sources_created) 194 | self.mock_ecs.has_started_metric_creation.assert_called() 195 | 196 | def test_ensure_metric_sources_created_not_started_no_dash(self): 197 | from ardere.exceptions import CreatingMetricSourceException 198 | os.environ["metrics_bucket"] = "metrics" 199 | self.plan["influxdb_private_ip"] = "1.1.1.1" 200 | self.plan["metric_container_arn"] = "arn:::" 201 | self.plan["metrics_options"] = dict( 202 | enabled=True, 203 | ) 204 | self.mock_ecs.has_started_metric_creation.return_value = False 205 | assert_raises(CreatingMetricSourceException, 206 | self.runner.ensure_metric_sources_created) 207 | self.mock_ecs.has_started_metric_creation.assert_called() 208 | 209 | def test_create_ecs_services(self): 210 | self.runner.create_ecs_services() 211 | self.mock_ecs.create_services.assert_called_with(self.plan["steps"]) 212 | 213 | def test_wait_for_cluster_ready_not_ready(self): 214 | from ardere.exceptions import ServicesStartingException 215 | 216 | self.mock_ecs.all_services_ready.return_value = False 217 | assert_raises(ServicesStartingException, 218 | self.runner.wait_for_cluster_ready) 219 | 220 | def test_wait_for_cluster_ready_all_ready(self): 221 | self.mock_ecs.all_services_ready.return_value = True 222 | self.runner.wait_for_cluster_ready() 223 | self.mock_ecs.all_services_ready.assert_called() 224 | 225 | def test_signal_cluster_start(self): 226 | self.plan["plan_run_uuid"] = str(uuid.uuid4()) 227 | 228 | self.runner.signal_cluster_start() 229 | self.mock_boto.client.assert_called() 230 | 231 | def test_check_for_cluster_done_not_done(self): 232 | os.environ["s3_ready_bucket"] = "test_bucket" 233 | mock_file = mock.Mock() 234 | mock_file.get.return_value = {"Body": mock_file} 235 | mock_file.read.return_value = "{}".format( 236 | int(time.time()) - 100).encode( 237 | 'utf-8') 238 | mock_s3_obj = mock.Mock() 239 | mock_s3_obj.Object.return_value = mock_file 240 | self.mock_boto.resource.return_value = mock_s3_obj 241 | 242 | self.plan["plan_run_uuid"] = str(uuid.uuid4()) 243 | self.runner.check_for_cluster_done() 244 | 245 | def test_check_for_cluster_done_shutdown(self): 246 | from ardere.exceptions import ShutdownPlanException 247 | 248 | os.environ["s3_ready_bucket"] = "test_bucket" 249 | mock_file = mock.Mock() 250 | mock_file.get.return_value = {"Body": mock_file} 251 | mock_file.read.return_value = "{}".format( 252 | int(time.time()) - 400).encode( 253 | 'utf-8') 254 | mock_s3_obj = mock.Mock() 255 | mock_s3_obj.Object.return_value = mock_file 256 | self.mock_boto.resource.return_value = mock_s3_obj 257 | 258 | self.plan["plan_run_uuid"] = str(uuid.uuid4()) 259 | assert_raises(ShutdownPlanException, self.runner.check_for_cluster_done) 260 | 261 | def test_check_for_cluster_done_object_error(self): 262 | from ardere.exceptions import ShutdownPlanException 263 | 264 | os.environ["s3_ready_bucket"] = "test_bucket" 265 | mock_file = mock.Mock() 266 | mock_file.get.return_value = {"Body": mock_file} 267 | mock_file.read.return_value = "{}".format( 268 | int(time.time()) - 400).encode( 269 | 'utf-8') 270 | mock_s3_obj = mock.Mock() 271 | mock_s3_obj.Object.side_effect = ClientError( 272 | {"Error": {}}, None 273 | ) 274 | self.mock_boto.resource.return_value = mock_s3_obj 275 | 276 | self.plan["plan_run_uuid"] = str(uuid.uuid4()) 277 | assert_raises(ShutdownPlanException, self.runner.check_for_cluster_done) 278 | 279 | def test_cleanup_cluster(self): 280 | self.plan["plan_run_uuid"] = str(uuid.uuid4()) 281 | 282 | self.runner.cleanup_cluster() 283 | self.mock_boto.resource.assert_called() 284 | 285 | def test_cleanup_cluster_error(self): 286 | self.plan["plan_run_uuid"] = str(uuid.uuid4()) 287 | 288 | mock_s3 = mock.Mock() 289 | self.mock_boto.resource.return_value = mock_s3 290 | mock_s3.Object.side_effect = ClientError( 291 | {"Error": {}}, None 292 | ) 293 | self.runner.cleanup_cluster() 294 | mock_s3.Object.assert_called() 295 | 296 | def test_drain_check_draining(self): 297 | from ardere.exceptions import UndrainedInstancesException 298 | self.mock_ecs.all_services_done.return_value = True 299 | self.runner.check_drained() 300 | self.mock_ecs.all_services_done.return_value = False 301 | assert_raises(UndrainedInstancesException, 302 | self.runner.check_drained) 303 | 304 | 305 | class TestValidation(unittest.TestCase): 306 | def _make_FUT(self): 307 | from ardere.step_functions import PlanValidator 308 | return PlanValidator() 309 | 310 | def test_validate_success(self): 311 | schema = self._make_FUT() 312 | schema.context["boto"] = mock.Mock() 313 | plan = json.loads(fixtures.sample_basic_test_plan) 314 | data, errors = schema.load(plan) 315 | eq_(errors, {}) 316 | eq_(len(data["steps"]), len(plan["steps"])) 317 | 318 | def test_validate_fail_ecs_name(self): 319 | schema = self._make_FUT() 320 | schema.context["boto"] = mock.Mock() 321 | plan = json.loads(fixtures.sample_basic_test_plan) 322 | plan['ecs_name'] = '' 323 | data, errors = schema.load(plan) 324 | eq_(errors, {'ecs_name': ['Plan ecs_name missing']}) 325 | plan['ecs_name'] += '*' 326 | data, errors = schema.load(plan) 327 | eq_(errors, {'ecs_name': 328 | ['Plan ecs_name contained invalid characters']}) 329 | plan['ecs_name'] = 'a' * 512 330 | data, errors = schema.load(plan) 331 | eq_(errors, {'ecs_name': ['Plan ecs_name too long']}) 332 | 333 | def test_validate_fail_step_name(self): 334 | schema = self._make_FUT() 335 | schema.context["boto"] = mock.Mock() 336 | plan = json.loads(fixtures.sample_basic_test_plan) 337 | plan['steps'][0]['name'] = '' 338 | data, errors = schema.load(plan) 339 | eq_(errors, {'steps': {0: {'name': ['Step name missing']}}}) 340 | plan['steps'][0]['name'] = '*' 341 | data, errors = schema.load(plan) 342 | eq_(errors, 343 | {'steps': {0: {'name': ['Step name contains invalid characters']}}} 344 | ) 345 | plan['steps'][0]['name'] = 'a' * 512 346 | data, errors = schema.load(plan) 347 | eq_(errors, {'steps': {0: {'name': ['Step name too long']}}}) 348 | 349 | def test_validate_fail(self): 350 | schema = self._make_FUT() 351 | schema.context["boto"] = mock_boto = mock.Mock() 352 | mock_client = mock.Mock() 353 | mock_boto.client.return_value = mock_client 354 | mock_client.describe_clusters.return_value = {"clusters": []} 355 | plan = json.loads(fixtures.sample_basic_test_plan) 356 | data, errors = schema.load(plan) 357 | eq_(len(data["steps"]), len(plan["steps"])) 358 | eq_(len(errors), 1) 359 | --------------------------------------------------------------------------------