├── .clog.toml
├── .coveragerc
├── .flooignore
├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── ardere
    ├── __init__.py
    ├── aws.py
    ├── exceptions.py
    ├── scripts
    │   ├── __init__.py
    │   └── metric_creator.py
    └── step_functions.py
├── config.bash
├── default_dashboard.json
├── handler.py
├── package.json
├── requirements.txt
├── serverless.yml
├── setup.cfg
├── src
    └── shell
    │   ├── telegraf.toml
    │   └── waitforcluster.sh
├── test-requirements.txt
└── tests
    ├── __init__.py
    ├── fixtures.py
    ├── test_aws.py
    ├── test_metric_creator.py
    └── test_step_functions.py


/.clog.toml:
--------------------------------------------------------------------------------
 1 | [clog]
 2 | repository = "https://github.com/loads/ardere"
 3 | changelog = "CHANGELOG.md"
 4 | from-latest-tag = true
 5 | link-style = "github"
 6 | 
 7 | [sections]
 8 | Refactor = ["refactor"]
 9 | Test = ["test"]
10 | Doc = ["docs"]
11 | Chore = ["chore"]
12 | Features = ["feat", "feature"]
13 | "Bug Fixes" = ["fix", "bug"]
14 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | show_missing = True
3 | 


--------------------------------------------------------------------------------
/.flooignore:
--------------------------------------------------------------------------------
 1 | # Distribution / packaging
 2 | .Python
 3 | env/
 4 | build/
 5 | develop-eggs/
 6 | dist/
 7 | downloads/
 8 | eggs/
 9 | .eggs/
10 | lib/
11 | lib64/
12 | parts/
13 | sdist/
14 | var/
15 | *.egg-info/
16 | .installed.cfg
17 | *.egg
18 | node_modules/
19 | 
20 | # Serverless directories
21 | .serverless
22 | .requirements
23 | ardenv/
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Distribution / packaging
 2 | .coverage
 3 | .floo
 4 | .idea
 5 | .Python
 6 | .requirements
 7 | .npmignore
 8 | ardenv/
 9 | env/
10 | build/
11 | develop-eggs/
12 | dist/
13 | downloads/
14 | eggs/
15 | .eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.pyc
24 | *.egg
25 | node_modules/
26 | 
27 | # Serverless directories
28 | .serverless
29 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | cache: pip
 3 | sudo: required
 4 | dist: precise
 5 | 
 6 | matrix:
 7 |   include:
 8 |     - python: 2.7
 9 |       env: CODECOV=true
10 | 
11 | install:
12 | - pip install -r test-requirements.txt
13 | - pip install ${CODECOV:+codecov}
14 | script:
15 | - nosetests -d tests ${CODECOV:+--with-coverage --cover-xml --cover-package=ardere}
16 | after_success:
17 | - codecov
18 | notifications:
19 |   slack:
20 |     secure: vT9sWtUuxk28g6xYKAsQmiPZllErOYVfx5lcL+/jo1eRFrmbpYnyndT6s+FxGI1547oizZ0IqZbHVvB7BUoSJixXJyQJYXW2MchwN1UeHrey8mYpF1GNEaJT7FMfqSkxUU9gvAZ3IU7zstNeTLbfG1GkLuzybp0WAiHl/ocUTz8=
21 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | <a name="0.1.1"></a>
 2 | ## 0.1.1 (2017-05-15)
 3 | 
 4 | 
 5 | #### Doc
 6 | 
 7 | *   update README with run steps ([3d6b5aa2](https://github.com/loads/ardere/commit/3d6b5aa2e6277a33e1a464d30168bbc2f406c512))
 8 | 
 9 | #### Bug Fixes
10 | 
11 | *   bump wait for cluster ready from 10 to 30 minutes ([a23115b8](https://github.com/loads/ardere/commit/a23115b8bc20f4e7b44ef4bf78b3687069ea1253))
12 | 
13 | 
14 | 
15 | <a name="0.1"></a>
16 | ## 0.1 (2017-04-25)
17 | 
18 | 
19 | #### Features
20 | 
21 | *   secure metrics/load-test nodes from outside access ([3d08dccd](https://github.com/loads/ardere/commit/3d08dccd2376f85976b2f7bd026295c504560485), closes [#54](https://github.com/loads/ardere/issues/54))
22 | *   Check names for invalid characters and lengths ([c886f6a9](https://github.com/loads/ardere/commit/c886f6a9598badb084871720515ff1663e61c032))
23 | *   use security groups to restrict node access ([6395f9cd](https://github.com/loads/ardere/commit/6395f9cd52ab0c74a2735a8fecc2b30a217ddfda), closes [#48](https://github.com/loads/ardere/issues/48))
24 | *   add grafana dashboarding ([a7a30df8](https://github.com/loads/ardere/commit/a7a30df8210429341e711ad713510e00acdc80c1), closes [#40](https://github.com/loads/ardere/issues/40))
25 | *   add telegraf setup for per-container stat reporting ([7749e2eb](https://github.com/loads/ardere/commit/7749e2eb373a6f6afc49b2a7d03fcf5c4f9a18fb), closes [#33](https://github.com/loads/ardere/issues/33))
26 | *   start influxdb with test runs ([8ddc48b5](https://github.com/loads/ardere/commit/8ddc48b5d3d395d54a914166e9803a6ab41ecf3f), closes [#19](https://github.com/loads/ardere/issues/19))
27 | *   validate test plan before running ([0314fae7](https://github.com/loads/ardere/commit/0314fae70962f6281a261499e32500291ff764ab), closes [#21](https://github.com/loads/ardere/issues/21))
28 | *   remove need to specify cpu_units ([e99eddea](https://github.com/loads/ardere/commit/e99eddead4b4119e508546aa38dc34873efa9632), closes [#20](https://github.com/loads/ardere/issues/20))
29 | *   add port mapping for containers ([af054af1](https://github.com/loads/ardere/commit/af054af18e6ab5e4cd163c903867dc2cfe415168), closes [#24](https://github.com/loads/ardere/issues/24))
30 | *   add toml loading as a test plan ([8342cb11](https://github.com/loads/ardere/commit/8342cb11902f6a225925cd1f8fd430d31a614cf9), closes [#32](https://github.com/loads/ardere/issues/32))
31 | *   use cloudwatch logs for container output ([8bafa09f](https://github.com/loads/ardere/commit/8bafa09f82ad0116e31cc49849b7bd679219506c), closes [#27](https://github.com/loads/ardere/issues/27))
32 | *   setup environment data from the test plan ([7e2ad2da](https://github.com/loads/ardere/commit/7e2ad2dad361336a4d46166e6aec32cd80c15e03), closes [#25](https://github.com/loads/ardere/issues/25))
33 | *   fixup readme and test suite ([047a7fa6](https://github.com/loads/ardere/commit/047a7fa6381f4d034fd0c2955e90319a29730c76), closes [#22](https://github.com/loads/ardere/issues/22))
34 | *   create MVP using serverless w/python ([9aa80467](https://github.com/loads/ardere/commit/9aa80467ce86b95e330886c1dcf57e5d84004e83), closes [#17](https://github.com/loads/ardere/issues/17))
35 | *   add the lambda to start the run by writing to s3 ([e45a2789](https://github.com/loads/ardere/commit/e45a278930589b8dddbf88e3fe151f979d388edd))
36 | *   add lambda function and basic CF templates for use ([0cb63bff](https://github.com/loads/ardere/commit/0cb63bff8f1d7b2533ee40a81a932e3bb618236f), closes [#11](https://github.com/loads/ardere/issues/11))
37 | *   add an initial state machine impl ([2f571b0a](https://github.com/loads/ardere/commit/2f571b0aec7df9252c8d0fce44da252c17985fa2))
38 | *   initial waiter script (#9) ([c07749c0](https://github.com/loads/ardere/commit/c07749c06a97bba50fe1701a2896d9b5a11dd18e))
39 | 
40 | #### Doc
41 | 
42 | *   update for use of cloud formation in setup (#2) ([243a4a11](https://github.com/loads/ardere/commit/243a4a11da3343735815dd42a0c78bb6936adf56))
43 | *   initial design docs from autoconf ([eead6dd8](https://github.com/loads/ardere/commit/eead6dd80a43c24b40047fc5c22571122878ce05))
44 | 
45 | #### Bug Fixes
46 | 
47 | *   check service drained vs container draining ([fd4907e1](https://github.com/loads/ardere/commit/fd4907e10be9103cc9e20511c2e16c4ae906e469), closes [#62](https://github.com/loads/ardere/issues/62))
48 | *   Do not check 'metrics' instance for draining ([40e8cd01](https://github.com/loads/ardere/commit/40e8cd01fc996f1596370c5ddb6ff6998b04ffdc))
49 | *   Ensure all containers drained before exiting ([4cbea2fd](https://github.com/loads/ardere/commit/4cbea2fd0a280993d4312f82ba52354a0bf15f7f))
50 | *   add proper tagging and socket limits ([15dc023e](https://github.com/loads/ardere/commit/15dc023efc91a0b3b644084a71f3f6f46be77158), closes [#44](https://github.com/loads/ardere/issues/44))
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribution Guidelines
 2 | 
 3 | Anyone is welcome to contribute to this project. Feel free to get in touch with
 4 | other community members on IRC, the mailing list or through issues here on
 5 | GitHub.
 6 | 
 7 | [See the README](/README.md) for contact information.
 8 | 
 9 | ## Bug Reports
10 | 
11 | You can file issues here on GitHub. Please try to include as much information as
12 | you can and under what conditions you saw the issue.
13 | 
14 | ## Sending Pull Requests
15 | 
16 | Patches should be submitted as pull requests (PR).
17 | 
18 | Before submitting a PR:
19 | - Your code must run and pass all the automated tests before you submit your PR
20 |   for review. "Work in progress" pull requests are allowed to be submitted, but
21 |   should be clearly labeled as such and should not be merged until all tests
22 |   pass and the code has been reviewed.
23 | - Your patch should include new tests that cover your changes. It is your and
24 |   your reviewer's responsibility to ensure your patch includes adequate tests.
25 | 
26 | When submitting a PR:
27 | - You agree to license your code under the project's open source license
28 |   ([MPL 2.0](/LICENSE)).
29 | - Base your branch off the current `master` (see below for an example workflow).
30 | - Add both your code and new tests if relevant.
31 | - Run the test suite to make sure your code passes linting and tests.
32 | - Please do not include merge commits in pull requests; include only commits with the new relevant code.
33 | 
34 | See the main [README.md](/README.md) for information on prerequisites, installing, running and testing.
35 | 
36 | ## Code Review
37 | 
38 | This project is production Mozilla code and subject to our [engineering practices and quality standards](https://developer.mozilla.org/en-US/docs/Mozilla/Developer_guide/Committing_Rules_and_Responsibilities). Every patch must be peer reviewed.
39 | 
40 | ## Git Commit Guidelines
41 | 
42 | We loosely follow the [Angular commit guidelines](https://github.com/angular/angular.js/blob/master/CONTRIBUTING.md#type) of `<type>(<scope>): <subject>` where `type` must be one of:
43 | 
44 | * **feat**: A new feature
45 | * **fix**: A bug fix
46 | * **docs**: Documentation only changes
47 | * **style**: Changes that do not affect the meaning of the code (white-space, formatting, missing
48 |   semi-colons, etc)
49 | * **refactor**: A code change that neither fixes a bug or adds a feature
50 | * **perf**: A code change that improves performance
51 | * **test**: Adding missing tests
52 | * **chore**: Changes to the build process or auxiliary tools and libraries such as documentation
53 |   generation
54 | 
55 | ### Scope
56 | The scope could be anything specifying place of the commit change.
57 | 
58 | ### Subject
59 | The subject contains succinct description of the change:
60 | 
61 | * use the imperative, present tense: "change" not "changed" nor "changes"
62 | * don't capitalize first letter
63 | * no dot (.) at the end
64 | 
65 | ###Body
66 | In order to maintain a reference to the context of the commit, add
67 | `fixes #<issue_number>` if it closes a related issue or `issue #<issue_number>`
68 | if it's a partial fix.
69 | 
70 | You can also write a detailed description of the commit: Just as in the
71 | **subject**, use the imperative, present tense: "change" not "changed" nor
72 | "changes" It should include the motivation for the change and contrast this with
73 | previous behavior.
74 | 
75 | ###Footer
76 | The footer should contain any information about **Breaking Changes** and is also
77 | the place to reference GitHub issues that this commit **Closes**.
78 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Mozilla Public License Version 2.0
  2 | ==================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor's Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Incompatible With Secondary Licenses"
 25 |     means
 26 | 
 27 |     (a) that the initial Contributor has attached the notice described
 28 |         in Exhibit B to the Covered Software; or
 29 | 
 30 |     (b) that the Covered Software was made available under the terms of
 31 |         version 1.1 or earlier of the License, but not also under the
 32 |         terms of a Secondary License.
 33 | 
 34 | 1.6. "Executable Form"
 35 |     means any form of the work other than Source Code Form.
 36 | 
 37 | 1.7. "Larger Work"
 38 |     means a work that combines Covered Software with other material, in
 39 |     a separate file or files, that is not Covered Software.
 40 | 
 41 | 1.8. "License"
 42 |     means this document.
 43 | 
 44 | 1.9. "Licensable"
 45 |     means having the right to grant, to the maximum extent possible,
 46 |     whether at the time of the initial grant or subsequently, any and
 47 |     all of the rights conveyed by this License.
 48 | 
 49 | 1.10. "Modifications"
 50 |     means any of the following:
 51 | 
 52 |     (a) any file in Source Code Form that results from an addition to,
 53 |         deletion from, or modification of the contents of Covered
 54 |         Software; or
 55 | 
 56 |     (b) any new file in Source Code Form that contains any Covered
 57 |         Software.
 58 | 
 59 | 1.11. "Patent Claims" of a Contributor
 60 |     means any patent claim(s), including without limitation, method,
 61 |     process, and apparatus claims, in any patent Licensable by such
 62 |     Contributor that would be infringed, but for the grant of the
 63 |     License, by the making, using, selling, offering for sale, having
 64 |     made, import, or transfer of either its Contributions or its
 65 |     Contributor Version.
 66 | 
 67 | 1.12. "Secondary License"
 68 |     means either the GNU General Public License, Version 2.0, the GNU
 69 |     Lesser General Public License, Version 2.1, the GNU Affero General
 70 |     Public License, Version 3.0, or any later versions of those
 71 |     licenses.
 72 | 
 73 | 1.13. "Source Code Form"
 74 |     means the form of the work preferred for making modifications.
 75 | 
 76 | 1.14. "You" (or "Your")
 77 |     means an individual or a legal entity exercising rights under this
 78 |     License. For legal entities, "You" includes any entity that
 79 |     controls, is controlled by, or is under common control with You. For
 80 |     purposes of this definition, "control" means (a) the power, direct
 81 |     or indirect, to cause the direction or management of such entity,
 82 |     whether by contract or otherwise, or (b) ownership of more than
 83 |     fifty percent (50%) of the outstanding shares or beneficial
 84 |     ownership of such entity.
 85 | 
 86 | 2. License Grants and Conditions
 87 | --------------------------------
 88 | 
 89 | 2.1. Grants
 90 | 
 91 | Each Contributor hereby grants You a world-wide, royalty-free,
 92 | non-exclusive license:
 93 | 
 94 | (a) under intellectual property rights (other than patent or trademark)
 95 |     Licensable by such Contributor to use, reproduce, make available,
 96 |     modify, display, perform, distribute, and otherwise exploit its
 97 |     Contributions, either on an unmodified basis, with Modifications, or
 98 |     as part of a Larger Work; and
 99 | 
100 | (b) under Patent Claims of such Contributor to make, use, sell, offer
101 |     for sale, have made, import, and otherwise transfer either its
102 |     Contributions or its Contributor Version.
103 | 
104 | 2.2. Effective Date
105 | 
106 | The licenses granted in Section 2.1 with respect to any Contribution
107 | become effective for each Contribution on the date the Contributor first
108 | distributes such Contribution.
109 | 
110 | 2.3. Limitations on Grant Scope
111 | 
112 | The licenses granted in this Section 2 are the only rights granted under
113 | this License. No additional rights or licenses will be implied from the
114 | distribution or licensing of Covered Software under this License.
115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
116 | Contributor:
117 | 
118 | (a) for any code that a Contributor has removed from Covered Software;
119 |     or
120 | 
121 | (b) for infringements caused by: (i) Your and any other third party's
122 |     modifications of Covered Software, or (ii) the combination of its
123 |     Contributions with other software (except as part of its Contributor
124 |     Version); or
125 | 
126 | (c) under Patent Claims infringed by Covered Software in the absence of
127 |     its Contributions.
128 | 
129 | This License does not grant any rights in the trademarks, service marks,
130 | or logos of any Contributor (except as may be necessary to comply with
131 | the notice requirements in Section 3.4).
132 | 
133 | 2.4. Subsequent Licenses
134 | 
135 | No Contributor makes additional grants as a result of Your choice to
136 | distribute the Covered Software under a subsequent version of this
137 | License (see Section 10.2) or under the terms of a Secondary License (if
138 | permitted under the terms of Section 3.3).
139 | 
140 | 2.5. Representation
141 | 
142 | Each Contributor represents that the Contributor believes its
143 | Contributions are its original creation(s) or it has sufficient rights
144 | to grant the rights to its Contributions conveyed by this License.
145 | 
146 | 2.6. Fair Use
147 | 
148 | This License is not intended to limit any rights You have under
149 | applicable copyright doctrines of fair use, fair dealing, or other
150 | equivalents.
151 | 
152 | 2.7. Conditions
153 | 
154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155 | in Section 2.1.
156 | 
157 | 3. Responsibilities
158 | -------------------
159 | 
160 | 3.1. Distribution of Source Form
161 | 
162 | All distribution of Covered Software in Source Code Form, including any
163 | Modifications that You create or to which You contribute, must be under
164 | the terms of this License. You must inform recipients that the Source
165 | Code Form of the Covered Software is governed by the terms of this
166 | License, and how they can obtain a copy of this License. You may not
167 | attempt to alter or restrict the recipients' rights in the Source Code
168 | Form.
169 | 
170 | 3.2. Distribution of Executable Form
171 | 
172 | If You distribute Covered Software in Executable Form then:
173 | 
174 | (a) such Covered Software must also be made available in Source Code
175 |     Form, as described in Section 3.1, and You must inform recipients of
176 |     the Executable Form how they can obtain a copy of such Source Code
177 |     Form by reasonable means in a timely manner, at a charge no more
178 |     than the cost of distribution to the recipient; and
179 | 
180 | (b) You may distribute such Executable Form under the terms of this
181 |     License, or sublicense it under different terms, provided that the
182 |     license for the Executable Form does not attempt to limit or alter
183 |     the recipients' rights in the Source Code Form under this License.
184 | 
185 | 3.3. Distribution of a Larger Work
186 | 
187 | You may create and distribute a Larger Work under terms of Your choice,
188 | provided that You also comply with the requirements of this License for
189 | the Covered Software. If the Larger Work is a combination of Covered
190 | Software with a work governed by one or more Secondary Licenses, and the
191 | Covered Software is not Incompatible With Secondary Licenses, this
192 | License permits You to additionally distribute such Covered Software
193 | under the terms of such Secondary License(s), so that the recipient of
194 | the Larger Work may, at their option, further distribute the Covered
195 | Software under the terms of either this License or such Secondary
196 | License(s).
197 | 
198 | 3.4. Notices
199 | 
200 | You may not remove or alter the substance of any license notices
201 | (including copyright notices, patent notices, disclaimers of warranty,
202 | or limitations of liability) contained within the Source Code Form of
203 | the Covered Software, except that You may alter any license notices to
204 | the extent required to remedy known factual inaccuracies.
205 | 
206 | 3.5. Application of Additional Terms
207 | 
208 | You may choose to offer, and to charge a fee for, warranty, support,
209 | indemnity or liability obligations to one or more recipients of Covered
210 | Software. However, You may do so only on Your own behalf, and not on
211 | behalf of any Contributor. You must make it absolutely clear that any
212 | such warranty, support, indemnity, or liability obligation is offered by
213 | You alone, and You hereby agree to indemnify every Contributor for any
214 | liability incurred by such Contributor as a result of warranty, support,
215 | indemnity or liability terms You offer. You may include additional
216 | disclaimers of warranty and limitations of liability specific to any
217 | jurisdiction.
218 | 
219 | 4. Inability to Comply Due to Statute or Regulation
220 | ---------------------------------------------------
221 | 
222 | If it is impossible for You to comply with any of the terms of this
223 | License with respect to some or all of the Covered Software due to
224 | statute, judicial order, or regulation then You must: (a) comply with
225 | the terms of this License to the maximum extent possible; and (b)
226 | describe the limitations and the code they affect. Such description must
227 | be placed in a text file included with all distributions of the Covered
228 | Software under this License. Except to the extent prohibited by statute
229 | or regulation, such description must be sufficiently detailed for a
230 | recipient of ordinary skill to be able to understand it.
231 | 
232 | 5. Termination
233 | --------------
234 | 
235 | 5.1. The rights granted under this License will terminate automatically
236 | if You fail to comply with any of its terms. However, if You become
237 | compliant, then the rights granted under this License from a particular
238 | Contributor are reinstated (a) provisionally, unless and until such
239 | Contributor explicitly and finally terminates Your grants, and (b) on an
240 | ongoing basis, if such Contributor fails to notify You of the
241 | non-compliance by some reasonable means prior to 60 days after You have
242 | come back into compliance. Moreover, Your grants from a particular
243 | Contributor are reinstated on an ongoing basis if such Contributor
244 | notifies You of the non-compliance by some reasonable means, this is the
245 | first time You have received notice of non-compliance with this License
246 | from such Contributor, and You become compliant prior to 30 days after
247 | Your receipt of the notice.
248 | 
249 | 5.2. If You initiate litigation against any entity by asserting a patent
250 | infringement claim (excluding declaratory judgment actions,
251 | counter-claims, and cross-claims) alleging that a Contributor Version
252 | directly or indirectly infringes any patent, then the rights granted to
253 | You by any and all Contributors for the Covered Software under Section
254 | 2.1 of this License shall terminate.
255 | 
256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257 | end user license agreements (excluding distributors and resellers) which
258 | have been validly granted by You or Your distributors under this License
259 | prior to termination shall survive termination.
260 | 
261 | ************************************************************************
262 | *                                                                      *
263 | *  6. Disclaimer of Warranty                                           *
264 | *  -------------------------                                           *
265 | *                                                                      *
266 | *  Covered Software is provided under this License on an "as is"       *
267 | *  basis, without warranty of any kind, either expressed, implied, or  *
268 | *  statutory, including, without limitation, warranties that the       *
269 | *  Covered Software is free of defects, merchantable, fit for a        *
270 | *  particular purpose or non-infringing. The entire risk as to the     *
271 | *  quality and performance of the Covered Software is with You.        *
272 | *  Should any Covered Software prove defective in any respect, You     *
273 | *  (not any Contributor) assume the cost of any necessary servicing,   *
274 | *  repair, or correction. This disclaimer of warranty constitutes an   *
275 | *  essential part of this License. No use of any Covered Software is   *
276 | *  authorized under this License except under this disclaimer.         *
277 | *                                                                      *
278 | ************************************************************************
279 | 
280 | ************************************************************************
281 | *                                                                      *
282 | *  7. Limitation of Liability                                          *
283 | *  --------------------------                                          *
284 | *                                                                      *
285 | *  Under no circumstances and under no legal theory, whether tort      *
286 | *  (including negligence), contract, or otherwise, shall any           *
287 | *  Contributor, or anyone who distributes Covered Software as          *
288 | *  permitted above, be liable to You for any direct, indirect,         *
289 | *  special, incidental, or consequential damages of any character      *
290 | *  including, without limitation, damages for lost profits, loss of    *
291 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
292 | *  and all other commercial damages or losses, even if such party      *
293 | *  shall have been informed of the possibility of such damages. This   *
294 | *  limitation of liability shall not apply to liability for death or   *
295 | *  personal injury resulting from such party's negligence to the       *
296 | *  extent applicable law prohibits such limitation. Some               *
297 | *  jurisdictions do not allow the exclusion or limitation of           *
298 | *  incidental or consequential damages, so this exclusion and          *
299 | *  limitation may not apply to You.                                    *
300 | *                                                                      *
301 | ************************************************************************
302 | 
303 | 8. Litigation
304 | -------------
305 | 
306 | Any litigation relating to this License may be brought only in the
307 | courts of a jurisdiction where the defendant maintains its principal
308 | place of business and such litigation shall be governed by laws of that
309 | jurisdiction, without reference to its conflict-of-law provisions.
310 | Nothing in this Section shall prevent a party's ability to bring
311 | cross-claims or counter-claims.
312 | 
313 | 9. Miscellaneous
314 | ----------------
315 | 
316 | This License represents the complete agreement concerning the subject
317 | matter hereof. If any provision of this License is held to be
318 | unenforceable, such provision shall be reformed only to the extent
319 | necessary to make it enforceable. Any law or regulation which provides
320 | that the language of a contract shall be construed against the drafter
321 | shall not be used to construe this License against a Contributor.
322 | 
323 | 10. Versions of the License
324 | ---------------------------
325 | 
326 | 10.1. New Versions
327 | 
328 | Mozilla Foundation is the license steward. Except as provided in Section
329 | 10.3, no one other than the license steward has the right to modify or
330 | publish new versions of this License. Each version will be given a
331 | distinguishing version number.
332 | 
333 | 10.2. Effect of New Versions
334 | 
335 | You may distribute the Covered Software under the terms of the version
336 | of the License under which You originally received the Covered Software,
337 | or under the terms of any subsequent version published by the license
338 | steward.
339 | 
340 | 10.3. Modified Versions
341 | 
342 | If you create software not governed by this License, and you want to
343 | create a new license for such software, you may create and use a
344 | modified version of this License if you rename the license and remove
345 | any references to the name of the license steward (except to note that
346 | such modified license differs from this License).
347 | 
348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
349 | Licenses
350 | 
351 | If You choose to distribute Source Code Form that is Incompatible With
352 | Secondary Licenses under the terms of this version of the License, the
353 | notice described in Exhibit B of this License must be attached.
354 | 
355 | Exhibit A - Source Code Form License Notice
356 | -------------------------------------------
357 | 
358 |   This Source Code Form is subject to the terms of the Mozilla Public
359 |   License, v. 2.0. If a copy of the MPL was not distributed with this
360 |   file, You can obtain one at http://mozilla.org/MPL/2.0/.
361 | 
362 | If it is not possible or desirable to put the notice in a particular
363 | file, then You may include the notice in a location (such as a LICENSE
364 | file in a relevant directory) where a recipient would be likely to look
365 | for such a notice.
366 | 
367 | You may add additional accurate notices of copyright ownership.
368 | 
369 | Exhibit B - "Incompatible With Secondary Licenses" Notice
370 | ---------------------------------------------------------
371 | 
372 |   This Source Code Form is "Incompatible With Secondary Licenses", as
373 |   defined by the Mozilla Public License, v. 2.0.
374 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ardere
 2 | *AWS Serverless Service for Load-Testing*
 3 | 
 4 | ardere runs as a serverless service using AWS to orchestrate
 5 | load-tests consisting of docker container configurations arranged as
 6 | test plans.
 7 | 
 8 | ## Installation
 9 | 
10 | Pre-requisite: 
11 | installation requires node > v6 
12 | 
13 | To deploy ardere to your AWS account, you will need a fairly recent
14 | install of Node, then install the Node packages required:
15 | 
16 |     $ npm install
17 |     
18 | You will need to ensure your have AWS access and secret keys configured
19 | for serverless:
20 | 
21 |     $ sls config
22 |     
23 | To deploy the ardere lambda's and required AWS stack:
24 | 
25 |     $ sls deploy
26 | 
27 | Then you can deploy the ardere Step Function:
28 | 
29 |     $ sls deploy stepf
30 | 
31 | 
32 | ## Developing
33 | 
34 | ardere is written in Python and deployed via serverless to AWS. To an
35 | extent testing it on AWS is the most reliable indicator it works as
36 | intended. However, there are sets of tests that ensure the Python code
37 | is valid and works with arguments as intended that may be run locally.
38 | 
39 | Create a Python virtualenv, and install the test requirements:
40 | 
41 |     $ virtualenv ardenv
42 |     $ source ardenv/bin/activate
43 |     $ pip install -r test-requirements.txt
44 | 
45 | The tests can now be run with nose:
46 | 
47 |     $ nosetests
48 |   
49 | Note that **you cannot run the sls deploy while the virtualenv is active**
50 | due to how the serverless Python requirements plugin operates.
51 | 
52 | ## Run Test
53 | 
54 | 1. Login to AWS console
55 |    (mozilla-services use: stage)
56 | 2. Go to Step Functions > Dashboard
57 | 3. Select your state machine
58 |    (mozilla-services use: "ardere-dev-ardere")
59 | 4. Click on "New Execution" button
60 | 5. Paste your json config into text area
61 |    (example: [**mozilla-services/screenshots-loadtests** /ardere.json](https://github.com/mozilla-services/screenshots-loadtests/blob/master/ardere.json))
62 | 6. Optional: Assign a name to your execution
63 | 7. Click on "Start Execution"
64 | 8. Monitor execution in Dashboard
65 | 9. Test load should be visible in DataDog, NewRelic, etc.
66 | 
67 | ## Monitoring
68 | 
69 | ### Metrics Node Monitoring (Grafana) 
70 | 
71 | 1. ssh -L 3000:\<ip\_metrics\_node\>:3000  \<ip\_bastion\_host\>
72 | 2. open local browser to http://localhost:3000
73 | 3. login using credentials specified in your ardere (JSON) config file
74 | 


--------------------------------------------------------------------------------
/ardere/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.1'  # pragma: nocover
2 | 


--------------------------------------------------------------------------------
/ardere/aws.py:
--------------------------------------------------------------------------------
  1 | """AWS Helper Classes"""
  2 | import logging
  3 | import os
  4 | import time
  5 | import uuid
  6 | from collections import defaultdict
  7 | 
  8 | import boto3
  9 | import botocore
 10 | from concurrent.futures import ThreadPoolExecutor
 11 | from typing import Any, Dict, List, Optional, Tuple  # noqa
 12 | 
 13 | logger = logging.getLogger()
 14 | logger.setLevel(logging.INFO)
 15 | 
 16 | # Setup script paths
 17 | dir_path = os.path.dirname(os.path.realpath(__file__))
 18 | parent_dir_path = os.path.dirname(dir_path)
 19 | wait_script_path = os.path.join(parent_dir_path, "src", "shell",
 20 |                                 "waitforcluster.sh")
 21 | telegraf_script_path = os.path.join(parent_dir_path, "src", "shell",
 22 |                                     "telegraf.toml")
 23 | metric_create_script = os.path.join(parent_dir_path, "ardere", "scripts",
 24 |                                     "metric_creator.py")
 25 | 
 26 | # EC2 userdata to setup values on load
 27 | # Settings for net.ipv4 settings based on:
 28 | #    http://stackoverflow.com/questions/410616/increasing-the-maximum-number-of-tcp-ip-connections-in-linux
 29 | # Other settings are from operations on kernel tweaks they've done to handle
 30 | # large socket conditions.
 31 | EC2_USER_DATA = """#!/bin/bash
 32 | echo ECS_CLUSTER='{ecs_name}' >> /etc/ecs/ecs.config
 33 | sysctl net.core.rmem_default=8388608
 34 | sysctl net.core.rmem_max=16777216
 35 | sysctl net.core.wmem_max=16777216
 36 | sysctl net.core.netdev_max_backlog=2500
 37 | sysctl net.core.somaxconn=3240000
 38 | sysctl net.netfilter.nf_conntrack_tcp_timeout_established=600
 39 | sysctl net.nf_conntrack_max=1000000
 40 | sysctl net.ipv4.ip_local_port_range="1024 65535"
 41 | sysctl net.ipv4.netfilter.ip_conntrack_max=4999999
 42 | sysctl net.ipv4.netfilter.ip_conntrack_tcp_timeout_time_wait=1
 43 | sysctl net.ipv4.netfilter.ip_conntrack_tcp_timeout_established=54000
 44 | sysctl net.ipv4.tcp_fin_timeout=5
 45 | sysctl net.ipv4.tcp_keepalive_time=30
 46 | sysctl net.ipv4.tcp_keepalive_intvl=15
 47 | sysctl net.ipv4.tcp_keepalive_probes=6
 48 | sysctl net.ipv4.tcp_window_scaling=1
 49 | sysctl net.ipv4.tcp_rmem="4096 87380 16777216"
 50 | sysctl net.ipv4.tcp_wmem="4096 65536 16777216"
 51 | sysctl net.ipv4.tcp_mem="786432 1048576 26777216"
 52 | sysctl net.ipv4.tcp_max_tw_buckets=360000
 53 | sysctl net.ipv4.tcp_max_syn_backlog=3240000
 54 | sysctl net.ipv4.tcp_max_tw_buckets=1440000
 55 | sysctl net.ipv4.tcp_slow_start_after_idle=0
 56 | sysctl net.ipv4.tcp_retries2=5
 57 | sysctl net.ipv4.tcp_tw_recycle=1
 58 | sysctl net.ipv4.tcp_tw_reuse=1
 59 | sysctl vm.min_free_kbytes=65536
 60 | sysctl -w fs.file-max=1000000
 61 | ulimit -n 1000000
 62 | """
 63 | 
 64 | # List tracking vcpu's of all instance types for cpu unit reservations
 65 | # We are intentionally leaving out the following instance types as they're
 66 | # considered overkill for load-testing purposes or any instance req's we have
 67 | # experienced so far:
 68 | #     P2, G2, F1, I3, D2
 69 | ec2_type_by_vcpu = {
 70 |     1: ["t2.nano", "t2.micro", "t2.small", "m3.medium"],
 71 |     2: ["t2.medium", "t2.large", "m3.large", "m4.large", "c3.large",
 72 |         "c4.large", "r3.large", "r4.large"],
 73 |     4: ["t2.xlarge", "m3.xlarge", "m4.xlarge", "c3.xlarge", "c4.xlarge",
 74 |         "r3.xlarge", "r4.xlarge"],
 75 |     8: ["t2.2xlarge", "m3.2xlarge", "m4.2xlarge", "c3.2xlarge", "c4.2xlarge",
 76 |         "r3.2xlarge", "r4.2xlarge"],
 77 |     16: ["m4.4xlarge", "c3.4xlarge", "c4.4xlarge", "r3.4xlarge", "r4.4xlarge"],
 78 |     32: ["c3.8xlarge", "r3.8xlarge", "r4.8xlarge"],
 79 |     36: ["c4.8xlarge"],
 80 |     40: ["m4.10xlarge"],
 81 |     64: ["m4.16xlarge", "x1.16xlarge", "r4.16xlarge"],
 82 |     128: ["x1.32xlarge"]
 83 | }
 84 | 
 85 | # Build a list of vcpu's by instance type
 86 | ec2_vcpu_by_type = {}
 87 | for vcpu, instance_types in ec2_type_by_vcpu.items():
 88 |     for instance_type in instance_types:
 89 |         ec2_vcpu_by_type[instance_type] = vcpu
 90 | 
 91 | 
 92 | def cpu_units_for_instance_type(instance_type):
 93 |     # type: (str) -> int
 94 |     """Calculate how many CPU units to allocate for an instance_type
 95 | 
 96 |     We calculate cpu_units as 1024 * vcpu's for each instance to allocate
 97 |     almost the entirety of the instance's cpu units to the load-testing
 98 |     container. We take out 512 to ensure some leftover capacity for other
 99 |     utility containers we run with the load-testing container.
100 | 
101 |     """
102 |     return (ec2_vcpu_by_type[instance_type] * 1024) - 512
103 | 
104 | 
105 | class ECSManager(object):
106 |     """ECS Manager queries and manages an ECS cluster"""
107 |     # For testing purposes
108 |     boto = boto3
109 | 
110 |     # ECS optimized AMI id's
111 |     ecs_ami_ids = {
112 |         "us-east-1": "ami-275ffe31",
113 |         "us-east-2": "ami-62745007",
114 |         "us-west-1": "ami-689bc208",
115 |         "us-west-2": "ami-62d35c02"
116 |     }
117 | 
118 |     influxdb_container = "influxdb:1.2-alpine"
119 |     telegraf_container = "telegraf:1.2-alpine"
120 |     grafana_container = "grafana/grafana:4.1.2"
121 |     python_container = "jfloff/alpine-python:2.7-slim"
122 | 
123 |     _wait_script = None
124 |     _telegraf_script = None
125 |     _metric_create_script = None
126 | 
127 |     def __init__(self, plan):
128 |         # type: (Dict[str, Any]) -> None
129 |         """Create and return a ECSManager for a cluster of the given name."""
130 |         self._ecs_client = self.boto.client('ecs')
131 |         self._ec2_client = self.boto.client('ec2')
132 |         self._ecs_name = plan["ecs_name"]
133 |         self._plan = plan
134 | 
135 |         # Pull out the env vars
136 |         self.s3_ready_bucket = os.environ["s3_ready_bucket"]
137 |         self.container_log_group = os.environ["container_log_group"]
138 |         self.ecs_profile = os.environ["ecs_profile"]
139 | 
140 |         if "plan_run_uuid" not in plan:
141 |             plan["plan_run_uuid"] = uuid.uuid4().hex
142 | 
143 |         self._plan_uuid = plan["plan_run_uuid"]
144 | 
145 |     @property
146 |     def wait_script(self):
147 |         if not self._wait_script:
148 |             with open(wait_script_path, 'r') as f:
149 |                 self._wait_script = f.read()
150 |         return self._wait_script
151 | 
152 |     @property
153 |     def telegraf_script(self):
154 |         if not self._telegraf_script:
155 |             with open(telegraf_script_path, 'r') as f:
156 |                 self._telegraf_script = f.read()
157 |         return self._telegraf_script
158 | 
159 |     @property
160 |     def metric_create_script(self):
161 |         if not self._metric_create_script:
162 |             with open(metric_create_script, 'r') as f:
163 |                 self._metric_create_script = f.read()
164 |         return self._metric_create_script
165 | 
166 |     @property
167 |     def plan_uuid(self):
168 |         return self._plan_uuid
169 | 
170 |     @property
171 |     def s3_ready_file(self):
172 |         return "https://s3.amazonaws.com/{bucket}/{key}".format(
173 |             bucket=self.s3_ready_bucket,
174 |             key="{}.ready".format(self._plan_uuid)
175 |         )
176 | 
177 |     @property
178 |     def log_config(self):
179 |         return {
180 |             "logDriver": "awslogs",
181 |             "options": {"awslogs-group": self.container_log_group,
182 |                         "awslogs-region": "us-east-1",
183 |                         "awslogs-stream-prefix":
184 |                             "ardere-{}".format(self.plan_uuid)
185 |                         }
186 |         }
187 | 
188 |     @property
189 |     def influx_db_name(self):
190 |         return "run-{}".format(self.plan_uuid)
191 | 
192 |     @property
193 |     def grafana_admin_user(self):
194 |         return self._plan["metrics_options"]["dashboard"]["admin_user"]
195 | 
196 |     @property
197 |     def grafana_admin_password(self):
198 |         return self._plan["metrics_options"]["dashboard"]["admin_password"]
199 | 
200 |     def family_name(self, step):
201 |         # type: (Dict[str, Any]) -> str
202 |         """Generate a consistent family name for a given step"""
203 |         return step["name"] + "-" + self._plan_uuid
204 | 
205 |     def metrics_family_name(self):
206 |         # type: () -> str
207 |         """Generate a consistent metrics family name"""
208 |         return "{}-metrics".format(self._ecs_name)
209 | 
210 |     def metrics_setup_family_name(self):
211 |         # type: () -> str
212 |         """Generate a consistent metric setup family name"""
213 |         return "{}-metrics-setup".format(self._ecs_name)
214 | 
215 |     def query_active_instances(self, additional_tags=None):
216 |         # type: (Optional[Dict[str, str]]) -> Dict[str, int]
217 |         """Query EC2 for all the instances owned by ardere for this cluster."""
218 |         instance_dict = defaultdict(int)
219 |         paginator = self._ec2_client.get_paginator('describe_instances')
220 |         filters = {"Owner": "ardere", "ECSCluster": self._ecs_name}
221 |         if additional_tags:
222 |             filters.update(additional_tags)
223 |         response_iterator = paginator.paginate(
224 |             Filters=[
225 |                 {
226 |                     "Name": "tag:{}".format(tag_name),
227 |                     "Values": [tag_value]
228 |                 } for tag_name, tag_value in filters.items()
229 |             ]
230 |         )
231 |         for page in response_iterator:
232 |             for reservation in page["Reservations"]:
233 |                 for instance in reservation["Instances"]:
234 |                     # Determine if the instance is pending/running and count
235 |                     # 0 = Pending, 16 = Running, > is all shutting down, etc.
236 |                     if instance["State"]["Code"] <= 16:
237 |                         instance_dict[instance["InstanceType"]] += 1
238 |         return instance_dict
239 | 
240 |     def calculate_missing_instances(self, desired, current):
241 |         # type: (Dict[str, int], Dict[str, int]) -> Dict[str, int]
242 |         """Determine how many of what instance types are needed to ensure
243 |         the current instance dict has all the desired instance count/types."""
244 |         needed = {}
245 |         for instance_type, instance_count in desired.items():
246 |             cur = current.get(instance_type, 0)
247 |             if cur < instance_count:
248 |                 needed[instance_type] = instance_count - cur
249 |         return needed
250 | 
251 |     def has_metrics_node(self, instance_type):
252 |         # type: (str) -> bool
253 |         """Return whether a metrics node with this instance type exists"""
254 |         instances = self.query_active_instances(
255 |             additional_tags=dict(Role="metrics")
256 |         )
257 |         return instance_type in instances
258 | 
259 |     def has_started_metric_creation(self):
260 |         # type: () -> bool
261 |         """Return whether the metric creation container was started"""
262 |         response = self._ecs_client.list_tasks(
263 |             cluster=self._ecs_name,
264 |             startedBy=self.plan_uuid
265 |         )
266 |         return bool(response["taskArns"])
267 | 
268 |     def has_finished_metric_creation(self):
269 |         # type: () -> bool
270 |         """Return whether the metric creation container has finished"""
271 |         response = self._ecs_client.list_tasks(
272 |             cluster=self._ecs_name,
273 |             startedBy=self.plan_uuid,
274 |             desiredStatus="STOPPED"
275 |         )
276 |         return bool(response["taskArns"])
277 | 
278 |     def request_instances(self, instances, security_group_ids,
279 |                           additional_tags=None):
280 |         # type: (Dict[str, int], List[str], Optional[Dict[str, str]]) -> None
281 |         """Create requested types/quantities of instances for this cluster"""
282 |         ami_id = self.ecs_ami_ids["us-east-1"]
283 |         tags = dict(Name=self._ecs_name, Owner="ardere",
284 |                     ECSCluster=self._ecs_name)
285 |         if additional_tags:
286 |             tags.update(additional_tags)
287 |         for instance_type, instance_count in instances.items():
288 |             self._ec2_client.run_instances(
289 |                 ImageId=ami_id,
290 |                 MinCount=instance_count,
291 |                 MaxCount=instance_count,
292 |                 InstanceType=instance_type,
293 |                 UserData=EC2_USER_DATA.format(ecs_name=self._ecs_name),
294 |                 IamInstanceProfile={"Arn": self.ecs_profile},
295 |                 SecurityGroupIds=security_group_ids,
296 |                 TagSpecifications=[
297 |                     {
298 |                         "ResourceType": "instance",
299 |                         "Tags": [
300 |                             dict(Key=tag_name, Value=tag_value)
301 |                             for tag_name, tag_value in tags.items()
302 |                         ]
303 |                     }
304 |                 ]
305 |             )
306 | 
307 |     def locate_metrics_container_ip(self):
308 |         # type: () -> Tuple[Optional[str], Optional[str]]
309 |         """Locates the metrics container IP and container instance arn
310 | 
311 |         Returns a tuple of (public_ip, container_arn)
312 | 
313 |         """
314 |         response = self._ecs_client.list_container_instances(
315 |             cluster=self._ecs_name,
316 |             filter="task:group == service:metrics"
317 |         )
318 |         if not response["containerInstanceArns"]:
319 |             return None, None
320 | 
321 |         container_arn = response["containerInstanceArns"][0]
322 |         response = self._ecs_client.describe_container_instances(
323 |             cluster=self._ecs_name,
324 |             containerInstances=[container_arn]
325 |         )
326 | 
327 |         container_instance = response["containerInstances"][0]
328 |         ec2_instance_id = container_instance["ec2InstanceId"]
329 |         instance = self.boto.resource("ec2").Instance(ec2_instance_id)
330 |         return instance.private_ip_address, container_arn
331 | 
332 |     def locate_metrics_service(self):
333 |         # type: () -> Optional[str]
334 |         """Locate and return the metrics service arn if any"""
335 |         response = self._ecs_client.describe_services(
336 |             cluster=self._ecs_name,
337 |             services=["metrics"]
338 |         )
339 |         if response["services"] and response["services"][0]["status"] == \
340 |                 "ACTIVE":
341 |             return response["services"][0]
342 |         else:
343 |             return None
344 | 
345 |     def create_metrics_service(self, options):
346 |         # type: (Dict[str, Any]) -> Dict[str, Any]
347 |         """Creates an ECS service to run InfluxDB and Grafana for metric
348 |         reporting and returns its info"""
349 |         logger.info("Creating InfluxDB service with options: {}".format(
350 |             options))
351 | 
352 |         cmd = """\
353 |         export GF_DEFAULT_INSTANCE_NAME=`wget -qO- http://169.254.169.254/latest/meta-data/instance-id` && \
354 |         export GF_SECURITY_ADMIN_USER=%s && \
355 |         export GF_SECURITY_ADMIN_PASSWORD=%s && \
356 |         export GF_USERS_ALLOW_SIGN_UP=false && \
357 |         mkdir "${GF_DASHBOARDS_JSON_PATH}" && \
358 |         ./run.sh
359 |         """ % (self.grafana_admin_user, self.grafana_admin_password)  # noqa
360 |         cmd = ['sh', '-c', '{}'.format(cmd)]
361 | 
362 |         gf_env = {
363 |             "GF_DASHBOARDS_JSON_ENABLED": "true",
364 |             "GF_DASHBOARDS_JSON_PATH": "/var/lib/grafana/dashboards",
365 |             "__ARDERE_GRAFANA_URL__":
366 |                 "http://admin:admin@localhost:3000/api/datasources"
367 |         }
368 | 
369 |         # Setup the task definition for setting up influxdb/grafana instances
370 |         # per run
371 |         mc_cmd = """\
372 |         pip install influxdb requests boto3 && \
373 |         echo "${__ARDERE_PYTHON_SCRIPT__}" > setup_db.py && \
374 |         python setup_db.py
375 |         """
376 |         mc_cmd = ['sh', '-c', '{}'.format(mc_cmd)]
377 |         self._ecs_client.register_task_definition(
378 |             family=self.metrics_setup_family_name(),
379 |             containerDefinitions=[
380 |                 {
381 |                     "name": "metricsetup",
382 |                     "image": self.python_container,
383 |                     "cpu": 128,
384 |                     "entryPoint": mc_cmd,
385 |                     "memoryReservation": 256,
386 |                     "privileged": True,
387 |                     "logConfiguration": self.log_config
388 |                 }
389 |             ],
390 |             networkMode="host"
391 |         )
392 | 
393 |         task_response = self._ecs_client.register_task_definition(
394 |             family=self.metrics_family_name(),
395 |             containerDefinitions=[
396 |                 {
397 |                     "name": "influxdb",
398 |                     "image": self.influxdb_container,
399 |                     "cpu": cpu_units_for_instance_type(
400 |                         options["instance_type"]),
401 |                     "memoryReservation": 256,
402 |                     "privileged": True,
403 |                     "portMappings": [
404 |                         {"containerPort": 8086},
405 |                         {"containerPort": 8088}
406 |                     ],
407 |                     "logConfiguration": self.log_config
408 |                 },
409 |                 {
410 |                     "name": "grafana",
411 |                     "image": self.grafana_container,
412 |                     "cpu": 256,
413 |                     "memoryReservation": 256,
414 |                     "entryPoint": cmd,
415 |                     "portMappings": [
416 |                         {"containerPort": 3000}
417 |                     ],
418 |                     "privileged": True,
419 |                     "environment": [
420 |                         {"name": key, "value": value} for key, value in
421 |                         gf_env.items()
422 |                     ],
423 |                     "logConfiguration": self.log_config
424 |                 }
425 |             ],
426 |             # use host network mode for optimal performance
427 |             networkMode="host",
428 | 
429 |             placementConstraints=[
430 |                 # Ensure the service is confined to the right instance type
431 |                 {
432 |                     "type": "memberOf",
433 |                     "expression": "attribute:ecs.instance-type == {}".format(
434 |                         options["instance_type"]),
435 |                 }
436 |             ],
437 |         )
438 |         task_arn = task_response["taskDefinition"]["taskDefinitionArn"]
439 |         service_result = self._ecs_client.create_service(
440 |             cluster=self._ecs_name,
441 |             serviceName="metrics",
442 |             taskDefinition=task_arn,
443 |             desiredCount=1,
444 |             deploymentConfiguration={
445 |                 "minimumHealthyPercent": 0,
446 |                 "maximumPercent": 100
447 |             },
448 |             placementConstraints=[
449 |                 {
450 |                     "type": "distinctInstance"
451 |                 }
452 |             ]
453 |         )
454 |         service_arn = service_result["service"]["serviceArn"]
455 |         return dict(task_arn=task_arn, service_arn=service_arn)
456 | 
457 |     def run_metric_creation_task(self, container_instance, grafana_auth,
458 |                                  dashboard=None,
459 |                                  dashboard_name=None):
460 |         # type: (str, Tuple[str, str], Optional[str], Optional[str]) -> None
461 |         """Starts the metric creation task"""
462 |         env = {
463 |             "__ARDERE_GRAFANA_USER__": grafana_auth[0],
464 |             "__ARDERE_GRAFANA_PASS__": grafana_auth[1],
465 |             "__ARDERE_PYTHON_SCRIPT__": self.metric_create_script,
466 |             "__ARDERE_INFLUXDB_NAME__": self.influx_db_name
467 |         }
468 | 
469 |         if dashboard:
470 |             env["__ARDERE_DASHBOARD__"] = dashboard
471 |             env["__ARDERE_DASHBOARD_NAME__"] = dashboard_name
472 | 
473 |         self._ecs_client.start_task(
474 |             cluster=self._ecs_name,
475 |             taskDefinition=self.metrics_setup_family_name(),
476 |             overrides={
477 |                 'containerOverrides': [
478 |                     {
479 |                         "name": "metricsetup",
480 |                         "environment": [
481 |                             {"name": key, "value": value} for key, value in
482 |                             env.items()
483 |                         ]
484 |                     }
485 |                 ]
486 |             },
487 |             containerInstances=[container_instance],
488 |             startedBy=self.plan_uuid
489 |         )
490 | 
491 |     def create_service(self, step):
492 |         # type: (Dict[str, Any]) -> Dict[str, Any]
493 |         """Creates an ECS service for a step and returns its info"""
494 |         logger.info("CreateService called with: {}".format(step))
495 | 
496 |         # Prep the shell command
497 |         wfc_var = '__ARDERE_WAITFORCLUSTER_SH__'
498 |         wfc_cmd = 'sh -c "${}" waitforcluster.sh {} {}'.format(
499 |             wfc_var,
500 |             self.s3_ready_file,
501 |             step.get("run_delay", 0)
502 |         )
503 |         service_cmd = step["cmd"]
504 |         cmd = ['sh', '-c', '{} && {}'.format(wfc_cmd, service_cmd)]
505 | 
506 |         # Prep the env vars
507 |         env_vars = [{"name": wfc_var, "value": self.wait_script}]
508 |         for name, value in step.get("env", {}).items():
509 |             env_vars.append({"name": name, "value": value})
510 | 
511 |         # ECS wants a family name for task definitions, no spaces, 255 chars
512 |         family_name = step["name"] + "-" + self._plan_uuid
513 | 
514 |         # Use cpu_unit if provided, otherwise monopolize
515 |         cpu_units = step.get(
516 |             "cpu_units",
517 |             cpu_units_for_instance_type(step["instance_type"])
518 |         )
519 | 
520 |         # Setup the container definition
521 |         container_def = {
522 |             "name": step["name"],
523 |             "image": step["container_name"],
524 |             "cpu": cpu_units,
525 | 
526 |             # using only memoryReservation sets no hard limit
527 |             "memoryReservation": 256,
528 |             "privileged": True,
529 |             "environment": env_vars,
530 |             "entryPoint": cmd,
531 |             "ulimits": [
532 |                 dict(name="nofile", softLimit=1000000, hardLimit=1000000)
533 |             ],
534 |             "logConfiguration": self.log_config
535 |         }
536 |         if "port_mapping" in step:
537 |             ports = [{"containerPort": port} for port in step["port_mapping"]]
538 |             container_def["portMappings"] = ports
539 | 
540 |         # Setup the telegraf container definition
541 |         cmd = """\
542 |         echo "${__ARDERE_TELEGRAF_CONF__}" > /etc/telegraf/telegraf.conf && \
543 |         export __ARDERE_TELEGRAF_HOST__=`wget -qO- http://169.254.169.254/latest/meta-data/instance-id` && \
544 |         telegraf \
545 |         """  # noqa
546 |         cmd = ['sh', '-c', '{}'.format(cmd)]
547 |         telegraf_def = {
548 |             "name": "telegraf",
549 |             "image": self.telegraf_container,
550 |             "cpu": 512,
551 |             "memoryReservation": 256,
552 |             "entryPoint": cmd,
553 |             "portMappings": [
554 |                 {"containerPort": 8125}
555 |             ],
556 |             "privileged": True,
557 |             "environment": [
558 |                 {"name": "__ARDERE_TELEGRAF_CONF__",
559 |                  "value": self.telegraf_script},
560 |                 {"name": "__ARDERE_TELEGRAF_STEP__",
561 |                  "value": step["name"]},
562 |                 {"name": "__ARDERE_INFLUX_ADDR__",
563 |                  "value": "{}:8086".format(self._plan["influxdb_private_ip"])},
564 |                 {"name": "__ARDERE_INFLUX_DB__",
565 |                  "value": self.influx_db_name},
566 |                 {"name": "__ARDERE_TELEGRAF_TYPE__",
567 |                  "value": step["docker_series"]}
568 |             ],
569 |             "logConfiguration": self.log_config
570 |         }
571 | 
572 |         task_response = self._ecs_client.register_task_definition(
573 |             family=family_name,
574 |             containerDefinitions=[
575 |                 container_def,
576 |                 telegraf_def
577 |             ],
578 |             # use host network mode for optimal performance
579 |             networkMode="host",
580 | 
581 |             placementConstraints=[
582 |                 # Ensure the service is confined to the right instance type
583 |                 {
584 |                     "type": "memberOf",
585 |                     "expression": "attribute:ecs.instance-type == {}".format(
586 |                         step["instance_type"]),
587 |                 }
588 |             ]
589 |         )
590 |         task_arn = task_response["taskDefinition"]["taskDefinitionArn"]
591 |         step["taskArn"] = task_arn
592 |         service_result = self._ecs_client.create_service(
593 |             cluster=self._ecs_name,
594 |             serviceName=step["name"],
595 |             taskDefinition=task_arn,
596 |             desiredCount=step["instance_count"],
597 |             deploymentConfiguration={
598 |                 "minimumHealthyPercent": 0,
599 |                 "maximumPercent": 100
600 |             },
601 |             placementConstraints=[
602 |                 {
603 |                     "type": "distinctInstance"
604 |                 }
605 |             ]
606 |         )
607 |         step["serviceArn"] = service_result["service"]["serviceArn"]
608 |         step["service_status"] = "STARTED"
609 |         return step
610 | 
611 |     def create_services(self, steps):
612 |         # type: (List[Dict[str, Any]]) -> None
613 |         """Create ECS Services given a list of steps"""
614 |         with ThreadPoolExecutor(max_workers=8) as executor:
615 |             list(executor.map(self.create_service, steps))
616 | 
617 |     def service_ready(self, step):
618 |         # type: (Dict[str, Any]) -> bool
619 |         """Query a service and return whether all its tasks are running"""
620 |         service_name = step["name"]
621 |         response = self._ecs_client.describe_services(
622 |             cluster=self._ecs_name,
623 |             services=[service_name]
624 |         )
625 | 
626 |         try:
627 |             deploy = response["services"][0]["deployments"][0]
628 |         except (TypeError, IndexError):
629 |             return False
630 |         return deploy["desiredCount"] == deploy["runningCount"]
631 | 
632 |     def all_services_ready(self, steps):
633 |         # type: (List[Dict[str, Any]]) -> bool
634 |         """Queries all service ARN's in the plan to see if they're ready"""
635 |         with ThreadPoolExecutor(max_workers=8) as executor:
636 |             results = executor.map(self.service_ready, steps)
637 |         return all(results)
638 | 
639 |     def service_done(self, step):
640 |         # type: (Dict[str, Any]) -> bool
641 |         """Query a service to return whether its fully drained and back to
642 |         INACTIVE"""
643 |         service_name = step["name"]
644 |         response = self._ecs_client.describe_services(
645 |             cluster=self._ecs_name,
646 |             services=[service_name]
647 |         )
648 | 
649 |         service = response["services"][0]
650 |         return service["status"] == "INACTIVE"
651 | 
652 |     def all_services_done(self, steps):
653 |         # type: (List[Dict[str, Any]]) -> bool
654 |         """Queries all service ARN's in the plan to see if they're fully
655 |         DRAINED and now INACTIVE"""
656 |         with ThreadPoolExecutor(max_workers=8) as executor:
657 |             results = executor.map(self.service_done, steps)
658 |         return all(results)
659 | 
660 |     def stop_finished_service(self, start_time, step):
661 |         # type: (start_time, Dict[str, Any]) -> None
662 |         """Stops a service if it needs to shutdown"""
663 |         if step["service_status"] == "STOPPED":
664 |             return
665 | 
666 |         # Calculate time
667 |         step_duration = step.get("run_delay", 0) + step["run_max_time"]
668 |         now = time.time()
669 |         if now < (start_time + step_duration):
670 |             return
671 | 
672 |         # Running long enough to shutdown
673 |         self._ecs_client.update_service(
674 |             cluster=self._ecs_name,
675 |             service=step["name"],
676 |             desiredCount=0
677 |         )
678 |         step["service_status"] = "STOPPED"
679 | 
680 |     def stop_finished_services(self, start_time, steps):
681 |         # type: (int, List[Dict[str, Any]]) -> None
682 |         """Shuts down any services that have run for their max time"""
683 |         for step in steps:
684 |             self.stop_finished_service(start_time, step)
685 | 
686 |     def shutdown_plan(self, steps):
687 |         # type: (List[Dict[str, Any]]) -> None
688 |         """Terminate the entire plan, ensure all services and task
689 |         definitions are completely cleaned up and removed"""
690 |         # Locate all the services for the ECS Cluster
691 |         paginator = self._ecs_client.get_paginator('list_services')
692 |         response_iterator = paginator.paginate(
693 |             cluster=self._ecs_name
694 |         )
695 | 
696 |         # Collect all the service ARN's
697 |         service_arns = []
698 |         for page in response_iterator:
699 |             service_arns.extend(page["serviceArns"])
700 | 
701 |         # Avoid shutting down metrics if tear down was not requested
702 |         # We have to exclude it from the services discovered above if we
703 |         # should NOT tear it down
704 |         if not self._plan["metrics_options"]["tear_down"]:
705 |             metric_service = self.locate_metrics_service()
706 |             if metric_service and metric_service["serviceArn"] in service_arns:
707 |                 service_arns.remove(metric_service["serviceArn"])
708 | 
709 |         for service_arn in service_arns:
710 |             try:
711 |                 self._ecs_client.update_service(
712 |                     cluster=self._ecs_name,
713 |                     service=service_arn,
714 |                     desiredCount=0
715 |                 )
716 |             except botocore.exceptions.ClientError:
717 |                 continue
718 | 
719 |             try:
720 |                 self._ecs_client.delete_service(
721 |                     cluster=self._ecs_name,
722 |                     service=service_arn
723 |                 )
724 |             except botocore.exceptions.ClientError:
725 |                 pass
726 | 
727 |         # Locate all the task definitions for this plan
728 |         step_family_names = [self.family_name(step) for step in steps]
729 | 
730 |         # Add in the metrics family name if we need to tear_down
731 |         if self._plan["metrics_options"]["tear_down"]:
732 |             step_family_names.append(self.metrics_family_name())
733 |             step_family_names.append(self.metrics_setup_family_name())
734 | 
735 |         for family_name in step_family_names:
736 |             try:
737 |                 response = self._ecs_client.describe_task_definition(
738 |                     taskDefinition=family_name
739 |                 )
740 |             except botocore.exceptions.ClientError:
741 |                 continue
742 | 
743 |             task_arn = response["taskDefinition"]["taskDefinitionArn"]
744 | 
745 |             # Deregister the task
746 |             try:
747 |                 self._ecs_client.deregister_task_definition(
748 |                     taskDefinition=task_arn
749 |                 )
750 |             except botocore.exceptions.ClientError:
751 |                 pass
752 | 


--------------------------------------------------------------------------------
/ardere/exceptions.py:
--------------------------------------------------------------------------------
 1 | class ServicesStartingException(Exception):
 2 |     """Exception to indicate Services are still Starting"""
 3 | 
 4 | 
 5 | class ShutdownPlanException(Exception):
 6 |     """Exception to indicate the Plan should be Shutdown"""
 7 | 
 8 | 
 9 | class ValidationException(Exception):
10 |     """Exception to indicate validation error parsing input"""
11 | 
12 | 
13 | class UndrainedInstancesException(Exception):
14 |     """There are still ACTIVE or DRAINING instances in the cluster"""
15 | 
16 | 
17 | class CreatingMetricSourceException(Exception):
18 |     """Metric creation task hasn't completed yet"""
19 | 


--------------------------------------------------------------------------------
/ardere/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | 


--------------------------------------------------------------------------------
/ardere/scripts/metric_creator.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import json
  3 | import os
  4 | 
  5 | import boto3
  6 | import influxdb
  7 | import requests
  8 | 
  9 | try:
 10 |     from typing import Any, Dict  # noqa
 11 | except ImportError:  # pragma: nocover
 12 |     pass
 13 | 
 14 | logging.basicConfig(level=logging.INFO)
 15 | logger = logging.getLogger()
 16 | 
 17 | 
 18 | class DashboardSetup(object):
 19 |     # For testing purposes
 20 |     boto = boto3
 21 |     req = requests
 22 |     influx = influxdb
 23 | 
 24 |     def __init__(self):
 25 |         self.influx_db_name = os.environ["__ARDERE_INFLUXDB_NAME__"]
 26 |         self.dashboard = os.environ.get("__ARDERE_DASHBOARD__")
 27 |         self.dashboard_name = os.environ.get("__ARDERE_DASHBOARD_NAME__")
 28 |         self.grafana_auth = (
 29 |             os.environ.get("__ARDERE_GRAFANA_USER__"),
 30 |             os.environ.get("__ARDERE_GRAFANA_PASS__")
 31 |         )
 32 | 
 33 |     def _load_dashboard(self):
 34 |         # type: () -> Dict[str, Any]
 35 |         """Load dashboard from S3 and update JSON contents"""
 36 |         logger.info("Fetching dashboard from S3")
 37 |         bucket, filename = self.dashboard.split(":")
 38 |         s3 = self.boto.resource('s3')
 39 |         dash_file = s3.Object(bucket, filename)
 40 |         file_contents = dash_file.get()['Body'].read().decode('utf-8')
 41 |         dash_contents = json.loads(file_contents)
 42 |         dash_contents["title"] = self.dashboard_name
 43 |         dash_contents["id"] = None
 44 |         logger.info("Fetched dashboard file")
 45 |         return dash_contents
 46 | 
 47 |     def _create_dashboard(self, grafana_url):
 48 |         # type: (str) -> None
 49 |         """Create the dashboard in grafana"""
 50 |         dash_contents = self._load_dashboard()
 51 |         logger.info("Creating dashboard in grafana")
 52 |         response = self.req.post(grafana_url + "/api/dashboards/db",
 53 |                                  auth=self.grafana_auth,
 54 |                                  json=dict(
 55 |                                      dashboard=dash_contents,
 56 |                                      overwrite=True
 57 |                                  ))
 58 |         if response.status_code != 200:
 59 |             raise Exception("Error creating dashboard: {}".format(
 60 |                 response.status_code))
 61 | 
 62 |     def _ensure_dashboard(self, grafana_url):
 63 |         # type: (str) -> None
 64 |         """Ensure the dashboard is present"""
 65 |         # Verify whether the dashboard exists
 66 |         response = self.req.get(grafana_url + "/api/search",
 67 |                                 auth=self.grafana_auth,
 68 |                                 params=dict(query=self.dashboard_name))
 69 |         if response.status_code != 200:
 70 |             raise Exception("Failure to search dashboards")
 71 | 
 72 |         # search results for dashboard
 73 |         results = filter(lambda x: x["title"] == self.dashboard_name,
 74 |                          response.json())
 75 |         if not results:
 76 |             self._create_dashboard(grafana_url)
 77 | 
 78 |     def create_datasources(self):
 79 |         # type: () -> None
 80 |         # Create an influxdb for this run
 81 |         logger.info("Create influx database")
 82 |         influx_client = self.influx.InfluxDBClient()
 83 |         influx_client.create_database(self.influx_db_name)
 84 | 
 85 |         # Setup the grafana datasource
 86 |         grafana_url = "http://127.0.0.1:3000"
 87 |         ds_api_url = "http://127.0.0.1:3000/api/datasources"
 88 |         logger.info("Create datasource in grafana")
 89 |         self.req.post(ds_api_url, auth=self.grafana_auth, json=dict(
 90 |             name=self.influx_db_name,
 91 |             type="influxdb",
 92 |             url="http://localhost:8086",
 93 |             database=self.influx_db_name,
 94 |             access="proxy",
 95 |             basicAuth=False
 96 |         ))
 97 | 
 98 |         # Setup the grafana dashboard if needed/desired
 99 |         if self.dashboard:
100 |             self._ensure_dashboard(grafana_url)
101 | 
102 | 
103 | if __name__ == "__main__":  # pragma: no cover
104 |     logger.info("Creating datasources")
105 |     DashboardSetup().create_datasources()
106 |     logger.info("Finished.")
107 | 


--------------------------------------------------------------------------------
/ardere/step_functions.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import re
  4 | import time
  5 | from collections import defaultdict
  6 | 
  7 | import boto3
  8 | import botocore
  9 | import toml
 10 | from marshmallow import (
 11 |     Schema,
 12 |     decorators,
 13 |     fields,
 14 |     validate,
 15 |     ValidationError,
 16 | )
 17 | from typing import Any, Dict, List  # noqa
 18 | 
 19 | from ardere.aws import (
 20 |     ECSManager,
 21 |     ec2_vcpu_by_type,
 22 | )
 23 | from ardere.exceptions import (
 24 |     CreatingMetricSourceException,
 25 |     ServicesStartingException,
 26 |     ShutdownPlanException,
 27 |     ValidationException,
 28 |     UndrainedInstancesException,
 29 | )
 30 | 
 31 | logger = logging.getLogger()
 32 | logger.setLevel(logging.INFO)
 33 | 
 34 | # Step name is used as the Log stream name.
 35 | # Log stream names are limited to 512 characters (no ":" or "*")
 36 | # Name format is
 37 | # ardere-UUID/STEP_NAME/LUUID
 38 | # where UUID is dashed, and LUUID is not
 39 | # therefore: 512 - (9 + 36 + 32) = max name len
 40 | MAX_NAME_LEN = 435
 41 | INVALID_NAME_CHECK = re.compile("([:\*]+)")
 42 | 
 43 | 
 44 | class StepValidator(Schema):
 45 |     name = fields.String(required=True)
 46 |     instance_count = fields.Int(required=True)
 47 |     instance_type = fields.String(
 48 |         required=True,
 49 |         validate=validate.OneOf(ec2_vcpu_by_type.keys())
 50 |     )
 51 |     run_max_time = fields.Int(required=True)
 52 |     run_delay = fields.Int(missing=0)
 53 |     container_name = fields.String(required=True)
 54 |     cmd = fields.String(required=True)
 55 |     port_mapping = fields.List(fields.Int())
 56 |     env = fields.Dict()
 57 |     docker_series = fields.String(missing="default")
 58 | 
 59 |     @decorators.validates("name")
 60 |     def validate_name(self, value):
 61 |         if len(value) == 0:
 62 |             raise ValidationError("Step name missing")
 63 |         if len(value) > MAX_NAME_LEN:
 64 |             raise ValidationError("Step name too long")
 65 |         if INVALID_NAME_CHECK.search(value):
 66 |             raise ValidationError("Step name contains invalid characters")
 67 | 
 68 | 
 69 | class DashboardOptions(Schema):
 70 |     admin_user = fields.String(missing="admin")
 71 |     admin_password = fields.String(required=True)
 72 |     name = fields.String(required=True)
 73 |     filename = fields.String(required=True)
 74 | 
 75 | 
 76 | class MetricsOptions(Schema):
 77 |     enabled = fields.Bool(missing=True)
 78 |     instance_type = fields.String(
 79 |         missing="c4.large",
 80 |         validate=validate.OneOf(ec2_vcpu_by_type.keys())
 81 |     )
 82 |     dashboard = fields.Nested(DashboardOptions)
 83 |     tear_down = fields.Bool(missing=False)
 84 | 
 85 | 
 86 | class PlanValidator(Schema):
 87 |     ecs_name = fields.String(required=True)
 88 |     name = fields.String(required=True)
 89 |     metrics_options = fields.Nested(MetricsOptions, missing={})
 90 | 
 91 |     steps = fields.Nested(StepValidator, many=True)
 92 | 
 93 |     def _log_validate_name(self, value, name_type):
 94 |         if len(value) == 0:
 95 |             raise ValidationError("{} missing".format(name_type))
 96 |         if len(value) > MAX_NAME_LEN:
 97 |             raise ValidationError("{} too long".format(name_type))
 98 |         if INVALID_NAME_CHECK.search(value):
 99 |             raise ValidationError(
100 |                 "{} contained invalid characters".format(name_type))
101 | 
102 |     @decorators.validates("ecs_name")
103 |     def validate_ecs_name(self, value):
104 |         """Verify a cluster exists for this name"""
105 |         self._log_validate_name(value, "Plan ecs_name")
106 |         client = self.context["boto"].client('ecs')
107 |         response = client.describe_clusters(
108 |             clusters=[value]
109 |         )
110 |         if not response.get("clusters"):
111 |             raise ValidationError("No cluster with the provided name.")
112 | 
113 |     @decorators.validates("name")
114 |     def validate_name(self, value):
115 |         self._log_validate_name(value, "Step name")
116 | 
117 | 
118 | class AsynchronousPlanRunner(object):
119 |     """Asynchronous Test Plan Runner
120 | 
121 |     This step function based runner handles running a test plan in an
122 |     asynchronous manner, where each step will wait for its run_delay if
123 |     present before running.
124 | 
125 |     """
126 |     # For testing purposes
127 |     boto = boto3
128 | 
129 |     def __init__(self, event, context):
130 |         logger.info("Called with {}".format(event))
131 |         logger.info("Environ: {}".format(os.environ))
132 | 
133 |         # Load our TOML if needed
134 |         event = self._load_toml(event)
135 | 
136 |         self.event = event
137 |         self.context = context
138 |         self.ecs = ECSManager(plan=event)
139 | 
140 |     @property
141 |     def grafana_auth(self):
142 |         if not self.event["metrics_options"].get("dashboard"):
143 |             return "", ""
144 | 
145 |         dash_opts = self.event["metrics_options"]["dashboard"]
146 |         return dash_opts["admin_user"], dash_opts["admin_password"]
147 | 
148 |     @property
149 |     def dashboard_options(self):
150 |         return self.event["metrics_options"]["dashboard"]
151 | 
152 |     def _build_instance_map(self):
153 |         """Given a JSON test-plan, build and return a dict of instance types
154 |         and how many should exist for each type."""
155 |         instances = defaultdict(int)
156 |         for step in self.event["steps"]:
157 |             instances[step["instance_type"]] += step["instance_count"]
158 |         return instances
159 | 
160 |     def _find_test_plan_duration(self):
161 |         # type: (Dict[str, Any]) -> int
162 |         """Locates and calculates the longest test plan duration from its
163 |         delay through its duration of the plan."""
164 |         return max(
165 |             [x.get("run_delay", 0) + x["run_max_time"] for x in
166 |              self.event["steps"]]
167 |         )
168 | 
169 |     def _load_toml(self, event):
170 |         """Loads TOML if necessary"""
171 |         return toml.loads(event["toml"]) if "toml" in event else event
172 | 
173 |     def _validate_plan(self):
174 |         """Validates that the loaded plan is correct"""
175 |         schema = PlanValidator()
176 |         schema.context["boto"] = self.boto
177 |         data, errors = schema.load(self.event)
178 |         if errors:
179 |             raise ValidationException("Failed to validate: {}".format(errors))
180 | 
181 |         # Replace our event with the validated
182 |         self.event = data
183 | 
184 |     def populate_missing_instances(self):
185 |         """Populate any missing EC2 instances needed for the test plan in the
186 |         cluster
187 | 
188 |         """
189 |         # First, validate the test plan, done only as part of step 1
190 |         self._validate_plan()
191 | 
192 |         needed = self._build_instance_map()
193 | 
194 |         # Ensure we have the metrics instance
195 |         if self.event["metrics_options"]["enabled"]:
196 |             # Query to see if we need to add a metrics node
197 |             metric_inst_type = self.event["metrics_options"]["instance_type"]
198 | 
199 |             # We add the instance type to needed to ensure we don't leave out
200 |             # more nodes since this will turn up in the query_active results
201 |             needed[metric_inst_type] += 1
202 | 
203 |             # We create it here up-front if needed since we have different
204 |             # tags
205 |             if not self.ecs.has_metrics_node(metric_inst_type):
206 |                 self.ecs.request_instances(
207 |                     instances={metric_inst_type: 1},
208 |                     security_group_ids=[os.environ["metric_sg"],
209 |                                         os.environ["ec2_sg"]],
210 |                     additional_tags={"Role": "metrics"}
211 |                 )
212 | 
213 |         logger.info("Plan instances needed: {}".format(needed))
214 |         current_instances = self.ecs.query_active_instances()
215 |         missing_instances = self.ecs.calculate_missing_instances(
216 |             desired=needed, current=current_instances
217 |         )
218 |         if missing_instances:
219 |             logger.info("Requesting instances: {}".format(missing_instances))
220 |             self.ecs.request_instances(
221 |                 instances=missing_instances,
222 |                 security_group_ids=[os.environ["ec2_sg"]]
223 |             )
224 |         return self.event
225 | 
226 |     def ensure_metrics_available(self):
227 |         """Start the metrics service, ensure its running, and its IP is known
228 | 
229 |         """
230 |         if not self.event["metrics_options"]["enabled"]:
231 |             return self.event
232 | 
233 |         # Is the service already running?
234 |         metrics = self.ecs.locate_metrics_service()
235 |         logger.info("Metrics info: %s", metrics)
236 | 
237 |         if not metrics:
238 |             # Start the metrics service, throw a retry
239 |             self.ecs.create_metrics_service(self.event["metrics_options"])
240 |             raise ServicesStartingException("Triggered metrics start")
241 | 
242 |         deploy = metrics["deployments"][0]
243 |         ready = deploy["desiredCount"] == deploy["runningCount"]
244 |         logger.info("Deploy info: %s", deploy)
245 |         if not ready:
246 |             raise ServicesStartingException("Waiting for metrics")
247 | 
248 |         # Populate the IP of the metrics service
249 |         metric_ip, container_arn = self.ecs.locate_metrics_container_ip()
250 | 
251 |         if not metric_ip:
252 |             raise Exception("Unable to locate metrics IP even though its "
253 |                             "running")
254 | 
255 |         self.event["influxdb_private_ip"] = metric_ip
256 |         self.event["metric_container_arn"] = container_arn
257 |         return self.event
258 | 
259 |     def ensure_metric_sources_created(self):
260 |         """Ensure the metrics db and grafana datasource are configured"""
261 |         if not self.event["metrics_options"]["enabled"]:
262 |             return self.event
263 | 
264 |         if not self.ecs.has_started_metric_creation():
265 |             dashboard = None
266 |             dashboard_name = None
267 |             if self.event["metrics_options"].get("dashboard"):
268 |                 dashboard = ":".join([os.environ["metrics_bucket"],
269 |                                       self.dashboard_options["filename"]])
270 |                 dashboard_name = self.dashboard_options["name"]
271 |             self.ecs.run_metric_creation_task(
272 |                 container_instance=self.event["metric_container_arn"],
273 |                 grafana_auth=self.grafana_auth,
274 |                 dashboard=dashboard,
275 |                 dashboard_name=dashboard_name
276 |             )
277 |             raise CreatingMetricSourceException("Started metric creation")
278 | 
279 |         if not self.ecs.has_finished_metric_creation():
280 |             raise CreatingMetricSourceException("Metric creation still "
281 |                                                 "running")
282 | 
283 |         metric_ip = self.event["influxdb_private_ip"]
284 |         self.event["grafana_dashboard"] = "http://{}:3000".format(metric_ip)
285 |         return self.event
286 | 
287 |     def create_ecs_services(self):
288 |         """Create all the ECS services needed
289 | 
290 |         """
291 |         self.ecs.create_services(self.event["steps"])
292 |         return self.event
293 | 
294 |     def wait_for_cluster_ready(self):
295 |         """Check all the ECS services to see if they're ready
296 | 
297 |         """
298 |         if not self.ecs.all_services_ready(self.event["steps"]):
299 |             raise ServicesStartingException()
300 |         return self.event
301 | 
302 |     def signal_cluster_start(self):
303 |         """Drop a ready file in S3 to trigger the test plan to being
304 | 
305 |         """
306 |         s3_client = self.boto.client('s3')
307 |         s3_client.put_object(
308 |             ACL="public-read",
309 |             Body=b'{}'.format(int(time.time())),
310 |             Bucket=os.environ["s3_ready_bucket"],
311 |             Key="{}.ready".format(self.ecs.plan_uuid),
312 |             Metadata={
313 |                 "ECSCluster": self.event["ecs_name"]
314 |             }
315 |         )
316 |         return self.event
317 | 
318 |     def check_for_cluster_done(self):
319 |         """Check all the ECS services to see if they've run for their
320 |         specified duration
321 | 
322 |         """
323 |         # Check to see if the S3 file is still around
324 |         s3 = self.boto.resource('s3')
325 |         try:
326 |             ready_file = s3.Object(
327 |                 os.environ["s3_ready_bucket"],
328 |                 "{}.ready".format(self.ecs.plan_uuid)
329 |             )
330 |         except botocore.exceptions.ClientError:
331 |             # Error getting to the bucket/key, abort test run
332 |             raise ShutdownPlanException("Error accessing ready file")
333 | 
334 |         file_contents = ready_file.get()['Body'].read().decode('utf-8')
335 |         start_time = int(file_contents)
336 | 
337 |         # Update to running count 0 any services that should halt by now
338 |         self.ecs.stop_finished_services(start_time, self.event["steps"])
339 | 
340 |         # If we're totally done, exit.
341 |         now = time.time()
342 |         plan_duration = self._find_test_plan_duration()
343 |         if now > (start_time + plan_duration):
344 |             raise ShutdownPlanException("Test Plan has completed")
345 |         return self.event
346 | 
347 |     def cleanup_cluster(self):
348 |         """Shutdown all ECS services and deregister all task definitions"""
349 |         self.ecs.shutdown_plan(self.event["steps"])
350 | 
351 |         # Attempt to remove the S3 object
352 |         s3 = self.boto.resource('s3')
353 |         try:
354 |             ready_file = s3.Object(
355 |                 os.environ["s3_ready_bucket"],
356 |                 "{}.ready".format(self.ecs.plan_uuid)
357 |             )
358 |             ready_file.delete()
359 |         except botocore.exceptions.ClientError:
360 |             pass
361 |         return self.event
362 | 
363 |     def check_drained(self):
364 |         """Ensure that all services are shut down before allowing restart"""
365 |         if self.ecs.all_services_done(self.event["steps"]):
366 |             return self.event
367 |         else:
368 |             raise UndrainedInstancesException("Services still draining")
369 | 


--------------------------------------------------------------------------------
/config.bash:
--------------------------------------------------------------------------------
 1 | #! /bin/bash -w
 2 | 
 3 | ctrlc()
 4 | {
 5 |     echo "  Exiting..."
 6 |     rm ~/.aws/credentials
 7 |     exit 1
 8 | }
 9 | set -e
10 | 
11 | if [[ "`which serverless`" == "" ]]
12 | then
13 |     echo "Hrm, serverless is not installed. "
14 |     echo "See https://serverless.com/framework/docs/providers/aws/guide/installation/"
15 |     return
16 | fi
17 | if [[ ! -e ~/.aws/credentials ]]
18 | then
19 |     trap ctrlc SIGINT
20 |     echo "  credential file was not found. Let's make one."
21 |     echo ""
22 |     echo "  If you haven't already, you'll need to create an access key."
23 |     echo "  e.g. go to https://console.aws.amazon.com/iam/home#/users/${USER}/?security_credientials"
24 |     echo "  and click [Create access key]."
25 |     echo ""
26 |     read -p "Access Key ID: " access_key
27 |     read -p "Secret Key ID: " secret_key
28 |     echo "  Thanks! Running configuration";
29 |     echo serverless config credentials --provider aws --key $access_key --secret $secret_key
30 |     serverless config credentials --provider aws --key $access_key --secret $secret_key
31 | fi
32 | echo "  You're configured. The next step is to deploy."
33 | 
34 | 


--------------------------------------------------------------------------------
/default_dashboard.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "annotations": {
  3 |     "list": []
  4 |   },
  5 |   "editable": true,
  6 |   "gnetId": null,
  7 |   "graphTooltip": 0,
  8 |   "hideControls": false,
  9 |   "id": 1,
 10 |   "links": [],
 11 |   "refresh": false,
 12 |   "rows": [
 13 |     {
 14 |       "collapse": false,
 15 |       "height": "250px",
 16 |       "panels": [
 17 |         {
 18 |           "aliasColors": {},
 19 |           "bars": false,
 20 |           "datasource": "$db",
 21 |           "fill": 1,
 22 |           "id": 1,
 23 |           "legend": {
 24 |             "alignAsTable": true,
 25 |             "avg": false,
 26 |             "current": false,
 27 |             "max": false,
 28 |             "min": false,
 29 |             "show": true,
 30 |             "total": false,
 31 |             "values": true
 32 |           },
 33 |           "lines": true,
 34 |           "linewidth": 1,
 35 |           "links": [],
 36 |           "nullPointMode": "null",
 37 |           "percentage": false,
 38 |           "pointradius": 5,
 39 |           "points": false,
 40 |           "renderer": "flot",
 41 |           "seriesOverrides": [],
 42 |           "span": 6,
 43 |           "stack": false,
 44 |           "steppedLine": false,
 45 |           "targets": [
 46 |             {
 47 |               "alias": "",
 48 |               "dsType": "influxdb",
 49 |               "groupBy": [
 50 |                 {
 51 |                   "params": [
 52 |                     "$interval"
 53 |                   ],
 54 |                   "type": "time"
 55 |                 },
 56 |                 {
 57 |                   "params": [
 58 |                     "host"
 59 |                   ],
 60 |                   "type": "tag"
 61 |                 },
 62 |                 {
 63 |                   "params": [
 64 |                     "step"
 65 |                   ],
 66 |                   "type": "tag"
 67 |                 },
 68 |                 {
 69 |                   "params": [
 70 |                     "none"
 71 |                   ],
 72 |                   "type": "fill"
 73 |                 }
 74 |               ],
 75 |               "hide": false,
 76 |               "measurement": "cpu",
 77 |               "policy": "default",
 78 |               "query": "SELECT mean(\"usage_user\") FROM \"cpu\" WHERE $timeFilter GROUP BY time($interval) fill(null)",
 79 |               "rawQuery": false,
 80 |               "refId": "A",
 81 |               "resultFormat": "time_series",
 82 |               "select": [
 83 |                 [
 84 |                   {
 85 |                     "params": [
 86 |                       "usage_system"
 87 |                     ],
 88 |                     "type": "field"
 89 |                   },
 90 |                   {
 91 |                     "params": [],
 92 |                     "type": "mean"
 93 |                   }
 94 |                 ]
 95 |               ],
 96 |               "tags": [
 97 |                 {
 98 |                   "key": "step",
 99 |                   "operator": "=~",
100 |                   "value": "/^$step$/"
101 |                 },
102 |                 {
103 |                   "condition": "AND",
104 |                   "key": "host",
105 |                   "operator": "=~",
106 |                   "value": "/^$host$/"
107 |                 }
108 |               ]
109 |             }
110 |           ],
111 |           "thresholds": [],
112 |           "timeFrom": null,
113 |           "timeShift": null,
114 |           "title": "CPU Usage",
115 |           "tooltip": {
116 |             "shared": false,
117 |             "sort": 0,
118 |             "value_type": "individual"
119 |           },
120 |           "type": "graph",
121 |           "xaxis": {
122 |             "mode": "time",
123 |             "name": null,
124 |             "show": true,
125 |             "values": []
126 |           },
127 |           "yaxes": [
128 |             {
129 |               "format": "percentunit",
130 |               "label": null,
131 |               "logBase": 1,
132 |               "max": null,
133 |               "min": null,
134 |               "show": true
135 |             },
136 |             {
137 |               "format": "short",
138 |               "label": null,
139 |               "logBase": 1,
140 |               "max": null,
141 |               "min": null,
142 |               "show": true
143 |             }
144 |           ]
145 |         },
146 |         {
147 |           "aliasColors": {},
148 |           "bars": false,
149 |           "datasource": "$db",
150 |           "fill": 1,
151 |           "id": 2,
152 |           "legend": {
153 |             "alignAsTable": true,
154 |             "avg": false,
155 |             "current": false,
156 |             "max": false,
157 |             "min": false,
158 |             "show": true,
159 |             "total": false,
160 |             "values": true
161 |           },
162 |           "lines": true,
163 |           "linewidth": 1,
164 |           "links": [],
165 |           "nullPointMode": "null",
166 |           "percentage": false,
167 |           "pointradius": 5,
168 |           "points": false,
169 |           "renderer": "flot",
170 |           "seriesOverrides": [],
171 |           "span": 6,
172 |           "stack": false,
173 |           "steppedLine": false,
174 |           "targets": [
175 |             {
176 |               "dsType": "influxdb",
177 |               "groupBy": [
178 |                 {
179 |                   "params": [
180 |                     "$interval"
181 |                   ],
182 |                   "type": "time"
183 |                 },
184 |                 {
185 |                   "params": [
186 |                     "step"
187 |                   ],
188 |                   "type": "tag"
189 |                 },
190 |                 {
191 |                   "params": [
192 |                     "host"
193 |                   ],
194 |                   "type": "tag"
195 |                 },
196 |                 {
197 |                   "params": [
198 |                     "none"
199 |                   ],
200 |                   "type": "fill"
201 |                 }
202 |               ],
203 |               "measurement": "mem",
204 |               "policy": "default",
205 |               "refId": "A",
206 |               "resultFormat": "time_series",
207 |               "select": [
208 |                 [
209 |                   {
210 |                     "params": [
211 |                       "used"
212 |                     ],
213 |                     "type": "field"
214 |                   },
215 |                   {
216 |                     "params": [],
217 |                     "type": "mean"
218 |                   }
219 |                 ]
220 |               ],
221 |               "tags": [
222 |                 {
223 |                   "key": "step",
224 |                   "operator": "=~",
225 |                   "value": "/^$step$/"
226 |                 },
227 |                 {
228 |                   "condition": "AND",
229 |                   "key": "host",
230 |                   "operator": "=~",
231 |                   "value": "/^$host$/"
232 |                 }
233 |               ]
234 |             }
235 |           ],
236 |           "thresholds": [],
237 |           "timeFrom": null,
238 |           "timeShift": null,
239 |           "title": "Memory Usage",
240 |           "tooltip": {
241 |             "shared": false,
242 |             "sort": 0,
243 |             "value_type": "individual"
244 |           },
245 |           "type": "graph",
246 |           "xaxis": {
247 |             "mode": "time",
248 |             "name": null,
249 |             "show": true,
250 |             "values": []
251 |           },
252 |           "yaxes": [
253 |             {
254 |               "format": "bytes",
255 |               "label": null,
256 |               "logBase": 1,
257 |               "max": null,
258 |               "min": null,
259 |               "show": true
260 |             },
261 |             {
262 |               "format": "short",
263 |               "label": null,
264 |               "logBase": 1,
265 |               "max": null,
266 |               "min": null,
267 |               "show": true
268 |             }
269 |           ]
270 |         }
271 |       ],
272 |       "repeat": null,
273 |       "repeatIteration": null,
274 |       "repeatRowId": null,
275 |       "showTitle": false,
276 |       "title": "Dashboard Row",
277 |       "titleSize": "h6"
278 |     },
279 |     {
280 |       "collapse": false,
281 |       "height": 250,
282 |       "panels": [
283 |         {
284 |           "aliasColors": {},
285 |           "bars": false,
286 |           "datasource": "$db",
287 |           "fill": 1,
288 |           "id": 3,
289 |           "legend": {
290 |             "alignAsTable": true,
291 |             "avg": false,
292 |             "current": false,
293 |             "max": false,
294 |             "min": false,
295 |             "show": true,
296 |             "total": false,
297 |             "values": true
298 |           },
299 |           "lines": true,
300 |           "linewidth": 1,
301 |           "links": [],
302 |           "nullPointMode": "null",
303 |           "percentage": false,
304 |           "pointradius": 5,
305 |           "points": false,
306 |           "renderer": "flot",
307 |           "seriesOverrides": [
308 |             {
309 |               "alias": "/^in.*/",
310 |               "transform": "negative-Y"
311 |             }
312 |           ],
313 |           "span": 6,
314 |           "stack": false,
315 |           "steppedLine": false,
316 |           "targets": [
317 |             {
318 |               "alias": "out {host: [[tag_host]] step: [[tag_step]]}",
319 |               "dsType": "influxdb",
320 |               "groupBy": [
321 |                 {
322 |                   "params": [
323 |                     "$interval"
324 |                   ],
325 |                   "type": "time"
326 |                 },
327 |                 {
328 |                   "params": [
329 |                     "step"
330 |                   ],
331 |                   "type": "tag"
332 |                 },
333 |                 {
334 |                   "params": [
335 |                     "host"
336 |                   ],
337 |                   "type": "tag"
338 |                 },
339 |                 {
340 |                   "params": [
341 |                     "null"
342 |                   ],
343 |                   "type": "fill"
344 |                 }
345 |               ],
346 |               "measurement": "net",
347 |               "policy": "default",
348 |               "refId": "A",
349 |               "resultFormat": "time_series",
350 |               "select": [
351 |                 [
352 |                   {
353 |                     "params": [
354 |                       "bytes_sent"
355 |                     ],
356 |                     "type": "field"
357 |                   },
358 |                   {
359 |                     "params": [],
360 |                     "type": "mean"
361 |                   },
362 |                   {
363 |                     "params": [
364 |                       "1s"
365 |                     ],
366 |                     "type": "non_negative_derivative"
367 |                   },
368 |                   {
369 |                     "params": [
370 |                       " *8"
371 |                     ],
372 |                     "type": "math"
373 |                   }
374 |                 ]
375 |               ],
376 |               "tags": [
377 |                 {
378 |                   "key": "step",
379 |                   "operator": "=~",
380 |                   "value": "/^$step$/"
381 |                 },
382 |                 {
383 |                   "condition": "AND",
384 |                   "key": "host",
385 |                   "operator": "=~",
386 |                   "value": "/^$host$/"
387 |                 }
388 |               ]
389 |             },
390 |             {
391 |               "alias": "in {host: [[tag_host]] step: [[tag_step]]}",
392 |               "dsType": "influxdb",
393 |               "groupBy": [
394 |                 {
395 |                   "params": [
396 |                     "$interval"
397 |                   ],
398 |                   "type": "time"
399 |                 },
400 |                 {
401 |                   "params": [
402 |                     "step"
403 |                   ],
404 |                   "type": "tag"
405 |                 },
406 |                 {
407 |                   "params": [
408 |                     "host"
409 |                   ],
410 |                   "type": "tag"
411 |                 },
412 |                 {
413 |                   "params": [
414 |                     "none"
415 |                   ],
416 |                   "type": "fill"
417 |                 }
418 |               ],
419 |               "measurement": "net",
420 |               "policy": "default",
421 |               "query": "SELECT non_negative_derivative(mean(\"bytes_recv\"), 1s) *8 FROM \"net\" WHERE \"step\" =~ /^$step$/ AND \"host\" =~ /^$host$/ AND $timeFilter GROUP BY time($interval), \"step\", \"host\" fill(none)",
422 |               "rawQuery": true,
423 |               "refId": "B",
424 |               "resultFormat": "time_series",
425 |               "select": [
426 |                 [
427 |                   {
428 |                     "params": [
429 |                       "bytes_recv"
430 |                     ],
431 |                     "type": "field"
432 |                   },
433 |                   {
434 |                     "params": [],
435 |                     "type": "mean"
436 |                   },
437 |                   {
438 |                     "params": [
439 |                       "1s"
440 |                     ],
441 |                     "type": "non_negative_derivative"
442 |                   },
443 |                   {
444 |                     "params": [
445 |                       "*8"
446 |                     ],
447 |                     "type": "math"
448 |                   }
449 |                 ]
450 |               ],
451 |               "tags": [
452 |                 {
453 |                   "key": "step",
454 |                   "operator": "=~",
455 |                   "value": "/^$step$/"
456 |                 },
457 |                 {
458 |                   "condition": "AND",
459 |                   "key": "host",
460 |                   "operator": "=~",
461 |                   "value": "/^$host$/"
462 |                 }
463 |               ]
464 |             }
465 |           ],
466 |           "thresholds": [],
467 |           "timeFrom": null,
468 |           "timeShift": null,
469 |           "title": "Network Bytes/sec",
470 |           "tooltip": {
471 |             "shared": false,
472 |             "sort": 0,
473 |             "value_type": "individual"
474 |           },
475 |           "type": "graph",
476 |           "xaxis": {
477 |             "mode": "time",
478 |             "name": null,
479 |             "show": true,
480 |             "values": []
481 |           },
482 |           "yaxes": [
483 |             {
484 |               "format": "Bps",
485 |               "label": null,
486 |               "logBase": 1,
487 |               "max": null,
488 |               "min": null,
489 |               "show": true
490 |             },
491 |             {
492 |               "format": "short",
493 |               "label": null,
494 |               "logBase": 1,
495 |               "max": null,
496 |               "min": null,
497 |               "show": true
498 |             }
499 |           ]
500 |         }
501 |       ],
502 |       "repeat": null,
503 |       "repeatIteration": null,
504 |       "repeatRowId": null,
505 |       "showTitle": false,
506 |       "title": "Dashboard Row",
507 |       "titleSize": "h6"
508 |     }
509 |   ],
510 |   "schemaVersion": 14,
511 |   "style": "dark",
512 |   "tags": [],
513 |   "templating": {
514 |     "list": [
515 |       {
516 |         "current": {
517 |           "tags": [],
518 |           "text": "run-a3226cfb9513415bac7c7053a8b62a5f",
519 |           "value": "run-a3226cfb9513415bac7c7053a8b62a5f"
520 |         },
521 |         "hide": 0,
522 |         "label": null,
523 |         "name": "db",
524 |         "options": [],
525 |         "query": "influxdb",
526 |         "refresh": 1,
527 |         "regex": "",
528 |         "type": "datasource"
529 |       },
530 |       {
531 |         "allValue": null,
532 |         "current": {
533 |           "text": "All",
534 |           "value": "$__all"
535 |         },
536 |         "datasource": "$db",
537 |         "hide": 0,
538 |         "includeAll": true,
539 |         "label": null,
540 |         "multi": true,
541 |         "name": "step",
542 |         "options": [],
543 |         "query": "SHOW TAG VALUES WITH KEY = step",
544 |         "refresh": 2,
545 |         "regex": "",
546 |         "sort": 0,
547 |         "tagValuesQuery": "",
548 |         "tags": [],
549 |         "tagsQuery": "",
550 |         "type": "query",
551 |         "useTags": false
552 |       },
553 |       {
554 |         "allValue": null,
555 |         "current": {
556 |           "text": "All",
557 |           "value": "$__all"
558 |         },
559 |         "datasource": "$db",
560 |         "hide": 0,
561 |         "includeAll": true,
562 |         "label": null,
563 |         "multi": true,
564 |         "name": "host",
565 |         "options": [],
566 |         "query": "SHOW TAG VALUES WITH KEY = host",
567 |         "refresh": 2,
568 |         "regex": "",
569 |         "sort": 0,
570 |         "tagValuesQuery": "SHOW TAG VALUES WITH KEY = host WHERE step = $step",
571 |         "tags": [
572 |           "i-08a40a6f64d2e2cbe"
573 |         ],
574 |         "tagsQuery": "SHOW TAG VALUES WITH KEY = host",
575 |         "type": "query",
576 |         "useTags": true
577 |       }
578 |     ]
579 |   },
580 |   "time": {
581 |     "from": "2017-04-05T03:01:01.177Z",
582 |     "to": "2017-04-05T03:15:19.628Z"
583 |   },
584 |   "timepicker": {
585 |     "refresh_intervals": [
586 |       "5s",
587 |       "10s",
588 |       "30s",
589 |       "1m",
590 |       "5m",
591 |       "15m",
592 |       "30m",
593 |       "1h",
594 |       "2h",
595 |       "1d"
596 |     ],
597 |     "time_options": [
598 |       "5m",
599 |       "15m",
600 |       "1h",
601 |       "6h",
602 |       "12h",
603 |       "24h",
604 |       "2d",
605 |       "7d",
606 |       "30d"
607 |     ]
608 |   },
609 |   "timezone": "browser",
610 |   "title": "loads-broker Monitor",
611 |   "version": 5
612 | }


--------------------------------------------------------------------------------
/handler.py:
--------------------------------------------------------------------------------
 1 | # First some funky path manipulation so that we can work properly in
 2 | # the AWS environment
 3 | import sys
 4 | import os
 5 | dir_path = os.path.dirname(os.path.realpath(__file__))
 6 | sys.path.append(dir_path)
 7 | 
 8 | from ardere.step_functions import AsynchronousPlanRunner
 9 | 
10 | 
11 | def populate_missing_instances(event, context):
12 |     runner = AsynchronousPlanRunner(event, context)
13 |     return runner.populate_missing_instances()
14 | 
15 | 
16 | def ensure_metrics_available(event, context):
17 |     runner = AsynchronousPlanRunner(event, context)
18 |     return runner.ensure_metrics_available()
19 | 
20 | 
21 | def ensure_metric_sources_created(event, context):
22 |     runner = AsynchronousPlanRunner(event, context)
23 |     return runner.ensure_metric_sources_created()
24 | 
25 | 
26 | def create_ecs_services(event, context):
27 |     runner = AsynchronousPlanRunner(event, context)
28 |     return runner.create_ecs_services()
29 | 
30 | 
31 | def wait_for_cluster_ready(event, context):
32 |     runner = AsynchronousPlanRunner(event, context)
33 |     return runner.wait_for_cluster_ready()
34 | 
35 | 
36 | def signal_cluster_start(event, context):
37 |     runner = AsynchronousPlanRunner(event, context)
38 |     return runner.signal_cluster_start()
39 | 
40 | 
41 | def check_for_cluster_done(event, context):
42 |     runner = AsynchronousPlanRunner(event, context)
43 |     return runner.check_for_cluster_done()
44 | 
45 | 
46 | def cleanup_cluster(event, context):
47 |     runner = AsynchronousPlanRunner(event, context)
48 |     return runner.cleanup_cluster()
49 | 
50 | 
51 | def check_drain(event, context):
52 |     return AsynchronousPlanRunner(event, context).check_drained()
53 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "ardere",
 3 |   "version": "1.0.0",
 4 |   "description": "Serverless Service for Load-Testing",
 5 |   "main": "index.js",
 6 |   "dependencies": {
 7 |     "serverless": "^1.8.0",
 8 |     "serverless-python-requirements": "^2.0.0-beta.7",
 9 |     "serverless-step-functions": "^0.4.1"
10 |   },
11 |   "devDependencies": {},
12 |   "scripts": {
13 |     "test": "echo \"Error: no test specified\" && exit 1"
14 |   },
15 |   "repository": {
16 |     "type": "git",
17 |     "url": "git+https://github.com/loads/ardere.git"
18 |   },
19 |   "author": "",
20 |   "license": "MPL-2.0",
21 |   "bugs": {
22 |     "url": "https://github.com/loads/ardere/issues"
23 |   },
24 |   "homepage": "https://github.com/loads/ardere#readme"
25 | }
26 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | futures==3.0.5
2 | typing==3.5.3.0
3 | toml==0.9.2
4 | marshmallow==2.13.4
5 | boto3==1.4.4
6 | requests==2.13.0


--------------------------------------------------------------------------------
/serverless.yml:
--------------------------------------------------------------------------------
  1 | service: ardere
  2 | 
  3 | plugins:
  4 |   - serverless-step-functions
  5 |   - serverless-python-requirements
  6 | 
  7 | package:
  8 |   exclude:
  9 |     - node_modules/**
 10 |     - ardenv/**
 11 |     - tests/**
 12 |     - lib/**
 13 |     - share/**
 14 |     - man/**
 15 |     - bin/**
 16 |     - serverless/**
 17 |   include:
 18 |     - ardere/**
 19 | 
 20 | provider:
 21 |   name: aws
 22 |   runtime: python2.7
 23 |   memorySize: 128
 24 |   timeout: 60
 25 |   environment:
 26 |     ecs_profile:
 27 |       Fn::GetAtt:
 28 |         - EC2ContainerProfile
 29 |         - Arn
 30 |     s3_ready_bucket:
 31 |       Ref: "S3ReadyBucket"
 32 |     metrics_bucket:
 33 |       Ref: "MetricsBucket"
 34 |     ec2_sg:
 35 |       Fn::GetAtt:
 36 |         - EC2SecurityGroup
 37 |         - GroupId
 38 |     metric_sg:
 39 |       Fn::GetAtt:
 40 |         - MetricSecurityGroup
 41 |         - GroupId
 42 |     container_log_group:
 43 |       Ref: "ContainerLogs"
 44 | 
 45 |   iamRoleStatements:
 46 |     -  Effect: "Allow"
 47 |        Action:
 48 |          - "ecs:CreateCluster"
 49 |          - "ecs:ListServices"
 50 |          - "ecs:ListContainerInstances"
 51 |          - "ecs:ListTasks"
 52 |          - "ecs:DescribeClusters"
 53 |          - "ecs:DescribeServices"
 54 |          - "ecs:DescribeTaskDefinition"
 55 |          - "ecs:DescribeTasks"
 56 |          - "ecs:DescribeContainerInstances"
 57 |          - "ecs:CreateService"
 58 |          - "ecs:DeleteService"
 59 |          - "ecs:UpdateService"
 60 |          - "ecs:StartTask"
 61 |          - "ecs:RegisterTaskDefinition"
 62 |          - "ecs:DeregisterTaskDefinition"
 63 |        Resource:
 64 |          - "*"
 65 |     -  Effect: "Allow"
 66 |        Action:
 67 |          - "s3:ListBucket"
 68 |          - "s3:PutObject"
 69 |          - "s3:PutObjectAcl"
 70 |        Resource:
 71 |          - Fn::Join: ['', ['arn:aws:s3:::', Ref: "S3ReadyBucket"]]
 72 |     -  Effect: "Allow"
 73 |        Action:
 74 |          - "s3:PutObject"
 75 |          - "s3:PutObjectAcl"
 76 |          - "s3:GetObject"
 77 |          - "s3:DeleteObject"
 78 |        Resource:
 79 |          - Fn::Join: ['', ['arn:aws:s3:::', Ref: "S3ReadyBucket", "/*"]]
 80 |     -  Effect: "Allow"
 81 |        Action:
 82 |          - "s3:ListBucket"
 83 |          - "s3:GetObject"
 84 |        Resource:
 85 |          - Fn::Join: ['', ['arn:aws:s3:::', Ref: "MetricsBucket", "/*"]]
 86 |     -  Effect: "Allow"
 87 |        Action:
 88 |          - "ec2:DescribeInstances"
 89 |          - "ec2:RunInstances"
 90 |          - "ec2:CreateTags"
 91 |        Resource:
 92 |          - "*"
 93 |     -  Effect: "Allow"
 94 |        Action:
 95 |          - "iam:GetRole"
 96 |          - "iam:PassRole"
 97 |        Resource:
 98 |          Fn::GetAtt:
 99 |            - EC2ContainerRole
100 |            - Arn
101 | 
102 | functions:
103 |   populate_missing_instances:
104 |     handler: handler.populate_missing_instances
105 |     timeout: 300
106 |   ensure_metrics_available:
107 |     handler: handler.ensure_metrics_available
108 |     timeout: 300
109 |   ensure_metric_sources_created:
110 |     handler: handler.ensure_metric_sources_created
111 |     timeout: 300
112 |   create_ecs_services:
113 |     handler: handler.create_ecs_services
114 |     timeout: 300
115 |   wait_for_cluster_ready:
116 |     handler: handler.wait_for_cluster_ready
117 |   signal_cluster_start:
118 |     handler: handler.signal_cluster_start
119 |   check_for_cluster_done:
120 |     handler: handler.check_for_cluster_done
121 |   cleanup_cluster:
122 |     handler: handler.cleanup_cluster
123 |     timeout: 300
124 |   check_drain:
125 |     handler: handler.check_drain
126 | 
127 | stepFunctions:
128 |   stateMachines:
129 |     ardere:
130 |       Comment: "ardere load-tester"
131 |       Version: "1.0"
132 |       StartAt: "Populate Missing Instances"
133 |       States:
134 |         "Populate Missing Instances":
135 |           Type: Task
136 |           Resource: populate_missing_instances
137 |           Next: "Ensure Metrics Available"
138 |         "Ensure Metrics Available":
139 |           Type: Task
140 |           Resource: ensure_metrics_available
141 |           Retry:
142 |             -
143 |               ErrorEquals:
144 |                 - ServicesStartingException
145 |               IntervalSeconds: 10
146 |               MaxAttempts: 60
147 |               BackoffRate: 1
148 |           Catch:
149 |             -
150 |               ErrorEquals:
151 |                 - States.ALL
152 |               ResultPath: "$.error-info"
153 |               Next: "Clean-up Cluster"
154 |           Next: "Ensure Metric Sources Created"
155 |         "Ensure Metric Sources Created":
156 |           Type: Task
157 |           Resource: ensure_metric_sources_created
158 |           Retry:
159 |             -
160 |               ErrorEquals:
161 |                 - CreatingMetricSourceException
162 |               IntervalSeconds: 5
163 |               MaxAttempts: 20
164 |               BackoffRate: 1
165 |           Catch:
166 |             -
167 |               ErrorEquals:
168 |                 - States.ALL
169 |               ResultPath: "$.error-info"
170 |               Next: "Clean-up Cluster"
171 |           Next: "Create ECS Services"
172 |         "Create ECS Services":
173 |           Type: Task
174 |           Resource: create_ecs_services
175 |           Catch:
176 |             -
177 |               ErrorEquals:
178 |                 - States.ALL
179 |               ResultPath: "$.error-info"
180 |               Next: "Clean-up Cluster"
181 |           Next: "Wait for Cluster Ready"
182 |         "Wait for Cluster Ready":
183 |           Type: Task
184 |           Resource: wait_for_cluster_ready
185 |           Retry:
186 |             -
187 |               ErrorEquals:
188 |                 - ServicesStartingException
189 |               IntervalSeconds: 10
190 |               MaxAttempts: 180
191 |               BackoffRate: 1
192 |           Catch:
193 |             -
194 |               ErrorEquals:
195 |                 - States.ALL
196 |               ResultPath: "$.error-info"
197 |               Next: "Clean-up Cluster"
198 |           Next: "Signal Cluster Start"
199 |         "Signal Cluster Start":
200 |           Type: Task
201 |           Resource: signal_cluster_start
202 |           Catch:
203 |             -
204 |               ErrorEquals:
205 |                 - States.ALL
206 |               ResultPath: "$.error-info"
207 |               Next: "Clean-up Cluster"
208 |           Next: "Check for Cluster Done"
209 |         "Check for Cluster Done":
210 |           Type: Task
211 |           Resource: check_for_cluster_done
212 |           Next: "Wait for Cluster Done"
213 |           Retry:
214 |             -
215 |               ErrorEquals:
216 |                 - NoSuchKey
217 |               IntervalSeconds: 10
218 |               MaxAttempts: 2
219 |               BackoffRate: 1
220 |           Catch:
221 |             -
222 |               ErrorEquals:
223 |                 - States.ALL
224 |               ResultPath: "$.error-info"
225 |               Next: "Clean-up Cluster"
226 |         "Wait for Cluster Done":
227 |           Type: Wait
228 |           Seconds: 10
229 |           Next: "Check for Cluster Done"
230 |         "Clean-up Cluster":
231 |           Type: Task
232 |           Resource: cleanup_cluster
233 |           Next: "Checking Drain"
234 |         "Checking Drain":
235 |           Type: Task
236 |           Resource: check_drain
237 |           Retry:
238 |             -
239 |               ErrorEquals:
240 |                 - UndrainedInstancesException
241 |               IntervalSeconds: 10
242 |               MaxAttempts: 10
243 |               BackoffRate: 1
244 |           End: true
245 | 
246 | resources:
247 |   Resources:
248 |     S3ReadyBucket:
249 |       Type: "AWS::S3::Bucket"
250 |       Properties:
251 |         AccessControl: "PublicRead"
252 |     MetricsBucket:
253 |       Type: "AWS::S3::Bucket"
254 |       Properties:
255 |         AccessControl: "AuthenticatedRead"
256 |     MetricSecurityGroup:
257 |       Type: "AWS::EC2::SecurityGroup"
258 |       Properties:
259 |         GroupDescription: "ardere metrics"
260 |         SecurityGroupIngress:
261 |           -
262 |             IpProtocol: tcp
263 |             FromPort: 3000
264 |             ToPort: 3000
265 |             SourceSecurityGroupId:
266 |               Fn::GetAtt:
267 |                 - GrafanaSecurityGroup
268 |                 - GroupId
269 |           -
270 |             IpProtocol: tcp
271 |             FromPort: 8086
272 |             ToPort: 8086
273 |             SourceSecurityGroupId:
274 |               Fn::GetAtt:
275 |                 - EC2SecurityGroup
276 |                 - GroupId
277 |     GrafanaSecurityGroup:
278 |       Type: "AWS::EC2::SecurityGroup"
279 |       Properties:
280 |         GroupDescription: "grafana access"
281 |     EC2SecurityGroup:
282 |       Type: "AWS::EC2::SecurityGroup"
283 |       Properties:
284 |         GroupDescription: "ardere load-testers"
285 |     EC2ContainerRole:
286 |       Type: "AWS::IAM::Role"
287 |       Properties:
288 |         AssumeRolePolicyDocument:
289 |           Version: "2012-10-17"
290 |           Statement:
291 |             -
292 |               Effect: "Allow"
293 |               Principal:
294 |                 Service:
295 |                   - "ec2.amazonaws.com"
296 |               Action:
297 |                 - "sts:AssumeRole"
298 |         Path: "/"
299 |         Policies:
300 |           -
301 |             PolicyName: "ecs-service"
302 |             PolicyDocument:
303 |               Version: "2012-10-17"
304 |               Statement:
305 |                 -
306 |                   Effect: "Allow"
307 |                   Action:
308 |                     - "ecs:CreateCluster"
309 |                     - "ecs:DeregisterContainerInstance"
310 |                     - "ecs:DiscoverPollEndpoint"
311 |                     - "ecs:Poll"
312 |                     - "ecs:RegisterContainerInstance"
313 |                     - "ecs:StartTelemetrySession"
314 |                     - "ecs:SubmitContainerStateChange"
315 |                     - "ecs:SubmitTaskStateChange"
316 |                     - "ecs:Submit"
317 |                     - "logs:CreateLogStream"
318 |                     - "logs:PutLogEvents"
319 |                   Resource: "*"
320 |                 -
321 |                   Effect: "Allow"
322 |                   Action:
323 |                     - "s3:ListBucket"
324 |                     - "s3:GetObject"
325 |                   Resource:
326 |                     - Fn::Join: ['', ['arn:aws:s3:::', Ref: "MetricsBucket", "/*"]]
327 |     ContainerLogs:
328 |       Type: "AWS::Logs::LogGroup"
329 |       Properties:
330 |         RetentionInDays: 1
331 |     EC2ContainerProfile:
332 |       Type: "AWS::IAM::InstanceProfile"
333 |       Properties:
334 |         Path: "/"
335 |         Roles:
336 |           -
337 |             Ref: "EC2ContainerRole"
338 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [nosetests]
 2 | verbose=True
 3 | verbosity=1
 4 | detailed-errors=True
 5 | with-coverage=True
 6 | cover-erase=True
 7 | cover-package=ardere
 8 | cover-tests=True
 9 | cover-inclusive=True
10 | 


--------------------------------------------------------------------------------
/src/shell/telegraf.toml:
--------------------------------------------------------------------------------
  1 | # Telegraf Configuration
  2 | #
  3 | # Telegraf is entirely plugin driven. All metrics are gathered from the
  4 | # declared inputs, and sent to the declared outputs.
  5 | #
  6 | # Plugins must be declared in here to be active.
  7 | # To deactivate a plugin, comment out the name and any variables.
  8 | #
  9 | # Use 'telegraf -config telegraf.conf -test' to see what metrics a config
 10 | # file would generate.
 11 | #
 12 | # Environment variables can be used anywhere in this config file, simply prepend
 13 | # them with $. For strings the variable must be within quotes (ie, "$STR_VAR"),
 14 | # for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR)
 15 | 
 16 | 
 17 | # Global tags can be specified here in key="value" format.
 18 | [global_tags]
 19 | # dc = "us-east-1" # will tag all metrics with dc=us-east-1
 20 | # rack = "1a"
 21 | ## Environment variables can be used as tags, and throughout the config file
 22 | # user = "$USER"
 23 | step = "$__ARDERE_TELEGRAF_STEP__"
 24 | ## type is the old "docker_series"
 25 | type = "$__ARDERE_TELEGRAF_TYPE__"
 26 | 
 27 | 
 28 | # Configuration for telegraf agent
 29 | [agent]
 30 | ## Default data collection interval for all inputs
 31 | interval = "10s"
 32 | ## Rounds collection interval to 'interval'
 33 | ## ie, if interval="10s" then always collect on :00, :10, :20, etc.
 34 | round_interval = true
 35 | 
 36 | ## Telegraf will send metrics to outputs in batches of at most
 37 | ## metric_batch_size metrics.
 38 | ## This controls the size of writes that Telegraf sends to output plugins.
 39 | metric_batch_size = 1000
 40 | 
 41 | ## For failed writes, telegraf will cache metric_buffer_limit metrics for each
 42 | ## output, and will flush this buffer on a successful write. Oldest metrics
 43 | ## are dropped first when this buffer fills.
 44 | ## This buffer only fills when writes fail to output plugin(s).
 45 | metric_buffer_limit = 10000
 46 | 
 47 | ## Collection jitter is used to jitter the collection by a random amount.
 48 | ## Each plugin will sleep for a random time within jitter before collecting.
 49 | ## This can be used to avoid many plugins querying things like sysfs at the
 50 | ## same time, which can have a measurable effect on the system.
 51 | collection_jitter = "0s"
 52 | 
 53 | ## Default flushing interval for all outputs. You shouldn't set this below
 54 | ## interval. Maximum flush_interval will be flush_interval + flush_jitter
 55 | flush_interval = "10s"
 56 | ## Jitter the flush interval by a random amount. This is primarily to avoid
 57 | ## large write spikes for users running a large number of telegraf instances.
 58 | ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
 59 | flush_jitter = "0s"
 60 | ## By default, precision will be set to the same timestamp order as the
 61 | ## collection interval, with the maximum being 1s.
 62 | ## Precision will NOT be used for service inputs, such as logparser and statsd.
 63 | ## Valid values are "ns", "us" (or "µs"), "ms", "s".
 64 | precision = ""
 65 | ## Logging configuration:
 66 | ## Run telegraf with debug log messages.
 67 | debug = false
 68 | ## Run telegraf in quiet mode (error log messages only).
 69 | quiet = false
 70 | ## Specify the log file name. The empty string means to log to stderr.
 71 | logfile = ""
 72 | ## Override default hostname, if empty use os.Hostname()
 73 | hostname = "$__ARDERE_TELEGRAF_HOST__"
 74 | ## If set to true, do no set the "host" tag in the telegraf agent.
 75 | omit_hostname = false
 76 | ###############################################################################
 77 | #                            OUTPUT PLUGINS                                   #
 78 | ###############################################################################
 79 | # Configuration for influxdb server to send metrics to
 80 | [[outputs.influxdb]]
 81 | ## The full HTTP or UDP endpoint URL for your InfluxDB instance.
 82 | ## Multiple urls can be specified as part of the same cluster,
 83 | ## this means that only ONE of the urls will be written to each interval.
 84 | # urls = ["udp://localhost:8089"] # UDP endpoint example
 85 | urls = ["http://$__ARDERE_INFLUX_ADDR__"] # required
 86 | ## The target database for metrics (telegraf will create it if not exists).
 87 | database = "$__ARDERE_INFLUX_DB__" # required
 88 | ## Retention policy to write to. Empty string writes to the default rp.
 89 | retention_policy = ""
 90 | ## Write consistency (clusters only), can be: "any", "one", "quorum", "all"
 91 | write_consistency = "any"
 92 | ## Write timeout (for the InfluxDB client), formatted as a string.
 93 | ## If not provided, will default to 5s. 0s means no timeout (not recommended).
 94 | timeout = "5s"
 95 | # username = "telegraf"
 96 | # password = "metricsmetricsmetricsmetrics"
 97 | ## Set the user agent for HTTP POSTs (can be useful for log differentiation)
 98 | # user_agent = "telegraf"
 99 | ## Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
100 | # udp_payload = 512
101 | ## Optional SSL Config
102 | # ssl_ca = "/etc/telegraf/ca.pem"
103 | # ssl_cert = "/etc/telegraf/cert.pem"
104 | # ssl_key = "/etc/telegraf/key.pem"
105 | ## Use SSL but skip chain & host verification
106 | # insecure_skip_verify = false
107 | ###############################################################################
108 | #                            PROCESSOR PLUGINS                                #
109 | ###############################################################################
110 | # # Print all metrics that pass through this filter.
111 | # [[processors.printer]]
112 | ###############################################################################
113 | #                            AGGREGATOR PLUGINS                               #
114 | ###############################################################################
115 | # # Keep the aggregate min/max of each metric passing through.
116 | # [[aggregators.minmax]]
117 | #   ## General Aggregator Arguments:
118 | #   ## The period on which to flush & clear the aggregator.
119 | #   period = "30s"
120 | #   ## If true, the original metric will be dropped by the
121 | #   ## aggregator and will not get sent to the output plugins.
122 | #   drop_original = false
123 | ###############################################################################
124 | #                            INPUT PLUGINS                                    #
125 | ###############################################################################
126 | # Read metrics about cpu usage
127 | [[inputs.cpu]]
128 | ## Whether to report per-cpu stats or not
129 | percpu = true
130 | ## Whether to report total system cpu stats or not
131 | totalcpu = true
132 | ## If true, collect raw CPU time metrics.
133 | collect_cpu_time = false
134 | # Read metrics about memory usage
135 | [[inputs.mem]]
136 | # no configuration
137 | # Read TCP metrics such as established, time wait and sockets counts.
138 | [[inputs.netstat]]
139 | # no configuration
140 | ###############################################################################
141 | #                            SERVICE INPUT PLUGINS                            #
142 | ###############################################################################
143 | # Statsd Server
144 | [[inputs.statsd]]
145 | ## Address and port to host UDP listener on
146 | service_address = ":8125"
147 | ## The following configuration options control when telegraf clears it's cache
148 | ## of previous values. If set to false, then telegraf will only clear it's
149 | ## cache when the daemon is restarted.
150 | ## Reset gauges every interval (default=true)
151 | delete_gauges = true
152 | ## Reset counters every interval (default=true)
153 | delete_counters = true
154 | ## Reset sets every interval (default=true)
155 | delete_sets = true
156 | ## Reset timings & histograms every interval (default=true)
157 | delete_timings = true
158 | ## Percentiles to calculate for timing & histogram stats
159 | percentiles = [90]
160 | ## separator to use between elements of a statsd metric
161 | metric_separator = "_"
162 | ## Parses tags in the datadog statsd format
163 | ## http://docs.datadoghq.com/guides/dogstatsd/
164 | parse_data_dog_tags = false
165 | ## Statsd data translation templates, more info can be read here:
166 | ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md#graphite
167 | # templates = [
168 | #     "cpu.* measurement*"
169 | # ]
170 | ## Number of UDP messages allowed to queue up, once filled,
171 | ## the statsd server will start dropping packets
172 | allowed_pending_messages = 10000
173 | ## Number of timing/histogram values to track per-measurement in the
174 | ## calculation of percentiles. Raising this limit increases the accuracy
175 | ## of percentiles but also increases the memory usage and cpu time.
176 | #percentile_limit = 1000
177 | percentile_limit = 10


--------------------------------------------------------------------------------
/src/shell/waitforcluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # waits for a cluster to be ready + a run_delay
 4 | #
 5 | # cluster readiness is indicated by existence of ready_url, containing
 6 | # a timestamp (seconds since epoch) of when it was made so. timestamp
 7 | # is factored into the run_delay.
 8 | #
 9 | 
10 | # Polling frequency in seconds
11 | POLL_TIME=4
12 | 
13 | if [ $# != 2 ]; then
14 |     echo "usage $0: ready_url run_delay"
15 |     exit 1
16 | fi
17 | READY_URL=$1
18 | RUN_DELAY=$2
19 | 
20 | # XXX: a random jitter, backoff?
21 | JITTER=0
22 | 
23 | while true; do
24 |     START_TIME=`wget -qO- ${READY_URL}` && break
25 |     sleep $(( ${POLL_TIME} + ${JITTER} ))
26 | done
27 | 
28 | CURRENT_TIME=`date +%s`
29 | SINCE=$(( ${CURRENT_TIME} - ${START_TIME} ))
30 | if [ ${SINCE} -lt 0 ]; then
31 |     echo "Clock skew: ${SINCE}" >&2
32 |     SINCE=0
33 | fi
34 | 
35 | RUN_DELAY=$(( ${RUN_DELAY} - ${SINCE} ))
36 | if [ ${RUN_DELAY} -gt 0 ]; then
37 |     FMT_START_TIME=`date '+%FT%T+00:00' -d @${START_TIME}`
38 |     echo "Cluster ready @ ${FMT_START_TIME}" \
39 |          "(sleeping for run_delay=${RUN_DELAY}s)"
40 |     sleep $RUN_DELAY
41 | fi
42 | 


--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | nose==1.3.7
3 | mock==2.0.0
4 | coverage==4.3.4
5 | boto3==1.4.4
6 | influxdb==4.0.0
7 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loads/ardere/0c1b7736c514d2b9fd1563cf96bbe0fd75244f95/tests/__init__.py


--------------------------------------------------------------------------------
/tests/fixtures.py:
--------------------------------------------------------------------------------
 1 | sample_basic_test_plan = """
 2 | {
 3 |   "ecs_name": "ardere-test",
 4 |   "name": "Loadtest",
 5 |   "description": "Run all APLT scenarios",
 6 |   "metrics_options": {
 7 |     "enabled": true,
 8 |     "dashboard": {
 9 |         "admin_user": "admin",
10 |         "admin_password": "testing",
11 |         "name": "ap-loadtester",
12 |         "filename": "gf_basic_dashboard.json"
13 |     }
14 |   },
15 |   "steps": [
16 |     {
17 |       "name": "TestCluster",
18 |       "instance_count": 1,
19 |       "instance_type": "t2.medium",
20 |       "run_max_time": 140,
21 |       "env": {
22 |           "SOME_VAR": "great-value"
23 |       },
24 |       "port_mapping": [8000, 4000],
25 |       "container_name": "bbangert/ap-loadtester:latest",
26 |       "cmd": "./apenv/bin/aplt_testplan wss://autopush.stage.mozaws.net 'aplt.scenarios:notification_forever,1000,1,0' --statsd_host=localhost --statsd_port=8125"
27 |     }
28 |   ]
29 | }
30 | """
31 | 
32 | sample_toml = """
33 | ecs_name = "ardere-test"
34 | name = "connection loadtest"
35 | description = "autopush: connect and idle forever"
36 | 
37 | 
38 | [[steps]]
39 |     name = "***************** RUN #01 ***********************"
40 |     instance_count = 8
41 |     instance_type = "m3.medium"
42 |     container_name = "bbangert/ap-loadtester:latest"
43 |     cmd = "./apenv/bin/aplt_testplan wss://autopush.stage.mozaws.net 'aplt.scenarios:connect_and_idle_forever,10000,5,0'"
44 |     run_max_time = 300
45 |     volume_mapping = "/var/log:/var/log/$RUN_ID:rw"
46 |     docker_series = "push_tests"
47 | 
48 | [[steps]]
49 |     name = "***************** RUN #02 ***********************"
50 |     instance_count = 8
51 |     run_delay = 330
52 |     instance_type = "m3.medium"
53 |     container_name = "bbangert/ap-loadtester:latest"
54 |     cmd = "./apenv/bin/aplt_testplan wss://autopush.stage.mozaws.net 'aplt.scenarios:connect_and_idle_forever,10000,5,0'"
55 |     run_max_time = 300
56 |     volume_mapping = "/var/log:/var/log/$RUN_ID:rw"
57 |     docker_series = "push_tests"
58 | 
59 | """
60 | 
61 | future_hypothetical_test="""
62 | {
63 |     "name": "TestCluster",
64 |     "instance_count": 1,
65 |     "instance_type": "t2.medium",
66 |     "run_max_time": 140,
67 |     "container_name": "bbangert/pushgo:1.5rc4",
68 |     "port_mapping": "8080,8081,3000,8082",
69 |     "load_balancer": {
70 |         "env_var": "TEST_CLUSTER",
71 |         "ping_path": "/status/health",
72 |         "ping_port": 8080,
73 |         "ping_protocol": "http",
74 |         "listeners": [
75 |             {
76 |                 "listen_protocol": "ssl",
77 |                 "listen_port": 443,
78 |                 "backend_protocol": "tcp",
79 |                 "backend_port": 8080
80 |             },
81 |             {
82 |                 "listen_protocol": "https",
83 |                 "listen_port": 9000,
84 |                 "backend_protocol": "http",
85 |                 "backend_port": 8090
86 |             }
87 |         ]
88 |     }
89 | }
90 | """


--------------------------------------------------------------------------------
/tests/test_aws.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import time
  4 | import unittest
  5 | 
  6 | import mock
  7 | from nose.tools import assert_raises, eq_, ok_
  8 | 
  9 | from tests import fixtures
 10 | 
 11 | 
 12 | class TestECSManager(unittest.TestCase):
 13 |     def _make_FUT(self, plan=None):
 14 |         from ardere.aws import ECSManager
 15 |         os.environ["s3_ready_bucket"] = "test_bucket"
 16 |         os.environ["ecs_profile"] = "arn:something:fantastic:::"
 17 |         os.environ["container_log_group"] = "ardere"
 18 |         self.boto_mock = mock.Mock()
 19 |         ECSManager.boto = self.boto_mock
 20 |         if not plan:
 21 |             plan = json.loads(fixtures.sample_basic_test_plan)
 22 |             plan["metrics_options"] = dict(
 23 |                 dashboard=dict(
 24 |                     admin_user="admin",
 25 |                     admin_password="admin"
 26 |                 ),
 27 |                 tear_down=False
 28 |             )
 29 |         return ECSManager(plan)
 30 | 
 31 |     def test_init(self):
 32 |         ecs = self._make_FUT()
 33 |         eq_(ecs._plan["plan_run_uuid"], ecs._plan_uuid)
 34 |         eq_(ecs.plan_uuid, ecs._plan_uuid)
 35 | 
 36 |     def test_ready_file(self):
 37 |         ecs = self._make_FUT()
 38 |         ready_filename = ecs.s3_ready_file
 39 |         ok_("test_bucket" in ready_filename)
 40 |         ok_(ecs._plan_uuid in ready_filename)
 41 | 
 42 |     def test_query_active(self):
 43 |         mock_paginator = mock.Mock()
 44 |         mock_paginator.paginate.return_value = [
 45 |             {"Reservations": [
 46 |                 {
 47 |                     "Instances": [
 48 |                         {
 49 |                             "State": {
 50 |                                 "Code": 16
 51 |                             },
 52 |                             "InstanceType": "t2.medium"
 53 |                         }
 54 |                     ]
 55 |                 }
 56 |             ]}
 57 |         ]
 58 | 
 59 |         ecs = self._make_FUT()
 60 |         ecs._ec2_client.get_paginator.return_value = mock_paginator
 61 |         instance_dct = ecs.query_active_instances()
 62 |         eq_(len(instance_dct.values()), 1)
 63 | 
 64 |     def test_calculate_missing_instances(self):
 65 |         ecs = self._make_FUT()
 66 |         result = ecs.calculate_missing_instances(
 67 |             desired={"t2.medium": 2}, current={"t2.medium": 1}
 68 |         )
 69 |         eq_(result, {"t2.medium": 1})
 70 | 
 71 |     def test_has_metrics_node(self):
 72 |         mock_paginator = mock.Mock()
 73 |         mock_paginator.paginate.return_value = [
 74 |             {"Reservations": [
 75 |                 {
 76 |                     "Instances": [
 77 |                         {
 78 |                             "State": {
 79 |                                 "Code": 16
 80 |                             },
 81 |                             "InstanceType": "t2.medium"
 82 |                         }
 83 |                     ]
 84 |                 }
 85 |             ]}
 86 |         ]
 87 | 
 88 |         ecs = self._make_FUT()
 89 |         ecs._ec2_client.get_paginator.return_value = mock_paginator
 90 |         resp = ecs.has_metrics_node("t2.medium")
 91 |         eq_(resp, True)
 92 | 
 93 |     def test_has_started_metric_creation(self):
 94 |         ecs = self._make_FUT()
 95 |         ecs._ecs_client.list_tasks.return_value = {"taskArns": [123]}
 96 |         eq_(ecs.has_started_metric_creation(), True)
 97 | 
 98 |     def test_has_finished_metric_creation(self):
 99 |         ecs = self._make_FUT()
100 |         ecs._ecs_client.list_tasks.return_value = {"taskArns": [123]}
101 |         eq_(ecs.has_finished_metric_creation(), True)
102 | 
103 |     def test_request_instances(self):
104 |         instances = {
105 |             "t2.medium": 10
106 |         }
107 |         ecs = self._make_FUT()
108 |         ecs._ec2_client.run_instances.return_value = {
109 |             "Instances": [{"InstanceId": 12345}]
110 |         }
111 |         ecs.request_instances(instances, ["i-382842"], {"Role": "metrics"})
112 |         ecs._ec2_client.run_instances.assert_called()
113 | 
114 |     def test_locate_metrics_container_ip(self):
115 |         ecs = self._make_FUT()
116 |         ecs._ecs_client.list_container_instances.return_value = {
117 |             "containerInstanceArns": ["arn:of:some:container::"]
118 |         }
119 |         ecs._ecs_client.describe_container_instances.return_value = {
120 |             "containerInstances": [
121 |                 {"ec2InstanceId": "e-28193823"}
122 |             ]
123 |         }
124 |         mock_resource = mock.Mock()
125 |         ecs.boto.resource.return_value = mock_resource
126 |         ecs.locate_metrics_container_ip()
127 |         ecs.boto.resource.assert_called()
128 | 
129 |     def test_locate_metrics_container_ip_not_found(self):
130 |         ecs = self._make_FUT()
131 |         ecs._ecs_client.list_container_instances.return_value = {
132 |             "containerInstanceArns": []
133 |         }
134 |         result = ecs.locate_metrics_container_ip()
135 |         eq_(result, (None, None))
136 | 
137 |     def test_locate_metrics_service(self):
138 |         ecs = self._make_FUT()
139 |         ecs._ecs_client.describe_services.return_value = {
140 |             "services": [
141 |                 {"stuff": 1, "status": "ACTIVE"}
142 |             ]
143 |         }
144 |         result = ecs.locate_metrics_service()
145 |         eq_(result, {"stuff": 1, "status": "ACTIVE"})
146 | 
147 |     def test_locate_metrics_service_not_found(self):
148 |         ecs = self._make_FUT()
149 |         ecs._ecs_client.describe_services.return_value = {
150 |             "services": []
151 |         }
152 |         result = ecs.locate_metrics_service()
153 |         eq_(result, None)
154 | 
155 |     def test_create_metrics_service(self):
156 |         ecs = self._make_FUT()
157 | 
158 |         # Setup mocks
159 |         ecs._ecs_client.register_task_definition.return_value = {
160 |             "taskDefinition": {
161 |                 "taskDefinitionArn": "arn:of:some:task::"
162 |             }
163 |         }
164 |         ecs._ecs_client.create_service.return_value = {
165 |             "service": {"serviceArn": "arn:of:some:service::"}
166 |         }
167 | 
168 |         result = ecs.create_metrics_service(dict(instance_type="c4.large"))
169 |         eq_(result["service_arn"], "arn:of:some:service::")
170 | 
171 |     def test_run_metric_creation_task(self):
172 |         ecs = self._make_FUT()
173 |         ecs.run_metric_creation_task("arn:::", ("admin", "admin"),
174 |                                      "asdf", "atitle")
175 |         ecs._ecs_client.start_task.assert_called()
176 | 
177 |     def test_create_service(self):
178 |         ecs = self._make_FUT()
179 | 
180 |         step = ecs._plan["steps"][0]
181 |         ecs._plan["influxdb_private_ip"] = "1.1.1.1"
182 |         step["docker_series"] = "default"
183 | 
184 |         # Setup mocks
185 |         ecs._ecs_client.register_task_definition.return_value = {
186 |             "taskDefinition": {
187 |                 "taskDefinitionArn": "arn:of:some:task::"
188 |             }
189 |         }
190 |         ecs._ecs_client.create_service.return_value = {
191 |             "service": {"serviceArn": "arn:of:some:service::"}
192 |         }
193 | 
194 |         ecs.create_service(step)
195 | 
196 |         eq_(step["serviceArn"], "arn:of:some:service::")
197 |         ecs._ecs_client.register_task_definition.assert_called()
198 |         _, kwargs = ecs._ecs_client.register_task_definition.call_args
199 |         container_def = kwargs["containerDefinitions"][0]
200 | 
201 |         eq_(container_def["cpu"], 1536)
202 | 
203 |         _, kwargs = ecs._ecs_client.register_task_definition.call_args
204 |         container_def = kwargs["containerDefinitions"][0]
205 |         ok_("portMappings" in container_def)
206 | 
207 |     def test_create_services(self):
208 |         ecs = self._make_FUT()
209 |         ecs.create_service = mock.Mock()
210 |         ecs.create_services(ecs._plan["steps"])
211 |         ecs.create_service.assert_called()
212 | 
213 |     def test_create_services_ecs_error(self):
214 |         from botocore.exceptions import ClientError
215 |         ecs = self._make_FUT()
216 | 
217 |         step = ecs._plan["steps"][0]
218 |         ecs._plan["influxdb_private_ip"] = "1.1.1.1"
219 |         step["docker_series"] = "default"
220 |         ecs._ecs_client.register_task_definition.side_effect = ClientError(
221 |             {"Error": {}}, "some_op"
222 |         )
223 | 
224 |         with assert_raises(ClientError):
225 |             ecs.create_services(ecs._plan["steps"])
226 | 
227 |     def test_service_ready_true(self):
228 |         ecs = self._make_FUT()
229 |         step = ecs._plan["steps"][0]
230 | 
231 |         ecs._ecs_client.describe_services.return_value = {
232 |             "services": [{
233 |                 "deployments": [{
234 |                     "desiredCount": 2,
235 |                     "runningCount": 2
236 |                 }]
237 |             }]
238 |         }
239 | 
240 |         result = ecs.service_ready(step)
241 |         eq_(result, True)
242 | 
243 |     def test_service_not_known_yet(self):
244 |         ecs = self._make_FUT()
245 |         step = ecs._plan["steps"][0]
246 | 
247 |         ecs._ecs_client.describe_services.return_value = {
248 |             "services": []
249 |         }
250 | 
251 |         result = ecs.service_ready(step)
252 |         eq_(result, False)
253 | 
254 |     def test_all_services_ready(self):
255 |         ecs = self._make_FUT()
256 |         ecs.service_ready = mock.Mock()
257 | 
258 |         ecs.all_services_ready(ecs._plan["steps"])
259 |         ecs.service_ready.assert_called()
260 | 
261 |     def test_service_done_true(self):
262 |         ecs = self._make_FUT()
263 |         step = ecs._plan["steps"][0]
264 | 
265 |         ecs._ecs_client.describe_services.return_value = {
266 |             "services": [{
267 |                 "status": "INACTIVE"
268 |             }]
269 |         }
270 | 
271 |         result = ecs.service_done(step)
272 |         eq_(result, True)
273 | 
274 |     def test_service_not_known(self):
275 |         ecs = self._make_FUT()
276 |         step = ecs._plan["steps"][0]
277 | 
278 |         ecs._ecs_client.describe_services.return_value = {
279 |             "services": [{
280 |                 "status": "DRAINING"
281 |             }]
282 |         }
283 | 
284 |         result = ecs.service_done(step)
285 |         eq_(result, False)
286 | 
287 |     def test_all_services_done(self):
288 |         ecs = self._make_FUT()
289 |         ecs.service_done = mock.Mock()
290 |         ecs.all_services_done(ecs._plan["steps"])
291 |         ecs.service_done.assert_called()
292 | 
293 |     def test_stop_finished_service_stopped(self):
294 |         ecs = self._make_FUT()
295 |         ecs._ecs_client.update_service = mock.Mock()
296 |         step = ecs._plan["steps"][0]
297 |         step["service_status"] = "STARTED"
298 |         past = time.time() - 400
299 |         ecs.stop_finished_service(past, step)
300 |         ecs._ecs_client.update_service.assert_called()
301 |         eq_(step["service_status"], "STOPPED")
302 | 
303 |     def test_stop_finished_service_stop_already_stopped(self):
304 |         ecs = self._make_FUT()
305 |         ecs._ecs_client.update_service = mock.Mock()
306 |         step = ecs._plan["steps"][0]
307 |         step["service_status"] = "STOPPED"
308 |         past = time.time() - 400
309 |         ecs.stop_finished_service(past, step)
310 |         ecs._ecs_client.update_service.assert_not_called()
311 |         eq_(step["service_status"], "STOPPED")
312 | 
313 |     def test_stop_finished_service_still_running(self):
314 |         ecs = self._make_FUT()
315 |         ecs._ecs_client.update_service = mock.Mock()
316 |         step = ecs._plan["steps"][0]
317 |         step["service_status"] = "STARTED"
318 |         past = time.time() - 100
319 |         ecs.stop_finished_service(past, step)
320 |         ecs._ecs_client.update_service.assert_not_called()
321 |         eq_(step["service_status"], "STARTED")
322 | 
323 |     def test_stop_finished_services(self):
324 |         ecs = self._make_FUT()
325 |         ecs.stop_finished_service = mock.Mock()
326 | 
327 |         past = time.time() - 100
328 |         ecs.stop_finished_services(past, ecs._plan["steps"])
329 |         ecs.stop_finished_service.assert_called()
330 | 
331 |     def test_shutdown_plan(self):
332 |         mock_paginator = mock.Mock()
333 |         mock_paginator.paginate.return_value = [
334 |             {"serviceArns": ["arn:123:::", "arn:456:::"]}
335 |         ]
336 | 
337 |         ecs = self._make_FUT()
338 |         ecs.locate_metrics_service = mock.Mock()
339 |         ecs.locate_metrics_service.return_value = dict(
340 |             serviceArn="arn:456:::"
341 |         )
342 |         ecs._ecs_client.get_paginator.return_value = mock_paginator
343 |         ecs._ecs_client.describe_task_definition.return_value = {
344 |             "taskDefinition": {"taskDefinitionArn": "arn:task:::"}
345 |         }
346 | 
347 |         ecs.shutdown_plan(ecs._plan["steps"])
348 |         ecs._ecs_client.deregister_task_definition.assert_called()
349 |         ecs._ecs_client.delete_service.assert_called()
350 | 
351 |     def test_shutdown_plan_update_error(self):
352 |         from botocore.exceptions import ClientError
353 | 
354 |         mock_paginator = mock.Mock()
355 |         mock_paginator.paginate.return_value = [
356 |             {"serviceArns": ["arn:123:::", "arn:456:::"]}
357 |         ]
358 | 
359 |         ecs = self._make_FUT()
360 |         ecs.locate_metrics_service = mock.Mock()
361 |         ecs.locate_metrics_service.return_value = dict(
362 |             serviceArn="arn:456:::"
363 |         )
364 |         ecs._ecs_client.get_paginator.return_value = mock_paginator
365 |         ecs._ecs_client.describe_task_definition.return_value = {
366 |             "taskDefinition": {"taskDefinitionArn": "arn:task:::"}
367 |         }
368 |         ecs._ecs_client.update_service.side_effect = ClientError(
369 |             {"Error": {}}, "some_op"
370 |         )
371 | 
372 |         ecs.shutdown_plan(ecs._plan["steps"])
373 |         ecs._ecs_client.delete_service.assert_not_called()
374 | 
375 |     def test_shutdown_plan_describe_error(self):
376 |         from botocore.exceptions import ClientError
377 | 
378 |         mock_paginator = mock.Mock()
379 |         mock_paginator.paginate.return_value = [
380 |             {"serviceArns": ["arn:123:::", "arn:456:::"]}
381 |         ]
382 | 
383 |         ecs = self._make_FUT()
384 |         ecs.locate_metrics_service = mock.Mock()
385 |         ecs.locate_metrics_service.return_value = dict(
386 |             serviceArn="arn:456:::"
387 |         )
388 |         ecs._plan["steps"] = ecs._plan["steps"][:1]
389 |         ecs._ecs_client.get_paginator.return_value = mock_paginator
390 |         ecs._ecs_client.describe_task_definition.side_effect = ClientError(
391 |             {"Error": {}}, "some_op"
392 |         )
393 | 
394 |         ecs.shutdown_plan(ecs._plan["steps"])
395 |         ecs._ecs_client.deregister_task_definition.assert_not_called()
396 | 
397 |     def test_shutdown_plan_delete_error(self):
398 |         from botocore.exceptions import ClientError
399 | 
400 |         mock_paginator = mock.Mock()
401 |         mock_paginator.paginate.return_value = [
402 |             {"serviceArns": ["arn:123:::", "arn:456:::"]}
403 |         ]
404 | 
405 |         ecs = self._make_FUT()
406 |         ecs.locate_metrics_service = mock.Mock()
407 |         ecs.locate_metrics_service.return_value = dict(
408 |             serviceArn="arn:456:::"
409 |         )
410 |         ecs._ecs_client.get_paginator.return_value = mock_paginator
411 |         ecs._ecs_client.describe_task_definition.return_value = {
412 |             "taskDefinition": {"taskDefinitionArn": "arn:task:::"}
413 |         }
414 |         ecs._ecs_client.delete_service.side_effect = ClientError(
415 |             {"Error": {}}, "some_op"
416 |         )
417 | 
418 |         ecs.shutdown_plan(ecs._plan["steps"])
419 |         ecs._ecs_client.delete_service.assert_called()
420 | 
421 |     def test_shutdown_plan_deregister_error(self):
422 |         from botocore.exceptions import ClientError
423 | 
424 |         mock_paginator = mock.Mock()
425 |         mock_paginator.paginate.return_value = [
426 |             {"serviceArns": ["arn:123:::", "arn:456:::"]}
427 |         ]
428 | 
429 |         ecs = self._make_FUT()
430 |         ecs.locate_metrics_service = mock.Mock()
431 |         ecs.locate_metrics_service.return_value = dict(
432 |             serviceArn="arn:456:::"
433 |         )
434 |         ecs._plan["metrics_options"]["tear_down"] = True
435 |         ecs._ecs_client.get_paginator.return_value = mock_paginator
436 |         ecs._ecs_client.describe_task_definition.return_value = {
437 |             "taskDefinition": {"taskDefinitionArn": "arn:task:::"}
438 |         }
439 |         ecs._ecs_client.deregister_task_definition.side_effect = ClientError(
440 |             {"Error": {}}, "some_op"
441 |         )
442 | 
443 |         ecs.shutdown_plan(ecs._plan["steps"])
444 |         ecs._ecs_client.delete_service.assert_called()
445 | 


--------------------------------------------------------------------------------
/tests/test_metric_creator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | import mock
 5 | from nose.tools import assert_raises, eq_
 6 | 
 7 | 
 8 | class TestMetricRunner(unittest.TestCase):
 9 |     def _make_FUT(self):
10 |         from ardere.scripts.metric_creator import DashboardSetup
11 |         # Setup the env vars we need
12 |         os.environ["__ARDERE_INFLUXDB_NAME__"] = "ardere"
13 |         return DashboardSetup()
14 | 
15 |     def test_load_dashboard(self):
16 |         ds = self._make_FUT()
17 |         mock_file = mock.Mock()
18 |         mock_file.get.return_value = {"Body": mock_file}
19 |         mock_file.read.return_value = "{}".encode(
20 |             'utf-8')
21 |         mock_s3_obj = mock.Mock()
22 |         mock_s3_obj.Object.return_value = mock_file
23 | 
24 |         ds.boto = mock.Mock()
25 |         ds.boto.resource.return_value = mock_s3_obj
26 |         ds.dashboard = "asdf:asdf"
27 |         result = ds._load_dashboard()
28 |         eq_(result, dict(id=None, title=None))
29 | 
30 |     def test_create_dashboard(self):
31 |         ds = self._make_FUT()
32 |         ds._load_dashboard = mock.Mock()
33 |         ds.req = mock.Mock()
34 |         ds.req.post.return_value = mock.Mock(status_code=200)
35 |         ds._create_dashboard("http://localhost")
36 |         ds._load_dashboard.assert_called()
37 | 
38 |     def test_create_dashboard_exception(self):
39 |         ds = self._make_FUT()
40 |         ds._load_dashboard = mock.Mock()
41 |         ds.req = mock.Mock()
42 |         ds.req.post.return_value = mock.Mock(status_code=500)
43 |         assert_raises(Exception, ds._create_dashboard, "http://localhost")
44 | 
45 |     def test_ensure_dashboard_create(self):
46 |         ds = self._make_FUT()
47 |         ds.req = mock.Mock()
48 |         mock_response = mock.Mock()
49 |         mock_response.status_code = 200
50 |         mock_response.json.return_value = []
51 |         ds._create_dashboard = mock.Mock()
52 |         ds.req.get.return_value = mock_response
53 | 
54 |         ds._ensure_dashboard("http://localhost")
55 |         ds._create_dashboard.assert_called()
56 | 
57 |     def test_ensure_dashboard_exception(self):
58 |         ds = self._make_FUT()
59 |         ds.req = mock.Mock()
60 |         mock_response = mock.Mock()
61 |         mock_response.status_code = 500
62 |         ds.req.get.return_value = mock_response
63 |         assert_raises(Exception, ds._ensure_dashboard, "http://localhost")
64 | 
65 |     def test_create_datasources(self):
66 |         ds = self._make_FUT()
67 |         ds.dashboard = True
68 |         ds.influx = mock.Mock()
69 |         ds.req = mock.Mock()
70 |         mock_client = mock.Mock()
71 |         ds._ensure_dashboard = mock.Mock()
72 |         ds.influx.InfluxDBClient.return_value = mock_client
73 | 
74 |         ds.create_datasources()
75 |         mock_client.create_database.assert_called()
76 |         ds._ensure_dashboard.assert_called()
77 | 


--------------------------------------------------------------------------------
/tests/test_step_functions.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import time
  4 | import unittest
  5 | import uuid
  6 | 
  7 | import mock
  8 | from botocore.exceptions import ClientError
  9 | from nose.tools import eq_, assert_raises
 10 | 
 11 | from tests import fixtures
 12 | 
 13 | 
 14 | class TestAsyncPlanRunner(unittest.TestCase):
 15 |     def setUp(self):
 16 |         self.mock_ecs = mock.Mock()
 17 |         self._patcher = mock.patch("ardere.step_functions.ECSManager")
 18 |         mock_manager = self._patcher.start()
 19 |         mock_manager.return_value = self.mock_ecs
 20 | 
 21 |         from ardere.step_functions import AsynchronousPlanRunner
 22 | 
 23 |         self.plan = json.loads(fixtures.sample_basic_test_plan)
 24 |         self.runner = AsynchronousPlanRunner(self.plan, {})
 25 |         self.runner.boto = self.mock_boto = mock.Mock()
 26 | 
 27 |     def tearDown(self):
 28 |         self._patcher.stop()
 29 | 
 30 |     def test_build_instance_map(self):
 31 |         result = self.runner._build_instance_map()
 32 |         eq_(len(result), 1)
 33 |         eq_(result, {"t2.medium": 1})
 34 | 
 35 |     def test_find_test_plan_duration(self):
 36 |         result = self.runner._find_test_plan_duration()
 37 |         eq_(result, 140)
 38 | 
 39 |     def test_load_toml(self):
 40 |         from ardere.step_functions import AsynchronousPlanRunner
 41 | 
 42 |         self.runner = AsynchronousPlanRunner({"toml": fixtures.sample_toml},
 43 |                                              None)
 44 |         eq_(len(self.runner.event["steps"]), 2)
 45 |         eq_(self.runner.event["steps"][0]["instance_count"], 8)
 46 |         eq_(self.runner.event["ecs_name"], "ardere-test")
 47 | 
 48 |     def test_populate_missing_instances(self):
 49 |         os.environ["ec2_sg"] = "i-23232"
 50 |         os.environ["metric_sg"] = "i-84828"
 51 |         self.mock_ecs.has_metrics_node.return_value = False
 52 |         self.runner.populate_missing_instances()
 53 |         self.mock_ecs.query_active_instances.assert_called()
 54 |         self.mock_ecs.request_instances.assert_called()
 55 | 
 56 |     def test_populate_missing_instances_fail(self):
 57 |         from ardere.exceptions import ValidationException
 58 |         mock_client = mock.Mock()
 59 |         self.mock_boto.client.return_value = mock_client
 60 |         mock_client.describe_clusters.return_value = {"clusters": []}
 61 |         assert_raises(ValidationException,
 62 |                       self.runner.populate_missing_instances)
 63 | 
 64 |     def test_ensure_metrics_available_running_create(self):
 65 |         from ardere.exceptions import ServicesStartingException
 66 | 
 67 |         self.plan["metrics_options"] = dict(enabled=True)
 68 |         self.mock_ecs.locate_metrics_service.return_value = None
 69 | 
 70 |         assert_raises(ServicesStartingException,
 71 |                       self.runner.ensure_metrics_available)
 72 |         self.mock_ecs.create_metrics_service.assert_called()
 73 | 
 74 |     def test_ensure_metrics_available_running_waiting(self):
 75 |         from ardere.exceptions import ServicesStartingException
 76 | 
 77 |         self.plan["metrics_options"] = dict(enabled=True)
 78 |         self.mock_ecs.locate_metrics_service.return_value = {
 79 |             "deployments": [{
 80 |                 "desiredCount": 1,
 81 |                 "runningCount": 0
 82 |             }]
 83 |         }
 84 | 
 85 |         assert_raises(ServicesStartingException,
 86 |                       self.runner.ensure_metrics_available)
 87 | 
 88 |     def test_ensure_metrics_available_running_error(self):
 89 |         self.plan["metrics_options"] = dict(enabled=True)
 90 |         self.mock_ecs.locate_metrics_service.return_value = {
 91 |             "deployments": [{
 92 |                 "desiredCount": 1,
 93 |                 "runningCount": 1
 94 |             }]
 95 |         }
 96 |         self.mock_ecs.locate_metrics_container_ip.return_value = None
 97 | 
 98 |         assert_raises(Exception, self.runner.ensure_metrics_available)
 99 | 
100 |     def test_ensure_metrics_available_running(self):
101 |         os.environ["metrics_bucket"] = "metrics"
102 |         self.plan["metrics_options"] = dict(
103 |             enabled=True,
104 |             dashboard=dict(admin_user="admin",
105 |                            admin_password="admin", name="fred",
106 |                            filename="smith")
107 |         )
108 |         self.mock_ecs.locate_metrics_service.return_value = {
109 |             "deployments": [{
110 |                 "desiredCount": 1,
111 |                 "runningCount": 1
112 |             }]
113 |         }
114 |         self.mock_ecs.locate_metrics_container_ip.return_value = (
115 |             "1.1.1.1", "arn:::"
116 |         )
117 | 
118 |         self.runner.ensure_metrics_available()
119 |         self.mock_ecs.locate_metrics_container_ip.assert_called()
120 | 
121 |     def test_ensure_metrics_available_running_no_metric_ip(self):
122 |         os.environ["metrics_bucket"] = "metrics"
123 |         self.plan["metrics_options"] = dict(
124 |             enabled=True,
125 |             dashboard=dict(admin_user="admin",
126 |                            admin_password="admin", name="fred",
127 |                            filename="smith")
128 |         )
129 |         self.mock_ecs.locate_metrics_service.return_value = {
130 |             "deployments": [{
131 |                 "desiredCount": 1,
132 |                 "runningCount": 1
133 |             }]
134 |         }
135 |         self.mock_ecs.locate_metrics_container_ip.return_value = (
136 |             None, None
137 |         )
138 | 
139 |         assert_raises(Exception, self.runner.ensure_metrics_available)
140 |         self.mock_ecs.locate_metrics_container_ip.assert_called()
141 | 
142 |     def test_ensure_metrics_available_disabled(self):
143 |         self.plan["metrics_options"] = dict(enabled=False)
144 |         self.runner.ensure_metrics_available()
145 | 
146 |     def test_ensure_metric_sources_created(self):
147 |         os.environ["metrics_bucket"] = "metrics"
148 |         self.plan["influxdb_private_ip"] = "1.1.1.1"
149 |         self.plan["metrics_options"] = dict(
150 |             enabled=True,
151 |             dashboard=dict()
152 |         )
153 |         self.mock_ecs.has_started_metric_creation.return_value = True
154 |         self.runner.ensure_metric_sources_created()
155 |         self.mock_ecs.has_started_metric_creation.assert_called()
156 | 
157 |     def test_ensure_metric_sources_created_not_finished(self):
158 |         from ardere.exceptions import CreatingMetricSourceException
159 |         os.environ["metrics_bucket"] = "metrics"
160 |         self.plan["influxdb_private_ip"] = "1.1.1.1"
161 |         self.plan["metrics_options"] = dict(
162 |             enabled=True,
163 |         )
164 |         self.mock_ecs.has_started_metric_creation.return_value = True
165 |         self.mock_ecs.has_finished_metric_creation.return_value = False
166 |         assert_raises(CreatingMetricSourceException,
167 |                       self.runner.ensure_metric_sources_created)
168 |         self.mock_ecs.has_started_metric_creation.assert_called()
169 | 
170 |     def test_ensure_metric_sources_created_not_enabled(self):
171 |         self.plan["metrics_options"] = dict(
172 |             enabled=False,
173 |             dashboard=dict()
174 |         )
175 |         self.runner.ensure_metric_sources_created()
176 | 
177 |     def test_ensure_metric_sources_created_not_started(self):
178 |         from ardere.exceptions import CreatingMetricSourceException
179 |         os.environ["metrics_bucket"] = "metrics"
180 |         self.plan["influxdb_private_ip"] = "1.1.1.1"
181 |         self.plan["metric_container_arn"] = "arn:::"
182 |         self.plan["metrics_options"] = dict(
183 |             enabled=True,
184 |             dashboard=dict(
185 |                 admin_user="admin",
186 |                 admin_password="admin",
187 |                 filename="asdf",
188 |                 name="a title"
189 |             )
190 |         )
191 |         self.mock_ecs.has_started_metric_creation.return_value = False
192 |         assert_raises(CreatingMetricSourceException,
193 |                       self.runner.ensure_metric_sources_created)
194 |         self.mock_ecs.has_started_metric_creation.assert_called()
195 | 
196 |     def test_ensure_metric_sources_created_not_started_no_dash(self):
197 |         from ardere.exceptions import CreatingMetricSourceException
198 |         os.environ["metrics_bucket"] = "metrics"
199 |         self.plan["influxdb_private_ip"] = "1.1.1.1"
200 |         self.plan["metric_container_arn"] = "arn:::"
201 |         self.plan["metrics_options"] = dict(
202 |             enabled=True,
203 |         )
204 |         self.mock_ecs.has_started_metric_creation.return_value = False
205 |         assert_raises(CreatingMetricSourceException,
206 |                       self.runner.ensure_metric_sources_created)
207 |         self.mock_ecs.has_started_metric_creation.assert_called()
208 | 
209 |     def test_create_ecs_services(self):
210 |         self.runner.create_ecs_services()
211 |         self.mock_ecs.create_services.assert_called_with(self.plan["steps"])
212 | 
213 |     def test_wait_for_cluster_ready_not_ready(self):
214 |         from ardere.exceptions import ServicesStartingException
215 | 
216 |         self.mock_ecs.all_services_ready.return_value = False
217 |         assert_raises(ServicesStartingException,
218 |                       self.runner.wait_for_cluster_ready)
219 | 
220 |     def test_wait_for_cluster_ready_all_ready(self):
221 |         self.mock_ecs.all_services_ready.return_value = True
222 |         self.runner.wait_for_cluster_ready()
223 |         self.mock_ecs.all_services_ready.assert_called()
224 | 
225 |     def test_signal_cluster_start(self):
226 |         self.plan["plan_run_uuid"] = str(uuid.uuid4())
227 | 
228 |         self.runner.signal_cluster_start()
229 |         self.mock_boto.client.assert_called()
230 | 
231 |     def test_check_for_cluster_done_not_done(self):
232 |         os.environ["s3_ready_bucket"] = "test_bucket"
233 |         mock_file = mock.Mock()
234 |         mock_file.get.return_value = {"Body": mock_file}
235 |         mock_file.read.return_value = "{}".format(
236 |             int(time.time()) - 100).encode(
237 |             'utf-8')
238 |         mock_s3_obj = mock.Mock()
239 |         mock_s3_obj.Object.return_value = mock_file
240 |         self.mock_boto.resource.return_value = mock_s3_obj
241 | 
242 |         self.plan["plan_run_uuid"] = str(uuid.uuid4())
243 |         self.runner.check_for_cluster_done()
244 | 
245 |     def test_check_for_cluster_done_shutdown(self):
246 |         from ardere.exceptions import ShutdownPlanException
247 | 
248 |         os.environ["s3_ready_bucket"] = "test_bucket"
249 |         mock_file = mock.Mock()
250 |         mock_file.get.return_value = {"Body": mock_file}
251 |         mock_file.read.return_value = "{}".format(
252 |             int(time.time()) - 400).encode(
253 |             'utf-8')
254 |         mock_s3_obj = mock.Mock()
255 |         mock_s3_obj.Object.return_value = mock_file
256 |         self.mock_boto.resource.return_value = mock_s3_obj
257 | 
258 |         self.plan["plan_run_uuid"] = str(uuid.uuid4())
259 |         assert_raises(ShutdownPlanException, self.runner.check_for_cluster_done)
260 | 
261 |     def test_check_for_cluster_done_object_error(self):
262 |         from ardere.exceptions import ShutdownPlanException
263 | 
264 |         os.environ["s3_ready_bucket"] = "test_bucket"
265 |         mock_file = mock.Mock()
266 |         mock_file.get.return_value = {"Body": mock_file}
267 |         mock_file.read.return_value = "{}".format(
268 |             int(time.time()) - 400).encode(
269 |             'utf-8')
270 |         mock_s3_obj = mock.Mock()
271 |         mock_s3_obj.Object.side_effect = ClientError(
272 |             {"Error": {}}, None
273 |         )
274 |         self.mock_boto.resource.return_value = mock_s3_obj
275 | 
276 |         self.plan["plan_run_uuid"] = str(uuid.uuid4())
277 |         assert_raises(ShutdownPlanException, self.runner.check_for_cluster_done)
278 | 
279 |     def test_cleanup_cluster(self):
280 |         self.plan["plan_run_uuid"] = str(uuid.uuid4())
281 | 
282 |         self.runner.cleanup_cluster()
283 |         self.mock_boto.resource.assert_called()
284 | 
285 |     def test_cleanup_cluster_error(self):
286 |         self.plan["plan_run_uuid"] = str(uuid.uuid4())
287 | 
288 |         mock_s3 = mock.Mock()
289 |         self.mock_boto.resource.return_value = mock_s3
290 |         mock_s3.Object.side_effect = ClientError(
291 |             {"Error": {}}, None
292 |         )
293 |         self.runner.cleanup_cluster()
294 |         mock_s3.Object.assert_called()
295 | 
296 |     def test_drain_check_draining(self):
297 |         from ardere.exceptions import UndrainedInstancesException
298 |         self.mock_ecs.all_services_done.return_value = True
299 |         self.runner.check_drained()
300 |         self.mock_ecs.all_services_done.return_value = False
301 |         assert_raises(UndrainedInstancesException,
302 |                       self.runner.check_drained)
303 | 
304 | 
305 | class TestValidation(unittest.TestCase):
306 |     def _make_FUT(self):
307 |         from ardere.step_functions import PlanValidator
308 |         return PlanValidator()
309 | 
310 |     def test_validate_success(self):
311 |         schema = self._make_FUT()
312 |         schema.context["boto"] = mock.Mock()
313 |         plan = json.loads(fixtures.sample_basic_test_plan)
314 |         data, errors = schema.load(plan)
315 |         eq_(errors, {})
316 |         eq_(len(data["steps"]), len(plan["steps"]))
317 | 
318 |     def test_validate_fail_ecs_name(self):
319 |         schema = self._make_FUT()
320 |         schema.context["boto"] = mock.Mock()
321 |         plan = json.loads(fixtures.sample_basic_test_plan)
322 |         plan['ecs_name'] = ''
323 |         data, errors = schema.load(plan)
324 |         eq_(errors, {'ecs_name': ['Plan ecs_name missing']})
325 |         plan['ecs_name'] += '*'
326 |         data, errors = schema.load(plan)
327 |         eq_(errors, {'ecs_name':
328 |                          ['Plan ecs_name contained invalid characters']})
329 |         plan['ecs_name'] = 'a' * 512
330 |         data, errors = schema.load(plan)
331 |         eq_(errors, {'ecs_name': ['Plan ecs_name too long']})
332 | 
333 |     def test_validate_fail_step_name(self):
334 |         schema = self._make_FUT()
335 |         schema.context["boto"] = mock.Mock()
336 |         plan = json.loads(fixtures.sample_basic_test_plan)
337 |         plan['steps'][0]['name'] = ''
338 |         data, errors = schema.load(plan)
339 |         eq_(errors, {'steps': {0: {'name': ['Step name missing']}}})
340 |         plan['steps'][0]['name'] = '*'
341 |         data, errors = schema.load(plan)
342 |         eq_(errors,
343 |             {'steps': {0: {'name': ['Step name contains invalid characters']}}}
344 |             )
345 |         plan['steps'][0]['name'] = 'a' * 512
346 |         data, errors = schema.load(plan)
347 |         eq_(errors, {'steps': {0: {'name': ['Step name too long']}}})
348 | 
349 |     def test_validate_fail(self):
350 |         schema = self._make_FUT()
351 |         schema.context["boto"] = mock_boto = mock.Mock()
352 |         mock_client = mock.Mock()
353 |         mock_boto.client.return_value = mock_client
354 |         mock_client.describe_clusters.return_value = {"clusters": []}
355 |         plan = json.loads(fixtures.sample_basic_test_plan)
356 |         data, errors = schema.load(plan)
357 |         eq_(len(data["steps"]), len(plan["steps"]))
358 |         eq_(len(errors), 1)
359 | 


--------------------------------------------------------------------------------