├── .clog.toml
├── .coveragerc
├── .flooignore
├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── ardere
├── __init__.py
├── aws.py
├── exceptions.py
├── scripts
│ ├── __init__.py
│ └── metric_creator.py
└── step_functions.py
├── config.bash
├── default_dashboard.json
├── handler.py
├── package.json
├── requirements.txt
├── serverless.yml
├── setup.cfg
├── src
└── shell
│ ├── telegraf.toml
│ └── waitforcluster.sh
├── test-requirements.txt
└── tests
├── __init__.py
├── fixtures.py
├── test_aws.py
├── test_metric_creator.py
└── test_step_functions.py
/.clog.toml:
--------------------------------------------------------------------------------
1 | [clog]
2 | repository = "https://github.com/loads/ardere"
3 | changelog = "CHANGELOG.md"
4 | from-latest-tag = true
5 | link-style = "github"
6 |
7 | [sections]
8 | Refactor = ["refactor"]
9 | Test = ["test"]
10 | Doc = ["docs"]
11 | Chore = ["chore"]
12 | Features = ["feat", "feature"]
13 | "Bug Fixes" = ["fix", "bug"]
14 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | show_missing = True
3 |
--------------------------------------------------------------------------------
/.flooignore:
--------------------------------------------------------------------------------
1 | # Distribution / packaging
2 | .Python
3 | env/
4 | build/
5 | develop-eggs/
6 | dist/
7 | downloads/
8 | eggs/
9 | .eggs/
10 | lib/
11 | lib64/
12 | parts/
13 | sdist/
14 | var/
15 | *.egg-info/
16 | .installed.cfg
17 | *.egg
18 | node_modules/
19 |
20 | # Serverless directories
21 | .serverless
22 | .requirements
23 | ardenv/
24 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Distribution / packaging
2 | .coverage
3 | .floo
4 | .idea
5 | .Python
6 | .requirements
7 | .npmignore
8 | ardenv/
9 | env/
10 | build/
11 | develop-eggs/
12 | dist/
13 | downloads/
14 | eggs/
15 | .eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.pyc
24 | *.egg
25 | node_modules/
26 |
27 | # Serverless directories
28 | .serverless
29 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | cache: pip
3 | sudo: required
4 | dist: precise
5 |
6 | matrix:
7 | include:
8 | - python: 2.7
9 | env: CODECOV=true
10 |
11 | install:
12 | - pip install -r test-requirements.txt
13 | - pip install ${CODECOV:+codecov}
14 | script:
15 | - nosetests -d tests ${CODECOV:+--with-coverage --cover-xml --cover-package=ardere}
16 | after_success:
17 | - codecov
18 | notifications:
19 | slack:
20 | secure: vT9sWtUuxk28g6xYKAsQmiPZllErOYVfx5lcL+/jo1eRFrmbpYnyndT6s+FxGI1547oizZ0IqZbHVvB7BUoSJixXJyQJYXW2MchwN1UeHrey8mYpF1GNEaJT7FMfqSkxUU9gvAZ3IU7zstNeTLbfG1GkLuzybp0WAiHl/ocUTz8=
21 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 |
2 | ## 0.1.1 (2017-05-15)
3 |
4 |
5 | #### Doc
6 |
7 | * update README with run steps ([3d6b5aa2](https://github.com/loads/ardere/commit/3d6b5aa2e6277a33e1a464d30168bbc2f406c512))
8 |
9 | #### Bug Fixes
10 |
11 | * bump wait for cluster ready from 10 to 30 minutes ([a23115b8](https://github.com/loads/ardere/commit/a23115b8bc20f4e7b44ef4bf78b3687069ea1253))
12 |
13 |
14 |
15 |
16 | ## 0.1 (2017-04-25)
17 |
18 |
19 | #### Features
20 |
21 | * secure metrics/load-test nodes from outside access ([3d08dccd](https://github.com/loads/ardere/commit/3d08dccd2376f85976b2f7bd026295c504560485), closes [#54](https://github.com/loads/ardere/issues/54))
22 | * Check names for invalid characters and lengths ([c886f6a9](https://github.com/loads/ardere/commit/c886f6a9598badb084871720515ff1663e61c032))
23 | * use security groups to restrict node access ([6395f9cd](https://github.com/loads/ardere/commit/6395f9cd52ab0c74a2735a8fecc2b30a217ddfda), closes [#48](https://github.com/loads/ardere/issues/48))
24 | * add grafana dashboarding ([a7a30df8](https://github.com/loads/ardere/commit/a7a30df8210429341e711ad713510e00acdc80c1), closes [#40](https://github.com/loads/ardere/issues/40))
25 | * add telegraf setup for per-container stat reporting ([7749e2eb](https://github.com/loads/ardere/commit/7749e2eb373a6f6afc49b2a7d03fcf5c4f9a18fb), closes [#33](https://github.com/loads/ardere/issues/33))
26 | * start influxdb with test runs ([8ddc48b5](https://github.com/loads/ardere/commit/8ddc48b5d3d395d54a914166e9803a6ab41ecf3f), closes [#19](https://github.com/loads/ardere/issues/19))
27 | * validate test plan before running ([0314fae7](https://github.com/loads/ardere/commit/0314fae70962f6281a261499e32500291ff764ab), closes [#21](https://github.com/loads/ardere/issues/21))
28 | * remove need to specify cpu_units ([e99eddea](https://github.com/loads/ardere/commit/e99eddead4b4119e508546aa38dc34873efa9632), closes [#20](https://github.com/loads/ardere/issues/20))
29 | * add port mapping for containers ([af054af1](https://github.com/loads/ardere/commit/af054af18e6ab5e4cd163c903867dc2cfe415168), closes [#24](https://github.com/loads/ardere/issues/24))
30 | * add toml loading as a test plan ([8342cb11](https://github.com/loads/ardere/commit/8342cb11902f6a225925cd1f8fd430d31a614cf9), closes [#32](https://github.com/loads/ardere/issues/32))
31 | * use cloudwatch logs for container output ([8bafa09f](https://github.com/loads/ardere/commit/8bafa09f82ad0116e31cc49849b7bd679219506c), closes [#27](https://github.com/loads/ardere/issues/27))
32 | * setup environment data from the test plan ([7e2ad2da](https://github.com/loads/ardere/commit/7e2ad2dad361336a4d46166e6aec32cd80c15e03), closes [#25](https://github.com/loads/ardere/issues/25))
33 | * fixup readme and test suite ([047a7fa6](https://github.com/loads/ardere/commit/047a7fa6381f4d034fd0c2955e90319a29730c76), closes [#22](https://github.com/loads/ardere/issues/22))
34 | * create MVP using serverless w/python ([9aa80467](https://github.com/loads/ardere/commit/9aa80467ce86b95e330886c1dcf57e5d84004e83), closes [#17](https://github.com/loads/ardere/issues/17))
35 | * add the lambda to start the run by writing to s3 ([e45a2789](https://github.com/loads/ardere/commit/e45a278930589b8dddbf88e3fe151f979d388edd))
36 | * add lambda function and basic CF templates for use ([0cb63bff](https://github.com/loads/ardere/commit/0cb63bff8f1d7b2533ee40a81a932e3bb618236f), closes [#11](https://github.com/loads/ardere/issues/11))
37 | * add an initial state machine impl ([2f571b0a](https://github.com/loads/ardere/commit/2f571b0aec7df9252c8d0fce44da252c17985fa2))
38 | * initial waiter script (#9) ([c07749c0](https://github.com/loads/ardere/commit/c07749c06a97bba50fe1701a2896d9b5a11dd18e))
39 |
40 | #### Doc
41 |
42 | * update for use of cloud formation in setup (#2) ([243a4a11](https://github.com/loads/ardere/commit/243a4a11da3343735815dd42a0c78bb6936adf56))
43 | * initial design docs from autoconf ([eead6dd8](https://github.com/loads/ardere/commit/eead6dd80a43c24b40047fc5c22571122878ce05))
44 |
45 | #### Bug Fixes
46 |
47 | * check service drained vs container draining ([fd4907e1](https://github.com/loads/ardere/commit/fd4907e10be9103cc9e20511c2e16c4ae906e469), closes [#62](https://github.com/loads/ardere/issues/62))
48 | * Do not check 'metrics' instance for draining ([40e8cd01](https://github.com/loads/ardere/commit/40e8cd01fc996f1596370c5ddb6ff6998b04ffdc))
49 | * Ensure all containers drained before exiting ([4cbea2fd](https://github.com/loads/ardere/commit/4cbea2fd0a280993d4312f82ba52354a0bf15f7f))
50 | * add proper tagging and socket limits ([15dc023e](https://github.com/loads/ardere/commit/15dc023efc91a0b3b644084a71f3f6f46be77158), closes [#44](https://github.com/loads/ardere/issues/44))
51 |
52 |
53 |
54 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contribution Guidelines
2 |
3 | Anyone is welcome to contribute to this project. Feel free to get in touch with
4 | other community members on IRC, the mailing list or through issues here on
5 | GitHub.
6 |
7 | [See the README](/README.md) for contact information.
8 |
9 | ## Bug Reports
10 |
11 | You can file issues here on GitHub. Please try to include as much information as
12 | you can and under what conditions you saw the issue.
13 |
14 | ## Sending Pull Requests
15 |
16 | Patches should be submitted as pull requests (PR).
17 |
18 | Before submitting a PR:
19 | - Your code must run and pass all the automated tests before you submit your PR
20 | for review. "Work in progress" pull requests are allowed to be submitted, but
21 | should be clearly labeled as such and should not be merged until all tests
22 | pass and the code has been reviewed.
23 | - Your patch should include new tests that cover your changes. It is your and
24 | your reviewer's responsibility to ensure your patch includes adequate tests.
25 |
26 | When submitting a PR:
27 | - You agree to license your code under the project's open source license
28 | ([MPL 2.0](/LICENSE)).
29 | - Base your branch off the current `master` (see below for an example workflow).
30 | - Add both your code and new tests if relevant.
31 | - Run the test suite to make sure your code passes linting and tests.
32 | - Please do not include merge commits in pull requests; include only commits with the new relevant code.
33 |
34 | See the main [README.md](/README.md) for information on prerequisites, installing, running and testing.
35 |
36 | ## Code Review
37 |
38 | This project is production Mozilla code and subject to our [engineering practices and quality standards](https://developer.mozilla.org/en-US/docs/Mozilla/Developer_guide/Committing_Rules_and_Responsibilities). Every patch must be peer reviewed.
39 |
40 | ## Git Commit Guidelines
41 |
42 | We loosely follow the [Angular commit guidelines](https://github.com/angular/angular.js/blob/master/CONTRIBUTING.md#type) of `(): ` where `type` must be one of:
43 |
44 | * **feat**: A new feature
45 | * **fix**: A bug fix
46 | * **docs**: Documentation only changes
47 | * **style**: Changes that do not affect the meaning of the code (white-space, formatting, missing
48 | semi-colons, etc)
49 | * **refactor**: A code change that neither fixes a bug or adds a feature
50 | * **perf**: A code change that improves performance
51 | * **test**: Adding missing tests
52 | * **chore**: Changes to the build process or auxiliary tools and libraries such as documentation
53 | generation
54 |
55 | ### Scope
56 | The scope could be anything specifying place of the commit change.
57 |
58 | ### Subject
59 | The subject contains succinct description of the change:
60 |
61 | * use the imperative, present tense: "change" not "changed" nor "changes"
62 | * don't capitalize first letter
63 | * no dot (.) at the end
64 |
65 | ###Body
66 | In order to maintain a reference to the context of the commit, add
67 | `fixes #` if it closes a related issue or `issue #`
68 | if it's a partial fix.
69 |
70 | You can also write a detailed description of the commit: Just as in the
71 | **subject**, use the imperative, present tense: "change" not "changed" nor
72 | "changes" It should include the motivation for the change and contrast this with
73 | previous behavior.
74 |
75 | ###Footer
76 | The footer should contain any information about **Breaking Changes** and is also
77 | the place to reference GitHub issues that this commit **Closes**.
78 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Mozilla Public License Version 2.0
2 | ==================================
3 |
4 | 1. Definitions
5 | --------------
6 |
7 | 1.1. "Contributor"
8 | means each individual or legal entity that creates, contributes to
9 | the creation of, or owns Covered Software.
10 |
11 | 1.2. "Contributor Version"
12 | means the combination of the Contributions of others (if any) used
13 | by a Contributor and that particular Contributor's Contribution.
14 |
15 | 1.3. "Contribution"
16 | means Covered Software of a particular Contributor.
17 |
18 | 1.4. "Covered Software"
19 | means Source Code Form to which the initial Contributor has attached
20 | the notice in Exhibit A, the Executable Form of such Source Code
21 | Form, and Modifications of such Source Code Form, in each case
22 | including portions thereof.
23 |
24 | 1.5. "Incompatible With Secondary Licenses"
25 | means
26 |
27 | (a) that the initial Contributor has attached the notice described
28 | in Exhibit B to the Covered Software; or
29 |
30 | (b) that the Covered Software was made available under the terms of
31 | version 1.1 or earlier of the License, but not also under the
32 | terms of a Secondary License.
33 |
34 | 1.6. "Executable Form"
35 | means any form of the work other than Source Code Form.
36 |
37 | 1.7. "Larger Work"
38 | means a work that combines Covered Software with other material, in
39 | a separate file or files, that is not Covered Software.
40 |
41 | 1.8. "License"
42 | means this document.
43 |
44 | 1.9. "Licensable"
45 | means having the right to grant, to the maximum extent possible,
46 | whether at the time of the initial grant or subsequently, any and
47 | all of the rights conveyed by this License.
48 |
49 | 1.10. "Modifications"
50 | means any of the following:
51 |
52 | (a) any file in Source Code Form that results from an addition to,
53 | deletion from, or modification of the contents of Covered
54 | Software; or
55 |
56 | (b) any new file in Source Code Form that contains any Covered
57 | Software.
58 |
59 | 1.11. "Patent Claims" of a Contributor
60 | means any patent claim(s), including without limitation, method,
61 | process, and apparatus claims, in any patent Licensable by such
62 | Contributor that would be infringed, but for the grant of the
63 | License, by the making, using, selling, offering for sale, having
64 | made, import, or transfer of either its Contributions or its
65 | Contributor Version.
66 |
67 | 1.12. "Secondary License"
68 | means either the GNU General Public License, Version 2.0, the GNU
69 | Lesser General Public License, Version 2.1, the GNU Affero General
70 | Public License, Version 3.0, or any later versions of those
71 | licenses.
72 |
73 | 1.13. "Source Code Form"
74 | means the form of the work preferred for making modifications.
75 |
76 | 1.14. "You" (or "Your")
77 | means an individual or a legal entity exercising rights under this
78 | License. For legal entities, "You" includes any entity that
79 | controls, is controlled by, or is under common control with You. For
80 | purposes of this definition, "control" means (a) the power, direct
81 | or indirect, to cause the direction or management of such entity,
82 | whether by contract or otherwise, or (b) ownership of more than
83 | fifty percent (50%) of the outstanding shares or beneficial
84 | ownership of such entity.
85 |
86 | 2. License Grants and Conditions
87 | --------------------------------
88 |
89 | 2.1. Grants
90 |
91 | Each Contributor hereby grants You a world-wide, royalty-free,
92 | non-exclusive license:
93 |
94 | (a) under intellectual property rights (other than patent or trademark)
95 | Licensable by such Contributor to use, reproduce, make available,
96 | modify, display, perform, distribute, and otherwise exploit its
97 | Contributions, either on an unmodified basis, with Modifications, or
98 | as part of a Larger Work; and
99 |
100 | (b) under Patent Claims of such Contributor to make, use, sell, offer
101 | for sale, have made, import, and otherwise transfer either its
102 | Contributions or its Contributor Version.
103 |
104 | 2.2. Effective Date
105 |
106 | The licenses granted in Section 2.1 with respect to any Contribution
107 | become effective for each Contribution on the date the Contributor first
108 | distributes such Contribution.
109 |
110 | 2.3. Limitations on Grant Scope
111 |
112 | The licenses granted in this Section 2 are the only rights granted under
113 | this License. No additional rights or licenses will be implied from the
114 | distribution or licensing of Covered Software under this License.
115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
116 | Contributor:
117 |
118 | (a) for any code that a Contributor has removed from Covered Software;
119 | or
120 |
121 | (b) for infringements caused by: (i) Your and any other third party's
122 | modifications of Covered Software, or (ii) the combination of its
123 | Contributions with other software (except as part of its Contributor
124 | Version); or
125 |
126 | (c) under Patent Claims infringed by Covered Software in the absence of
127 | its Contributions.
128 |
129 | This License does not grant any rights in the trademarks, service marks,
130 | or logos of any Contributor (except as may be necessary to comply with
131 | the notice requirements in Section 3.4).
132 |
133 | 2.4. Subsequent Licenses
134 |
135 | No Contributor makes additional grants as a result of Your choice to
136 | distribute the Covered Software under a subsequent version of this
137 | License (see Section 10.2) or under the terms of a Secondary License (if
138 | permitted under the terms of Section 3.3).
139 |
140 | 2.5. Representation
141 |
142 | Each Contributor represents that the Contributor believes its
143 | Contributions are its original creation(s) or it has sufficient rights
144 | to grant the rights to its Contributions conveyed by this License.
145 |
146 | 2.6. Fair Use
147 |
148 | This License is not intended to limit any rights You have under
149 | applicable copyright doctrines of fair use, fair dealing, or other
150 | equivalents.
151 |
152 | 2.7. Conditions
153 |
154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155 | in Section 2.1.
156 |
157 | 3. Responsibilities
158 | -------------------
159 |
160 | 3.1. Distribution of Source Form
161 |
162 | All distribution of Covered Software in Source Code Form, including any
163 | Modifications that You create or to which You contribute, must be under
164 | the terms of this License. You must inform recipients that the Source
165 | Code Form of the Covered Software is governed by the terms of this
166 | License, and how they can obtain a copy of this License. You may not
167 | attempt to alter or restrict the recipients' rights in the Source Code
168 | Form.
169 |
170 | 3.2. Distribution of Executable Form
171 |
172 | If You distribute Covered Software in Executable Form then:
173 |
174 | (a) such Covered Software must also be made available in Source Code
175 | Form, as described in Section 3.1, and You must inform recipients of
176 | the Executable Form how they can obtain a copy of such Source Code
177 | Form by reasonable means in a timely manner, at a charge no more
178 | than the cost of distribution to the recipient; and
179 |
180 | (b) You may distribute such Executable Form under the terms of this
181 | License, or sublicense it under different terms, provided that the
182 | license for the Executable Form does not attempt to limit or alter
183 | the recipients' rights in the Source Code Form under this License.
184 |
185 | 3.3. Distribution of a Larger Work
186 |
187 | You may create and distribute a Larger Work under terms of Your choice,
188 | provided that You also comply with the requirements of this License for
189 | the Covered Software. If the Larger Work is a combination of Covered
190 | Software with a work governed by one or more Secondary Licenses, and the
191 | Covered Software is not Incompatible With Secondary Licenses, this
192 | License permits You to additionally distribute such Covered Software
193 | under the terms of such Secondary License(s), so that the recipient of
194 | the Larger Work may, at their option, further distribute the Covered
195 | Software under the terms of either this License or such Secondary
196 | License(s).
197 |
198 | 3.4. Notices
199 |
200 | You may not remove or alter the substance of any license notices
201 | (including copyright notices, patent notices, disclaimers of warranty,
202 | or limitations of liability) contained within the Source Code Form of
203 | the Covered Software, except that You may alter any license notices to
204 | the extent required to remedy known factual inaccuracies.
205 |
206 | 3.5. Application of Additional Terms
207 |
208 | You may choose to offer, and to charge a fee for, warranty, support,
209 | indemnity or liability obligations to one or more recipients of Covered
210 | Software. However, You may do so only on Your own behalf, and not on
211 | behalf of any Contributor. You must make it absolutely clear that any
212 | such warranty, support, indemnity, or liability obligation is offered by
213 | You alone, and You hereby agree to indemnify every Contributor for any
214 | liability incurred by such Contributor as a result of warranty, support,
215 | indemnity or liability terms You offer. You may include additional
216 | disclaimers of warranty and limitations of liability specific to any
217 | jurisdiction.
218 |
219 | 4. Inability to Comply Due to Statute or Regulation
220 | ---------------------------------------------------
221 |
222 | If it is impossible for You to comply with any of the terms of this
223 | License with respect to some or all of the Covered Software due to
224 | statute, judicial order, or regulation then You must: (a) comply with
225 | the terms of this License to the maximum extent possible; and (b)
226 | describe the limitations and the code they affect. Such description must
227 | be placed in a text file included with all distributions of the Covered
228 | Software under this License. Except to the extent prohibited by statute
229 | or regulation, such description must be sufficiently detailed for a
230 | recipient of ordinary skill to be able to understand it.
231 |
232 | 5. Termination
233 | --------------
234 |
235 | 5.1. The rights granted under this License will terminate automatically
236 | if You fail to comply with any of its terms. However, if You become
237 | compliant, then the rights granted under this License from a particular
238 | Contributor are reinstated (a) provisionally, unless and until such
239 | Contributor explicitly and finally terminates Your grants, and (b) on an
240 | ongoing basis, if such Contributor fails to notify You of the
241 | non-compliance by some reasonable means prior to 60 days after You have
242 | come back into compliance. Moreover, Your grants from a particular
243 | Contributor are reinstated on an ongoing basis if such Contributor
244 | notifies You of the non-compliance by some reasonable means, this is the
245 | first time You have received notice of non-compliance with this License
246 | from such Contributor, and You become compliant prior to 30 days after
247 | Your receipt of the notice.
248 |
249 | 5.2. If You initiate litigation against any entity by asserting a patent
250 | infringement claim (excluding declaratory judgment actions,
251 | counter-claims, and cross-claims) alleging that a Contributor Version
252 | directly or indirectly infringes any patent, then the rights granted to
253 | You by any and all Contributors for the Covered Software under Section
254 | 2.1 of this License shall terminate.
255 |
256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257 | end user license agreements (excluding distributors and resellers) which
258 | have been validly granted by You or Your distributors under this License
259 | prior to termination shall survive termination.
260 |
261 | ************************************************************************
262 | * *
263 | * 6. Disclaimer of Warranty *
264 | * ------------------------- *
265 | * *
266 | * Covered Software is provided under this License on an "as is" *
267 | * basis, without warranty of any kind, either expressed, implied, or *
268 | * statutory, including, without limitation, warranties that the *
269 | * Covered Software is free of defects, merchantable, fit for a *
270 | * particular purpose or non-infringing. The entire risk as to the *
271 | * quality and performance of the Covered Software is with You. *
272 | * Should any Covered Software prove defective in any respect, You *
273 | * (not any Contributor) assume the cost of any necessary servicing, *
274 | * repair, or correction. This disclaimer of warranty constitutes an *
275 | * essential part of this License. No use of any Covered Software is *
276 | * authorized under this License except under this disclaimer. *
277 | * *
278 | ************************************************************************
279 |
280 | ************************************************************************
281 | * *
282 | * 7. Limitation of Liability *
283 | * -------------------------- *
284 | * *
285 | * Under no circumstances and under no legal theory, whether tort *
286 | * (including negligence), contract, or otherwise, shall any *
287 | * Contributor, or anyone who distributes Covered Software as *
288 | * permitted above, be liable to You for any direct, indirect, *
289 | * special, incidental, or consequential damages of any character *
290 | * including, without limitation, damages for lost profits, loss of *
291 | * goodwill, work stoppage, computer failure or malfunction, or any *
292 | * and all other commercial damages or losses, even if such party *
293 | * shall have been informed of the possibility of such damages. This *
294 | * limitation of liability shall not apply to liability for death or *
295 | * personal injury resulting from such party's negligence to the *
296 | * extent applicable law prohibits such limitation. Some *
297 | * jurisdictions do not allow the exclusion or limitation of *
298 | * incidental or consequential damages, so this exclusion and *
299 | * limitation may not apply to You. *
300 | * *
301 | ************************************************************************
302 |
303 | 8. Litigation
304 | -------------
305 |
306 | Any litigation relating to this License may be brought only in the
307 | courts of a jurisdiction where the defendant maintains its principal
308 | place of business and such litigation shall be governed by laws of that
309 | jurisdiction, without reference to its conflict-of-law provisions.
310 | Nothing in this Section shall prevent a party's ability to bring
311 | cross-claims or counter-claims.
312 |
313 | 9. Miscellaneous
314 | ----------------
315 |
316 | This License represents the complete agreement concerning the subject
317 | matter hereof. If any provision of this License is held to be
318 | unenforceable, such provision shall be reformed only to the extent
319 | necessary to make it enforceable. Any law or regulation which provides
320 | that the language of a contract shall be construed against the drafter
321 | shall not be used to construe this License against a Contributor.
322 |
323 | 10. Versions of the License
324 | ---------------------------
325 |
326 | 10.1. New Versions
327 |
328 | Mozilla Foundation is the license steward. Except as provided in Section
329 | 10.3, no one other than the license steward has the right to modify or
330 | publish new versions of this License. Each version will be given a
331 | distinguishing version number.
332 |
333 | 10.2. Effect of New Versions
334 |
335 | You may distribute the Covered Software under the terms of the version
336 | of the License under which You originally received the Covered Software,
337 | or under the terms of any subsequent version published by the license
338 | steward.
339 |
340 | 10.3. Modified Versions
341 |
342 | If you create software not governed by this License, and you want to
343 | create a new license for such software, you may create and use a
344 | modified version of this License if you rename the license and remove
345 | any references to the name of the license steward (except to note that
346 | such modified license differs from this License).
347 |
348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
349 | Licenses
350 |
351 | If You choose to distribute Source Code Form that is Incompatible With
352 | Secondary Licenses under the terms of this version of the License, the
353 | notice described in Exhibit B of this License must be attached.
354 |
355 | Exhibit A - Source Code Form License Notice
356 | -------------------------------------------
357 |
358 | This Source Code Form is subject to the terms of the Mozilla Public
359 | License, v. 2.0. If a copy of the MPL was not distributed with this
360 | file, You can obtain one at http://mozilla.org/MPL/2.0/.
361 |
362 | If it is not possible or desirable to put the notice in a particular
363 | file, then You may include the notice in a location (such as a LICENSE
364 | file in a relevant directory) where a recipient would be likely to look
365 | for such a notice.
366 |
367 | You may add additional accurate notices of copyright ownership.
368 |
369 | Exhibit B - "Incompatible With Secondary Licenses" Notice
370 | ---------------------------------------------------------
371 |
372 | This Source Code Form is "Incompatible With Secondary Licenses", as
373 | defined by the Mozilla Public License, v. 2.0.
374 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ardere
2 | *AWS Serverless Service for Load-Testing*
3 |
4 | ardere runs as a serverless service using AWS to orchestrate
5 | load-tests consisting of docker container configurations arranged as
6 | test plans.
7 |
8 | ## Installation
9 |
10 | Pre-requisite:
11 | installation requires node > v6
12 |
13 | To deploy ardere to your AWS account, you will need a fairly recent
14 | install of Node, then install the Node packages required:
15 |
16 | $ npm install
17 |
18 | You will need to ensure your have AWS access and secret keys configured
19 | for serverless:
20 |
21 | $ sls config
22 |
23 | To deploy the ardere lambda's and required AWS stack:
24 |
25 | $ sls deploy
26 |
27 | Then you can deploy the ardere Step Function:
28 |
29 | $ sls deploy stepf
30 |
31 |
32 | ## Developing
33 |
34 | ardere is written in Python and deployed via serverless to AWS. To an
35 | extent testing it on AWS is the most reliable indicator it works as
36 | intended. However, there are sets of tests that ensure the Python code
37 | is valid and works with arguments as intended that may be run locally.
38 |
39 | Create a Python virtualenv, and install the test requirements:
40 |
41 | $ virtualenv ardenv
42 | $ source ardenv/bin/activate
43 | $ pip install -r test-requirements.txt
44 |
45 | The tests can now be run with nose:
46 |
47 | $ nosetests
48 |
49 | Note that **you cannot run the sls deploy while the virtualenv is active**
50 | due to how the serverless Python requirements plugin operates.
51 |
52 | ## Run Test
53 |
54 | 1. Login to AWS console
55 | (mozilla-services use: stage)
56 | 2. Go to Step Functions > Dashboard
57 | 3. Select your state machine
58 | (mozilla-services use: "ardere-dev-ardere")
59 | 4. Click on "New Execution" button
60 | 5. Paste your json config into text area
61 | (example: [**mozilla-services/screenshots-loadtests** /ardere.json](https://github.com/mozilla-services/screenshots-loadtests/blob/master/ardere.json))
62 | 6. Optional: Assign a name to your execution
63 | 7. Click on "Start Execution"
64 | 8. Monitor execution in Dashboard
65 | 9. Test load should be visible in DataDog, NewRelic, etc.
66 |
67 | ## Monitoring
68 |
69 | ### Metrics Node Monitoring (Grafana)
70 |
71 | 1. ssh -L 3000:\:3000 \
72 | 2. open local browser to http://localhost:3000
73 | 3. login using credentials specified in your ardere (JSON) config file
74 |
--------------------------------------------------------------------------------
/ardere/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.1' # pragma: nocover
2 |
--------------------------------------------------------------------------------
/ardere/aws.py:
--------------------------------------------------------------------------------
1 | """AWS Helper Classes"""
2 | import logging
3 | import os
4 | import time
5 | import uuid
6 | from collections import defaultdict
7 |
8 | import boto3
9 | import botocore
10 | from concurrent.futures import ThreadPoolExecutor
11 | from typing import Any, Dict, List, Optional, Tuple # noqa
12 |
13 | logger = logging.getLogger()
14 | logger.setLevel(logging.INFO)
15 |
16 | # Setup script paths
17 | dir_path = os.path.dirname(os.path.realpath(__file__))
18 | parent_dir_path = os.path.dirname(dir_path)
19 | wait_script_path = os.path.join(parent_dir_path, "src", "shell",
20 | "waitforcluster.sh")
21 | telegraf_script_path = os.path.join(parent_dir_path, "src", "shell",
22 | "telegraf.toml")
23 | metric_create_script = os.path.join(parent_dir_path, "ardere", "scripts",
24 | "metric_creator.py")
25 |
26 | # EC2 userdata to setup values on load
27 | # Settings for net.ipv4 settings based on:
28 | # http://stackoverflow.com/questions/410616/increasing-the-maximum-number-of-tcp-ip-connections-in-linux
29 | # Other settings are from operations on kernel tweaks they've done to handle
30 | # large socket conditions.
31 | EC2_USER_DATA = """#!/bin/bash
32 | echo ECS_CLUSTER='{ecs_name}' >> /etc/ecs/ecs.config
33 | sysctl net.core.rmem_default=8388608
34 | sysctl net.core.rmem_max=16777216
35 | sysctl net.core.wmem_max=16777216
36 | sysctl net.core.netdev_max_backlog=2500
37 | sysctl net.core.somaxconn=3240000
38 | sysctl net.netfilter.nf_conntrack_tcp_timeout_established=600
39 | sysctl net.nf_conntrack_max=1000000
40 | sysctl net.ipv4.ip_local_port_range="1024 65535"
41 | sysctl net.ipv4.netfilter.ip_conntrack_max=4999999
42 | sysctl net.ipv4.netfilter.ip_conntrack_tcp_timeout_time_wait=1
43 | sysctl net.ipv4.netfilter.ip_conntrack_tcp_timeout_established=54000
44 | sysctl net.ipv4.tcp_fin_timeout=5
45 | sysctl net.ipv4.tcp_keepalive_time=30
46 | sysctl net.ipv4.tcp_keepalive_intvl=15
47 | sysctl net.ipv4.tcp_keepalive_probes=6
48 | sysctl net.ipv4.tcp_window_scaling=1
49 | sysctl net.ipv4.tcp_rmem="4096 87380 16777216"
50 | sysctl net.ipv4.tcp_wmem="4096 65536 16777216"
51 | sysctl net.ipv4.tcp_mem="786432 1048576 26777216"
52 | sysctl net.ipv4.tcp_max_tw_buckets=360000
53 | sysctl net.ipv4.tcp_max_syn_backlog=3240000
54 | sysctl net.ipv4.tcp_max_tw_buckets=1440000
55 | sysctl net.ipv4.tcp_slow_start_after_idle=0
56 | sysctl net.ipv4.tcp_retries2=5
57 | sysctl net.ipv4.tcp_tw_recycle=1
58 | sysctl net.ipv4.tcp_tw_reuse=1
59 | sysctl vm.min_free_kbytes=65536
60 | sysctl -w fs.file-max=1000000
61 | ulimit -n 1000000
62 | """
63 |
64 | # List tracking vcpu's of all instance types for cpu unit reservations
65 | # We are intentionally leaving out the following instance types as they're
66 | # considered overkill for load-testing purposes or any instance req's we have
67 | # experienced so far:
68 | # P2, G2, F1, I3, D2
69 | ec2_type_by_vcpu = {
70 | 1: ["t2.nano", "t2.micro", "t2.small", "m3.medium"],
71 | 2: ["t2.medium", "t2.large", "m3.large", "m4.large", "c3.large",
72 | "c4.large", "r3.large", "r4.large"],
73 | 4: ["t2.xlarge", "m3.xlarge", "m4.xlarge", "c3.xlarge", "c4.xlarge",
74 | "r3.xlarge", "r4.xlarge"],
75 | 8: ["t2.2xlarge", "m3.2xlarge", "m4.2xlarge", "c3.2xlarge", "c4.2xlarge",
76 | "r3.2xlarge", "r4.2xlarge"],
77 | 16: ["m4.4xlarge", "c3.4xlarge", "c4.4xlarge", "r3.4xlarge", "r4.4xlarge"],
78 | 32: ["c3.8xlarge", "r3.8xlarge", "r4.8xlarge"],
79 | 36: ["c4.8xlarge"],
80 | 40: ["m4.10xlarge"],
81 | 64: ["m4.16xlarge", "x1.16xlarge", "r4.16xlarge"],
82 | 128: ["x1.32xlarge"]
83 | }
84 |
85 | # Build a list of vcpu's by instance type
86 | ec2_vcpu_by_type = {}
87 | for vcpu, instance_types in ec2_type_by_vcpu.items():
88 | for instance_type in instance_types:
89 | ec2_vcpu_by_type[instance_type] = vcpu
90 |
91 |
92 | def cpu_units_for_instance_type(instance_type):
93 | # type: (str) -> int
94 | """Calculate how many CPU units to allocate for an instance_type
95 |
96 | We calculate cpu_units as 1024 * vcpu's for each instance to allocate
97 | almost the entirety of the instance's cpu units to the load-testing
98 | container. We take out 512 to ensure some leftover capacity for other
99 | utility containers we run with the load-testing container.
100 |
101 | """
102 | return (ec2_vcpu_by_type[instance_type] * 1024) - 512
103 |
104 |
105 | class ECSManager(object):
106 | """ECS Manager queries and manages an ECS cluster"""
107 | # For testing purposes
108 | boto = boto3
109 |
110 | # ECS optimized AMI id's
111 | ecs_ami_ids = {
112 | "us-east-1": "ami-275ffe31",
113 | "us-east-2": "ami-62745007",
114 | "us-west-1": "ami-689bc208",
115 | "us-west-2": "ami-62d35c02"
116 | }
117 |
118 | influxdb_container = "influxdb:1.2-alpine"
119 | telegraf_container = "telegraf:1.2-alpine"
120 | grafana_container = "grafana/grafana:4.1.2"
121 | python_container = "jfloff/alpine-python:2.7-slim"
122 |
123 | _wait_script = None
124 | _telegraf_script = None
125 | _metric_create_script = None
126 |
127 | def __init__(self, plan):
128 | # type: (Dict[str, Any]) -> None
129 | """Create and return a ECSManager for a cluster of the given name."""
130 | self._ecs_client = self.boto.client('ecs')
131 | self._ec2_client = self.boto.client('ec2')
132 | self._ecs_name = plan["ecs_name"]
133 | self._plan = plan
134 |
135 | # Pull out the env vars
136 | self.s3_ready_bucket = os.environ["s3_ready_bucket"]
137 | self.container_log_group = os.environ["container_log_group"]
138 | self.ecs_profile = os.environ["ecs_profile"]
139 |
140 | if "plan_run_uuid" not in plan:
141 | plan["plan_run_uuid"] = uuid.uuid4().hex
142 |
143 | self._plan_uuid = plan["plan_run_uuid"]
144 |
145 | @property
146 | def wait_script(self):
147 | if not self._wait_script:
148 | with open(wait_script_path, 'r') as f:
149 | self._wait_script = f.read()
150 | return self._wait_script
151 |
152 | @property
153 | def telegraf_script(self):
154 | if not self._telegraf_script:
155 | with open(telegraf_script_path, 'r') as f:
156 | self._telegraf_script = f.read()
157 | return self._telegraf_script
158 |
159 | @property
160 | def metric_create_script(self):
161 | if not self._metric_create_script:
162 | with open(metric_create_script, 'r') as f:
163 | self._metric_create_script = f.read()
164 | return self._metric_create_script
165 |
166 | @property
167 | def plan_uuid(self):
168 | return self._plan_uuid
169 |
170 | @property
171 | def s3_ready_file(self):
172 | return "https://s3.amazonaws.com/{bucket}/{key}".format(
173 | bucket=self.s3_ready_bucket,
174 | key="{}.ready".format(self._plan_uuid)
175 | )
176 |
177 | @property
178 | def log_config(self):
179 | return {
180 | "logDriver": "awslogs",
181 | "options": {"awslogs-group": self.container_log_group,
182 | "awslogs-region": "us-east-1",
183 | "awslogs-stream-prefix":
184 | "ardere-{}".format(self.plan_uuid)
185 | }
186 | }
187 |
188 | @property
189 | def influx_db_name(self):
190 | return "run-{}".format(self.plan_uuid)
191 |
192 | @property
193 | def grafana_admin_user(self):
194 | return self._plan["metrics_options"]["dashboard"]["admin_user"]
195 |
196 | @property
197 | def grafana_admin_password(self):
198 | return self._plan["metrics_options"]["dashboard"]["admin_password"]
199 |
200 | def family_name(self, step):
201 | # type: (Dict[str, Any]) -> str
202 | """Generate a consistent family name for a given step"""
203 | return step["name"] + "-" + self._plan_uuid
204 |
205 | def metrics_family_name(self):
206 | # type: () -> str
207 | """Generate a consistent metrics family name"""
208 | return "{}-metrics".format(self._ecs_name)
209 |
210 | def metrics_setup_family_name(self):
211 | # type: () -> str
212 | """Generate a consistent metric setup family name"""
213 | return "{}-metrics-setup".format(self._ecs_name)
214 |
215 | def query_active_instances(self, additional_tags=None):
216 | # type: (Optional[Dict[str, str]]) -> Dict[str, int]
217 | """Query EC2 for all the instances owned by ardere for this cluster."""
218 | instance_dict = defaultdict(int)
219 | paginator = self._ec2_client.get_paginator('describe_instances')
220 | filters = {"Owner": "ardere", "ECSCluster": self._ecs_name}
221 | if additional_tags:
222 | filters.update(additional_tags)
223 | response_iterator = paginator.paginate(
224 | Filters=[
225 | {
226 | "Name": "tag:{}".format(tag_name),
227 | "Values": [tag_value]
228 | } for tag_name, tag_value in filters.items()
229 | ]
230 | )
231 | for page in response_iterator:
232 | for reservation in page["Reservations"]:
233 | for instance in reservation["Instances"]:
234 | # Determine if the instance is pending/running and count
235 | # 0 = Pending, 16 = Running, > is all shutting down, etc.
236 | if instance["State"]["Code"] <= 16:
237 | instance_dict[instance["InstanceType"]] += 1
238 | return instance_dict
239 |
240 | def calculate_missing_instances(self, desired, current):
241 | # type: (Dict[str, int], Dict[str, int]) -> Dict[str, int]
242 | """Determine how many of what instance types are needed to ensure
243 | the current instance dict has all the desired instance count/types."""
244 | needed = {}
245 | for instance_type, instance_count in desired.items():
246 | cur = current.get(instance_type, 0)
247 | if cur < instance_count:
248 | needed[instance_type] = instance_count - cur
249 | return needed
250 |
251 | def has_metrics_node(self, instance_type):
252 | # type: (str) -> bool
253 | """Return whether a metrics node with this instance type exists"""
254 | instances = self.query_active_instances(
255 | additional_tags=dict(Role="metrics")
256 | )
257 | return instance_type in instances
258 |
259 | def has_started_metric_creation(self):
260 | # type: () -> bool
261 | """Return whether the metric creation container was started"""
262 | response = self._ecs_client.list_tasks(
263 | cluster=self._ecs_name,
264 | startedBy=self.plan_uuid
265 | )
266 | return bool(response["taskArns"])
267 |
268 | def has_finished_metric_creation(self):
269 | # type: () -> bool
270 | """Return whether the metric creation container has finished"""
271 | response = self._ecs_client.list_tasks(
272 | cluster=self._ecs_name,
273 | startedBy=self.plan_uuid,
274 | desiredStatus="STOPPED"
275 | )
276 | return bool(response["taskArns"])
277 |
278 | def request_instances(self, instances, security_group_ids,
279 | additional_tags=None):
280 | # type: (Dict[str, int], List[str], Optional[Dict[str, str]]) -> None
281 | """Create requested types/quantities of instances for this cluster"""
282 | ami_id = self.ecs_ami_ids["us-east-1"]
283 | tags = dict(Name=self._ecs_name, Owner="ardere",
284 | ECSCluster=self._ecs_name)
285 | if additional_tags:
286 | tags.update(additional_tags)
287 | for instance_type, instance_count in instances.items():
288 | self._ec2_client.run_instances(
289 | ImageId=ami_id,
290 | MinCount=instance_count,
291 | MaxCount=instance_count,
292 | InstanceType=instance_type,
293 | UserData=EC2_USER_DATA.format(ecs_name=self._ecs_name),
294 | IamInstanceProfile={"Arn": self.ecs_profile},
295 | SecurityGroupIds=security_group_ids,
296 | TagSpecifications=[
297 | {
298 | "ResourceType": "instance",
299 | "Tags": [
300 | dict(Key=tag_name, Value=tag_value)
301 | for tag_name, tag_value in tags.items()
302 | ]
303 | }
304 | ]
305 | )
306 |
307 | def locate_metrics_container_ip(self):
308 | # type: () -> Tuple[Optional[str], Optional[str]]
309 | """Locates the metrics container IP and container instance arn
310 |
311 | Returns a tuple of (public_ip, container_arn)
312 |
313 | """
314 | response = self._ecs_client.list_container_instances(
315 | cluster=self._ecs_name,
316 | filter="task:group == service:metrics"
317 | )
318 | if not response["containerInstanceArns"]:
319 | return None, None
320 |
321 | container_arn = response["containerInstanceArns"][0]
322 | response = self._ecs_client.describe_container_instances(
323 | cluster=self._ecs_name,
324 | containerInstances=[container_arn]
325 | )
326 |
327 | container_instance = response["containerInstances"][0]
328 | ec2_instance_id = container_instance["ec2InstanceId"]
329 | instance = self.boto.resource("ec2").Instance(ec2_instance_id)
330 | return instance.private_ip_address, container_arn
331 |
332 | def locate_metrics_service(self):
333 | # type: () -> Optional[str]
334 | """Locate and return the metrics service arn if any"""
335 | response = self._ecs_client.describe_services(
336 | cluster=self._ecs_name,
337 | services=["metrics"]
338 | )
339 | if response["services"] and response["services"][0]["status"] == \
340 | "ACTIVE":
341 | return response["services"][0]
342 | else:
343 | return None
344 |
345 | def create_metrics_service(self, options):
346 | # type: (Dict[str, Any]) -> Dict[str, Any]
347 | """Creates an ECS service to run InfluxDB and Grafana for metric
348 | reporting and returns its info"""
349 | logger.info("Creating InfluxDB service with options: {}".format(
350 | options))
351 |
352 | cmd = """\
353 | export GF_DEFAULT_INSTANCE_NAME=`wget -qO- http://169.254.169.254/latest/meta-data/instance-id` && \
354 | export GF_SECURITY_ADMIN_USER=%s && \
355 | export GF_SECURITY_ADMIN_PASSWORD=%s && \
356 | export GF_USERS_ALLOW_SIGN_UP=false && \
357 | mkdir "${GF_DASHBOARDS_JSON_PATH}" && \
358 | ./run.sh
359 | """ % (self.grafana_admin_user, self.grafana_admin_password) # noqa
360 | cmd = ['sh', '-c', '{}'.format(cmd)]
361 |
362 | gf_env = {
363 | "GF_DASHBOARDS_JSON_ENABLED": "true",
364 | "GF_DASHBOARDS_JSON_PATH": "/var/lib/grafana/dashboards",
365 | "__ARDERE_GRAFANA_URL__":
366 | "http://admin:admin@localhost:3000/api/datasources"
367 | }
368 |
369 | # Setup the task definition for setting up influxdb/grafana instances
370 | # per run
371 | mc_cmd = """\
372 | pip install influxdb requests boto3 && \
373 | echo "${__ARDERE_PYTHON_SCRIPT__}" > setup_db.py && \
374 | python setup_db.py
375 | """
376 | mc_cmd = ['sh', '-c', '{}'.format(mc_cmd)]
377 | self._ecs_client.register_task_definition(
378 | family=self.metrics_setup_family_name(),
379 | containerDefinitions=[
380 | {
381 | "name": "metricsetup",
382 | "image": self.python_container,
383 | "cpu": 128,
384 | "entryPoint": mc_cmd,
385 | "memoryReservation": 256,
386 | "privileged": True,
387 | "logConfiguration": self.log_config
388 | }
389 | ],
390 | networkMode="host"
391 | )
392 |
393 | task_response = self._ecs_client.register_task_definition(
394 | family=self.metrics_family_name(),
395 | containerDefinitions=[
396 | {
397 | "name": "influxdb",
398 | "image": self.influxdb_container,
399 | "cpu": cpu_units_for_instance_type(
400 | options["instance_type"]),
401 | "memoryReservation": 256,
402 | "privileged": True,
403 | "portMappings": [
404 | {"containerPort": 8086},
405 | {"containerPort": 8088}
406 | ],
407 | "logConfiguration": self.log_config
408 | },
409 | {
410 | "name": "grafana",
411 | "image": self.grafana_container,
412 | "cpu": 256,
413 | "memoryReservation": 256,
414 | "entryPoint": cmd,
415 | "portMappings": [
416 | {"containerPort": 3000}
417 | ],
418 | "privileged": True,
419 | "environment": [
420 | {"name": key, "value": value} for key, value in
421 | gf_env.items()
422 | ],
423 | "logConfiguration": self.log_config
424 | }
425 | ],
426 | # use host network mode for optimal performance
427 | networkMode="host",
428 |
429 | placementConstraints=[
430 | # Ensure the service is confined to the right instance type
431 | {
432 | "type": "memberOf",
433 | "expression": "attribute:ecs.instance-type == {}".format(
434 | options["instance_type"]),
435 | }
436 | ],
437 | )
438 | task_arn = task_response["taskDefinition"]["taskDefinitionArn"]
439 | service_result = self._ecs_client.create_service(
440 | cluster=self._ecs_name,
441 | serviceName="metrics",
442 | taskDefinition=task_arn,
443 | desiredCount=1,
444 | deploymentConfiguration={
445 | "minimumHealthyPercent": 0,
446 | "maximumPercent": 100
447 | },
448 | placementConstraints=[
449 | {
450 | "type": "distinctInstance"
451 | }
452 | ]
453 | )
454 | service_arn = service_result["service"]["serviceArn"]
455 | return dict(task_arn=task_arn, service_arn=service_arn)
456 |
457 | def run_metric_creation_task(self, container_instance, grafana_auth,
458 | dashboard=None,
459 | dashboard_name=None):
460 | # type: (str, Tuple[str, str], Optional[str], Optional[str]) -> None
461 | """Starts the metric creation task"""
462 | env = {
463 | "__ARDERE_GRAFANA_USER__": grafana_auth[0],
464 | "__ARDERE_GRAFANA_PASS__": grafana_auth[1],
465 | "__ARDERE_PYTHON_SCRIPT__": self.metric_create_script,
466 | "__ARDERE_INFLUXDB_NAME__": self.influx_db_name
467 | }
468 |
469 | if dashboard:
470 | env["__ARDERE_DASHBOARD__"] = dashboard
471 | env["__ARDERE_DASHBOARD_NAME__"] = dashboard_name
472 |
473 | self._ecs_client.start_task(
474 | cluster=self._ecs_name,
475 | taskDefinition=self.metrics_setup_family_name(),
476 | overrides={
477 | 'containerOverrides': [
478 | {
479 | "name": "metricsetup",
480 | "environment": [
481 | {"name": key, "value": value} for key, value in
482 | env.items()
483 | ]
484 | }
485 | ]
486 | },
487 | containerInstances=[container_instance],
488 | startedBy=self.plan_uuid
489 | )
490 |
491 | def create_service(self, step):
492 | # type: (Dict[str, Any]) -> Dict[str, Any]
493 | """Creates an ECS service for a step and returns its info"""
494 | logger.info("CreateService called with: {}".format(step))
495 |
496 | # Prep the shell command
497 | wfc_var = '__ARDERE_WAITFORCLUSTER_SH__'
498 | wfc_cmd = 'sh -c "${}" waitforcluster.sh {} {}'.format(
499 | wfc_var,
500 | self.s3_ready_file,
501 | step.get("run_delay", 0)
502 | )
503 | service_cmd = step["cmd"]
504 | cmd = ['sh', '-c', '{} && {}'.format(wfc_cmd, service_cmd)]
505 |
506 | # Prep the env vars
507 | env_vars = [{"name": wfc_var, "value": self.wait_script}]
508 | for name, value in step.get("env", {}).items():
509 | env_vars.append({"name": name, "value": value})
510 |
511 | # ECS wants a family name for task definitions, no spaces, 255 chars
512 | family_name = step["name"] + "-" + self._plan_uuid
513 |
514 | # Use cpu_unit if provided, otherwise monopolize
515 | cpu_units = step.get(
516 | "cpu_units",
517 | cpu_units_for_instance_type(step["instance_type"])
518 | )
519 |
520 | # Setup the container definition
521 | container_def = {
522 | "name": step["name"],
523 | "image": step["container_name"],
524 | "cpu": cpu_units,
525 |
526 | # using only memoryReservation sets no hard limit
527 | "memoryReservation": 256,
528 | "privileged": True,
529 | "environment": env_vars,
530 | "entryPoint": cmd,
531 | "ulimits": [
532 | dict(name="nofile", softLimit=1000000, hardLimit=1000000)
533 | ],
534 | "logConfiguration": self.log_config
535 | }
536 | if "port_mapping" in step:
537 | ports = [{"containerPort": port} for port in step["port_mapping"]]
538 | container_def["portMappings"] = ports
539 |
540 | # Setup the telegraf container definition
541 | cmd = """\
542 | echo "${__ARDERE_TELEGRAF_CONF__}" > /etc/telegraf/telegraf.conf && \
543 | export __ARDERE_TELEGRAF_HOST__=`wget -qO- http://169.254.169.254/latest/meta-data/instance-id` && \
544 | telegraf \
545 | """ # noqa
546 | cmd = ['sh', '-c', '{}'.format(cmd)]
547 | telegraf_def = {
548 | "name": "telegraf",
549 | "image": self.telegraf_container,
550 | "cpu": 512,
551 | "memoryReservation": 256,
552 | "entryPoint": cmd,
553 | "portMappings": [
554 | {"containerPort": 8125}
555 | ],
556 | "privileged": True,
557 | "environment": [
558 | {"name": "__ARDERE_TELEGRAF_CONF__",
559 | "value": self.telegraf_script},
560 | {"name": "__ARDERE_TELEGRAF_STEP__",
561 | "value": step["name"]},
562 | {"name": "__ARDERE_INFLUX_ADDR__",
563 | "value": "{}:8086".format(self._plan["influxdb_private_ip"])},
564 | {"name": "__ARDERE_INFLUX_DB__",
565 | "value": self.influx_db_name},
566 | {"name": "__ARDERE_TELEGRAF_TYPE__",
567 | "value": step["docker_series"]}
568 | ],
569 | "logConfiguration": self.log_config
570 | }
571 |
572 | task_response = self._ecs_client.register_task_definition(
573 | family=family_name,
574 | containerDefinitions=[
575 | container_def,
576 | telegraf_def
577 | ],
578 | # use host network mode for optimal performance
579 | networkMode="host",
580 |
581 | placementConstraints=[
582 | # Ensure the service is confined to the right instance type
583 | {
584 | "type": "memberOf",
585 | "expression": "attribute:ecs.instance-type == {}".format(
586 | step["instance_type"]),
587 | }
588 | ]
589 | )
590 | task_arn = task_response["taskDefinition"]["taskDefinitionArn"]
591 | step["taskArn"] = task_arn
592 | service_result = self._ecs_client.create_service(
593 | cluster=self._ecs_name,
594 | serviceName=step["name"],
595 | taskDefinition=task_arn,
596 | desiredCount=step["instance_count"],
597 | deploymentConfiguration={
598 | "minimumHealthyPercent": 0,
599 | "maximumPercent": 100
600 | },
601 | placementConstraints=[
602 | {
603 | "type": "distinctInstance"
604 | }
605 | ]
606 | )
607 | step["serviceArn"] = service_result["service"]["serviceArn"]
608 | step["service_status"] = "STARTED"
609 | return step
610 |
611 | def create_services(self, steps):
612 | # type: (List[Dict[str, Any]]) -> None
613 | """Create ECS Services given a list of steps"""
614 | with ThreadPoolExecutor(max_workers=8) as executor:
615 | list(executor.map(self.create_service, steps))
616 |
617 | def service_ready(self, step):
618 | # type: (Dict[str, Any]) -> bool
619 | """Query a service and return whether all its tasks are running"""
620 | service_name = step["name"]
621 | response = self._ecs_client.describe_services(
622 | cluster=self._ecs_name,
623 | services=[service_name]
624 | )
625 |
626 | try:
627 | deploy = response["services"][0]["deployments"][0]
628 | except (TypeError, IndexError):
629 | return False
630 | return deploy["desiredCount"] == deploy["runningCount"]
631 |
632 | def all_services_ready(self, steps):
633 | # type: (List[Dict[str, Any]]) -> bool
634 | """Queries all service ARN's in the plan to see if they're ready"""
635 | with ThreadPoolExecutor(max_workers=8) as executor:
636 | results = executor.map(self.service_ready, steps)
637 | return all(results)
638 |
639 | def service_done(self, step):
640 | # type: (Dict[str, Any]) -> bool
641 | """Query a service to return whether its fully drained and back to
642 | INACTIVE"""
643 | service_name = step["name"]
644 | response = self._ecs_client.describe_services(
645 | cluster=self._ecs_name,
646 | services=[service_name]
647 | )
648 |
649 | service = response["services"][0]
650 | return service["status"] == "INACTIVE"
651 |
652 | def all_services_done(self, steps):
653 | # type: (List[Dict[str, Any]]) -> bool
654 | """Queries all service ARN's in the plan to see if they're fully
655 | DRAINED and now INACTIVE"""
656 | with ThreadPoolExecutor(max_workers=8) as executor:
657 | results = executor.map(self.service_done, steps)
658 | return all(results)
659 |
660 | def stop_finished_service(self, start_time, step):
661 | # type: (start_time, Dict[str, Any]) -> None
662 | """Stops a service if it needs to shutdown"""
663 | if step["service_status"] == "STOPPED":
664 | return
665 |
666 | # Calculate time
667 | step_duration = step.get("run_delay", 0) + step["run_max_time"]
668 | now = time.time()
669 | if now < (start_time + step_duration):
670 | return
671 |
672 | # Running long enough to shutdown
673 | self._ecs_client.update_service(
674 | cluster=self._ecs_name,
675 | service=step["name"],
676 | desiredCount=0
677 | )
678 | step["service_status"] = "STOPPED"
679 |
680 | def stop_finished_services(self, start_time, steps):
681 | # type: (int, List[Dict[str, Any]]) -> None
682 | """Shuts down any services that have run for their max time"""
683 | for step in steps:
684 | self.stop_finished_service(start_time, step)
685 |
686 | def shutdown_plan(self, steps):
687 | # type: (List[Dict[str, Any]]) -> None
688 | """Terminate the entire plan, ensure all services and task
689 | definitions are completely cleaned up and removed"""
690 | # Locate all the services for the ECS Cluster
691 | paginator = self._ecs_client.get_paginator('list_services')
692 | response_iterator = paginator.paginate(
693 | cluster=self._ecs_name
694 | )
695 |
696 | # Collect all the service ARN's
697 | service_arns = []
698 | for page in response_iterator:
699 | service_arns.extend(page["serviceArns"])
700 |
701 | # Avoid shutting down metrics if tear down was not requested
702 | # We have to exclude it from the services discovered above if we
703 | # should NOT tear it down
704 | if not self._plan["metrics_options"]["tear_down"]:
705 | metric_service = self.locate_metrics_service()
706 | if metric_service and metric_service["serviceArn"] in service_arns:
707 | service_arns.remove(metric_service["serviceArn"])
708 |
709 | for service_arn in service_arns:
710 | try:
711 | self._ecs_client.update_service(
712 | cluster=self._ecs_name,
713 | service=service_arn,
714 | desiredCount=0
715 | )
716 | except botocore.exceptions.ClientError:
717 | continue
718 |
719 | try:
720 | self._ecs_client.delete_service(
721 | cluster=self._ecs_name,
722 | service=service_arn
723 | )
724 | except botocore.exceptions.ClientError:
725 | pass
726 |
727 | # Locate all the task definitions for this plan
728 | step_family_names = [self.family_name(step) for step in steps]
729 |
730 | # Add in the metrics family name if we need to tear_down
731 | if self._plan["metrics_options"]["tear_down"]:
732 | step_family_names.append(self.metrics_family_name())
733 | step_family_names.append(self.metrics_setup_family_name())
734 |
735 | for family_name in step_family_names:
736 | try:
737 | response = self._ecs_client.describe_task_definition(
738 | taskDefinition=family_name
739 | )
740 | except botocore.exceptions.ClientError:
741 | continue
742 |
743 | task_arn = response["taskDefinition"]["taskDefinitionArn"]
744 |
745 | # Deregister the task
746 | try:
747 | self._ecs_client.deregister_task_definition(
748 | taskDefinition=task_arn
749 | )
750 | except botocore.exceptions.ClientError:
751 | pass
752 |
--------------------------------------------------------------------------------
/ardere/exceptions.py:
--------------------------------------------------------------------------------
1 | class ServicesStartingException(Exception):
2 | """Exception to indicate Services are still Starting"""
3 |
4 |
5 | class ShutdownPlanException(Exception):
6 | """Exception to indicate the Plan should be Shutdown"""
7 |
8 |
9 | class ValidationException(Exception):
10 | """Exception to indicate validation error parsing input"""
11 |
12 |
13 | class UndrainedInstancesException(Exception):
14 | """There are still ACTIVE or DRAINING instances in the cluster"""
15 |
16 |
17 | class CreatingMetricSourceException(Exception):
18 | """Metric creation task hasn't completed yet"""
19 |
--------------------------------------------------------------------------------
/ardere/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 |
--------------------------------------------------------------------------------
/ardere/scripts/metric_creator.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import json
3 | import os
4 |
5 | import boto3
6 | import influxdb
7 | import requests
8 |
9 | try:
10 | from typing import Any, Dict # noqa
11 | except ImportError: # pragma: nocover
12 | pass
13 |
14 | logging.basicConfig(level=logging.INFO)
15 | logger = logging.getLogger()
16 |
17 |
18 | class DashboardSetup(object):
19 | # For testing purposes
20 | boto = boto3
21 | req = requests
22 | influx = influxdb
23 |
24 | def __init__(self):
25 | self.influx_db_name = os.environ["__ARDERE_INFLUXDB_NAME__"]
26 | self.dashboard = os.environ.get("__ARDERE_DASHBOARD__")
27 | self.dashboard_name = os.environ.get("__ARDERE_DASHBOARD_NAME__")
28 | self.grafana_auth = (
29 | os.environ.get("__ARDERE_GRAFANA_USER__"),
30 | os.environ.get("__ARDERE_GRAFANA_PASS__")
31 | )
32 |
33 | def _load_dashboard(self):
34 | # type: () -> Dict[str, Any]
35 | """Load dashboard from S3 and update JSON contents"""
36 | logger.info("Fetching dashboard from S3")
37 | bucket, filename = self.dashboard.split(":")
38 | s3 = self.boto.resource('s3')
39 | dash_file = s3.Object(bucket, filename)
40 | file_contents = dash_file.get()['Body'].read().decode('utf-8')
41 | dash_contents = json.loads(file_contents)
42 | dash_contents["title"] = self.dashboard_name
43 | dash_contents["id"] = None
44 | logger.info("Fetched dashboard file")
45 | return dash_contents
46 |
47 | def _create_dashboard(self, grafana_url):
48 | # type: (str) -> None
49 | """Create the dashboard in grafana"""
50 | dash_contents = self._load_dashboard()
51 | logger.info("Creating dashboard in grafana")
52 | response = self.req.post(grafana_url + "/api/dashboards/db",
53 | auth=self.grafana_auth,
54 | json=dict(
55 | dashboard=dash_contents,
56 | overwrite=True
57 | ))
58 | if response.status_code != 200:
59 | raise Exception("Error creating dashboard: {}".format(
60 | response.status_code))
61 |
62 | def _ensure_dashboard(self, grafana_url):
63 | # type: (str) -> None
64 | """Ensure the dashboard is present"""
65 | # Verify whether the dashboard exists
66 | response = self.req.get(grafana_url + "/api/search",
67 | auth=self.grafana_auth,
68 | params=dict(query=self.dashboard_name))
69 | if response.status_code != 200:
70 | raise Exception("Failure to search dashboards")
71 |
72 | # search results for dashboard
73 | results = filter(lambda x: x["title"] == self.dashboard_name,
74 | response.json())
75 | if not results:
76 | self._create_dashboard(grafana_url)
77 |
78 | def create_datasources(self):
79 | # type: () -> None
80 | # Create an influxdb for this run
81 | logger.info("Create influx database")
82 | influx_client = self.influx.InfluxDBClient()
83 | influx_client.create_database(self.influx_db_name)
84 |
85 | # Setup the grafana datasource
86 | grafana_url = "http://127.0.0.1:3000"
87 | ds_api_url = "http://127.0.0.1:3000/api/datasources"
88 | logger.info("Create datasource in grafana")
89 | self.req.post(ds_api_url, auth=self.grafana_auth, json=dict(
90 | name=self.influx_db_name,
91 | type="influxdb",
92 | url="http://localhost:8086",
93 | database=self.influx_db_name,
94 | access="proxy",
95 | basicAuth=False
96 | ))
97 |
98 | # Setup the grafana dashboard if needed/desired
99 | if self.dashboard:
100 | self._ensure_dashboard(grafana_url)
101 |
102 |
103 | if __name__ == "__main__": # pragma: no cover
104 | logger.info("Creating datasources")
105 | DashboardSetup().create_datasources()
106 | logger.info("Finished.")
107 |
--------------------------------------------------------------------------------
/ardere/step_functions.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import re
4 | import time
5 | from collections import defaultdict
6 |
7 | import boto3
8 | import botocore
9 | import toml
10 | from marshmallow import (
11 | Schema,
12 | decorators,
13 | fields,
14 | validate,
15 | ValidationError,
16 | )
17 | from typing import Any, Dict, List # noqa
18 |
19 | from ardere.aws import (
20 | ECSManager,
21 | ec2_vcpu_by_type,
22 | )
23 | from ardere.exceptions import (
24 | CreatingMetricSourceException,
25 | ServicesStartingException,
26 | ShutdownPlanException,
27 | ValidationException,
28 | UndrainedInstancesException,
29 | )
30 |
31 | logger = logging.getLogger()
32 | logger.setLevel(logging.INFO)
33 |
34 | # Step name is used as the Log stream name.
35 | # Log stream names are limited to 512 characters (no ":" or "*")
36 | # Name format is
37 | # ardere-UUID/STEP_NAME/LUUID
38 | # where UUID is dashed, and LUUID is not
39 | # therefore: 512 - (9 + 36 + 32) = max name len
40 | MAX_NAME_LEN = 435
41 | INVALID_NAME_CHECK = re.compile("([:\*]+)")
42 |
43 |
44 | class StepValidator(Schema):
45 | name = fields.String(required=True)
46 | instance_count = fields.Int(required=True)
47 | instance_type = fields.String(
48 | required=True,
49 | validate=validate.OneOf(ec2_vcpu_by_type.keys())
50 | )
51 | run_max_time = fields.Int(required=True)
52 | run_delay = fields.Int(missing=0)
53 | container_name = fields.String(required=True)
54 | cmd = fields.String(required=True)
55 | port_mapping = fields.List(fields.Int())
56 | env = fields.Dict()
57 | docker_series = fields.String(missing="default")
58 |
59 | @decorators.validates("name")
60 | def validate_name(self, value):
61 | if len(value) == 0:
62 | raise ValidationError("Step name missing")
63 | if len(value) > MAX_NAME_LEN:
64 | raise ValidationError("Step name too long")
65 | if INVALID_NAME_CHECK.search(value):
66 | raise ValidationError("Step name contains invalid characters")
67 |
68 |
69 | class DashboardOptions(Schema):
70 | admin_user = fields.String(missing="admin")
71 | admin_password = fields.String(required=True)
72 | name = fields.String(required=True)
73 | filename = fields.String(required=True)
74 |
75 |
76 | class MetricsOptions(Schema):
77 | enabled = fields.Bool(missing=True)
78 | instance_type = fields.String(
79 | missing="c4.large",
80 | validate=validate.OneOf(ec2_vcpu_by_type.keys())
81 | )
82 | dashboard = fields.Nested(DashboardOptions)
83 | tear_down = fields.Bool(missing=False)
84 |
85 |
86 | class PlanValidator(Schema):
87 | ecs_name = fields.String(required=True)
88 | name = fields.String(required=True)
89 | metrics_options = fields.Nested(MetricsOptions, missing={})
90 |
91 | steps = fields.Nested(StepValidator, many=True)
92 |
93 | def _log_validate_name(self, value, name_type):
94 | if len(value) == 0:
95 | raise ValidationError("{} missing".format(name_type))
96 | if len(value) > MAX_NAME_LEN:
97 | raise ValidationError("{} too long".format(name_type))
98 | if INVALID_NAME_CHECK.search(value):
99 | raise ValidationError(
100 | "{} contained invalid characters".format(name_type))
101 |
102 | @decorators.validates("ecs_name")
103 | def validate_ecs_name(self, value):
104 | """Verify a cluster exists for this name"""
105 | self._log_validate_name(value, "Plan ecs_name")
106 | client = self.context["boto"].client('ecs')
107 | response = client.describe_clusters(
108 | clusters=[value]
109 | )
110 | if not response.get("clusters"):
111 | raise ValidationError("No cluster with the provided name.")
112 |
113 | @decorators.validates("name")
114 | def validate_name(self, value):
115 | self._log_validate_name(value, "Step name")
116 |
117 |
118 | class AsynchronousPlanRunner(object):
119 | """Asynchronous Test Plan Runner
120 |
121 | This step function based runner handles running a test plan in an
122 | asynchronous manner, where each step will wait for its run_delay if
123 | present before running.
124 |
125 | """
126 | # For testing purposes
127 | boto = boto3
128 |
129 | def __init__(self, event, context):
130 | logger.info("Called with {}".format(event))
131 | logger.info("Environ: {}".format(os.environ))
132 |
133 | # Load our TOML if needed
134 | event = self._load_toml(event)
135 |
136 | self.event = event
137 | self.context = context
138 | self.ecs = ECSManager(plan=event)
139 |
140 | @property
141 | def grafana_auth(self):
142 | if not self.event["metrics_options"].get("dashboard"):
143 | return "", ""
144 |
145 | dash_opts = self.event["metrics_options"]["dashboard"]
146 | return dash_opts["admin_user"], dash_opts["admin_password"]
147 |
148 | @property
149 | def dashboard_options(self):
150 | return self.event["metrics_options"]["dashboard"]
151 |
152 | def _build_instance_map(self):
153 | """Given a JSON test-plan, build and return a dict of instance types
154 | and how many should exist for each type."""
155 | instances = defaultdict(int)
156 | for step in self.event["steps"]:
157 | instances[step["instance_type"]] += step["instance_count"]
158 | return instances
159 |
160 | def _find_test_plan_duration(self):
161 | # type: (Dict[str, Any]) -> int
162 | """Locates and calculates the longest test plan duration from its
163 | delay through its duration of the plan."""
164 | return max(
165 | [x.get("run_delay", 0) + x["run_max_time"] for x in
166 | self.event["steps"]]
167 | )
168 |
169 | def _load_toml(self, event):
170 | """Loads TOML if necessary"""
171 | return toml.loads(event["toml"]) if "toml" in event else event
172 |
173 | def _validate_plan(self):
174 | """Validates that the loaded plan is correct"""
175 | schema = PlanValidator()
176 | schema.context["boto"] = self.boto
177 | data, errors = schema.load(self.event)
178 | if errors:
179 | raise ValidationException("Failed to validate: {}".format(errors))
180 |
181 | # Replace our event with the validated
182 | self.event = data
183 |
184 | def populate_missing_instances(self):
185 | """Populate any missing EC2 instances needed for the test plan in the
186 | cluster
187 |
188 | """
189 | # First, validate the test plan, done only as part of step 1
190 | self._validate_plan()
191 |
192 | needed = self._build_instance_map()
193 |
194 | # Ensure we have the metrics instance
195 | if self.event["metrics_options"]["enabled"]:
196 | # Query to see if we need to add a metrics node
197 | metric_inst_type = self.event["metrics_options"]["instance_type"]
198 |
199 | # We add the instance type to needed to ensure we don't leave out
200 | # more nodes since this will turn up in the query_active results
201 | needed[metric_inst_type] += 1
202 |
203 | # We create it here up-front if needed since we have different
204 | # tags
205 | if not self.ecs.has_metrics_node(metric_inst_type):
206 | self.ecs.request_instances(
207 | instances={metric_inst_type: 1},
208 | security_group_ids=[os.environ["metric_sg"],
209 | os.environ["ec2_sg"]],
210 | additional_tags={"Role": "metrics"}
211 | )
212 |
213 | logger.info("Plan instances needed: {}".format(needed))
214 | current_instances = self.ecs.query_active_instances()
215 | missing_instances = self.ecs.calculate_missing_instances(
216 | desired=needed, current=current_instances
217 | )
218 | if missing_instances:
219 | logger.info("Requesting instances: {}".format(missing_instances))
220 | self.ecs.request_instances(
221 | instances=missing_instances,
222 | security_group_ids=[os.environ["ec2_sg"]]
223 | )
224 | return self.event
225 |
226 | def ensure_metrics_available(self):
227 | """Start the metrics service, ensure its running, and its IP is known
228 |
229 | """
230 | if not self.event["metrics_options"]["enabled"]:
231 | return self.event
232 |
233 | # Is the service already running?
234 | metrics = self.ecs.locate_metrics_service()
235 | logger.info("Metrics info: %s", metrics)
236 |
237 | if not metrics:
238 | # Start the metrics service, throw a retry
239 | self.ecs.create_metrics_service(self.event["metrics_options"])
240 | raise ServicesStartingException("Triggered metrics start")
241 |
242 | deploy = metrics["deployments"][0]
243 | ready = deploy["desiredCount"] == deploy["runningCount"]
244 | logger.info("Deploy info: %s", deploy)
245 | if not ready:
246 | raise ServicesStartingException("Waiting for metrics")
247 |
248 | # Populate the IP of the metrics service
249 | metric_ip, container_arn = self.ecs.locate_metrics_container_ip()
250 |
251 | if not metric_ip:
252 | raise Exception("Unable to locate metrics IP even though its "
253 | "running")
254 |
255 | self.event["influxdb_private_ip"] = metric_ip
256 | self.event["metric_container_arn"] = container_arn
257 | return self.event
258 |
259 | def ensure_metric_sources_created(self):
260 | """Ensure the metrics db and grafana datasource are configured"""
261 | if not self.event["metrics_options"]["enabled"]:
262 | return self.event
263 |
264 | if not self.ecs.has_started_metric_creation():
265 | dashboard = None
266 | dashboard_name = None
267 | if self.event["metrics_options"].get("dashboard"):
268 | dashboard = ":".join([os.environ["metrics_bucket"],
269 | self.dashboard_options["filename"]])
270 | dashboard_name = self.dashboard_options["name"]
271 | self.ecs.run_metric_creation_task(
272 | container_instance=self.event["metric_container_arn"],
273 | grafana_auth=self.grafana_auth,
274 | dashboard=dashboard,
275 | dashboard_name=dashboard_name
276 | )
277 | raise CreatingMetricSourceException("Started metric creation")
278 |
279 | if not self.ecs.has_finished_metric_creation():
280 | raise CreatingMetricSourceException("Metric creation still "
281 | "running")
282 |
283 | metric_ip = self.event["influxdb_private_ip"]
284 | self.event["grafana_dashboard"] = "http://{}:3000".format(metric_ip)
285 | return self.event
286 |
287 | def create_ecs_services(self):
288 | """Create all the ECS services needed
289 |
290 | """
291 | self.ecs.create_services(self.event["steps"])
292 | return self.event
293 |
294 | def wait_for_cluster_ready(self):
295 | """Check all the ECS services to see if they're ready
296 |
297 | """
298 | if not self.ecs.all_services_ready(self.event["steps"]):
299 | raise ServicesStartingException()
300 | return self.event
301 |
302 | def signal_cluster_start(self):
303 | """Drop a ready file in S3 to trigger the test plan to being
304 |
305 | """
306 | s3_client = self.boto.client('s3')
307 | s3_client.put_object(
308 | ACL="public-read",
309 | Body=b'{}'.format(int(time.time())),
310 | Bucket=os.environ["s3_ready_bucket"],
311 | Key="{}.ready".format(self.ecs.plan_uuid),
312 | Metadata={
313 | "ECSCluster": self.event["ecs_name"]
314 | }
315 | )
316 | return self.event
317 |
318 | def check_for_cluster_done(self):
319 | """Check all the ECS services to see if they've run for their
320 | specified duration
321 |
322 | """
323 | # Check to see if the S3 file is still around
324 | s3 = self.boto.resource('s3')
325 | try:
326 | ready_file = s3.Object(
327 | os.environ["s3_ready_bucket"],
328 | "{}.ready".format(self.ecs.plan_uuid)
329 | )
330 | except botocore.exceptions.ClientError:
331 | # Error getting to the bucket/key, abort test run
332 | raise ShutdownPlanException("Error accessing ready file")
333 |
334 | file_contents = ready_file.get()['Body'].read().decode('utf-8')
335 | start_time = int(file_contents)
336 |
337 | # Update to running count 0 any services that should halt by now
338 | self.ecs.stop_finished_services(start_time, self.event["steps"])
339 |
340 | # If we're totally done, exit.
341 | now = time.time()
342 | plan_duration = self._find_test_plan_duration()
343 | if now > (start_time + plan_duration):
344 | raise ShutdownPlanException("Test Plan has completed")
345 | return self.event
346 |
347 | def cleanup_cluster(self):
348 | """Shutdown all ECS services and deregister all task definitions"""
349 | self.ecs.shutdown_plan(self.event["steps"])
350 |
351 | # Attempt to remove the S3 object
352 | s3 = self.boto.resource('s3')
353 | try:
354 | ready_file = s3.Object(
355 | os.environ["s3_ready_bucket"],
356 | "{}.ready".format(self.ecs.plan_uuid)
357 | )
358 | ready_file.delete()
359 | except botocore.exceptions.ClientError:
360 | pass
361 | return self.event
362 |
363 | def check_drained(self):
364 | """Ensure that all services are shut down before allowing restart"""
365 | if self.ecs.all_services_done(self.event["steps"]):
366 | return self.event
367 | else:
368 | raise UndrainedInstancesException("Services still draining")
369 |
--------------------------------------------------------------------------------
/config.bash:
--------------------------------------------------------------------------------
1 | #! /bin/bash -w
2 |
3 | ctrlc()
4 | {
5 | echo " Exiting..."
6 | rm ~/.aws/credentials
7 | exit 1
8 | }
9 | set -e
10 |
11 | if [[ "`which serverless`" == "" ]]
12 | then
13 | echo "Hrm, serverless is not installed. "
14 | echo "See https://serverless.com/framework/docs/providers/aws/guide/installation/"
15 | return
16 | fi
17 | if [[ ! -e ~/.aws/credentials ]]
18 | then
19 | trap ctrlc SIGINT
20 | echo " credential file was not found. Let's make one."
21 | echo ""
22 | echo " If you haven't already, you'll need to create an access key."
23 | echo " e.g. go to https://console.aws.amazon.com/iam/home#/users/${USER}/?security_credientials"
24 | echo " and click [Create access key]."
25 | echo ""
26 | read -p "Access Key ID: " access_key
27 | read -p "Secret Key ID: " secret_key
28 | echo " Thanks! Running configuration";
29 | echo serverless config credentials --provider aws --key $access_key --secret $secret_key
30 | serverless config credentials --provider aws --key $access_key --secret $secret_key
31 | fi
32 | echo " You're configured. The next step is to deploy."
33 |
34 |
--------------------------------------------------------------------------------
/default_dashboard.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotations": {
3 | "list": []
4 | },
5 | "editable": true,
6 | "gnetId": null,
7 | "graphTooltip": 0,
8 | "hideControls": false,
9 | "id": 1,
10 | "links": [],
11 | "refresh": false,
12 | "rows": [
13 | {
14 | "collapse": false,
15 | "height": "250px",
16 | "panels": [
17 | {
18 | "aliasColors": {},
19 | "bars": false,
20 | "datasource": "$db",
21 | "fill": 1,
22 | "id": 1,
23 | "legend": {
24 | "alignAsTable": true,
25 | "avg": false,
26 | "current": false,
27 | "max": false,
28 | "min": false,
29 | "show": true,
30 | "total": false,
31 | "values": true
32 | },
33 | "lines": true,
34 | "linewidth": 1,
35 | "links": [],
36 | "nullPointMode": "null",
37 | "percentage": false,
38 | "pointradius": 5,
39 | "points": false,
40 | "renderer": "flot",
41 | "seriesOverrides": [],
42 | "span": 6,
43 | "stack": false,
44 | "steppedLine": false,
45 | "targets": [
46 | {
47 | "alias": "",
48 | "dsType": "influxdb",
49 | "groupBy": [
50 | {
51 | "params": [
52 | "$interval"
53 | ],
54 | "type": "time"
55 | },
56 | {
57 | "params": [
58 | "host"
59 | ],
60 | "type": "tag"
61 | },
62 | {
63 | "params": [
64 | "step"
65 | ],
66 | "type": "tag"
67 | },
68 | {
69 | "params": [
70 | "none"
71 | ],
72 | "type": "fill"
73 | }
74 | ],
75 | "hide": false,
76 | "measurement": "cpu",
77 | "policy": "default",
78 | "query": "SELECT mean(\"usage_user\") FROM \"cpu\" WHERE $timeFilter GROUP BY time($interval) fill(null)",
79 | "rawQuery": false,
80 | "refId": "A",
81 | "resultFormat": "time_series",
82 | "select": [
83 | [
84 | {
85 | "params": [
86 | "usage_system"
87 | ],
88 | "type": "field"
89 | },
90 | {
91 | "params": [],
92 | "type": "mean"
93 | }
94 | ]
95 | ],
96 | "tags": [
97 | {
98 | "key": "step",
99 | "operator": "=~",
100 | "value": "/^$step$/"
101 | },
102 | {
103 | "condition": "AND",
104 | "key": "host",
105 | "operator": "=~",
106 | "value": "/^$host$/"
107 | }
108 | ]
109 | }
110 | ],
111 | "thresholds": [],
112 | "timeFrom": null,
113 | "timeShift": null,
114 | "title": "CPU Usage",
115 | "tooltip": {
116 | "shared": false,
117 | "sort": 0,
118 | "value_type": "individual"
119 | },
120 | "type": "graph",
121 | "xaxis": {
122 | "mode": "time",
123 | "name": null,
124 | "show": true,
125 | "values": []
126 | },
127 | "yaxes": [
128 | {
129 | "format": "percentunit",
130 | "label": null,
131 | "logBase": 1,
132 | "max": null,
133 | "min": null,
134 | "show": true
135 | },
136 | {
137 | "format": "short",
138 | "label": null,
139 | "logBase": 1,
140 | "max": null,
141 | "min": null,
142 | "show": true
143 | }
144 | ]
145 | },
146 | {
147 | "aliasColors": {},
148 | "bars": false,
149 | "datasource": "$db",
150 | "fill": 1,
151 | "id": 2,
152 | "legend": {
153 | "alignAsTable": true,
154 | "avg": false,
155 | "current": false,
156 | "max": false,
157 | "min": false,
158 | "show": true,
159 | "total": false,
160 | "values": true
161 | },
162 | "lines": true,
163 | "linewidth": 1,
164 | "links": [],
165 | "nullPointMode": "null",
166 | "percentage": false,
167 | "pointradius": 5,
168 | "points": false,
169 | "renderer": "flot",
170 | "seriesOverrides": [],
171 | "span": 6,
172 | "stack": false,
173 | "steppedLine": false,
174 | "targets": [
175 | {
176 | "dsType": "influxdb",
177 | "groupBy": [
178 | {
179 | "params": [
180 | "$interval"
181 | ],
182 | "type": "time"
183 | },
184 | {
185 | "params": [
186 | "step"
187 | ],
188 | "type": "tag"
189 | },
190 | {
191 | "params": [
192 | "host"
193 | ],
194 | "type": "tag"
195 | },
196 | {
197 | "params": [
198 | "none"
199 | ],
200 | "type": "fill"
201 | }
202 | ],
203 | "measurement": "mem",
204 | "policy": "default",
205 | "refId": "A",
206 | "resultFormat": "time_series",
207 | "select": [
208 | [
209 | {
210 | "params": [
211 | "used"
212 | ],
213 | "type": "field"
214 | },
215 | {
216 | "params": [],
217 | "type": "mean"
218 | }
219 | ]
220 | ],
221 | "tags": [
222 | {
223 | "key": "step",
224 | "operator": "=~",
225 | "value": "/^$step$/"
226 | },
227 | {
228 | "condition": "AND",
229 | "key": "host",
230 | "operator": "=~",
231 | "value": "/^$host$/"
232 | }
233 | ]
234 | }
235 | ],
236 | "thresholds": [],
237 | "timeFrom": null,
238 | "timeShift": null,
239 | "title": "Memory Usage",
240 | "tooltip": {
241 | "shared": false,
242 | "sort": 0,
243 | "value_type": "individual"
244 | },
245 | "type": "graph",
246 | "xaxis": {
247 | "mode": "time",
248 | "name": null,
249 | "show": true,
250 | "values": []
251 | },
252 | "yaxes": [
253 | {
254 | "format": "bytes",
255 | "label": null,
256 | "logBase": 1,
257 | "max": null,
258 | "min": null,
259 | "show": true
260 | },
261 | {
262 | "format": "short",
263 | "label": null,
264 | "logBase": 1,
265 | "max": null,
266 | "min": null,
267 | "show": true
268 | }
269 | ]
270 | }
271 | ],
272 | "repeat": null,
273 | "repeatIteration": null,
274 | "repeatRowId": null,
275 | "showTitle": false,
276 | "title": "Dashboard Row",
277 | "titleSize": "h6"
278 | },
279 | {
280 | "collapse": false,
281 | "height": 250,
282 | "panels": [
283 | {
284 | "aliasColors": {},
285 | "bars": false,
286 | "datasource": "$db",
287 | "fill": 1,
288 | "id": 3,
289 | "legend": {
290 | "alignAsTable": true,
291 | "avg": false,
292 | "current": false,
293 | "max": false,
294 | "min": false,
295 | "show": true,
296 | "total": false,
297 | "values": true
298 | },
299 | "lines": true,
300 | "linewidth": 1,
301 | "links": [],
302 | "nullPointMode": "null",
303 | "percentage": false,
304 | "pointradius": 5,
305 | "points": false,
306 | "renderer": "flot",
307 | "seriesOverrides": [
308 | {
309 | "alias": "/^in.*/",
310 | "transform": "negative-Y"
311 | }
312 | ],
313 | "span": 6,
314 | "stack": false,
315 | "steppedLine": false,
316 | "targets": [
317 | {
318 | "alias": "out {host: [[tag_host]] step: [[tag_step]]}",
319 | "dsType": "influxdb",
320 | "groupBy": [
321 | {
322 | "params": [
323 | "$interval"
324 | ],
325 | "type": "time"
326 | },
327 | {
328 | "params": [
329 | "step"
330 | ],
331 | "type": "tag"
332 | },
333 | {
334 | "params": [
335 | "host"
336 | ],
337 | "type": "tag"
338 | },
339 | {
340 | "params": [
341 | "null"
342 | ],
343 | "type": "fill"
344 | }
345 | ],
346 | "measurement": "net",
347 | "policy": "default",
348 | "refId": "A",
349 | "resultFormat": "time_series",
350 | "select": [
351 | [
352 | {
353 | "params": [
354 | "bytes_sent"
355 | ],
356 | "type": "field"
357 | },
358 | {
359 | "params": [],
360 | "type": "mean"
361 | },
362 | {
363 | "params": [
364 | "1s"
365 | ],
366 | "type": "non_negative_derivative"
367 | },
368 | {
369 | "params": [
370 | " *8"
371 | ],
372 | "type": "math"
373 | }
374 | ]
375 | ],
376 | "tags": [
377 | {
378 | "key": "step",
379 | "operator": "=~",
380 | "value": "/^$step$/"
381 | },
382 | {
383 | "condition": "AND",
384 | "key": "host",
385 | "operator": "=~",
386 | "value": "/^$host$/"
387 | }
388 | ]
389 | },
390 | {
391 | "alias": "in {host: [[tag_host]] step: [[tag_step]]}",
392 | "dsType": "influxdb",
393 | "groupBy": [
394 | {
395 | "params": [
396 | "$interval"
397 | ],
398 | "type": "time"
399 | },
400 | {
401 | "params": [
402 | "step"
403 | ],
404 | "type": "tag"
405 | },
406 | {
407 | "params": [
408 | "host"
409 | ],
410 | "type": "tag"
411 | },
412 | {
413 | "params": [
414 | "none"
415 | ],
416 | "type": "fill"
417 | }
418 | ],
419 | "measurement": "net",
420 | "policy": "default",
421 | "query": "SELECT non_negative_derivative(mean(\"bytes_recv\"), 1s) *8 FROM \"net\" WHERE \"step\" =~ /^$step$/ AND \"host\" =~ /^$host$/ AND $timeFilter GROUP BY time($interval), \"step\", \"host\" fill(none)",
422 | "rawQuery": true,
423 | "refId": "B",
424 | "resultFormat": "time_series",
425 | "select": [
426 | [
427 | {
428 | "params": [
429 | "bytes_recv"
430 | ],
431 | "type": "field"
432 | },
433 | {
434 | "params": [],
435 | "type": "mean"
436 | },
437 | {
438 | "params": [
439 | "1s"
440 | ],
441 | "type": "non_negative_derivative"
442 | },
443 | {
444 | "params": [
445 | "*8"
446 | ],
447 | "type": "math"
448 | }
449 | ]
450 | ],
451 | "tags": [
452 | {
453 | "key": "step",
454 | "operator": "=~",
455 | "value": "/^$step$/"
456 | },
457 | {
458 | "condition": "AND",
459 | "key": "host",
460 | "operator": "=~",
461 | "value": "/^$host$/"
462 | }
463 | ]
464 | }
465 | ],
466 | "thresholds": [],
467 | "timeFrom": null,
468 | "timeShift": null,
469 | "title": "Network Bytes/sec",
470 | "tooltip": {
471 | "shared": false,
472 | "sort": 0,
473 | "value_type": "individual"
474 | },
475 | "type": "graph",
476 | "xaxis": {
477 | "mode": "time",
478 | "name": null,
479 | "show": true,
480 | "values": []
481 | },
482 | "yaxes": [
483 | {
484 | "format": "Bps",
485 | "label": null,
486 | "logBase": 1,
487 | "max": null,
488 | "min": null,
489 | "show": true
490 | },
491 | {
492 | "format": "short",
493 | "label": null,
494 | "logBase": 1,
495 | "max": null,
496 | "min": null,
497 | "show": true
498 | }
499 | ]
500 | }
501 | ],
502 | "repeat": null,
503 | "repeatIteration": null,
504 | "repeatRowId": null,
505 | "showTitle": false,
506 | "title": "Dashboard Row",
507 | "titleSize": "h6"
508 | }
509 | ],
510 | "schemaVersion": 14,
511 | "style": "dark",
512 | "tags": [],
513 | "templating": {
514 | "list": [
515 | {
516 | "current": {
517 | "tags": [],
518 | "text": "run-a3226cfb9513415bac7c7053a8b62a5f",
519 | "value": "run-a3226cfb9513415bac7c7053a8b62a5f"
520 | },
521 | "hide": 0,
522 | "label": null,
523 | "name": "db",
524 | "options": [],
525 | "query": "influxdb",
526 | "refresh": 1,
527 | "regex": "",
528 | "type": "datasource"
529 | },
530 | {
531 | "allValue": null,
532 | "current": {
533 | "text": "All",
534 | "value": "$__all"
535 | },
536 | "datasource": "$db",
537 | "hide": 0,
538 | "includeAll": true,
539 | "label": null,
540 | "multi": true,
541 | "name": "step",
542 | "options": [],
543 | "query": "SHOW TAG VALUES WITH KEY = step",
544 | "refresh": 2,
545 | "regex": "",
546 | "sort": 0,
547 | "tagValuesQuery": "",
548 | "tags": [],
549 | "tagsQuery": "",
550 | "type": "query",
551 | "useTags": false
552 | },
553 | {
554 | "allValue": null,
555 | "current": {
556 | "text": "All",
557 | "value": "$__all"
558 | },
559 | "datasource": "$db",
560 | "hide": 0,
561 | "includeAll": true,
562 | "label": null,
563 | "multi": true,
564 | "name": "host",
565 | "options": [],
566 | "query": "SHOW TAG VALUES WITH KEY = host",
567 | "refresh": 2,
568 | "regex": "",
569 | "sort": 0,
570 | "tagValuesQuery": "SHOW TAG VALUES WITH KEY = host WHERE step = $step",
571 | "tags": [
572 | "i-08a40a6f64d2e2cbe"
573 | ],
574 | "tagsQuery": "SHOW TAG VALUES WITH KEY = host",
575 | "type": "query",
576 | "useTags": true
577 | }
578 | ]
579 | },
580 | "time": {
581 | "from": "2017-04-05T03:01:01.177Z",
582 | "to": "2017-04-05T03:15:19.628Z"
583 | },
584 | "timepicker": {
585 | "refresh_intervals": [
586 | "5s",
587 | "10s",
588 | "30s",
589 | "1m",
590 | "5m",
591 | "15m",
592 | "30m",
593 | "1h",
594 | "2h",
595 | "1d"
596 | ],
597 | "time_options": [
598 | "5m",
599 | "15m",
600 | "1h",
601 | "6h",
602 | "12h",
603 | "24h",
604 | "2d",
605 | "7d",
606 | "30d"
607 | ]
608 | },
609 | "timezone": "browser",
610 | "title": "loads-broker Monitor",
611 | "version": 5
612 | }
--------------------------------------------------------------------------------
/handler.py:
--------------------------------------------------------------------------------
1 | # First some funky path manipulation so that we can work properly in
2 | # the AWS environment
3 | import sys
4 | import os
5 | dir_path = os.path.dirname(os.path.realpath(__file__))
6 | sys.path.append(dir_path)
7 |
8 | from ardere.step_functions import AsynchronousPlanRunner
9 |
10 |
11 | def populate_missing_instances(event, context):
12 | runner = AsynchronousPlanRunner(event, context)
13 | return runner.populate_missing_instances()
14 |
15 |
16 | def ensure_metrics_available(event, context):
17 | runner = AsynchronousPlanRunner(event, context)
18 | return runner.ensure_metrics_available()
19 |
20 |
21 | def ensure_metric_sources_created(event, context):
22 | runner = AsynchronousPlanRunner(event, context)
23 | return runner.ensure_metric_sources_created()
24 |
25 |
26 | def create_ecs_services(event, context):
27 | runner = AsynchronousPlanRunner(event, context)
28 | return runner.create_ecs_services()
29 |
30 |
31 | def wait_for_cluster_ready(event, context):
32 | runner = AsynchronousPlanRunner(event, context)
33 | return runner.wait_for_cluster_ready()
34 |
35 |
36 | def signal_cluster_start(event, context):
37 | runner = AsynchronousPlanRunner(event, context)
38 | return runner.signal_cluster_start()
39 |
40 |
41 | def check_for_cluster_done(event, context):
42 | runner = AsynchronousPlanRunner(event, context)
43 | return runner.check_for_cluster_done()
44 |
45 |
46 | def cleanup_cluster(event, context):
47 | runner = AsynchronousPlanRunner(event, context)
48 | return runner.cleanup_cluster()
49 |
50 |
51 | def check_drain(event, context):
52 | return AsynchronousPlanRunner(event, context).check_drained()
53 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ardere",
3 | "version": "1.0.0",
4 | "description": "Serverless Service for Load-Testing",
5 | "main": "index.js",
6 | "dependencies": {
7 | "serverless": "^1.8.0",
8 | "serverless-python-requirements": "^2.0.0-beta.7",
9 | "serverless-step-functions": "^0.4.1"
10 | },
11 | "devDependencies": {},
12 | "scripts": {
13 | "test": "echo \"Error: no test specified\" && exit 1"
14 | },
15 | "repository": {
16 | "type": "git",
17 | "url": "git+https://github.com/loads/ardere.git"
18 | },
19 | "author": "",
20 | "license": "MPL-2.0",
21 | "bugs": {
22 | "url": "https://github.com/loads/ardere/issues"
23 | },
24 | "homepage": "https://github.com/loads/ardere#readme"
25 | }
26 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | futures==3.0.5
2 | typing==3.5.3.0
3 | toml==0.9.2
4 | marshmallow==2.13.4
5 | boto3==1.4.4
6 | requests==2.13.0
--------------------------------------------------------------------------------
/serverless.yml:
--------------------------------------------------------------------------------
1 | service: ardere
2 |
3 | plugins:
4 | - serverless-step-functions
5 | - serverless-python-requirements
6 |
7 | package:
8 | exclude:
9 | - node_modules/**
10 | - ardenv/**
11 | - tests/**
12 | - lib/**
13 | - share/**
14 | - man/**
15 | - bin/**
16 | - serverless/**
17 | include:
18 | - ardere/**
19 |
20 | provider:
21 | name: aws
22 | runtime: python2.7
23 | memorySize: 128
24 | timeout: 60
25 | environment:
26 | ecs_profile:
27 | Fn::GetAtt:
28 | - EC2ContainerProfile
29 | - Arn
30 | s3_ready_bucket:
31 | Ref: "S3ReadyBucket"
32 | metrics_bucket:
33 | Ref: "MetricsBucket"
34 | ec2_sg:
35 | Fn::GetAtt:
36 | - EC2SecurityGroup
37 | - GroupId
38 | metric_sg:
39 | Fn::GetAtt:
40 | - MetricSecurityGroup
41 | - GroupId
42 | container_log_group:
43 | Ref: "ContainerLogs"
44 |
45 | iamRoleStatements:
46 | - Effect: "Allow"
47 | Action:
48 | - "ecs:CreateCluster"
49 | - "ecs:ListServices"
50 | - "ecs:ListContainerInstances"
51 | - "ecs:ListTasks"
52 | - "ecs:DescribeClusters"
53 | - "ecs:DescribeServices"
54 | - "ecs:DescribeTaskDefinition"
55 | - "ecs:DescribeTasks"
56 | - "ecs:DescribeContainerInstances"
57 | - "ecs:CreateService"
58 | - "ecs:DeleteService"
59 | - "ecs:UpdateService"
60 | - "ecs:StartTask"
61 | - "ecs:RegisterTaskDefinition"
62 | - "ecs:DeregisterTaskDefinition"
63 | Resource:
64 | - "*"
65 | - Effect: "Allow"
66 | Action:
67 | - "s3:ListBucket"
68 | - "s3:PutObject"
69 | - "s3:PutObjectAcl"
70 | Resource:
71 | - Fn::Join: ['', ['arn:aws:s3:::', Ref: "S3ReadyBucket"]]
72 | - Effect: "Allow"
73 | Action:
74 | - "s3:PutObject"
75 | - "s3:PutObjectAcl"
76 | - "s3:GetObject"
77 | - "s3:DeleteObject"
78 | Resource:
79 | - Fn::Join: ['', ['arn:aws:s3:::', Ref: "S3ReadyBucket", "/*"]]
80 | - Effect: "Allow"
81 | Action:
82 | - "s3:ListBucket"
83 | - "s3:GetObject"
84 | Resource:
85 | - Fn::Join: ['', ['arn:aws:s3:::', Ref: "MetricsBucket", "/*"]]
86 | - Effect: "Allow"
87 | Action:
88 | - "ec2:DescribeInstances"
89 | - "ec2:RunInstances"
90 | - "ec2:CreateTags"
91 | Resource:
92 | - "*"
93 | - Effect: "Allow"
94 | Action:
95 | - "iam:GetRole"
96 | - "iam:PassRole"
97 | Resource:
98 | Fn::GetAtt:
99 | - EC2ContainerRole
100 | - Arn
101 |
102 | functions:
103 | populate_missing_instances:
104 | handler: handler.populate_missing_instances
105 | timeout: 300
106 | ensure_metrics_available:
107 | handler: handler.ensure_metrics_available
108 | timeout: 300
109 | ensure_metric_sources_created:
110 | handler: handler.ensure_metric_sources_created
111 | timeout: 300
112 | create_ecs_services:
113 | handler: handler.create_ecs_services
114 | timeout: 300
115 | wait_for_cluster_ready:
116 | handler: handler.wait_for_cluster_ready
117 | signal_cluster_start:
118 | handler: handler.signal_cluster_start
119 | check_for_cluster_done:
120 | handler: handler.check_for_cluster_done
121 | cleanup_cluster:
122 | handler: handler.cleanup_cluster
123 | timeout: 300
124 | check_drain:
125 | handler: handler.check_drain
126 |
127 | stepFunctions:
128 | stateMachines:
129 | ardere:
130 | Comment: "ardere load-tester"
131 | Version: "1.0"
132 | StartAt: "Populate Missing Instances"
133 | States:
134 | "Populate Missing Instances":
135 | Type: Task
136 | Resource: populate_missing_instances
137 | Next: "Ensure Metrics Available"
138 | "Ensure Metrics Available":
139 | Type: Task
140 | Resource: ensure_metrics_available
141 | Retry:
142 | -
143 | ErrorEquals:
144 | - ServicesStartingException
145 | IntervalSeconds: 10
146 | MaxAttempts: 60
147 | BackoffRate: 1
148 | Catch:
149 | -
150 | ErrorEquals:
151 | - States.ALL
152 | ResultPath: "$.error-info"
153 | Next: "Clean-up Cluster"
154 | Next: "Ensure Metric Sources Created"
155 | "Ensure Metric Sources Created":
156 | Type: Task
157 | Resource: ensure_metric_sources_created
158 | Retry:
159 | -
160 | ErrorEquals:
161 | - CreatingMetricSourceException
162 | IntervalSeconds: 5
163 | MaxAttempts: 20
164 | BackoffRate: 1
165 | Catch:
166 | -
167 | ErrorEquals:
168 | - States.ALL
169 | ResultPath: "$.error-info"
170 | Next: "Clean-up Cluster"
171 | Next: "Create ECS Services"
172 | "Create ECS Services":
173 | Type: Task
174 | Resource: create_ecs_services
175 | Catch:
176 | -
177 | ErrorEquals:
178 | - States.ALL
179 | ResultPath: "$.error-info"
180 | Next: "Clean-up Cluster"
181 | Next: "Wait for Cluster Ready"
182 | "Wait for Cluster Ready":
183 | Type: Task
184 | Resource: wait_for_cluster_ready
185 | Retry:
186 | -
187 | ErrorEquals:
188 | - ServicesStartingException
189 | IntervalSeconds: 10
190 | MaxAttempts: 180
191 | BackoffRate: 1
192 | Catch:
193 | -
194 | ErrorEquals:
195 | - States.ALL
196 | ResultPath: "$.error-info"
197 | Next: "Clean-up Cluster"
198 | Next: "Signal Cluster Start"
199 | "Signal Cluster Start":
200 | Type: Task
201 | Resource: signal_cluster_start
202 | Catch:
203 | -
204 | ErrorEquals:
205 | - States.ALL
206 | ResultPath: "$.error-info"
207 | Next: "Clean-up Cluster"
208 | Next: "Check for Cluster Done"
209 | "Check for Cluster Done":
210 | Type: Task
211 | Resource: check_for_cluster_done
212 | Next: "Wait for Cluster Done"
213 | Retry:
214 | -
215 | ErrorEquals:
216 | - NoSuchKey
217 | IntervalSeconds: 10
218 | MaxAttempts: 2
219 | BackoffRate: 1
220 | Catch:
221 | -
222 | ErrorEquals:
223 | - States.ALL
224 | ResultPath: "$.error-info"
225 | Next: "Clean-up Cluster"
226 | "Wait for Cluster Done":
227 | Type: Wait
228 | Seconds: 10
229 | Next: "Check for Cluster Done"
230 | "Clean-up Cluster":
231 | Type: Task
232 | Resource: cleanup_cluster
233 | Next: "Checking Drain"
234 | "Checking Drain":
235 | Type: Task
236 | Resource: check_drain
237 | Retry:
238 | -
239 | ErrorEquals:
240 | - UndrainedInstancesException
241 | IntervalSeconds: 10
242 | MaxAttempts: 10
243 | BackoffRate: 1
244 | End: true
245 |
246 | resources:
247 | Resources:
248 | S3ReadyBucket:
249 | Type: "AWS::S3::Bucket"
250 | Properties:
251 | AccessControl: "PublicRead"
252 | MetricsBucket:
253 | Type: "AWS::S3::Bucket"
254 | Properties:
255 | AccessControl: "AuthenticatedRead"
256 | MetricSecurityGroup:
257 | Type: "AWS::EC2::SecurityGroup"
258 | Properties:
259 | GroupDescription: "ardere metrics"
260 | SecurityGroupIngress:
261 | -
262 | IpProtocol: tcp
263 | FromPort: 3000
264 | ToPort: 3000
265 | SourceSecurityGroupId:
266 | Fn::GetAtt:
267 | - GrafanaSecurityGroup
268 | - GroupId
269 | -
270 | IpProtocol: tcp
271 | FromPort: 8086
272 | ToPort: 8086
273 | SourceSecurityGroupId:
274 | Fn::GetAtt:
275 | - EC2SecurityGroup
276 | - GroupId
277 | GrafanaSecurityGroup:
278 | Type: "AWS::EC2::SecurityGroup"
279 | Properties:
280 | GroupDescription: "grafana access"
281 | EC2SecurityGroup:
282 | Type: "AWS::EC2::SecurityGroup"
283 | Properties:
284 | GroupDescription: "ardere load-testers"
285 | EC2ContainerRole:
286 | Type: "AWS::IAM::Role"
287 | Properties:
288 | AssumeRolePolicyDocument:
289 | Version: "2012-10-17"
290 | Statement:
291 | -
292 | Effect: "Allow"
293 | Principal:
294 | Service:
295 | - "ec2.amazonaws.com"
296 | Action:
297 | - "sts:AssumeRole"
298 | Path: "/"
299 | Policies:
300 | -
301 | PolicyName: "ecs-service"
302 | PolicyDocument:
303 | Version: "2012-10-17"
304 | Statement:
305 | -
306 | Effect: "Allow"
307 | Action:
308 | - "ecs:CreateCluster"
309 | - "ecs:DeregisterContainerInstance"
310 | - "ecs:DiscoverPollEndpoint"
311 | - "ecs:Poll"
312 | - "ecs:RegisterContainerInstance"
313 | - "ecs:StartTelemetrySession"
314 | - "ecs:SubmitContainerStateChange"
315 | - "ecs:SubmitTaskStateChange"
316 | - "ecs:Submit"
317 | - "logs:CreateLogStream"
318 | - "logs:PutLogEvents"
319 | Resource: "*"
320 | -
321 | Effect: "Allow"
322 | Action:
323 | - "s3:ListBucket"
324 | - "s3:GetObject"
325 | Resource:
326 | - Fn::Join: ['', ['arn:aws:s3:::', Ref: "MetricsBucket", "/*"]]
327 | ContainerLogs:
328 | Type: "AWS::Logs::LogGroup"
329 | Properties:
330 | RetentionInDays: 1
331 | EC2ContainerProfile:
332 | Type: "AWS::IAM::InstanceProfile"
333 | Properties:
334 | Path: "/"
335 | Roles:
336 | -
337 | Ref: "EC2ContainerRole"
338 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [nosetests]
2 | verbose=True
3 | verbosity=1
4 | detailed-errors=True
5 | with-coverage=True
6 | cover-erase=True
7 | cover-package=ardere
8 | cover-tests=True
9 | cover-inclusive=True
10 |
--------------------------------------------------------------------------------
/src/shell/telegraf.toml:
--------------------------------------------------------------------------------
1 | # Telegraf Configuration
2 | #
3 | # Telegraf is entirely plugin driven. All metrics are gathered from the
4 | # declared inputs, and sent to the declared outputs.
5 | #
6 | # Plugins must be declared in here to be active.
7 | # To deactivate a plugin, comment out the name and any variables.
8 | #
9 | # Use 'telegraf -config telegraf.conf -test' to see what metrics a config
10 | # file would generate.
11 | #
12 | # Environment variables can be used anywhere in this config file, simply prepend
13 | # them with $. For strings the variable must be within quotes (ie, "$STR_VAR"),
14 | # for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR)
15 |
16 |
17 | # Global tags can be specified here in key="value" format.
18 | [global_tags]
19 | # dc = "us-east-1" # will tag all metrics with dc=us-east-1
20 | # rack = "1a"
21 | ## Environment variables can be used as tags, and throughout the config file
22 | # user = "$USER"
23 | step = "$__ARDERE_TELEGRAF_STEP__"
24 | ## type is the old "docker_series"
25 | type = "$__ARDERE_TELEGRAF_TYPE__"
26 |
27 |
28 | # Configuration for telegraf agent
29 | [agent]
30 | ## Default data collection interval for all inputs
31 | interval = "10s"
32 | ## Rounds collection interval to 'interval'
33 | ## ie, if interval="10s" then always collect on :00, :10, :20, etc.
34 | round_interval = true
35 |
36 | ## Telegraf will send metrics to outputs in batches of at most
37 | ## metric_batch_size metrics.
38 | ## This controls the size of writes that Telegraf sends to output plugins.
39 | metric_batch_size = 1000
40 |
41 | ## For failed writes, telegraf will cache metric_buffer_limit metrics for each
42 | ## output, and will flush this buffer on a successful write. Oldest metrics
43 | ## are dropped first when this buffer fills.
44 | ## This buffer only fills when writes fail to output plugin(s).
45 | metric_buffer_limit = 10000
46 |
47 | ## Collection jitter is used to jitter the collection by a random amount.
48 | ## Each plugin will sleep for a random time within jitter before collecting.
49 | ## This can be used to avoid many plugins querying things like sysfs at the
50 | ## same time, which can have a measurable effect on the system.
51 | collection_jitter = "0s"
52 |
53 | ## Default flushing interval for all outputs. You shouldn't set this below
54 | ## interval. Maximum flush_interval will be flush_interval + flush_jitter
55 | flush_interval = "10s"
56 | ## Jitter the flush interval by a random amount. This is primarily to avoid
57 | ## large write spikes for users running a large number of telegraf instances.
58 | ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
59 | flush_jitter = "0s"
60 | ## By default, precision will be set to the same timestamp order as the
61 | ## collection interval, with the maximum being 1s.
62 | ## Precision will NOT be used for service inputs, such as logparser and statsd.
63 | ## Valid values are "ns", "us" (or "µs"), "ms", "s".
64 | precision = ""
65 | ## Logging configuration:
66 | ## Run telegraf with debug log messages.
67 | debug = false
68 | ## Run telegraf in quiet mode (error log messages only).
69 | quiet = false
70 | ## Specify the log file name. The empty string means to log to stderr.
71 | logfile = ""
72 | ## Override default hostname, if empty use os.Hostname()
73 | hostname = "$__ARDERE_TELEGRAF_HOST__"
74 | ## If set to true, do no set the "host" tag in the telegraf agent.
75 | omit_hostname = false
76 | ###############################################################################
77 | # OUTPUT PLUGINS #
78 | ###############################################################################
79 | # Configuration for influxdb server to send metrics to
80 | [[outputs.influxdb]]
81 | ## The full HTTP or UDP endpoint URL for your InfluxDB instance.
82 | ## Multiple urls can be specified as part of the same cluster,
83 | ## this means that only ONE of the urls will be written to each interval.
84 | # urls = ["udp://localhost:8089"] # UDP endpoint example
85 | urls = ["http://$__ARDERE_INFLUX_ADDR__"] # required
86 | ## The target database for metrics (telegraf will create it if not exists).
87 | database = "$__ARDERE_INFLUX_DB__" # required
88 | ## Retention policy to write to. Empty string writes to the default rp.
89 | retention_policy = ""
90 | ## Write consistency (clusters only), can be: "any", "one", "quorum", "all"
91 | write_consistency = "any"
92 | ## Write timeout (for the InfluxDB client), formatted as a string.
93 | ## If not provided, will default to 5s. 0s means no timeout (not recommended).
94 | timeout = "5s"
95 | # username = "telegraf"
96 | # password = "metricsmetricsmetricsmetrics"
97 | ## Set the user agent for HTTP POSTs (can be useful for log differentiation)
98 | # user_agent = "telegraf"
99 | ## Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
100 | # udp_payload = 512
101 | ## Optional SSL Config
102 | # ssl_ca = "/etc/telegraf/ca.pem"
103 | # ssl_cert = "/etc/telegraf/cert.pem"
104 | # ssl_key = "/etc/telegraf/key.pem"
105 | ## Use SSL but skip chain & host verification
106 | # insecure_skip_verify = false
107 | ###############################################################################
108 | # PROCESSOR PLUGINS #
109 | ###############################################################################
110 | # # Print all metrics that pass through this filter.
111 | # [[processors.printer]]
112 | ###############################################################################
113 | # AGGREGATOR PLUGINS #
114 | ###############################################################################
115 | # # Keep the aggregate min/max of each metric passing through.
116 | # [[aggregators.minmax]]
117 | # ## General Aggregator Arguments:
118 | # ## The period on which to flush & clear the aggregator.
119 | # period = "30s"
120 | # ## If true, the original metric will be dropped by the
121 | # ## aggregator and will not get sent to the output plugins.
122 | # drop_original = false
123 | ###############################################################################
124 | # INPUT PLUGINS #
125 | ###############################################################################
126 | # Read metrics about cpu usage
127 | [[inputs.cpu]]
128 | ## Whether to report per-cpu stats or not
129 | percpu = true
130 | ## Whether to report total system cpu stats or not
131 | totalcpu = true
132 | ## If true, collect raw CPU time metrics.
133 | collect_cpu_time = false
134 | # Read metrics about memory usage
135 | [[inputs.mem]]
136 | # no configuration
137 | # Read TCP metrics such as established, time wait and sockets counts.
138 | [[inputs.netstat]]
139 | # no configuration
140 | ###############################################################################
141 | # SERVICE INPUT PLUGINS #
142 | ###############################################################################
143 | # Statsd Server
144 | [[inputs.statsd]]
145 | ## Address and port to host UDP listener on
146 | service_address = ":8125"
147 | ## The following configuration options control when telegraf clears it's cache
148 | ## of previous values. If set to false, then telegraf will only clear it's
149 | ## cache when the daemon is restarted.
150 | ## Reset gauges every interval (default=true)
151 | delete_gauges = true
152 | ## Reset counters every interval (default=true)
153 | delete_counters = true
154 | ## Reset sets every interval (default=true)
155 | delete_sets = true
156 | ## Reset timings & histograms every interval (default=true)
157 | delete_timings = true
158 | ## Percentiles to calculate for timing & histogram stats
159 | percentiles = [90]
160 | ## separator to use between elements of a statsd metric
161 | metric_separator = "_"
162 | ## Parses tags in the datadog statsd format
163 | ## http://docs.datadoghq.com/guides/dogstatsd/
164 | parse_data_dog_tags = false
165 | ## Statsd data translation templates, more info can be read here:
166 | ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md#graphite
167 | # templates = [
168 | # "cpu.* measurement*"
169 | # ]
170 | ## Number of UDP messages allowed to queue up, once filled,
171 | ## the statsd server will start dropping packets
172 | allowed_pending_messages = 10000
173 | ## Number of timing/histogram values to track per-measurement in the
174 | ## calculation of percentiles. Raising this limit increases the accuracy
175 | ## of percentiles but also increases the memory usage and cpu time.
176 | #percentile_limit = 1000
177 | percentile_limit = 10
--------------------------------------------------------------------------------
/src/shell/waitforcluster.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #
3 | # waits for a cluster to be ready + a run_delay
4 | #
5 | # cluster readiness is indicated by existence of ready_url, containing
6 | # a timestamp (seconds since epoch) of when it was made so. timestamp
7 | # is factored into the run_delay.
8 | #
9 |
10 | # Polling frequency in seconds
11 | POLL_TIME=4
12 |
13 | if [ $# != 2 ]; then
14 | echo "usage $0: ready_url run_delay"
15 | exit 1
16 | fi
17 | READY_URL=$1
18 | RUN_DELAY=$2
19 |
20 | # XXX: a random jitter, backoff?
21 | JITTER=0
22 |
23 | while true; do
24 | START_TIME=`wget -qO- ${READY_URL}` && break
25 | sleep $(( ${POLL_TIME} + ${JITTER} ))
26 | done
27 |
28 | CURRENT_TIME=`date +%s`
29 | SINCE=$(( ${CURRENT_TIME} - ${START_TIME} ))
30 | if [ ${SINCE} -lt 0 ]; then
31 | echo "Clock skew: ${SINCE}" >&2
32 | SINCE=0
33 | fi
34 |
35 | RUN_DELAY=$(( ${RUN_DELAY} - ${SINCE} ))
36 | if [ ${RUN_DELAY} -gt 0 ]; then
37 | FMT_START_TIME=`date '+%FT%T+00:00' -d @${START_TIME}`
38 | echo "Cluster ready @ ${FMT_START_TIME}" \
39 | "(sleeping for run_delay=${RUN_DELAY}s)"
40 | sleep $RUN_DELAY
41 | fi
42 |
--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | nose==1.3.7
3 | mock==2.0.0
4 | coverage==4.3.4
5 | boto3==1.4.4
6 | influxdb==4.0.0
7 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loads/ardere/0c1b7736c514d2b9fd1563cf96bbe0fd75244f95/tests/__init__.py
--------------------------------------------------------------------------------
/tests/fixtures.py:
--------------------------------------------------------------------------------
1 | sample_basic_test_plan = """
2 | {
3 | "ecs_name": "ardere-test",
4 | "name": "Loadtest",
5 | "description": "Run all APLT scenarios",
6 | "metrics_options": {
7 | "enabled": true,
8 | "dashboard": {
9 | "admin_user": "admin",
10 | "admin_password": "testing",
11 | "name": "ap-loadtester",
12 | "filename": "gf_basic_dashboard.json"
13 | }
14 | },
15 | "steps": [
16 | {
17 | "name": "TestCluster",
18 | "instance_count": 1,
19 | "instance_type": "t2.medium",
20 | "run_max_time": 140,
21 | "env": {
22 | "SOME_VAR": "great-value"
23 | },
24 | "port_mapping": [8000, 4000],
25 | "container_name": "bbangert/ap-loadtester:latest",
26 | "cmd": "./apenv/bin/aplt_testplan wss://autopush.stage.mozaws.net 'aplt.scenarios:notification_forever,1000,1,0' --statsd_host=localhost --statsd_port=8125"
27 | }
28 | ]
29 | }
30 | """
31 |
32 | sample_toml = """
33 | ecs_name = "ardere-test"
34 | name = "connection loadtest"
35 | description = "autopush: connect and idle forever"
36 |
37 |
38 | [[steps]]
39 | name = "***************** RUN #01 ***********************"
40 | instance_count = 8
41 | instance_type = "m3.medium"
42 | container_name = "bbangert/ap-loadtester:latest"
43 | cmd = "./apenv/bin/aplt_testplan wss://autopush.stage.mozaws.net 'aplt.scenarios:connect_and_idle_forever,10000,5,0'"
44 | run_max_time = 300
45 | volume_mapping = "/var/log:/var/log/$RUN_ID:rw"
46 | docker_series = "push_tests"
47 |
48 | [[steps]]
49 | name = "***************** RUN #02 ***********************"
50 | instance_count = 8
51 | run_delay = 330
52 | instance_type = "m3.medium"
53 | container_name = "bbangert/ap-loadtester:latest"
54 | cmd = "./apenv/bin/aplt_testplan wss://autopush.stage.mozaws.net 'aplt.scenarios:connect_and_idle_forever,10000,5,0'"
55 | run_max_time = 300
56 | volume_mapping = "/var/log:/var/log/$RUN_ID:rw"
57 | docker_series = "push_tests"
58 |
59 | """
60 |
61 | future_hypothetical_test="""
62 | {
63 | "name": "TestCluster",
64 | "instance_count": 1,
65 | "instance_type": "t2.medium",
66 | "run_max_time": 140,
67 | "container_name": "bbangert/pushgo:1.5rc4",
68 | "port_mapping": "8080,8081,3000,8082",
69 | "load_balancer": {
70 | "env_var": "TEST_CLUSTER",
71 | "ping_path": "/status/health",
72 | "ping_port": 8080,
73 | "ping_protocol": "http",
74 | "listeners": [
75 | {
76 | "listen_protocol": "ssl",
77 | "listen_port": 443,
78 | "backend_protocol": "tcp",
79 | "backend_port": 8080
80 | },
81 | {
82 | "listen_protocol": "https",
83 | "listen_port": 9000,
84 | "backend_protocol": "http",
85 | "backend_port": 8090
86 | }
87 | ]
88 | }
89 | }
90 | """
--------------------------------------------------------------------------------
/tests/test_aws.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import time
4 | import unittest
5 |
6 | import mock
7 | from nose.tools import assert_raises, eq_, ok_
8 |
9 | from tests import fixtures
10 |
11 |
12 | class TestECSManager(unittest.TestCase):
13 | def _make_FUT(self, plan=None):
14 | from ardere.aws import ECSManager
15 | os.environ["s3_ready_bucket"] = "test_bucket"
16 | os.environ["ecs_profile"] = "arn:something:fantastic:::"
17 | os.environ["container_log_group"] = "ardere"
18 | self.boto_mock = mock.Mock()
19 | ECSManager.boto = self.boto_mock
20 | if not plan:
21 | plan = json.loads(fixtures.sample_basic_test_plan)
22 | plan["metrics_options"] = dict(
23 | dashboard=dict(
24 | admin_user="admin",
25 | admin_password="admin"
26 | ),
27 | tear_down=False
28 | )
29 | return ECSManager(plan)
30 |
31 | def test_init(self):
32 | ecs = self._make_FUT()
33 | eq_(ecs._plan["plan_run_uuid"], ecs._plan_uuid)
34 | eq_(ecs.plan_uuid, ecs._plan_uuid)
35 |
36 | def test_ready_file(self):
37 | ecs = self._make_FUT()
38 | ready_filename = ecs.s3_ready_file
39 | ok_("test_bucket" in ready_filename)
40 | ok_(ecs._plan_uuid in ready_filename)
41 |
42 | def test_query_active(self):
43 | mock_paginator = mock.Mock()
44 | mock_paginator.paginate.return_value = [
45 | {"Reservations": [
46 | {
47 | "Instances": [
48 | {
49 | "State": {
50 | "Code": 16
51 | },
52 | "InstanceType": "t2.medium"
53 | }
54 | ]
55 | }
56 | ]}
57 | ]
58 |
59 | ecs = self._make_FUT()
60 | ecs._ec2_client.get_paginator.return_value = mock_paginator
61 | instance_dct = ecs.query_active_instances()
62 | eq_(len(instance_dct.values()), 1)
63 |
64 | def test_calculate_missing_instances(self):
65 | ecs = self._make_FUT()
66 | result = ecs.calculate_missing_instances(
67 | desired={"t2.medium": 2}, current={"t2.medium": 1}
68 | )
69 | eq_(result, {"t2.medium": 1})
70 |
71 | def test_has_metrics_node(self):
72 | mock_paginator = mock.Mock()
73 | mock_paginator.paginate.return_value = [
74 | {"Reservations": [
75 | {
76 | "Instances": [
77 | {
78 | "State": {
79 | "Code": 16
80 | },
81 | "InstanceType": "t2.medium"
82 | }
83 | ]
84 | }
85 | ]}
86 | ]
87 |
88 | ecs = self._make_FUT()
89 | ecs._ec2_client.get_paginator.return_value = mock_paginator
90 | resp = ecs.has_metrics_node("t2.medium")
91 | eq_(resp, True)
92 |
93 | def test_has_started_metric_creation(self):
94 | ecs = self._make_FUT()
95 | ecs._ecs_client.list_tasks.return_value = {"taskArns": [123]}
96 | eq_(ecs.has_started_metric_creation(), True)
97 |
98 | def test_has_finished_metric_creation(self):
99 | ecs = self._make_FUT()
100 | ecs._ecs_client.list_tasks.return_value = {"taskArns": [123]}
101 | eq_(ecs.has_finished_metric_creation(), True)
102 |
103 | def test_request_instances(self):
104 | instances = {
105 | "t2.medium": 10
106 | }
107 | ecs = self._make_FUT()
108 | ecs._ec2_client.run_instances.return_value = {
109 | "Instances": [{"InstanceId": 12345}]
110 | }
111 | ecs.request_instances(instances, ["i-382842"], {"Role": "metrics"})
112 | ecs._ec2_client.run_instances.assert_called()
113 |
114 | def test_locate_metrics_container_ip(self):
115 | ecs = self._make_FUT()
116 | ecs._ecs_client.list_container_instances.return_value = {
117 | "containerInstanceArns": ["arn:of:some:container::"]
118 | }
119 | ecs._ecs_client.describe_container_instances.return_value = {
120 | "containerInstances": [
121 | {"ec2InstanceId": "e-28193823"}
122 | ]
123 | }
124 | mock_resource = mock.Mock()
125 | ecs.boto.resource.return_value = mock_resource
126 | ecs.locate_metrics_container_ip()
127 | ecs.boto.resource.assert_called()
128 |
129 | def test_locate_metrics_container_ip_not_found(self):
130 | ecs = self._make_FUT()
131 | ecs._ecs_client.list_container_instances.return_value = {
132 | "containerInstanceArns": []
133 | }
134 | result = ecs.locate_metrics_container_ip()
135 | eq_(result, (None, None))
136 |
137 | def test_locate_metrics_service(self):
138 | ecs = self._make_FUT()
139 | ecs._ecs_client.describe_services.return_value = {
140 | "services": [
141 | {"stuff": 1, "status": "ACTIVE"}
142 | ]
143 | }
144 | result = ecs.locate_metrics_service()
145 | eq_(result, {"stuff": 1, "status": "ACTIVE"})
146 |
147 | def test_locate_metrics_service_not_found(self):
148 | ecs = self._make_FUT()
149 | ecs._ecs_client.describe_services.return_value = {
150 | "services": []
151 | }
152 | result = ecs.locate_metrics_service()
153 | eq_(result, None)
154 |
155 | def test_create_metrics_service(self):
156 | ecs = self._make_FUT()
157 |
158 | # Setup mocks
159 | ecs._ecs_client.register_task_definition.return_value = {
160 | "taskDefinition": {
161 | "taskDefinitionArn": "arn:of:some:task::"
162 | }
163 | }
164 | ecs._ecs_client.create_service.return_value = {
165 | "service": {"serviceArn": "arn:of:some:service::"}
166 | }
167 |
168 | result = ecs.create_metrics_service(dict(instance_type="c4.large"))
169 | eq_(result["service_arn"], "arn:of:some:service::")
170 |
171 | def test_run_metric_creation_task(self):
172 | ecs = self._make_FUT()
173 | ecs.run_metric_creation_task("arn:::", ("admin", "admin"),
174 | "asdf", "atitle")
175 | ecs._ecs_client.start_task.assert_called()
176 |
177 | def test_create_service(self):
178 | ecs = self._make_FUT()
179 |
180 | step = ecs._plan["steps"][0]
181 | ecs._plan["influxdb_private_ip"] = "1.1.1.1"
182 | step["docker_series"] = "default"
183 |
184 | # Setup mocks
185 | ecs._ecs_client.register_task_definition.return_value = {
186 | "taskDefinition": {
187 | "taskDefinitionArn": "arn:of:some:task::"
188 | }
189 | }
190 | ecs._ecs_client.create_service.return_value = {
191 | "service": {"serviceArn": "arn:of:some:service::"}
192 | }
193 |
194 | ecs.create_service(step)
195 |
196 | eq_(step["serviceArn"], "arn:of:some:service::")
197 | ecs._ecs_client.register_task_definition.assert_called()
198 | _, kwargs = ecs._ecs_client.register_task_definition.call_args
199 | container_def = kwargs["containerDefinitions"][0]
200 |
201 | eq_(container_def["cpu"], 1536)
202 |
203 | _, kwargs = ecs._ecs_client.register_task_definition.call_args
204 | container_def = kwargs["containerDefinitions"][0]
205 | ok_("portMappings" in container_def)
206 |
207 | def test_create_services(self):
208 | ecs = self._make_FUT()
209 | ecs.create_service = mock.Mock()
210 | ecs.create_services(ecs._plan["steps"])
211 | ecs.create_service.assert_called()
212 |
213 | def test_create_services_ecs_error(self):
214 | from botocore.exceptions import ClientError
215 | ecs = self._make_FUT()
216 |
217 | step = ecs._plan["steps"][0]
218 | ecs._plan["influxdb_private_ip"] = "1.1.1.1"
219 | step["docker_series"] = "default"
220 | ecs._ecs_client.register_task_definition.side_effect = ClientError(
221 | {"Error": {}}, "some_op"
222 | )
223 |
224 | with assert_raises(ClientError):
225 | ecs.create_services(ecs._plan["steps"])
226 |
227 | def test_service_ready_true(self):
228 | ecs = self._make_FUT()
229 | step = ecs._plan["steps"][0]
230 |
231 | ecs._ecs_client.describe_services.return_value = {
232 | "services": [{
233 | "deployments": [{
234 | "desiredCount": 2,
235 | "runningCount": 2
236 | }]
237 | }]
238 | }
239 |
240 | result = ecs.service_ready(step)
241 | eq_(result, True)
242 |
243 | def test_service_not_known_yet(self):
244 | ecs = self._make_FUT()
245 | step = ecs._plan["steps"][0]
246 |
247 | ecs._ecs_client.describe_services.return_value = {
248 | "services": []
249 | }
250 |
251 | result = ecs.service_ready(step)
252 | eq_(result, False)
253 |
254 | def test_all_services_ready(self):
255 | ecs = self._make_FUT()
256 | ecs.service_ready = mock.Mock()
257 |
258 | ecs.all_services_ready(ecs._plan["steps"])
259 | ecs.service_ready.assert_called()
260 |
261 | def test_service_done_true(self):
262 | ecs = self._make_FUT()
263 | step = ecs._plan["steps"][0]
264 |
265 | ecs._ecs_client.describe_services.return_value = {
266 | "services": [{
267 | "status": "INACTIVE"
268 | }]
269 | }
270 |
271 | result = ecs.service_done(step)
272 | eq_(result, True)
273 |
274 | def test_service_not_known(self):
275 | ecs = self._make_FUT()
276 | step = ecs._plan["steps"][0]
277 |
278 | ecs._ecs_client.describe_services.return_value = {
279 | "services": [{
280 | "status": "DRAINING"
281 | }]
282 | }
283 |
284 | result = ecs.service_done(step)
285 | eq_(result, False)
286 |
287 | def test_all_services_done(self):
288 | ecs = self._make_FUT()
289 | ecs.service_done = mock.Mock()
290 | ecs.all_services_done(ecs._plan["steps"])
291 | ecs.service_done.assert_called()
292 |
293 | def test_stop_finished_service_stopped(self):
294 | ecs = self._make_FUT()
295 | ecs._ecs_client.update_service = mock.Mock()
296 | step = ecs._plan["steps"][0]
297 | step["service_status"] = "STARTED"
298 | past = time.time() - 400
299 | ecs.stop_finished_service(past, step)
300 | ecs._ecs_client.update_service.assert_called()
301 | eq_(step["service_status"], "STOPPED")
302 |
303 | def test_stop_finished_service_stop_already_stopped(self):
304 | ecs = self._make_FUT()
305 | ecs._ecs_client.update_service = mock.Mock()
306 | step = ecs._plan["steps"][0]
307 | step["service_status"] = "STOPPED"
308 | past = time.time() - 400
309 | ecs.stop_finished_service(past, step)
310 | ecs._ecs_client.update_service.assert_not_called()
311 | eq_(step["service_status"], "STOPPED")
312 |
313 | def test_stop_finished_service_still_running(self):
314 | ecs = self._make_FUT()
315 | ecs._ecs_client.update_service = mock.Mock()
316 | step = ecs._plan["steps"][0]
317 | step["service_status"] = "STARTED"
318 | past = time.time() - 100
319 | ecs.stop_finished_service(past, step)
320 | ecs._ecs_client.update_service.assert_not_called()
321 | eq_(step["service_status"], "STARTED")
322 |
323 | def test_stop_finished_services(self):
324 | ecs = self._make_FUT()
325 | ecs.stop_finished_service = mock.Mock()
326 |
327 | past = time.time() - 100
328 | ecs.stop_finished_services(past, ecs._plan["steps"])
329 | ecs.stop_finished_service.assert_called()
330 |
331 | def test_shutdown_plan(self):
332 | mock_paginator = mock.Mock()
333 | mock_paginator.paginate.return_value = [
334 | {"serviceArns": ["arn:123:::", "arn:456:::"]}
335 | ]
336 |
337 | ecs = self._make_FUT()
338 | ecs.locate_metrics_service = mock.Mock()
339 | ecs.locate_metrics_service.return_value = dict(
340 | serviceArn="arn:456:::"
341 | )
342 | ecs._ecs_client.get_paginator.return_value = mock_paginator
343 | ecs._ecs_client.describe_task_definition.return_value = {
344 | "taskDefinition": {"taskDefinitionArn": "arn:task:::"}
345 | }
346 |
347 | ecs.shutdown_plan(ecs._plan["steps"])
348 | ecs._ecs_client.deregister_task_definition.assert_called()
349 | ecs._ecs_client.delete_service.assert_called()
350 |
351 | def test_shutdown_plan_update_error(self):
352 | from botocore.exceptions import ClientError
353 |
354 | mock_paginator = mock.Mock()
355 | mock_paginator.paginate.return_value = [
356 | {"serviceArns": ["arn:123:::", "arn:456:::"]}
357 | ]
358 |
359 | ecs = self._make_FUT()
360 | ecs.locate_metrics_service = mock.Mock()
361 | ecs.locate_metrics_service.return_value = dict(
362 | serviceArn="arn:456:::"
363 | )
364 | ecs._ecs_client.get_paginator.return_value = mock_paginator
365 | ecs._ecs_client.describe_task_definition.return_value = {
366 | "taskDefinition": {"taskDefinitionArn": "arn:task:::"}
367 | }
368 | ecs._ecs_client.update_service.side_effect = ClientError(
369 | {"Error": {}}, "some_op"
370 | )
371 |
372 | ecs.shutdown_plan(ecs._plan["steps"])
373 | ecs._ecs_client.delete_service.assert_not_called()
374 |
375 | def test_shutdown_plan_describe_error(self):
376 | from botocore.exceptions import ClientError
377 |
378 | mock_paginator = mock.Mock()
379 | mock_paginator.paginate.return_value = [
380 | {"serviceArns": ["arn:123:::", "arn:456:::"]}
381 | ]
382 |
383 | ecs = self._make_FUT()
384 | ecs.locate_metrics_service = mock.Mock()
385 | ecs.locate_metrics_service.return_value = dict(
386 | serviceArn="arn:456:::"
387 | )
388 | ecs._plan["steps"] = ecs._plan["steps"][:1]
389 | ecs._ecs_client.get_paginator.return_value = mock_paginator
390 | ecs._ecs_client.describe_task_definition.side_effect = ClientError(
391 | {"Error": {}}, "some_op"
392 | )
393 |
394 | ecs.shutdown_plan(ecs._plan["steps"])
395 | ecs._ecs_client.deregister_task_definition.assert_not_called()
396 |
397 | def test_shutdown_plan_delete_error(self):
398 | from botocore.exceptions import ClientError
399 |
400 | mock_paginator = mock.Mock()
401 | mock_paginator.paginate.return_value = [
402 | {"serviceArns": ["arn:123:::", "arn:456:::"]}
403 | ]
404 |
405 | ecs = self._make_FUT()
406 | ecs.locate_metrics_service = mock.Mock()
407 | ecs.locate_metrics_service.return_value = dict(
408 | serviceArn="arn:456:::"
409 | )
410 | ecs._ecs_client.get_paginator.return_value = mock_paginator
411 | ecs._ecs_client.describe_task_definition.return_value = {
412 | "taskDefinition": {"taskDefinitionArn": "arn:task:::"}
413 | }
414 | ecs._ecs_client.delete_service.side_effect = ClientError(
415 | {"Error": {}}, "some_op"
416 | )
417 |
418 | ecs.shutdown_plan(ecs._plan["steps"])
419 | ecs._ecs_client.delete_service.assert_called()
420 |
421 | def test_shutdown_plan_deregister_error(self):
422 | from botocore.exceptions import ClientError
423 |
424 | mock_paginator = mock.Mock()
425 | mock_paginator.paginate.return_value = [
426 | {"serviceArns": ["arn:123:::", "arn:456:::"]}
427 | ]
428 |
429 | ecs = self._make_FUT()
430 | ecs.locate_metrics_service = mock.Mock()
431 | ecs.locate_metrics_service.return_value = dict(
432 | serviceArn="arn:456:::"
433 | )
434 | ecs._plan["metrics_options"]["tear_down"] = True
435 | ecs._ecs_client.get_paginator.return_value = mock_paginator
436 | ecs._ecs_client.describe_task_definition.return_value = {
437 | "taskDefinition": {"taskDefinitionArn": "arn:task:::"}
438 | }
439 | ecs._ecs_client.deregister_task_definition.side_effect = ClientError(
440 | {"Error": {}}, "some_op"
441 | )
442 |
443 | ecs.shutdown_plan(ecs._plan["steps"])
444 | ecs._ecs_client.delete_service.assert_called()
445 |
--------------------------------------------------------------------------------
/tests/test_metric_creator.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | import mock
5 | from nose.tools import assert_raises, eq_
6 |
7 |
8 | class TestMetricRunner(unittest.TestCase):
9 | def _make_FUT(self):
10 | from ardere.scripts.metric_creator import DashboardSetup
11 | # Setup the env vars we need
12 | os.environ["__ARDERE_INFLUXDB_NAME__"] = "ardere"
13 | return DashboardSetup()
14 |
15 | def test_load_dashboard(self):
16 | ds = self._make_FUT()
17 | mock_file = mock.Mock()
18 | mock_file.get.return_value = {"Body": mock_file}
19 | mock_file.read.return_value = "{}".encode(
20 | 'utf-8')
21 | mock_s3_obj = mock.Mock()
22 | mock_s3_obj.Object.return_value = mock_file
23 |
24 | ds.boto = mock.Mock()
25 | ds.boto.resource.return_value = mock_s3_obj
26 | ds.dashboard = "asdf:asdf"
27 | result = ds._load_dashboard()
28 | eq_(result, dict(id=None, title=None))
29 |
30 | def test_create_dashboard(self):
31 | ds = self._make_FUT()
32 | ds._load_dashboard = mock.Mock()
33 | ds.req = mock.Mock()
34 | ds.req.post.return_value = mock.Mock(status_code=200)
35 | ds._create_dashboard("http://localhost")
36 | ds._load_dashboard.assert_called()
37 |
38 | def test_create_dashboard_exception(self):
39 | ds = self._make_FUT()
40 | ds._load_dashboard = mock.Mock()
41 | ds.req = mock.Mock()
42 | ds.req.post.return_value = mock.Mock(status_code=500)
43 | assert_raises(Exception, ds._create_dashboard, "http://localhost")
44 |
45 | def test_ensure_dashboard_create(self):
46 | ds = self._make_FUT()
47 | ds.req = mock.Mock()
48 | mock_response = mock.Mock()
49 | mock_response.status_code = 200
50 | mock_response.json.return_value = []
51 | ds._create_dashboard = mock.Mock()
52 | ds.req.get.return_value = mock_response
53 |
54 | ds._ensure_dashboard("http://localhost")
55 | ds._create_dashboard.assert_called()
56 |
57 | def test_ensure_dashboard_exception(self):
58 | ds = self._make_FUT()
59 | ds.req = mock.Mock()
60 | mock_response = mock.Mock()
61 | mock_response.status_code = 500
62 | ds.req.get.return_value = mock_response
63 | assert_raises(Exception, ds._ensure_dashboard, "http://localhost")
64 |
65 | def test_create_datasources(self):
66 | ds = self._make_FUT()
67 | ds.dashboard = True
68 | ds.influx = mock.Mock()
69 | ds.req = mock.Mock()
70 | mock_client = mock.Mock()
71 | ds._ensure_dashboard = mock.Mock()
72 | ds.influx.InfluxDBClient.return_value = mock_client
73 |
74 | ds.create_datasources()
75 | mock_client.create_database.assert_called()
76 | ds._ensure_dashboard.assert_called()
77 |
--------------------------------------------------------------------------------
/tests/test_step_functions.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import time
4 | import unittest
5 | import uuid
6 |
7 | import mock
8 | from botocore.exceptions import ClientError
9 | from nose.tools import eq_, assert_raises
10 |
11 | from tests import fixtures
12 |
13 |
14 | class TestAsyncPlanRunner(unittest.TestCase):
15 | def setUp(self):
16 | self.mock_ecs = mock.Mock()
17 | self._patcher = mock.patch("ardere.step_functions.ECSManager")
18 | mock_manager = self._patcher.start()
19 | mock_manager.return_value = self.mock_ecs
20 |
21 | from ardere.step_functions import AsynchronousPlanRunner
22 |
23 | self.plan = json.loads(fixtures.sample_basic_test_plan)
24 | self.runner = AsynchronousPlanRunner(self.plan, {})
25 | self.runner.boto = self.mock_boto = mock.Mock()
26 |
27 | def tearDown(self):
28 | self._patcher.stop()
29 |
30 | def test_build_instance_map(self):
31 | result = self.runner._build_instance_map()
32 | eq_(len(result), 1)
33 | eq_(result, {"t2.medium": 1})
34 |
35 | def test_find_test_plan_duration(self):
36 | result = self.runner._find_test_plan_duration()
37 | eq_(result, 140)
38 |
39 | def test_load_toml(self):
40 | from ardere.step_functions import AsynchronousPlanRunner
41 |
42 | self.runner = AsynchronousPlanRunner({"toml": fixtures.sample_toml},
43 | None)
44 | eq_(len(self.runner.event["steps"]), 2)
45 | eq_(self.runner.event["steps"][0]["instance_count"], 8)
46 | eq_(self.runner.event["ecs_name"], "ardere-test")
47 |
48 | def test_populate_missing_instances(self):
49 | os.environ["ec2_sg"] = "i-23232"
50 | os.environ["metric_sg"] = "i-84828"
51 | self.mock_ecs.has_metrics_node.return_value = False
52 | self.runner.populate_missing_instances()
53 | self.mock_ecs.query_active_instances.assert_called()
54 | self.mock_ecs.request_instances.assert_called()
55 |
56 | def test_populate_missing_instances_fail(self):
57 | from ardere.exceptions import ValidationException
58 | mock_client = mock.Mock()
59 | self.mock_boto.client.return_value = mock_client
60 | mock_client.describe_clusters.return_value = {"clusters": []}
61 | assert_raises(ValidationException,
62 | self.runner.populate_missing_instances)
63 |
64 | def test_ensure_metrics_available_running_create(self):
65 | from ardere.exceptions import ServicesStartingException
66 |
67 | self.plan["metrics_options"] = dict(enabled=True)
68 | self.mock_ecs.locate_metrics_service.return_value = None
69 |
70 | assert_raises(ServicesStartingException,
71 | self.runner.ensure_metrics_available)
72 | self.mock_ecs.create_metrics_service.assert_called()
73 |
74 | def test_ensure_metrics_available_running_waiting(self):
75 | from ardere.exceptions import ServicesStartingException
76 |
77 | self.plan["metrics_options"] = dict(enabled=True)
78 | self.mock_ecs.locate_metrics_service.return_value = {
79 | "deployments": [{
80 | "desiredCount": 1,
81 | "runningCount": 0
82 | }]
83 | }
84 |
85 | assert_raises(ServicesStartingException,
86 | self.runner.ensure_metrics_available)
87 |
88 | def test_ensure_metrics_available_running_error(self):
89 | self.plan["metrics_options"] = dict(enabled=True)
90 | self.mock_ecs.locate_metrics_service.return_value = {
91 | "deployments": [{
92 | "desiredCount": 1,
93 | "runningCount": 1
94 | }]
95 | }
96 | self.mock_ecs.locate_metrics_container_ip.return_value = None
97 |
98 | assert_raises(Exception, self.runner.ensure_metrics_available)
99 |
100 | def test_ensure_metrics_available_running(self):
101 | os.environ["metrics_bucket"] = "metrics"
102 | self.plan["metrics_options"] = dict(
103 | enabled=True,
104 | dashboard=dict(admin_user="admin",
105 | admin_password="admin", name="fred",
106 | filename="smith")
107 | )
108 | self.mock_ecs.locate_metrics_service.return_value = {
109 | "deployments": [{
110 | "desiredCount": 1,
111 | "runningCount": 1
112 | }]
113 | }
114 | self.mock_ecs.locate_metrics_container_ip.return_value = (
115 | "1.1.1.1", "arn:::"
116 | )
117 |
118 | self.runner.ensure_metrics_available()
119 | self.mock_ecs.locate_metrics_container_ip.assert_called()
120 |
121 | def test_ensure_metrics_available_running_no_metric_ip(self):
122 | os.environ["metrics_bucket"] = "metrics"
123 | self.plan["metrics_options"] = dict(
124 | enabled=True,
125 | dashboard=dict(admin_user="admin",
126 | admin_password="admin", name="fred",
127 | filename="smith")
128 | )
129 | self.mock_ecs.locate_metrics_service.return_value = {
130 | "deployments": [{
131 | "desiredCount": 1,
132 | "runningCount": 1
133 | }]
134 | }
135 | self.mock_ecs.locate_metrics_container_ip.return_value = (
136 | None, None
137 | )
138 |
139 | assert_raises(Exception, self.runner.ensure_metrics_available)
140 | self.mock_ecs.locate_metrics_container_ip.assert_called()
141 |
142 | def test_ensure_metrics_available_disabled(self):
143 | self.plan["metrics_options"] = dict(enabled=False)
144 | self.runner.ensure_metrics_available()
145 |
146 | def test_ensure_metric_sources_created(self):
147 | os.environ["metrics_bucket"] = "metrics"
148 | self.plan["influxdb_private_ip"] = "1.1.1.1"
149 | self.plan["metrics_options"] = dict(
150 | enabled=True,
151 | dashboard=dict()
152 | )
153 | self.mock_ecs.has_started_metric_creation.return_value = True
154 | self.runner.ensure_metric_sources_created()
155 | self.mock_ecs.has_started_metric_creation.assert_called()
156 |
157 | def test_ensure_metric_sources_created_not_finished(self):
158 | from ardere.exceptions import CreatingMetricSourceException
159 | os.environ["metrics_bucket"] = "metrics"
160 | self.plan["influxdb_private_ip"] = "1.1.1.1"
161 | self.plan["metrics_options"] = dict(
162 | enabled=True,
163 | )
164 | self.mock_ecs.has_started_metric_creation.return_value = True
165 | self.mock_ecs.has_finished_metric_creation.return_value = False
166 | assert_raises(CreatingMetricSourceException,
167 | self.runner.ensure_metric_sources_created)
168 | self.mock_ecs.has_started_metric_creation.assert_called()
169 |
170 | def test_ensure_metric_sources_created_not_enabled(self):
171 | self.plan["metrics_options"] = dict(
172 | enabled=False,
173 | dashboard=dict()
174 | )
175 | self.runner.ensure_metric_sources_created()
176 |
177 | def test_ensure_metric_sources_created_not_started(self):
178 | from ardere.exceptions import CreatingMetricSourceException
179 | os.environ["metrics_bucket"] = "metrics"
180 | self.plan["influxdb_private_ip"] = "1.1.1.1"
181 | self.plan["metric_container_arn"] = "arn:::"
182 | self.plan["metrics_options"] = dict(
183 | enabled=True,
184 | dashboard=dict(
185 | admin_user="admin",
186 | admin_password="admin",
187 | filename="asdf",
188 | name="a title"
189 | )
190 | )
191 | self.mock_ecs.has_started_metric_creation.return_value = False
192 | assert_raises(CreatingMetricSourceException,
193 | self.runner.ensure_metric_sources_created)
194 | self.mock_ecs.has_started_metric_creation.assert_called()
195 |
196 | def test_ensure_metric_sources_created_not_started_no_dash(self):
197 | from ardere.exceptions import CreatingMetricSourceException
198 | os.environ["metrics_bucket"] = "metrics"
199 | self.plan["influxdb_private_ip"] = "1.1.1.1"
200 | self.plan["metric_container_arn"] = "arn:::"
201 | self.plan["metrics_options"] = dict(
202 | enabled=True,
203 | )
204 | self.mock_ecs.has_started_metric_creation.return_value = False
205 | assert_raises(CreatingMetricSourceException,
206 | self.runner.ensure_metric_sources_created)
207 | self.mock_ecs.has_started_metric_creation.assert_called()
208 |
209 | def test_create_ecs_services(self):
210 | self.runner.create_ecs_services()
211 | self.mock_ecs.create_services.assert_called_with(self.plan["steps"])
212 |
213 | def test_wait_for_cluster_ready_not_ready(self):
214 | from ardere.exceptions import ServicesStartingException
215 |
216 | self.mock_ecs.all_services_ready.return_value = False
217 | assert_raises(ServicesStartingException,
218 | self.runner.wait_for_cluster_ready)
219 |
220 | def test_wait_for_cluster_ready_all_ready(self):
221 | self.mock_ecs.all_services_ready.return_value = True
222 | self.runner.wait_for_cluster_ready()
223 | self.mock_ecs.all_services_ready.assert_called()
224 |
225 | def test_signal_cluster_start(self):
226 | self.plan["plan_run_uuid"] = str(uuid.uuid4())
227 |
228 | self.runner.signal_cluster_start()
229 | self.mock_boto.client.assert_called()
230 |
231 | def test_check_for_cluster_done_not_done(self):
232 | os.environ["s3_ready_bucket"] = "test_bucket"
233 | mock_file = mock.Mock()
234 | mock_file.get.return_value = {"Body": mock_file}
235 | mock_file.read.return_value = "{}".format(
236 | int(time.time()) - 100).encode(
237 | 'utf-8')
238 | mock_s3_obj = mock.Mock()
239 | mock_s3_obj.Object.return_value = mock_file
240 | self.mock_boto.resource.return_value = mock_s3_obj
241 |
242 | self.plan["plan_run_uuid"] = str(uuid.uuid4())
243 | self.runner.check_for_cluster_done()
244 |
245 | def test_check_for_cluster_done_shutdown(self):
246 | from ardere.exceptions import ShutdownPlanException
247 |
248 | os.environ["s3_ready_bucket"] = "test_bucket"
249 | mock_file = mock.Mock()
250 | mock_file.get.return_value = {"Body": mock_file}
251 | mock_file.read.return_value = "{}".format(
252 | int(time.time()) - 400).encode(
253 | 'utf-8')
254 | mock_s3_obj = mock.Mock()
255 | mock_s3_obj.Object.return_value = mock_file
256 | self.mock_boto.resource.return_value = mock_s3_obj
257 |
258 | self.plan["plan_run_uuid"] = str(uuid.uuid4())
259 | assert_raises(ShutdownPlanException, self.runner.check_for_cluster_done)
260 |
261 | def test_check_for_cluster_done_object_error(self):
262 | from ardere.exceptions import ShutdownPlanException
263 |
264 | os.environ["s3_ready_bucket"] = "test_bucket"
265 | mock_file = mock.Mock()
266 | mock_file.get.return_value = {"Body": mock_file}
267 | mock_file.read.return_value = "{}".format(
268 | int(time.time()) - 400).encode(
269 | 'utf-8')
270 | mock_s3_obj = mock.Mock()
271 | mock_s3_obj.Object.side_effect = ClientError(
272 | {"Error": {}}, None
273 | )
274 | self.mock_boto.resource.return_value = mock_s3_obj
275 |
276 | self.plan["plan_run_uuid"] = str(uuid.uuid4())
277 | assert_raises(ShutdownPlanException, self.runner.check_for_cluster_done)
278 |
279 | def test_cleanup_cluster(self):
280 | self.plan["plan_run_uuid"] = str(uuid.uuid4())
281 |
282 | self.runner.cleanup_cluster()
283 | self.mock_boto.resource.assert_called()
284 |
285 | def test_cleanup_cluster_error(self):
286 | self.plan["plan_run_uuid"] = str(uuid.uuid4())
287 |
288 | mock_s3 = mock.Mock()
289 | self.mock_boto.resource.return_value = mock_s3
290 | mock_s3.Object.side_effect = ClientError(
291 | {"Error": {}}, None
292 | )
293 | self.runner.cleanup_cluster()
294 | mock_s3.Object.assert_called()
295 |
296 | def test_drain_check_draining(self):
297 | from ardere.exceptions import UndrainedInstancesException
298 | self.mock_ecs.all_services_done.return_value = True
299 | self.runner.check_drained()
300 | self.mock_ecs.all_services_done.return_value = False
301 | assert_raises(UndrainedInstancesException,
302 | self.runner.check_drained)
303 |
304 |
305 | class TestValidation(unittest.TestCase):
306 | def _make_FUT(self):
307 | from ardere.step_functions import PlanValidator
308 | return PlanValidator()
309 |
310 | def test_validate_success(self):
311 | schema = self._make_FUT()
312 | schema.context["boto"] = mock.Mock()
313 | plan = json.loads(fixtures.sample_basic_test_plan)
314 | data, errors = schema.load(plan)
315 | eq_(errors, {})
316 | eq_(len(data["steps"]), len(plan["steps"]))
317 |
318 | def test_validate_fail_ecs_name(self):
319 | schema = self._make_FUT()
320 | schema.context["boto"] = mock.Mock()
321 | plan = json.loads(fixtures.sample_basic_test_plan)
322 | plan['ecs_name'] = ''
323 | data, errors = schema.load(plan)
324 | eq_(errors, {'ecs_name': ['Plan ecs_name missing']})
325 | plan['ecs_name'] += '*'
326 | data, errors = schema.load(plan)
327 | eq_(errors, {'ecs_name':
328 | ['Plan ecs_name contained invalid characters']})
329 | plan['ecs_name'] = 'a' * 512
330 | data, errors = schema.load(plan)
331 | eq_(errors, {'ecs_name': ['Plan ecs_name too long']})
332 |
333 | def test_validate_fail_step_name(self):
334 | schema = self._make_FUT()
335 | schema.context["boto"] = mock.Mock()
336 | plan = json.loads(fixtures.sample_basic_test_plan)
337 | plan['steps'][0]['name'] = ''
338 | data, errors = schema.load(plan)
339 | eq_(errors, {'steps': {0: {'name': ['Step name missing']}}})
340 | plan['steps'][0]['name'] = '*'
341 | data, errors = schema.load(plan)
342 | eq_(errors,
343 | {'steps': {0: {'name': ['Step name contains invalid characters']}}}
344 | )
345 | plan['steps'][0]['name'] = 'a' * 512
346 | data, errors = schema.load(plan)
347 | eq_(errors, {'steps': {0: {'name': ['Step name too long']}}})
348 |
349 | def test_validate_fail(self):
350 | schema = self._make_FUT()
351 | schema.context["boto"] = mock_boto = mock.Mock()
352 | mock_client = mock.Mock()
353 | mock_boto.client.return_value = mock_client
354 | mock_client.describe_clusters.return_value = {"clusters": []}
355 | plan = json.loads(fixtures.sample_basic_test_plan)
356 | data, errors = schema.load(plan)
357 | eq_(len(data["steps"]), len(plan["steps"]))
358 | eq_(len(errors), 1)
359 |
--------------------------------------------------------------------------------