├── .dockerignore
├── .gitignore
├── .gitmodules
├── Dockerfile
├── LICENSE
├── README.md
├── bin
    ├── manage.py
    ├── manager
    │   ├── __init__.py
    │   ├── client.py
    │   ├── config.py
    │   ├── discovery.py
    │   ├── env.py
    │   ├── network.py
    │   ├── storage
    │   │   ├── __init__.py
    │   │   ├── local.py
    │   │   ├── manta_stor.py
    │   │   └── minio_stor.py
    │   └── utils.py
    └── test.py
├── etc
    ├── containerpilot.json5
    └── my.cnf.tmpl
├── examples
    ├── compose
    │   └── docker-compose.yml
    └── triton
    │   ├── docker-compose.yml
    │   └── setup.sh
├── makefile
└── tests
    ├── Dockerfile
    ├── compose.sh
    ├── tests.py
    └── triton.sh


/.dockerignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | **/.DS_Store
3 | _env*
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # build outputs
 2 | *.pyc
 3 | 
 4 | # credentials
 5 | _env*
 6 | manta
 7 | manta.pub
 8 | 
 9 | # temp
10 | python-manta/
11 | tmp/
12 | 
13 | # macos frustration
14 | .DS_Store
15 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tests/triton-docker-cli"]
2 | 	path = tests/triton-docker-cli
3 | 	url = https://github.com/joyent/triton-docker-cli
4 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM percona:5.6
 2 | 
 3 | # By keeping a lot of discrete steps in a single RUN we can clean up after
 4 | # ourselves in the same layer. This is gross but it saves ~100MB in the image
 5 | RUN set -ex \
 6 |     && export buildDeps='python-dev gcc unzip' \
 7 |     && export runDeps='python curl libffi-dev libssl-dev percona-xtrabackup ca-certificates' \
 8 |     && apt-get update \
 9 |     && apt-get install -y $buildDeps $runDeps --no-install-recommends \
10 |     # \
11 |     # get Python drivers MySQL, Consul, and Manta \
12 |     # \
13 |     && curl -Lsfo /tmp/mysql-connector.deb http://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python_2.1.3-1debian8.2_all.deb \
14 |     && dpkg -i /tmp/mysql-connector.deb \
15 |     && curl -Lsfo /tmp/mysql-utils.deb http://dev.mysql.com/get/Downloads/MySQLGUITools/mysql-utilities_1.5.6-1debian8_all.deb \
16 |     && dpkg -i /tmp/mysql-utils.deb \
17 |     && curl -Lsfo get-pip.py https://bootstrap.pypa.io/get-pip.py \
18 |     && python get-pip.py \
19 |     && pip install \
20 |        python-Consul==0.7.0 \
21 |        manta==2.5.0 \
22 |        minio==2.2.4 \
23 |        mock==2.0.0 \
24 |        json5==0.2.4 \
25 |     # \
26 |     # Add Consul from https://releases.hashicorp.com/consul \
27 |     # \
28 |     && export CHECKSUM=c8859a0a34c50115cdff147f998b2b63226f5f052e50f342209142420d1c2668 \
29 |     && curl -Lsfo /tmp/consul.zip https://releases.hashicorp.com/consul/0.8.4/consul_0.8.4_linux_amd64.zip \
30 |     && echo "${CHECKSUM}  /tmp/consul.zip" | sha256sum -c \
31 |     && unzip /tmp/consul.zip -d /usr/local/bin \
32 |     && rm /tmp/consul.zip \
33 |     && mkdir /config \
34 |     # \
35 |     # clean up to minimize image layer size \
36 |     # \
37 |     && rm -rf /var/lib/apt/lists/* \
38 |     && apt-get purge -y --auto-remove $buildDeps \
39 |     && rm /tmp/mysql-connector.deb \
40 |     && rm /tmp/mysql-utils.deb \
41 |     && rm /get-pip.py \
42 |     && rm /docker-entrypoint.sh
43 | 
44 | 
45 | ENV CONTAINERPILOT_VER 3.1.1
46 | ENV CONTAINERPILOT /etc/containerpilot.json5
47 | 
48 | # Add ContainerPilot
49 | RUN set -ex \
50 |     && export CONTAINERPILOT_CHECKSUM=1f159207c7dc2b622f693754f6dda77c82a88263 \
51 |     && curl -Lsfo /tmp/containerpilot.tar.gz "https://github.com/joyent/containerpilot/releases/download/${CONTAINERPILOT_VER}/containerpilot-${CONTAINERPILOT_VER}.tar.gz" \
52 |     && echo "${CONTAINERPILOT_CHECKSUM}  /tmp/containerpilot.tar.gz" | sha1sum -c \
53 |     && tar zxf /tmp/containerpilot.tar.gz -C /usr/local/bin \
54 |     && rm /tmp/containerpilot.tar.gz
55 | 
56 | # configure ContainerPilot and MySQL
57 | COPY etc/* /etc/
58 | COPY bin/manager /usr/local/bin/manager
59 | COPY bin/test.py /usr/local/bin/test.py
60 | COPY bin/manage.py /usr/local/bin/manage.py
61 | 
62 | # override the parent entrypoint
63 | ENTRYPOINT []
64 | CMD ["/usr/local/bin/containerpilot"]
65 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Mozilla Public License, version 2.0
  2 | 
  3 | 1. Definitions
  4 | 
  5 | 1.1. "Contributor"
  6 | 
  7 |      means each individual or legal entity that creates, contributes to the
  8 |      creation of, or owns Covered Software.
  9 | 
 10 | 1.2. "Contributor Version"
 11 | 
 12 |      means the combination of the Contributions of others (if any) used by a
 13 |      Contributor and that particular Contributor's Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 | 
 17 |      means Covered Software of a particular Contributor.
 18 | 
 19 | 1.4. "Covered Software"
 20 | 
 21 |      means Source Code Form to which the initial Contributor has attached the
 22 |      notice in Exhibit A, the Executable Form of such Source Code Form, and
 23 |      Modifications of such Source Code Form, in each case including portions
 24 |      thereof.
 25 | 
 26 | 1.5. "Incompatible With Secondary Licenses"
 27 |      means
 28 | 
 29 |      a. that the initial Contributor has attached the notice described in
 30 |         Exhibit B to the Covered Software; or
 31 | 
 32 |      b. that the Covered Software was made available under the terms of
 33 |         version 1.1 or earlier of the License, but not also under the terms of
 34 |         a Secondary License.
 35 | 
 36 | 1.6. "Executable Form"
 37 | 
 38 |      means any form of the work other than Source Code Form.
 39 | 
 40 | 1.7. "Larger Work"
 41 | 
 42 |      means a work that combines Covered Software with other material, in a
 43 |      separate file or files, that is not Covered Software.
 44 | 
 45 | 1.8. "License"
 46 | 
 47 |      means this document.
 48 | 
 49 | 1.9. "Licensable"
 50 | 
 51 |      means having the right to grant, to the maximum extent possible, whether
 52 |      at the time of the initial grant or subsequently, any and all of the
 53 |      rights conveyed by this License.
 54 | 
 55 | 1.10. "Modifications"
 56 | 
 57 |      means any of the following:
 58 | 
 59 |      a. any file in Source Code Form that results from an addition to,
 60 |         deletion from, or modification of the contents of Covered Software; or
 61 | 
 62 |      b. any new file in Source Code Form that contains any Covered Software.
 63 | 
 64 | 1.11. "Patent Claims" of a Contributor
 65 | 
 66 |       means any patent claim(s), including without limitation, method,
 67 |       process, and apparatus claims, in any patent Licensable by such
 68 |       Contributor that would be infringed, but for the grant of the License,
 69 |       by the making, using, selling, offering for sale, having made, import,
 70 |       or transfer of either its Contributions or its Contributor Version.
 71 | 
 72 | 1.12. "Secondary License"
 73 | 
 74 |       means either the GNU General Public License, Version 2.0, the GNU Lesser
 75 |       General Public License, Version 2.1, the GNU Affero General Public
 76 |       License, Version 3.0, or any later versions of those licenses.
 77 | 
 78 | 1.13. "Source Code Form"
 79 | 
 80 |       means the form of the work preferred for making modifications.
 81 | 
 82 | 1.14. "You" (or "Your")
 83 | 
 84 |       means an individual or a legal entity exercising rights under this
 85 |       License. For legal entities, "You" includes any entity that controls, is
 86 |       controlled by, or is under common control with You. For purposes of this
 87 |       definition, "control" means (a) the power, direct or indirect, to cause
 88 |       the direction or management of such entity, whether by contract or
 89 |       otherwise, or (b) ownership of more than fifty percent (50%) of the
 90 |       outstanding shares or beneficial ownership of such entity.
 91 | 
 92 | 
 93 | 2. License Grants and Conditions
 94 | 
 95 | 2.1. Grants
 96 | 
 97 |      Each Contributor hereby grants You a world-wide, royalty-free,
 98 |      non-exclusive license:
 99 | 
100 |      a. under intellectual property rights (other than patent or trademark)
101 |         Licensable by such Contributor to use, reproduce, make available,
102 |         modify, display, perform, distribute, and otherwise exploit its
103 |         Contributions, either on an unmodified basis, with Modifications, or
104 |         as part of a Larger Work; and
105 | 
106 |      b. under Patent Claims of such Contributor to make, use, sell, offer for
107 |         sale, have made, import, and otherwise transfer either its
108 |         Contributions or its Contributor Version.
109 | 
110 | 2.2. Effective Date
111 | 
112 |      The licenses granted in Section 2.1 with respect to any Contribution
113 |      become effective for each Contribution on the date the Contributor first
114 |      distributes such Contribution.
115 | 
116 | 2.3. Limitations on Grant Scope
117 | 
118 |      The licenses granted in this Section 2 are the only rights granted under
119 |      this License. No additional rights or licenses will be implied from the
120 |      distribution or licensing of Covered Software under this License.
121 |      Notwithstanding Section 2.1(b) above, no patent license is granted by a
122 |      Contributor:
123 | 
124 |      a. for any code that a Contributor has removed from Covered Software; or
125 | 
126 |      b. for infringements caused by: (i) Your and any other third party's
127 |         modifications of Covered Software, or (ii) the combination of its
128 |         Contributions with other software (except as part of its Contributor
129 |         Version); or
130 | 
131 |      c. under Patent Claims infringed by Covered Software in the absence of
132 |         its Contributions.
133 | 
134 |      This License does not grant any rights in the trademarks, service marks,
135 |      or logos of any Contributor (except as may be necessary to comply with
136 |      the notice requirements in Section 3.4).
137 | 
138 | 2.4. Subsequent Licenses
139 | 
140 |      No Contributor makes additional grants as a result of Your choice to
141 |      distribute the Covered Software under a subsequent version of this
142 |      License (see Section 10.2) or under the terms of a Secondary License (if
143 |      permitted under the terms of Section 3.3).
144 | 
145 | 2.5. Representation
146 | 
147 |      Each Contributor represents that the Contributor believes its
148 |      Contributions are its original creation(s) or it has sufficient rights to
149 |      grant the rights to its Contributions conveyed by this License.
150 | 
151 | 2.6. Fair Use
152 | 
153 |      This License is not intended to limit any rights You have under
154 |      applicable copyright doctrines of fair use, fair dealing, or other
155 |      equivalents.
156 | 
157 | 2.7. Conditions
158 | 
159 |      Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in
160 |      Section 2.1.
161 | 
162 | 
163 | 3. Responsibilities
164 | 
165 | 3.1. Distribution of Source Form
166 | 
167 |      All distribution of Covered Software in Source Code Form, including any
168 |      Modifications that You create or to which You contribute, must be under
169 |      the terms of this License. You must inform recipients that the Source
170 |      Code Form of the Covered Software is governed by the terms of this
171 |      License, and how they can obtain a copy of this License. You may not
172 |      attempt to alter or restrict the recipients' rights in the Source Code
173 |      Form.
174 | 
175 | 3.2. Distribution of Executable Form
176 | 
177 |      If You distribute Covered Software in Executable Form then:
178 | 
179 |      a. such Covered Software must also be made available in Source Code Form,
180 |         as described in Section 3.1, and You must inform recipients of the
181 |         Executable Form how they can obtain a copy of such Source Code Form by
182 |         reasonable means in a timely manner, at a charge no more than the cost
183 |         of distribution to the recipient; and
184 | 
185 |      b. You may distribute such Executable Form under the terms of this
186 |         License, or sublicense it under different terms, provided that the
187 |         license for the Executable Form does not attempt to limit or alter the
188 |         recipients' rights in the Source Code Form under this License.
189 | 
190 | 3.3. Distribution of a Larger Work
191 | 
192 |      You may create and distribute a Larger Work under terms of Your choice,
193 |      provided that You also comply with the requirements of this License for
194 |      the Covered Software. If the Larger Work is a combination of Covered
195 |      Software with a work governed by one or more Secondary Licenses, and the
196 |      Covered Software is not Incompatible With Secondary Licenses, this
197 |      License permits You to additionally distribute such Covered Software
198 |      under the terms of such Secondary License(s), so that the recipient of
199 |      the Larger Work may, at their option, further distribute the Covered
200 |      Software under the terms of either this License or such Secondary
201 |      License(s).
202 | 
203 | 3.4. Notices
204 | 
205 |      You may not remove or alter the substance of any license notices
206 |      (including copyright notices, patent notices, disclaimers of warranty, or
207 |      limitations of liability) contained within the Source Code Form of the
208 |      Covered Software, except that You may alter any license notices to the
209 |      extent required to remedy known factual inaccuracies.
210 | 
211 | 3.5. Application of Additional Terms
212 | 
213 |      You may choose to offer, and to charge a fee for, warranty, support,
214 |      indemnity or liability obligations to one or more recipients of Covered
215 |      Software. However, You may do so only on Your own behalf, and not on
216 |      behalf of any Contributor. You must make it absolutely clear that any
217 |      such warranty, support, indemnity, or liability obligation is offered by
218 |      You alone, and You hereby agree to indemnify every Contributor for any
219 |      liability incurred by such Contributor as a result of warranty, support,
220 |      indemnity or liability terms You offer. You may include additional
221 |      disclaimers of warranty and limitations of liability specific to any
222 |      jurisdiction.
223 | 
224 | 4. Inability to Comply Due to Statute or Regulation
225 | 
226 |    If it is impossible for You to comply with any of the terms of this License
227 |    with respect to some or all of the Covered Software due to statute,
228 |    judicial order, or regulation then You must: (a) comply with the terms of
229 |    this License to the maximum extent possible; and (b) describe the
230 |    limitations and the code they affect. Such description must be placed in a
231 |    text file included with all distributions of the Covered Software under
232 |    this License. Except to the extent prohibited by statute or regulation,
233 |    such description must be sufficiently detailed for a recipient of ordinary
234 |    skill to be able to understand it.
235 | 
236 | 5. Termination
237 | 
238 | 5.1. The rights granted under this License will terminate automatically if You
239 |      fail to comply with any of its terms. However, if You become compliant,
240 |      then the rights granted under this License from a particular Contributor
241 |      are reinstated (a) provisionally, unless and until such Contributor
242 |      explicitly and finally terminates Your grants, and (b) on an ongoing
243 |      basis, if such Contributor fails to notify You of the non-compliance by
244 |      some reasonable means prior to 60 days after You have come back into
245 |      compliance. Moreover, Your grants from a particular Contributor are
246 |      reinstated on an ongoing basis if such Contributor notifies You of the
247 |      non-compliance by some reasonable means, this is the first time You have
248 |      received notice of non-compliance with this License from such
249 |      Contributor, and You become compliant prior to 30 days after Your receipt
250 |      of the notice.
251 | 
252 | 5.2. If You initiate litigation against any entity by asserting a patent
253 |      infringement claim (excluding declaratory judgment actions,
254 |      counter-claims, and cross-claims) alleging that a Contributor Version
255 |      directly or indirectly infringes any patent, then the rights granted to
256 |      You by any and all Contributors for the Covered Software under Section
257 |      2.1 of this License shall terminate.
258 | 
259 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user
260 |      license agreements (excluding distributors and resellers) which have been
261 |      validly granted by You or Your distributors under this License prior to
262 |      termination shall survive termination.
263 | 
264 | 6. Disclaimer of Warranty
265 | 
266 |    Covered Software is provided under this License on an "as is" basis,
267 |    without warranty of any kind, either expressed, implied, or statutory,
268 |    including, without limitation, warranties that the Covered Software is free
269 |    of defects, merchantable, fit for a particular purpose or non-infringing.
270 |    The entire risk as to the quality and performance of the Covered Software
271 |    is with You. Should any Covered Software prove defective in any respect,
272 |    You (not any Contributor) assume the cost of any necessary servicing,
273 |    repair, or correction. This disclaimer of warranty constitutes an essential
274 |    part of this License. No use of  any Covered Software is authorized under
275 |    this License except under this disclaimer.
276 | 
277 | 7. Limitation of Liability
278 | 
279 |    Under no circumstances and under no legal theory, whether tort (including
280 |    negligence), contract, or otherwise, shall any Contributor, or anyone who
281 |    distributes Covered Software as permitted above, be liable to You for any
282 |    direct, indirect, special, incidental, or consequential damages of any
283 |    character including, without limitation, damages for lost profits, loss of
284 |    goodwill, work stoppage, computer failure or malfunction, or any and all
285 |    other commercial damages or losses, even if such party shall have been
286 |    informed of the possibility of such damages. This limitation of liability
287 |    shall not apply to liability for death or personal injury resulting from
288 |    such party's negligence to the extent applicable law prohibits such
289 |    limitation. Some jurisdictions do not allow the exclusion or limitation of
290 |    incidental or consequential damages, so this exclusion and limitation may
291 |    not apply to You.
292 | 
293 | 8. Litigation
294 | 
295 |    Any litigation relating to this License may be brought only in the courts
296 |    of a jurisdiction where the defendant maintains its principal place of
297 |    business and such litigation shall be governed by laws of that
298 |    jurisdiction, without reference to its conflict-of-law provisions. Nothing
299 |    in this Section shall prevent a party's ability to bring cross-claims or
300 |    counter-claims.
301 | 
302 | 9. Miscellaneous
303 | 
304 |    This License represents the complete agreement concerning the subject
305 |    matter hereof. If any provision of this License is held to be
306 |    unenforceable, such provision shall be reformed only to the extent
307 |    necessary to make it enforceable. Any law or regulation which provides that
308 |    the language of a contract shall be construed against the drafter shall not
309 |    be used to construe this License against a Contributor.
310 | 
311 | 
312 | 10. Versions of the License
313 | 
314 | 10.1. New Versions
315 | 
316 |       Mozilla Foundation is the license steward. Except as provided in Section
317 |       10.3, no one other than the license steward has the right to modify or
318 |       publish new versions of this License. Each version will be given a
319 |       distinguishing version number.
320 | 
321 | 10.2. Effect of New Versions
322 | 
323 |       You may distribute the Covered Software under the terms of the version
324 |       of the License under which You originally received the Covered Software,
325 |       or under the terms of any subsequent version published by the license
326 |       steward.
327 | 
328 | 10.3. Modified Versions
329 | 
330 |       If you create software not governed by this License, and you want to
331 |       create a new license for such software, you may create and use a
332 |       modified version of this License if you rename the license and remove
333 |       any references to the name of the license steward (except to note that
334 |       such modified license differs from this License).
335 | 
336 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
337 |       Licenses If You choose to distribute Source Code Form that is
338 |       Incompatible With Secondary Licenses under the terms of this version of
339 |       the License, the notice described in Exhibit B of this License must be
340 |       attached.
341 | 
342 | Exhibit A - Source Code Form License Notice
343 | 
344 |       This Source Code Form is subject to the
345 |       terms of the Mozilla Public License, v.
346 |       2.0. If a copy of the MPL was not
347 |       distributed with this file, You can
348 |       obtain one at
349 |       http://mozilla.org/MPL/2.0/.
350 | 
351 | If it is not possible or desirable to put the notice in a particular file,
352 | then You may include the notice in a location (such as a LICENSE file in a
353 | relevant directory) where a recipient would be likely to look for such a
354 | notice.
355 | 
356 | You may add additional accurate notices of copyright ownership.
357 | 
358 | Exhibit B - "Incompatible With Secondary Licenses" Notice
359 | 
360 |       This Source Code Form is "Incompatible
361 |       With Secondary Licenses", as defined by
362 |       the Mozilla Public License, v. 2.0.
363 | 
364 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Autopilot Pattern MySQL
  2 | 
  3 | MySQL designed for automated operation using the [Autopilot Pattern](http://autopilotpattern.io/). This repo serves as a blueprint demonstrating the pattern -- automatically setting up replication, backups, and failover without human intervention.
  4 | 
  5 | [![DockerPulls](https://img.shields.io/docker/pulls/autopilotpattern/mysql.svg)](https://registry.hub.docker.com/u/autopilotpattern/mysql/)
  6 | [![DockerStars](https://img.shields.io/docker/stars/autopilotpattern/mysql.svg)](https://registry.hub.docker.com/u/autopilotpattern/mysql/)
  7 | 
  8 | - [Architecture](#Architecture)
  9 |   - [Bootstrapping via `pre_start` handler](#bootstrapping-via-pre_start-handler)
 10 |   - [Maintenance via `health` handler](#maintenance-via-health-handler)
 11 |   - [Failover via `on_change` handler](#failover-via-on_change-handler)
 12 |   - [Backups in the `snapshot_task`](#backups-in-the-snapshot_task)
 13 | - [Concepts](#concepts)
 14 |   - [Guarantees](#guarantees)
 15 |   - [Determining if a node is primary](#determining-if-a-node-is-primary)
 16 | - [Running the cluster](#running-the-cluster)
 17 |   - [Configuration](#configuration)
 18 |   - [Upgrading the cluster](#upgrading-the-cluster)
 19 |   - [Where to store data](#where-to-store-data)
 20 |   - [Using an existing database](#using-an-existing-database)
 21 | 
 22 | ---
 23 | 
 24 | ## Architecture
 25 | 
 26 | A running cluster includes the following components:
 27 | 
 28 | - [MySQL](https://dev.mysql.com/): we're using MySQL5.6 via [Percona Server](https://www.percona.com/software/mysql-database/percona-server), and [`xtrabackup`](https://www.percona.com/software/mysql-database/percona-xtrabackup) for running hot snapshots.
 29 | - [ContainerPilot](https://www.joyent.com/containerpilot): included in our MySQL containers to orchestrate bootstrap behavior and coordinate replication using keys and checks stored in Consul in the `preStart`, `health`, and `onChange` handlers.
 30 | - [Consul](https://www.consul.io/): is our service catalog that works with ContainerPilot and helps coordinate service discovery, replication, and failover
 31 | - [Manta](https://www.joyent.com/triton/object-storage): the Joyent object store, for securely and durably storing our MySQL snapshots.
 32 | - `manage.py`: a small Python application that ContainerPilot's lifecycle hooks will call to bootstrap MySQL, perform health checks, manage replication setup, and perform coordinated failover.
 33 | 
 34 | The lifecycle of a MySQL container is managed by 4 lifecycle hooks in the `manage.py` application: `pre_start`, `health`, `on_change`, and `snapshot_task`.
 35 | 
 36 | ### Bootstrapping via `pre_start` handler
 37 | 
 38 | When a container is started ContainerPilot will run the `manage.py pre_start` function, which must exit cleanly before the MySQL server will be started. See [ContainerPilot's `preStart` action docs](https://www.joyent.com/containerpilot/docs/start-stop) for how this is triggered.
 39 | 
 40 | Once `pre_start` has gathered its configuration from the environment it verifies whether this instance has been previously started. If not, it asks Consul whether a snapshot image for the database exists on Manta. If so, it will download the snapshot and initialize the database from that snapshot using the Percona `xtrabackup` tool. If not, we perform the initial MySQL setup via `mysql_install_db`. Note that we're assuming that the first instance is launched and allowed to initialize before we bring up other instances, but this only applies the very first time we bring up the cluster. Also note that at this time the MySQL server is not yet running and so we can't complete replication setup.
 41 | 
 42 | 
 43 | ### Maintenance via `health` handler
 44 | 
 45 | The ContainerPilot `health` handler calls `manage.py health` periodically. The behavior of this handler depends on whether the instance is a primary, a replica, or hasn't yet been initialized as either. See [ContainerPilot's service health check docs](https://www.joyent.com/containerpilot/docs/health) for how to use this in your own application.
 46 | 
 47 | The `health` function first checks whether this instance has been previously initialized (via checking a lock file on disk). If not, it'll check if a primary has been registered with Consul. If not, the handler will attempt to obtain a lock in Consul marking it as the primary. If the lock fails (perhaps because we're bringing up multiple hosts at once and another has obtained the lock), then we'll exit and retry on the next call of the `health` function. If the lock succeeds this node is marked the primary and we'll bootstrap the rest of the MySQL application. This includes creating a default user, replication user, and default schema, as well as resetting the root password, and writing a snapshot of the initialized DB to Manta (where another instance can download it during `pre_start`).
 48 | 
 49 | If this is the first pass thru the health check and a primary has already been registered in Consul, then the handler will set up replication to the primary. Replication in this architecture uses [Global Transaction Identifiers (GTID)](https://dev.mysql.com/doc/refman/5.7/en/replication-gtids.html) so that replicas can autoconfigure their position within the binlog.
 50 | 
 51 | If this isn't the first pass thru the health check and we've already set up this node, then the health check can continue. If this node is the primary, the handler will execute a `SELECT 1` against the database to make sure its up and then renew the session in Consul indicating it is the primary. If the node is a replica, the handler will make sure that `SHOW SLAVE STATUS` returns a value. When the handler exits successfully, ContainerPilot will send a heartbeat with TTL to Consul indicating that the node is still healthy.
 52 | 
 53 | 
 54 | ### Failover via `on_change` handler
 55 | 
 56 | The ContainerPilot configuration for replicas watches for changes to the primary. If the primary becomes unhealthy or updates its IP address, ContainerPilot will call `manage.py on_change` to perform failover. See [ContainerPilot's `onChange` docs](https://www.joyent.com/containerpilot/docs/lifecycle#while-the-container-is-running) for how changes are identified.
 57 | 
 58 | All remaining instances will receive the `on_change` handler call so there is a process of coordination involved to ensure that only one instance actually attempts to perform the failover. The first step is to check if there is a healthy primary! Because the `on_change` handlers are polling asynchronously it's entirely possible for another instance to have completed failover before a given instance runs its `on_change` handler. In this case, the node will either mark itself as primary (if it was assigned to be the primary by the failover node, see below) or set up replication to the new primary.
 59 | 
 60 | Once we've determined that the node should attempt a failover it tries to obtain a lock in Consul. If the failover lock fails this means another node is going to run the failover and this handler should wait for it to be completed. Once the lock has been removed the handler will either mark itself as the new primary and reload its ContainerPilot configuration to match, or will realize its a replica and quietly exit.
 61 | 
 62 | If the failover lock succeeds then this node will run the failover, and it will do so via [`mysqlrpladmin failover`](http://dev.mysql.com/doc/mysql-utilities/1.6/en/mysqlrpladmin.html). The handler gets a list of healthy MySQL instances from Consul and passes these as candidates for primary to the `mysqlrpladmin` tool. This tool will stop replication on all candidates, determine which candidate is the best one to use as the primary, ensure that all nodes have the same transactions, and then set up replication for all replicas to the new primary. See [this blog post by Percona](https://www.percona.com/blog/2014/06/27/failover-mysql-utilities-part1-mysqlrpladmin/) for more details on how this failover step works.
 63 | 
 64 | Once the failover is complete, the failover node will release the lock after its next pass through the health check. This ensures that if the failover node marked itself as the primary that other replicas don't attempt to failover spuriously because there's no healthy primary.
 65 | 
 66 | 
 67 | ### Backups in the `snapshot_task`
 68 | 
 69 | If the node is primary, the handler will ask Consul if the TTL key for backups have expired or whether the binlog has been rotated. If either case is true, the application will create a new snapshot, upload it to Manta, and write the appropriate keys to Consul to tell replicas where to find the backup. See [ContainerPilot's period task docs](https://www.joyent.com/containerpilot/docs/tasks) to learn how the recurring snapshot task is configured.
 70 | 
 71 | 
 72 | ---
 73 | 
 74 | ## Concepts
 75 | 
 76 | ### Guarantees
 77 | 
 78 | It's very important to note that during failover, the MySQL cluster is unavailable for writes. Any client application should be using ContainerPilot or some other means to watch for changes to the `mysql-primary` service and halt writes until the failover is completed. Writes sent to the primary after it fails will be lost.
 79 | 
 80 | The failover process described above prevents data corruption by ensuring that all replicas have the same set of transactions before continuing. But because MySQL replication is asynchronous it cannot protect against data *loss*. It's entirely possible for the primary to fail without any replica having received its last transactions. This is an inherent limitation of MySQL asynchronous replication and you must architect your application to take this into account!
 81 | 
 82 | 
 83 | ### Determining if a node is primary
 84 | 
 85 | In most the handlers described above, there is a need to determine whether the node executing the handler thinks it is the current primary. This is determined as follows:
 86 | 
 87 | - Ask `mysqld` for replication status. If the node is replicating from an instance then it is a replica, and if it has replicas then it is a primary. If neither, continue to the next step.
 88 | - Ask Consul for a health instance of the `mysql-primary` service. If Consul returns an IP that matches this node, then it is primary. If Consul returns an IP that doesn't match this node, then it is a replica. If neither, then we cannot determine whether the node is primary or replica and it is marked "unassigned."
 89 | 
 90 | Note that this determines only whether this node *thinks* it is the primary instance. During initialization (in `health` checks) an unassigned node will try to elect itself the new primary. During failover, if a node is a replica then it will also check to see if the primary that it found is actually healthy.
 91 | 
 92 | ---
 93 | 
 94 | ## Running the cluster
 95 | 
 96 | Starting a new cluster is easy once you have [your `_env` file set with the configuration details](#configuration), **just run `docker-compose up -d` and in a few moments you'll have a running MySQL primary**. Both the primary and replicas are described as a single `docker-compose` service. During startup, [ContainerPilot](http://containerpilot.io) will ask Consul if an existing primary has been created. If not, the node will initialize as a new primary and all future nodes will self-configure replication with the primary in their `preStart` handler.
 97 | 
 98 | **Run `docker-compose scale mysql=2` to add a replica (or more than one!)**. The replicas will automatically configure themselves to to replicate from the primary and will register themselves in Consul as replicas once they're ready.
 99 | 
100 | ### Configuration
101 | 
102 | Pass these variables via an `_env` file. The included `setup.sh` can be used to test your Docker and Triton environment, and to encode the Manta SSH key in the `_env` file.
103 | 
104 | - `MYSQL_USER`: this user will be set up as the default non-root user on the node
105 | - `MYSQL_PASSWORD`: this user will be set up as the default non-root user on the node
106 | 
107 | 
108 | #### Snapshots
109 | These variables control where the database snapshots are saved.
110 | 
111 | - `SNAPSHOT_BACKEND`: Select from: `manta`, `minio`, or `local` (Defaults to `manta`.)
112 | 
113 | ##### Manta
114 | - `MANTA_URL`: the full Manta endpoint URL. (ex. `https://us-east.manta.joyent.com`)
115 | - `MANTA_USER`: the Manta account name.
116 | - `MANTA_SUBUSER`: the Manta subuser account name, if any.
117 | - `MANTA_ROLE`: the Manta role name, if any.
118 | - `MANTA_KEY_ID`: the MD5-format ssh key id for the Manta account/subuser (ex. `1a:b8:30:2e:57:ce:59:1d:16:f6:19:97:f2:60:2b:3d`); the included `setup.sh` will encode this automatically
119 | - `MANTA_PRIVATE_KEY`: the private ssh key for the Manta account/subuser; the included `setup.sh` will encode this automatically
120 | - `MANTA_BUCKET`: the path on Manta where backups will be stored. (ex. `/myaccount/stor/triton-mysql`); the bucket must already exist and be writeable by the `MANTA_USER`/`MANTA_PRIVATE_KEY`
121 | 
122 | ##### Minio
123 | - `MINIO_ACCESS_KEY`: S3 Access key to login.
124 | - `MINIO_SECRET_KEY`: S3 Secret key to login.
125 | - `MINIO_BUCKET`: The S3 bucket to put snapshots in. (Defaults to `backups`.)
126 | - `MINIO_LOCATION`: Define the region/ location where the bucket is. (Defaults to `us-east-1`.)
127 | - `MINIO_URL`: The url of minio. (Defaults to `minio:9000`.)
128 | - `MINIO_TLS_SECURE`: Use a secure https connection to minio. (Defaults to `false`.)
129 | 
130 | ##### Local
131 | - `STORAGE_DIR`: The local directory to store snapshots. (Defaults to `/tmp/snapshots`.)
132 | 
133 | #### Optional Configs
134 | 
135 | These variables are optional but you most likely want them:
136 | 
137 | - `SERVICE_NAME`: the name by which this instance will register itself in consul. If you do not provide one, defaults to `"mysql"`.
138 | - `MYSQL_REPL_USER`: this user will be used on all instances to set up MySQL replication. If not set, then replication will not be set up on the replicas.
139 | - `MYSQL_REPL_PASSWORD`: this password will be used on all instances to set up MySQL replication. If not set, then replication will not be set up on the replicas.
140 | - `MYSQL_DATABASE`: create this database on startup if it doesn't already exist. The `MYSQL_USER` user will be granted superuser access to that DB.
141 | - `LOG_LEVEL`: will set the logging level of the `manage.py` application. It defaults to `DEBUG` and uses the Python stdlib [log levels](https://docs.python.org/2/library/logging.html#levels). The `DEBUG` log level is extremely verbose -- in production you'll want this to be at `INFO` or above.
142 | - `CONSUL` is the hostname for the Consul instance(s). (Defaults to `consul`.)
143 | 
144 | #### Consul keys
145 | 
146 | The following variables control the names of keys written to Consul. They are optional with sane defaults, but if you are using Consul for many other services you might have requirements to namespace keys:
147 | 
148 | - `PRIMARY_KEY`: The key used to record a lock on what node is primary. (Defaults to `${SERVICE_NAME}-primary`.)
149 | - `BACKUP_LOCK_KEY`: The key used to record a lock on a running snapshot. (Defaults to `mysql-backup-running`.)
150 | - `LAST_BACKUP_KEY`: The key used to store the path and timestamp of the most recent backup. (Defaults to `mysql-last-backup`.)
151 | - `LAST_BINLOG_KEY`: The key used to store the filename of the most recent binlog file on the primary. (Defaults to `mysql-last-binlog`.)
152 | - `BACKUP_NAME`: The name of the backup file that's stored on Manta, with optional [strftime](https://docs.python.org/2/library/time.html#time.strftime) directives. (Defaults to `mysql-backup-%Y-%m-%dT%H-%M-%SZ`.)
153 | - `BACKUP_TTL`: Time in seconds to wait between backups. (Defaults to `86400`, or 24 hours.)
154 | - `SESSION_NAME`: The name used for session locks. (Defaults to `mysql-primary-lock`.)
155 | 
156 | #### MySQL
157 | 
158 | These variables *may* be passed but it's not recommended to do this. Instead we'll set a one-time root password during DB initialization; the password will be dropped into the logs. Security can be improved by using a key management system in place of environment variables. The constructor for the `Node` class in `manage.py` would be a good place to hook in this behavior, which is out-of-scope for this demonstration.
159 | 
160 | - `MYSQL_RANDOM_ROOT_PASSWORD`: defaults to "yes"
161 | - `MYSQL_ONETIME_PASSWORD`: defaults to "yes"
162 | - `MYSQL_ROOT_PASSWORD`: default to being unset
163 | 
164 | These variables will be written to `/etc/my.cnf`.
165 | 
166 | - `INNODB_BUFFER_POOL_SIZE`: innodb_buffer_pool_size
167 | 
168 | 
169 | Environment variables are expanded automatically.
170 | This allows you to [use environment variables](https://docs.docker.com/compose/compose-file/#/environment) from the machine where `docker compose` runs.
171 | Example:
172 | 
173 | ```
174 | # local-compose.yml
175 | mysql:
176 |   environment:
177 |     USER:
178 | ```
179 | 
180 | ```
181 | # _env
182 | MANTA_BUCKET=/companyaccount/stor/developers/${USER}/backups
183 | ```
184 | 
185 | 
186 | ### Upgrading a cluster
187 | 
188 | If you need to upgrade MySQL to a backwards compatible version, create a new image with the new MySQL. Bring up a new instance running that image; it will get the snapshot from Manta and make itself a replica. You can then replace all the replicas with the new image and finally execute a failover by stopping the primary instance. During the cutover, the cluster will be unavailable for writes.
189 | 
190 | If you need to upgrade `manage.py`, you can use the same process so long as you're using a compatible major-version of this repo. Because the container image includes MySQL, ContainerPilot, and the code in `manage.py` the [releases](https://github.com/autopilotpattern/mysql/releases) are versioned with both the MySQL version and the release version of this repo, using semantic versioning for the latter. For example, release [5.6r2.2.0](https://github.com/autopilotpattern/mysql/releases/tag/5.6r2.2.0) is MySQL5.6 with the 2.2.0 version of the `manage.py` code. It is not backwards compatible with [5.6r1.0.0](https://github.com/autopilotpattern/mysql/releases/tag/r1.0.0) but is backwards compatible with [5.6r2.1.0](https://github.com/autopilotpattern/mysql/releases/tag/5.6r2.1.0). Upgrading between major versions of `manage.py` currently can't be done automatically. You'll need to stand up a new cluster using the snapshot from your old cluster.
191 | 
192 | 
193 | ### Where to store data
194 | 
195 | This pattern automates the data management and makes container effectively stateless to the Docker daemon and schedulers. This is designed to maximize convenience and reliability by minimizing the external coordination needed to manage the database. The use of external volumes (`--volumes-from`, `-v`, etc.) is not recommended.
196 | 
197 | On Triton, there's no need to use data volumes because the performance hit you normally take with overlay file systems in Linux doesn't happen with ZFS.
198 | 
199 | 
200 | ### Using an existing database
201 | 
202 | If you start your MySQL container instance with a data directory that already contains a database (specifically, a mysql subdirectory), the pre-existing database won't be changed in any way.
203 | 


--------------------------------------------------------------------------------
/bin/manage.py:
--------------------------------------------------------------------------------
  1 | """ autopilotpattern/mysql ContainerPilot handlers """
  2 | from __future__ import print_function
  3 | from datetime import datetime
  4 | import os
  5 | import socket
  6 | import subprocess
  7 | import sys
  8 | 
  9 | # pylint: disable=invalid-name,no-self-use,dangerous-default-value
 10 | from manager.client import MySQL, MySQLError
 11 | from manager.config import ContainerPilot
 12 | from manager.discovery import Consul
 13 | from manager.env import PRIMARY_KEY, BACKUP_NAME
 14 | from manager.network import get_ip
 15 | 
 16 | from manager.storage.manta_stor import Manta
 17 | from manager.storage.minio_stor import Minio
 18 | from manager.storage.local import Local
 19 | 
 20 | from manager.utils import log, debug, \
 21 |     PRIMARY, REPLICA, UNASSIGNED, \
 22 |     UnknownPrimary, WaitTimeoutError
 23 | 
 24 | 
 25 | class Node(object):
 26 |     """
 27 |     Node represents the state of our running container and carries
 28 |     around the MySQL config, and clients for Consul and Snapshots.
 29 |     """
 30 |     def __init__(self, mysql=None, cp=None, consul=None, snaps=None):
 31 |         self.mysql = mysql
 32 |         self.consul = consul
 33 |         self.snaps = snaps
 34 |         self.cp = cp
 35 | 
 36 |         self.hostname = socket.gethostname()
 37 |         self.name = 'mysql-{}'.format(self.hostname)
 38 |         self.ip = get_ip()
 39 | 
 40 |     @debug(log_output=True)
 41 |     def is_primary(self):
 42 |         """
 43 |         Check if this node is the primary by checking in-memory cache,
 44 |         then Consul, then MySQL replication status. Caches its result so
 45 |         the node `state` field needs to be set to UNASSIGNED if you want
 46 |         to force a check of Consul, etc.
 47 |         """
 48 |         log.debug('state: %s' % self.cp.state)
 49 |         if self.cp.state != UNASSIGNED:
 50 |             return self.cp.state == PRIMARY
 51 | 
 52 |         try:
 53 |             # am I already replicating from somewhere else?
 54 |             _, primary_ip = self.mysql.get_primary()
 55 |             if not primary_ip:
 56 |                 pass
 57 |             elif primary_ip == self.ip:
 58 |                 self.cp.state = PRIMARY
 59 |                 return True
 60 |             else:
 61 |                 self.cp.state = REPLICA
 62 |                 return False
 63 |         except (MySQLError, WaitTimeoutError, UnknownPrimary) as ex:
 64 |             log.debug('could not determine primary via mysqld status: %s', ex)
 65 | 
 66 |         try:
 67 |             # am I already reporting I'm a healthy primary to Consul?
 68 |             _, primary_ip = self.consul.get_primary()
 69 |             if not primary_ip:
 70 |                 pass
 71 |             elif primary_ip == self.ip:
 72 |                 self.cp.state = PRIMARY
 73 |                 return True
 74 |             else:
 75 |                 self.cp.state = REPLICA
 76 |                 return False
 77 |         except (UnknownPrimary, ValueError) as ex:
 78 |             log.debug('could not determine primary via Consul: %s', ex)
 79 | 
 80 |         # am I listed in the Consul PRIMARY_KEY??
 81 |         _, primary_name = self.consul.read_lock(PRIMARY_KEY)
 82 |         log.debug('primary_name: %s' % primary_name)
 83 |         if primary_name == self.name:
 84 |             self.cp.state = PRIMARY
 85 |             return True
 86 | 
 87 |         self.cp.state = UNASSIGNED
 88 |         return False
 89 | 
 90 |     def is_replica(self):
 91 |         """ check if we're the replica """
 92 |         return not self.is_primary() and self.cp.state != UNASSIGNED
 93 | 
 94 |     def is_snapshot_node(self):
 95 |         """ check if we're the node that's going to execute the snapshot """
 96 |         # TODO: we want to have the replicas all do a lock on the snapshot task
 97 |         return self.is_primary()
 98 | 
 99 | 
100 | # ---------------------------------------------------------
101 | # Top-level functions called by ContainerPilot
102 | 
103 | @debug
104 | def pre_start(node):
105 |     """
106 |     the top-level ContainerPilot `preStart` handler.
107 |     MySQL must be running in order to execute most of our setup behavior
108 |     so we're just going to make sure the directory structures are in
109 |     place and then let the first health check handler take it from there
110 |     """
111 |     # make sure that if we've pulled in an external data volume that
112 |     # the mysql user can read it
113 |     my = node.mysql
114 |     my.take_ownership()
115 |     my.render()
116 |     if not os.path.isdir(os.path.join(my.datadir, 'mysql')):
117 |         last_backup = node.consul.has_snapshot()
118 |         if last_backup:
119 |             node.snaps.get_backup(last_backup)
120 |             my.restore_from_snapshot(last_backup)
121 |         else:
122 |             if not my.initialize_db():
123 |                 log.info('Skipping database setup.')
124 | 
125 | @debug
126 | def health(node):
127 |     """
128 |     The top-level ContainerPilot `health` handler. Runs a simple health check.
129 |     Also acts as a check for whether the ContainerPilot configuration needs
130 |     to be reloaded (if it's been changed externally).
131 |     """
132 | 
133 |     # Because we need MySQL up to finish initialization, we need to check
134 |     # for each pass thru the health check that we've done so. The happy
135 |     # path is to check a lock file against the node state (which has been
136 |     # set above) and immediately return when we discover the lock exists.
137 |     # Otherwise, we bootstrap the instance for its *current* state.
138 |     assert_initialized_for_state(node)
139 | 
140 |     if node.is_primary():
141 |         # If this lock is allowed to expire and the health check for the
142 |         # primary fails the `onChange` handlers for the replicas will try
143 |         # to failover and then the primary will obtain a new lock.
144 |         # If this node can update the lock but the DB fails its health check,
145 |         # then the operator will need to manually intervene if they want to
146 |         # force a failover. This architecture is a result of Consul not
147 |         # permitting us to acquire a new lock on a health-checked session if the
148 |         # health check is *currently* failing, but has the happy side-effect of
149 |         # reducing the risk of flapping on a transient health check failure.
150 |         node.consul.renew_session()
151 | 
152 |         # Simple health check; exceptions result in a non-zero exit code
153 |         node.mysql.query('select 1')
154 | 
155 |     elif node.is_replica():
156 |         # TODO: we should make this check actual replication health
157 |         # and not simply that replication has been established
158 |         if not node.mysql.query('show slave status'):
159 |             log.error('Replica is not replicating.')
160 |             sys.exit(1)
161 |     else:
162 |         # If we're still somehow marked UNASSIGNED we exit now. This is a
163 |         # byzantine failure mode where the end-user needs to intervene.
164 |         log.error('Cannot determine MySQL state; failing health check.')
165 |         sys.exit(1)
166 | 
167 |     node.consul.unlock_failover()
168 | 
169 | 
170 | @debug
171 | def on_change(node):
172 |     """ The top-level ContainerPilot onChange handler """
173 | 
174 |     # first check if this node has already been set primary by a completed
175 |     # call to failover and update the ContainerPilot config as needed.
176 |     if node.is_primary():
177 |         log.debug('[on_change] this node is primary, no failover required.')
178 |         if node.cp.update():
179 |             # we're ignoring the lock here intentionally
180 |             node.consul.put(PRIMARY_KEY, node.name)
181 |             node.cp.reload()
182 |         return
183 | 
184 |     # check if another node has been set primary already and is reporting
185 |     # as healthy, in which case there's no failover required. Note that
186 |     # we can't simply check if we're a replica via .is_replica() b/c that
187 |     # trusts mysqld's view of the world.
188 |     try:
189 |         node.consul.get_primary(timeout=1)
190 |         log.debug('[on_change] primary is already healthy, no failover required')
191 |         return
192 |     except (UnknownPrimary, WaitTimeoutError) as ex:
193 |         log.debug('[on_change] no primary from consul: %s', ex)
194 | 
195 |     if node.consul.lock_failover(node.name):
196 |         try:
197 |             nodes = node.consul.client.health.service(REPLICA, passing=True)[1]
198 |             ips = [instance['Service']['Address'] for instance in nodes]
199 |             log.info('[on_change] Executing failover with candidates: %s', ips)
200 |             node.mysql.failover(ips)
201 |         except Exception:
202 |             # On failure we bubble-up the exception and fail the onChange.
203 |             # Either another instance that didn't overlap in time will
204 |             # complete failover or we'll be left w/o a primary and require
205 |             # manual intervention via `mysqlrpladmin failover`
206 |             node.consul.unlock_failover()
207 |             raise
208 |     else:
209 |         log.info('[on_change] Failover in progress on another node, '
210 |                  'waiting to complete.')
211 |         node.consul.wait_for_failover_lock()
212 | 
213 |     # need to determine replicaton status at this point, so make
214 |     # sure we refresh .state from mysqld/Consul
215 |     node.cp.state = UNASSIGNED
216 |     if node.is_primary():
217 |         log.info('[on_change] node %s is primary after failover', node.name)
218 |         if node.cp.update():
219 |             # we're intentionally ignoring the advisory lock here
220 |             ok = node.consul.put(PRIMARY_KEY, node.name)
221 |             log.debug('[on_change] %s obtained lock: %s', node.name, ok)
222 |             node.cp.reload()
223 |         return
224 |     elif node.is_replica():
225 |         log.info('[on_change] node %s is replica after failover', node.name)
226 | 
227 |     if node.cp.state == UNASSIGNED:
228 |         log.error('[on_change] this node is neither primary or replica '
229 |                   'after failover; check replication status on cluster.')
230 |         sys.exit(1)
231 | 
232 | 
233 | @debug
234 | def snapshot_task(node):
235 |     """
236 |     Create a snapshot and send it to the object store if this is the
237 |     node and time to do so.
238 |     """
239 |     # bail-out early if we can avoid making a DB connection
240 |     if not node.is_snapshot_node() or not node.consul.lock_snapshot(node.name):
241 |         return
242 | 
243 |     binlog_file = node.mysql.get_binlog()
244 |     if node.consul.is_snapshot_stale(binlog_file):
245 |         # we'll let exceptions bubble up here. The task will fail
246 |         # and be logged, and when the BACKUP_LOCK_KEY expires we can
247 |         # alert on that externally.
248 |         try:
249 |             write_snapshot(node)
250 |         finally:
251 |             node.consul.unlock_snapshot()
252 | 
253 | 
254 | @debug
255 | def write_snapshot(node):
256 |     """
257 |     Calls out to innobackupex to snapshot the DB, then pushes the file
258 |     to Snapshot storage and writes that the work is completed in Consul.
259 |     """
260 |     now = datetime.utcnow()
261 |     # we don't want .isoformat() here because of URL encoding
262 |     backup_id = now.strftime('{}'.format(BACKUP_NAME))
263 |     backup_time = now.isoformat()
264 | 
265 |     with open('/tmp/backup.tar', 'w') as f:
266 |         subprocess.check_call(['/usr/bin/innobackupex',
267 |                                '--user={}'.format(node.mysql.repl_user),
268 |                                '--password={}'.format(node.mysql.repl_password),
269 |                                '--no-timestamp',
270 |                                #'--compress',
271 |                                '--stream=tar',
272 |                                '/tmp/backup'], stdout=f)
273 |     log.info('snapshot completed, uploading to object store')
274 |     out = node.snaps.put_backup(backup_id, '/tmp/backup.tar')
275 |     log.info('snapshot uploaded to %s', out)
276 | 
277 |     # write the filename of the binlog to Consul so that we know if
278 |     # we've rotated since the last backup.
279 |     # query lets KeyError bubble up -- something's broken
280 |     results = node.mysql.query('show master status')
281 |     binlog_file = results[0]['File']
282 |     node.consul.record_backup(backup_id, backup_time, binlog_file)
283 | 
284 | # ---------------------------------------------------------
285 | # run_as_* functions determine the top-level behavior of a node
286 | 
287 | @debug(log_output=True)
288 | def assert_initialized_for_state(node):
289 |     """
290 |     If the node has not yet been set up, find the correct state and
291 |     initialize for that state. After the first health check we'll have
292 |     written a lock file and will never hit this path again.
293 |     """
294 |     LOCK_PATH = '/var/run/init.lock'
295 |     try:
296 |         os.mkdir(LOCK_PATH, 0700)
297 |     except OSError:
298 |         # the lock file exists so we've already initialized
299 |         return True
300 | 
301 |     # the check for primary will set the state if its known. If another
302 |     # instance is the primary then we'll be marked as REPLICA, so if
303 |     # we can't determine after the check which we are then we're likely
304 |     # the first instance (this will get safely verified later).
305 |     if node.is_primary() or node.cp.state == UNASSIGNED:
306 |         try:
307 |             if not run_as_primary(node):
308 |                 log.error('Tried to mark node %s primary but primary exists, '
309 |                           'exiting for retry on next check.', node.name)
310 |                 os.rmdir(LOCK_PATH)
311 |                 sys.exit(1)
312 |         except MySQLError as ex:
313 |             # We've made it only partly thru setup. Setup isn't idempotent
314 |             # but should be safe to retry if we can make more progress. At
315 |             # worst we end up with a bunch of failure logs.
316 |             log.error('Failed to set up %s as primary (%s). Exiting but will '
317 |                       'retry setup. Check logs following this line to see if '
318 |                       'setup needs reconfiguration or manual intervention to '
319 |                       'continue.', node.name, ex)
320 |             os.rmdir(LOCK_PATH)
321 |             sys.exit(1)
322 |         if node.cp.update():
323 |             os.rmdir(LOCK_PATH)
324 |             node.cp.reload()
325 |             # this is racy with the SIGHUP that ContainerPilot just got
326 |             # sent, but if the Consul agent shuts down quickly enough we
327 |             # end up sending extra API calls to it and get a bunch of log
328 |             # spam. This forces us to exit early.
329 |             sys.exit(0)
330 |     else:
331 |         try:
332 |             run_as_replica(node)
333 |         except (UnknownPrimary, MySQLError) as ex:
334 |             log.error('Failed to set up %s for replication (%s). Exiting for retry '
335 |                       'on next check.', node.name, ex)
336 |             os.rmdir(LOCK_PATH)
337 |             sys.exit(1)
338 |     return False
339 | 
340 | 
341 | @debug
342 | def run_as_primary(node):
343 |     """
344 |     The overall workflow here is ported and reworked from the
345 |     Oracle-provided Docker image:
346 |     https://github.com/mysql/mysql-docker/blob/mysql-server/5.7/docker-entrypoint.sh
347 |     """
348 |     if not node.consul.mark_as_primary(node.name):
349 |         return False
350 |     node.cp.state = PRIMARY
351 | 
352 |     conn = node.mysql.wait_for_connection()
353 |     my = node.mysql
354 |     if conn:
355 |         # if we can make a connection w/o a password then this is the
356 |         # first pass. *Note: the conn is not the same as `node.conn`!*
357 |         my.set_timezone_info()
358 |         my.setup_root_user(conn)
359 |         my.create_db(conn)
360 |         my.create_default_user(conn)
361 |         my.create_repl_user(conn)
362 |         my.expire_root_password(conn)
363 |     else:
364 |         # in case this is a newly-promoted primary
365 |         my.execute('STOP SLAVE')
366 | 
367 |     # although backups will be run from any instance, we need to first
368 |     # snapshot the primary so that we can bootstrap replicas.
369 |     write_snapshot(node)
370 |     return True
371 | 
372 | @debug
373 | def run_as_replica(node):
374 |     """
375 |     Set up GTID-based replication to the primary; once this is set the
376 |     replica will automatically try to catch up with the primary's last
377 |     transactions. UnknownPrimary or mysqlconn.Errors are allowed to
378 |     bubble up to the caller.
379 |     """
380 |     log.info('Setting up replication.')
381 |     node.cp.state = REPLICA
382 |     _, primary_ip = node.consul.get_primary(timeout=30)
383 |     node.mysql.setup_replication(primary_ip)
384 | 
385 | # ---------------------------------------------------------
386 | 
387 | def main():
388 |     """
389 |     Parse argument as command and execute that command with
390 |     parameters containing the state of MySQL, ContainerPilot, etc.
391 |     Default behavior is to run `pre_start` DB initialization.
392 |     """
393 |     if len(sys.argv) == 1:
394 |         consul = Consul(envs={'CONSUL': os.environ.get('CONSUL', 'consul')})
395 |         cmd = pre_start
396 |     else:
397 |         consul = Consul()
398 |         try:
399 |             cmd = globals()[sys.argv[1]]
400 |         except KeyError:
401 |             log.error('Invalid command: %s', sys.argv[1])
402 |             sys.exit(1)
403 | 
404 |     my = MySQL()
405 | 
406 |     snapshot_backend = os.environ.get('SNAPSHOT_BACKEND', 'manta')
407 |     if snapshot_backend == 'local':
408 |         snaps = Local()
409 |     elif snapshot_backend == 'minio':
410 |         snaps = Minio()
411 |     else:
412 |         snaps = Manta()
413 | 
414 |     cp = ContainerPilot()
415 |     cp.load()
416 |     node = Node(mysql=my, consul=consul, snaps=snaps, cp=cp)
417 | 
418 |     cmd(node)
419 | 
420 | if __name__ == '__main__':
421 |     main()
422 | 


--------------------------------------------------------------------------------
/bin/manager/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/autopilotpattern/mysql/b104eb2679f6fbd0a59a02beb061e41f72e620d8/bin/manager/__init__.py


--------------------------------------------------------------------------------
/bin/manager/client.py:
--------------------------------------------------------------------------------
  1 | """ autopilotpattern/mysql MySQL module """
  2 | from collections import OrderedDict
  3 | import os
  4 | import re
  5 | import pwd
  6 | import socket
  7 | import subprocess
  8 | import string
  9 | import time
 10 | 
 11 | from manager.env import env, to_flag
 12 | from manager.network import get_ip
 13 | from manager.utils import debug, log, \
 14 |     WaitTimeoutError, UnknownPrimary
 15 | 
 16 | # pylint: disable=import-error,no-self-use,invalid-name,dangerous-default-value
 17 | import mysql.connector as mysqlconn
 18 | from mysql.connector import Error as MySQLError
 19 | 
 20 | class MySQL(object):
 21 |     """
 22 |     MySQL represents the connection to and configuration of the MySQL
 23 |     process and its clients.
 24 |     """
 25 |     def __init__(self, envs=os.environ):
 26 |         self.mysql_db = env('MYSQL_DATABASE', None, envs)
 27 |         self.mysql_user = env('MYSQL_USER', None, envs)
 28 |         self.mysql_password = env('MYSQL_PASSWORD', None, envs)
 29 |         self.mysql_root_password = env('MYSQL_ROOT_PASSWORD', '', envs)
 30 |         self.mysql_random_root_password = env('MYSQL_RANDOM_ROOT_PASSWORD',
 31 |                                               True, envs, to_flag)
 32 |         self.mysql_onetime_password = env('MYSQL_ONETIME_PASSWORD',
 33 |                                           False, envs, to_flag)
 34 |         self.repl_user = env('MYSQL_REPL_USER', None, envs)
 35 |         self.repl_password = env('MYSQL_REPL_PASSWORD', None, envs)
 36 |         self.datadir = env('MYSQL_DATADIR', '/var/lib/mysql', envs)
 37 |         self.pool_size = env('INNODB_BUFFER_POOL_SIZE', 0, envs, fn=int)
 38 | 
 39 |         # state
 40 |         self.ip = get_ip()
 41 |         self._conn = None
 42 |         self._query_buffer = OrderedDict()
 43 | 
 44 |     def render(self, src='/etc/my.cnf.tmpl', dest='/etc/my.cnf'):
 45 |         """
 46 |         Writes-out config files, even if we've previously initialized the DB,
 47 |         so that we can account for changed hostnames, resized containers, etc.
 48 |         """
 49 |         pool_size = self._get_innodb_buffer_pool_size()
 50 |         with open(src, 'r') as f:
 51 |             template = string.Template(f.read())
 52 |             rendered = template.substitute(buffer=pool_size,
 53 |                                            server_id=self.server_id,
 54 |                                            hostname=self.ip)
 55 |         with open(dest, 'w') as f:
 56 |             f.write(rendered)
 57 | 
 58 |     @property
 59 |     def server_id(self):
 60 |         """ replace server-id with ID derived from hostname """
 61 |         _hostname = socket.gethostname()
 62 |         return int(str(_hostname)[:4], 16)
 63 | 
 64 |     def _get_innodb_buffer_pool_size(self):
 65 |         """
 66 |         replace innodb_buffer_pool_size value from environment
 67 |         or use a sensible default (70% of available physical memory)
 68 |         """
 69 |         if not self.pool_size:
 70 |             with open('/proc/meminfo', 'r') as memInfoFile:
 71 |                 memInfo = memInfoFile.read()
 72 |                 base = re.search(r'^MemTotal: *(\d+)', memInfo).group(1)
 73 |                 self.pool_size = int((int(base) / 1024) * 0.7)
 74 |         return self.pool_size
 75 | 
 76 |     @property
 77 |     def conn(self):
 78 |         """
 79 |         Convenience method for setting up a cached connection
 80 |         with the replication manager user.
 81 |         """
 82 |         if self._conn:
 83 |             return self._conn
 84 |         ctx = dict(user=self.repl_user,
 85 |                    password=self.repl_password,
 86 |                    timeout=25) # derived from ContainerPilot config ttl
 87 |         self._conn = self.wait_for_connection(**ctx)
 88 |         return self._conn
 89 | 
 90 |     @debug()
 91 |     def wait_for_connection(self, user='root', password=None, database=None,
 92 |                             timeout=10):
 93 |         """
 94 |         Polls mysqld socket until we get a connection or the timeout
 95 |         expires (raise WaitTimeoutError). Defaults to root empty/password.
 96 |         """
 97 |         while timeout > 0:
 98 |             try:
 99 |                 sock = '/var/run/mysqld/mysqld.sock'
100 |                 return mysqlconn.connect(unix_socket=sock,
101 |                                          user=user,
102 |                                          password=password,
103 |                                          database=database,
104 |                                          charset='utf8',
105 |                                          connection_timeout=timeout)
106 |             except MySQLError as ex:
107 |                 timeout = timeout - 1
108 |                 if timeout == 0:
109 |                     raise WaitTimeoutError(ex)
110 |                 time.sleep(1)
111 | 
112 |     def add(self, stmt, params=()):
113 |         """ Adds a new SQL statement to an internal query buffer """
114 |         self._query_buffer[stmt] = params
115 | 
116 |     @debug()
117 |     def execute(self, sql, params=(), conn=None):
118 |         """ Execute and commit a SQL statement with parameters """
119 |         self.add(sql, params)
120 |         self._execute(conn, discard_results=True)
121 | 
122 |     @debug()
123 |     def execute_many(self, conn=None):
124 |         """
125 |         Execute and commit all previously `add`ed statements
126 |         in the query buffer
127 |         """
128 |         self._execute(conn, discard_results=True)
129 | 
130 |     @debug(log_output=True)
131 |     def query(self, sql, params=(), conn=None):
132 |         """ Execute a SQL query with params and return results. """
133 |         self.add(sql, params)
134 |         return self._execute(conn=conn)
135 | 
136 |     def _execute(self, conn=None, discard_results=False):
137 |         """
138 |         Execute and commit all composed statements and flushes the buffer
139 |         """
140 |         try:
141 |             if not conn:
142 |                 conn = self.conn
143 |         except (WaitTimeoutError, MySQLError):
144 |             raise # unrecoverable
145 | 
146 |         try:
147 |             cur = conn.cursor(dictionary=True, buffered=True)
148 |             for stmt, params in self._query_buffer.items():
149 |                 log.debug('%s %s', stmt, params)
150 |                 cur.execute(stmt, params=params)
151 |                 if not discard_results:
152 |                     return cur.fetchall()
153 | 
154 |                 # we discard results from writes
155 |                 conn.commit()
156 |                 try:
157 |                     cur.fetchall()
158 |                 except MySQLError:
159 |                     # Will get "InternalError: No result set to fetch from."
160 |                     # for SET statements. We can safely let this slide if the
161 |                     # `execute` call passes
162 |                     pass
163 |         finally:
164 |             # exceptions are an unrecoverable situation
165 |             self._query_buffer.clear()
166 |             cur.close()
167 | 
168 |     @debug(log_output=True)
169 |     def initialize_db(self):
170 |         """
171 |         post-installation run to set up data directories
172 |         and install mysql.user tables
173 |         """
174 |         self.make_datadir()
175 |         log.info('Initializing database...')
176 |         try:
177 |             subprocess.check_call(['/usr/bin/mysql_install_db',
178 |                                    '--user=mysql',
179 |                                    '--datadir={}'.format(self.datadir)])
180 |             log.info('Database initialized.')
181 |             return True
182 |         except subprocess.CalledProcessError:
183 |             log.warn('Database was previously initialized.')
184 |             return False
185 | 
186 |     def make_datadir(self):
187 |         """ Create the data dir if it doesn't already exist"""
188 |         try:
189 |             os.mkdir(self.datadir, 0770)
190 |             self.take_ownership()
191 |         except OSError:
192 |             pass
193 | 
194 |     def take_ownership(self, owner='mysql'):
195 |         """
196 |         Set ownership of all directories and files under config.datadir
197 |         to `owner`'s UID and GID. Defaults to setting ownership for
198 |         mysql user.
199 |         """
200 |         directory = self.datadir
201 |         user = pwd.getpwnam(owner)
202 |         os.chown(directory, user.pw_uid, user.pw_gid)
203 |         for root, dirs, files in os.walk(self.datadir):
204 |             for di in dirs:
205 |                 os.chown(os.path.join(root, di), user.pw_uid, user.pw_gid)
206 |             for fi in files:
207 |                 os.chown(os.path.join(root, fi), user.pw_uid, user.pw_gid)
208 | 
209 |     def setup_root_user(self, conn):
210 |         """
211 |         Create the root user and optionally give it a random root password
212 |         """
213 |         if self.mysql_random_root_password:
214 |             # we could use --random-passwords in our call to `mysql_install_db`
215 |             # instead here but we want to have the root password available
216 |             # until we're done with this setup.
217 |             chars = string.ascii_letters + string.digits + '!@#$%&^*()'
218 |             passwd = ''.join([chars[int(os.urandom(1).encode('hex'), 16) % len(chars)]
219 |                               for _ in range(20)])
220 |             self.mysql_root_password = passwd
221 |             log.info('Generated root password: %s', self.mysql_root_password)
222 | 
223 |         self.add('SET @@SESSION.SQL_LOG_BIN=0;')
224 |         self.add('DELETE FROM `mysql`.`user` where user != \'mysql.sys\';')
225 |         self.add('CREATE USER `root`@`%` IDENTIFIED BY %s ;',
226 |                  (self.mysql_root_password,))
227 |         self.add('GRANT ALL ON *.* TO `root`@`%` WITH GRANT OPTION ;')
228 |         self.add('DROP DATABASE IF EXISTS test ;')
229 |         self.add('FLUSH PRIVILEGES ;')
230 |         self.execute_many(conn=conn)
231 | 
232 |     def expire_root_password(self, conn):
233 |         """ optionally expire the root password """
234 |         if self.mysql_onetime_password:
235 |             self.execute('ALTER USER `root`@`%` PASSWORD EXPIRE', conn=conn)
236 | 
237 |     def create_db(self, conn):
238 |         """ this optional schema will be used by the application """
239 |         if not self.mysql_db:
240 |             log.warn('No default database configured.')
241 |             return
242 |         sql = 'CREATE DATABASE IF NOT EXISTS `{}`;'.format(self.mysql_db)
243 |         self.execute(sql, conn=conn)
244 | 
245 |     def create_default_user(self, conn):
246 |         """ this optional user will be used by the application """
247 |         if not self.mysql_user or not self.mysql_password:
248 |             log.warn('No default user/password configured.')
249 |             return
250 | 
251 |         # there's some kind of annoying encoding bug in the lib here
252 |         # so we have to format the string rather than passing it as
253 |         # a param. totally safe, I bet.
254 |         self.add('CREATE USER `{}`@`%` IDENTIFIED BY %s;'
255 |                  .format(self.mysql_user), (self.mysql_password,))
256 |         if self.mysql_db:
257 |             self.add('GRANT ALL ON `{}`.* TO `{}`@`%`;'
258 |                      .format(self.mysql_db, self.mysql_user))
259 |         self.add('FLUSH PRIVILEGES;')
260 |         self.execute_many(conn=conn)
261 | 
262 |     def create_repl_user(self, conn):
263 |         """ this user will be used for both replication and backups """
264 |         if not self.repl_user or not self.repl_password:
265 |             log.error('No replication user/password configured.')
266 |             return
267 | 
268 |         self.add('CREATE USER `{}`@`%` IDENTIFIED BY %s; '
269 |                  .format(self.repl_user), (self.repl_password,))
270 |         self.add('GRANT SUPER, SELECT, INSERT, REPLICATION SLAVE, RELOAD'
271 |                  ', LOCK TABLES, GRANT OPTION, REPLICATION CLIENT'
272 |                  ', RELOAD, DROP, CREATE '
273 |                  'ON *.* TO `{}`@`%`; '
274 |                  .format(self.repl_user))
275 |         self.add('FLUSH PRIVILEGES;')
276 |         self.execute_many(conn=conn)
277 | 
278 |     def set_timezone_info(self):
279 |         """
280 |         Write TZ data to mysqld by piping mysql_tzinfo_to_sql to the mysql
281 |         client. This is kinda gross but piping it avoids having to parse the
282 |         output for a bulk insert with the Connector/MySQL client.
283 |         """
284 |         try:
285 |             subprocess.check_output(
286 |                 '/usr/bin/mysql_tzinfo_to_sql /usr/share/zoneinfo | '
287 |                 '/usr/bin/mysql -uroot --protocol=socket '
288 |                 '--socket=/var/run/mysqld/mysqld.sock')
289 |         except (subprocess.CalledProcessError, OSError) as ex:
290 |             log.error('mysql_tzinfo_to_sql returned error: %s', ex)
291 | 
292 |     def restore_from_snapshot(self, filename):
293 |         """
294 |         Use innobackupex to restore from a snapshot.
295 |         """
296 |         self.make_datadir()
297 |         infile = '/tmp/backup/{}'.format(filename)
298 |         subprocess.check_call(['tar', '-xif', infile, '-C', '/tmp/backup'])
299 |         subprocess.check_call(['/usr/bin/innobackupex',
300 |                                '--force-non-empty-directories',
301 |                                '--copy-back',
302 |                                '/tmp/backup'])
303 |         self.take_ownership()
304 | 
305 |     @debug(log_output=True)
306 |     def get_primary(self):
307 |         """
308 |         Returns the server-id and hostname of the primary as known to MySQL
309 |         """
310 |         result = self.query('show slave status')
311 |         if result:
312 |             return result[0]['Master_Server_Id'], result[0]['Master_Host']
313 | 
314 |         result = self.query('show slave hosts')
315 |         if not result:
316 |             raise UnknownPrimary('no prior replication setup found')
317 |         return result[0]['Master_id'], self.ip
318 | 
319 |     @debug()
320 |     def setup_replication(self, primary_ip):
321 |         """
322 |         Set up GTID-based replication to the primary; once this is set the
323 |         replica will automatically try to catch up with the primary's last
324 |         transactions.
325 |         """
326 |         self.add('CHANGE MASTER TO '
327 |                  'MASTER_HOST           = %s, '
328 |                  'MASTER_USER           = %s, '
329 |                  'MASTER_PASSWORD       = %s, '
330 |                  'MASTER_PORT           = 3306, '
331 |                  'MASTER_CONNECT_RETRY  = 60, '
332 |                  'MASTER_AUTO_POSITION  = 1, '
333 |                  'MASTER_SSL            = 0; ',
334 |                  (primary_ip, self.repl_user, self.repl_password))
335 |         self.add('START SLAVE;')
336 |         self.execute_many()
337 | 
338 |     @debug()
339 |     def failover(self, ips):
340 |         """
341 |         Call external `mysqlrpladmin failover`. This will determine
342 |         best primary candidate, set up replication for all candidates
343 |         to the new primary, and catch up stale replicas.
344 |         """
345 |         user = self.repl_user
346 |         passwd = self.repl_password
347 |         candidates = ','.join(
348 |             ["{}:'{}'@{}".format(user, passwd, ip) for ip in ips]
349 |         )
350 |         return subprocess.check_call(
351 |             ['mysqlrpladmin',
352 |              '--slaves={}'.format(candidates),
353 |              '--candidates={}'.format(candidates),
354 |              '--rpl-user={}:{}'.format(user, passwd),
355 |              'failover']
356 |         )
357 | 
358 |     @debug()
359 |     def get_binlog(self):
360 |         """ Gets the current binlog file name """
361 |         results = self.query('show master status')
362 |         binlog_file = results[0]['File']
363 |         return binlog_file
364 | 


--------------------------------------------------------------------------------
/bin/manager/config.py:
--------------------------------------------------------------------------------
 1 | """ autopilotpattern/mysql ContainerPilot configuraton wrapper """
 2 | import os
 3 | import signal
 4 | import subprocess
 5 | 
 6 | import json5
 7 | 
 8 | from manager.env import env, to_flag
 9 | from manager.utils import debug, log, UNASSIGNED
10 | 
11 | 
12 | # pylint: disable=invalid-name,no-self-use,dangerous-default-value
13 | 
14 | class ContainerPilot(object):
15 |     """
16 |     ContainerPilot config is where we rewrite ContainerPilot's own config
17 |     so that we can dynamically alter what service we advertise.
18 |     """
19 | 
20 |     def __init__(self):
21 |         self.state = UNASSIGNED
22 |         self.path = None
23 |         self.config = None
24 | 
25 |     def load(self, envs=os.environ):
26 |         """
27 |         Fetches the ContainerPilot config file and asks ContainerPilot
28 |         to render it out so that all environment variables have been
29 |         interpolated.
30 |         """
31 |         self.path = env('CONTAINERPILOT', None, envs)
32 |         try:
33 |             cfg = subprocess.check_output(['containerpilot', '-config',
34 |                                            self.path, '-template'],
35 |                                           env=envs.copy())
36 |         except (subprocess.CalledProcessError, OSError) as ex:
37 |             log.error('containerpilot -template returned error: %s', ex)
38 |             raise(ex)
39 | 
40 |         config = json5.loads(cfg)
41 |         self.config = config
42 | 
43 |     @debug(log_output=True)
44 |     def update(self):
45 |         """
46 |         Update the on-disk config file if it is stale. Returns a
47 |         bool indicating whether an update was made.
48 |         """
49 |         if self.state == UNASSIGNED:
50 |             return False
51 |         if self.state and self.config['jobs'][1]['name'] != self.state:
52 |             self.config['jobs'][1]['name'] = self.state
53 |             self._render()
54 |             return True
55 |         return False
56 | 
57 |     @debug
58 |     def _render(self):
59 |         """ Writes the current config to file. """
60 |         new_config = json5.dumps(self.config)
61 |         with open(self.path, 'w') as f:
62 |             log.info('rewriting ContainerPilot config: %s', new_config)
63 |             f.write(new_config)
64 | 
65 |     def reload(self):
66 |         """ Force ContainerPilot to reload its configuration """
67 |         log.info('Reloading ContainerPilot configuration.')
68 |         try:
69 |             subprocess.check_output(['containerpilot', '-reload'])
70 |         except subprocess.CalledProcessError:
71 |             log.info("call to 'containerpilot -reload' failed")
72 | 


--------------------------------------------------------------------------------
/bin/manager/discovery.py:
--------------------------------------------------------------------------------
  1 | """ Module for Consul client wrapper and related tooling. """
  2 | from datetime import datetime, timedelta
  3 | import fcntl
  4 | import json
  5 | import os
  6 | import time
  7 | 
  8 | from manager.env import env, to_flag, \
  9 |     PRIMARY_KEY, LAST_BACKUP_KEY, \
 10 |     BACKUP_TTL, BACKUP_LOCK_KEY, LAST_BINLOG_KEY
 11 | from manager.utils import debug, log, \
 12 |     WaitTimeoutError, UnknownPrimary
 13 | 
 14 | # pylint: disable=import-error,invalid-name,dangerous-default-value
 15 | import consul as pyconsul
 16 | 
 17 | SESSION_CACHE_FILE = env('SESSION_CACHE_FILE', '/tmp/mysql-session')
 18 | SESSION_NAME = env('SESSION_NAME', 'mysql-primary-lock')
 19 | SESSION_TTL = env('SESSION_TTL', 25, fn=int)
 20 | FAILOVER_KEY = env('FAILOVER_IN_PROGRESS', 'FAILOVER_IN_PROGRESS')
 21 | FAILOVER_SESSION_FILE = env('FAILOVER_SESSION_FILE', '/tmp/failover-session')
 22 | MAX_SESSION=3600
 23 | 
 24 | class Consul(object):
 25 |     """ Consul represents the Consul instance this node talks to """
 26 | 
 27 |     def __init__(self, envs=os.environ):
 28 |         """
 29 |         Figures out the Consul client hostname based on whether or
 30 |         not we're using a local Consul agent.
 31 |         """
 32 |         if env('CONSUL_AGENT', False, envs, fn=to_flag):
 33 |             self.host = 'localhost'
 34 |         else:
 35 |             self.host = env('CONSUL', 'consul', envs)
 36 |         self.client = pyconsul.Consul(host=self.host)
 37 | 
 38 |     def get(self, key):
 39 |         """
 40 |         Return the Value field for a given Consul key.
 41 |         Handles None results safely but lets all other exceptions
 42 |         just bubble up.
 43 |         """
 44 |         result = self.client.kv.get(key)
 45 |         if result[1]:
 46 |             return result[1]['Value']
 47 |         return None
 48 | 
 49 |     def put(self, key, value):
 50 |         """ Puts a value for the key; allows all exceptions to bubble up """
 51 |         return self.client.kv.put(key, value)
 52 | 
 53 |     @debug(log_output=True)
 54 |     def get_session(self, key=SESSION_NAME, ttl=SESSION_TTL,
 55 |                     on_disk=SESSION_CACHE_FILE, cached=True):
 56 |         """
 57 |         Gets a Consul session ID from the on-disk cache or calls into
 58 |         `create_session` to generate a new one.
 59 |         We can't rely on storing Consul session IDs in memory because
 60 |         handler calls happen in subsequent processes. Here we create a
 61 |         session on Consul and cache the session ID to disk.
 62 |         Returns the session ID.
 63 |         """
 64 |         if not cached:
 65 |             return self.create_session(key, ttl)
 66 |         try:
 67 |             with open(on_disk, 'r+') as f:
 68 |                 session_id = f.read()
 69 |         except IOError:
 70 |             session_id = self.create_session(key, ttl)
 71 |         if cached:
 72 |             with open(on_disk, 'w') as f:
 73 |                 f.write(session_id)
 74 | 
 75 |         return session_id
 76 | 
 77 |     @debug(log_output=True)
 78 |     def create_session(self, key, ttl=120):
 79 |         """ Create a session on Consul and return the session ID """
 80 |         return self.client.session.create(name=key,
 81 |                                           behavior='release',
 82 |                                           ttl=ttl)
 83 | 
 84 |     @debug(log_output=True)
 85 |     def renew_session(self, session_id=None):
 86 |         """ Renews the session TTL on Consul """
 87 |         if not session_id:
 88 |             session_id = self.get_session()
 89 |         self.client.session.renew(session_id)
 90 | 
 91 |     @debug(log_output=True)
 92 |     def lock(self, key, value, session_id):
 93 |         """ Puts a key to Consul with an advisory lock """
 94 |         return self.client.kv.put(key, value, acquire=session_id)
 95 | 
 96 |     @debug
 97 |     def unlock(self, key, session_id):
 98 |         """ Clears a key in Consul and its advisory lock """
 99 |         return self.client.kv.put(key, "", release=session_id)
100 | 
101 |     @debug(log_output=True)
102 |     def is_locked(self, key):
103 |         """
104 |         Checks a lock in Consul and returns the session_id if the
105 |         lock is still valid, otherwise False
106 |         """
107 |         lock = self.client.kv.get(key)
108 |         try:
109 |             session_lock = lock[1]['Session']
110 |             return session_lock
111 |         except KeyError:
112 |             return False
113 | 
114 |     @debug(log_output=True)
115 |     def read_lock(self, key):
116 |         """
117 |         Checks a lock in Consul and returns the (session_id, value) if the
118 |         lock is still valid, otherwise (None, None)
119 |         """
120 |         lock = self.client.kv.get(key)
121 |         try:
122 |             if not lock[1]:
123 |                 raise KeyError
124 |             session_lock = lock[1]['Session']
125 |             value = lock[1]['Value']
126 |             return session_lock, value
127 |         except KeyError:
128 |             return None, None
129 | 
130 |     @debug(log_output=True)
131 |     def get_primary(self, timeout=10):
132 |         """
133 |         Returns the (name, IP) tuple for the instance that Consul thinks
134 |         is the healthy primary.
135 |         """
136 |         while timeout > 0:
137 |             try:
138 |                 nodes = self.client.health.service(PRIMARY_KEY, passing=True)[1]
139 |                 log.debug(nodes)
140 |                 instances = [service['Service'] for service in nodes]
141 |                 if len(instances) > 1:
142 |                     raise UnknownPrimary('Multiple primaries detected! %s', instances)
143 |                 return instances[0]['ID'], instances[0]['Address']
144 |             except pyconsul.ConsulException as ex:
145 |                 log.debug(ex)
146 |                 timeout = timeout - 1
147 |                 time.sleep(1)
148 |             except (IndexError, KeyError):
149 |                 raise UnknownPrimary('No primary found')
150 |         raise WaitTimeoutError('Could not find primary before timeout.')
151 | 
152 |     @debug
153 |     def mark_as_primary(self, name):
154 |         """ Write flag to Consul to mark this node as primary """
155 |         session_id = self.get_session()
156 |         if not self.lock(PRIMARY_KEY, name, session_id):
157 |             return False
158 |         return session_id
159 | 
160 |     @debug
161 |     def lock_failover(self, hostname):
162 |         """
163 |         Lock a session in Consul for the failover and cache the
164 |         session as a file on disk.
165 |         """
166 |         session_id = self.get_session(FAILOVER_KEY, ttl=120,
167 |                                       on_disk=FAILOVER_SESSION_FILE)
168 |         return self.lock(FAILOVER_KEY, hostname, session_id)
169 | 
170 |     @debug
171 |     def wait_for_failover_lock(self):
172 |         """
173 |         Block forever waiting on the session lock on the
174 |         failover to complete.
175 |         """
176 |         while True:
177 |             if not self.is_locked(FAILOVER_KEY):
178 |                 break
179 |             time.sleep(3)
180 | 
181 |     @debug
182 |     def unlock_failover(self):
183 |         """
184 |         If we've previously locked a session for failover and a new
185 |         primary has registered as healthy, unlock the session and
186 |         remove the session file.
187 |         """
188 |         try:
189 |             with open(FAILOVER_SESSION_FILE, 'r') as f:
190 |                 session_id = f.read()
191 |                 if self.get_primary():
192 |                     self.unlock(FAILOVER_KEY, session_id)
193 |                     os.remove(FAILOVER_SESSION_FILE)
194 |         except (IOError, OSError):
195 |             # we don't have a session file so just move on
196 |             pass
197 |         except (UnknownPrimary, WaitTimeoutError):
198 |             # the primary isn't ready yet so we'll try
199 |             # to unlock again on the next pass
200 |             log.debug('failover session lock (%s) not removed because '
201 |                       'primary has not reported as healthy', session_id)
202 | 
203 | 
204 | 
205 | 
206 |     @debug(log_output=True)
207 |     def has_snapshot(self, timeout=600):
208 |         """ Ask Consul for 'last backup' key. """
209 |         while timeout > 0:
210 |             try:
211 |                 result = self.client.kv.get(LAST_BACKUP_KEY)
212 |                 if result[1]:
213 |                     return json.loads(result[1]['Value'])['id']
214 |                 return None
215 |             except pyconsul.ConsulException:
216 |                 # Consul isn't up yet
217 |                 timeout -= 1
218 |                 time.sleep(1)
219 |             except (KeyError, TypeError, ValueError):
220 |                 raise # unexpected value / invalid JSON in Consul
221 |         raise WaitTimeoutError('Could not contact Consul to check '
222 |                                'for snapshot after %s seconds', timeout)
223 | 
224 | 
225 |     @debug
226 |     def lock_snapshot(self, hostname):
227 |         """
228 |         Lock a session in Consul for the failover and cache the
229 |         session as a file on disk. Prevents more than one attempt
230 |         to lock the session in Consul by using a lock on the local
231 |         session file too.
232 |         """
233 |         lock_filename = '/tmp/' + BACKUP_LOCK_KEY
234 |         session_id = self.get_session(BACKUP_LOCK_KEY, ttl=MAX_SESSION,
235 |                                       on_disk=lock_filename)
236 |         try:
237 |             lock_file = open(lock_filename, 'r+')
238 |             fcntl.flock(lock_file, fcntl.LOCK_EX|fcntl.LOCK_NB)
239 |             return self.lock(BACKUP_LOCK_KEY, hostname, session_id)
240 |         except IOError:
241 |             # couldn't obtain local file lock
242 |             return False
243 | 
244 |     @debug
245 |     def unlock_snapshot(self):
246 |         """
247 |         If we've previously locked a session for snapshot, unlock
248 |         the session and remove the session file.
249 |         """
250 |         lock_filename = '/tmp/' + BACKUP_LOCK_KEY
251 |         try:
252 |             with open(BACKUP_LOCK_KEY, 'r+') as f:
253 |                 session_id = f.read()
254 |                 self.unlock(BACKUP_LOCK_KEY, session_id)
255 |                 fcntl.flock(f, fcntl.LOCK_UN)
256 |                 os.remove(lock_filename)
257 |         except (IOError, OSError):
258 |             # we don't have a session file so just move on
259 |             pass
260 | 
261 |     @debug
262 |     def record_backup(self, backup_id, backup_time, binlog_file):
263 |         backup_val = {'id': backup_id, 'dt': backup_time}
264 |         self.put(LAST_BACKUP_KEY, json.dumps(backup_val))
265 |         self.put(LAST_BINLOG_KEY, binlog_file)
266 | 
267 |     @debug
268 |     def is_snapshot_stale(self, binlog_file):
269 |         """ Check if it's time to do a snapshot """
270 |         if self._is_binlog_stale(binlog_file):
271 |             return True
272 | 
273 |         result = self.get(LAST_BACKUP_KEY)
274 |         try:
275 |             dt = json.loads(result)['dt']
276 |         except (KeyError, TypeError, ValueError):
277 |             # TODO: should we log this and return True so we recover?
278 |             raise # unexpected value / invalid JSON in Consul
279 | 
280 |         parsed_dt = datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S.%f")
281 |         yesterday = datetime.utcnow() - timedelta(days=1)
282 |         if parsed_dt < yesterday:
283 |             return True
284 | 
285 |         return False
286 | 
287 |     @debug
288 |     def _is_binlog_stale(self, binlog_file):
289 |         """ Compare current binlog to that recorded w/ Consul """
290 |         try:
291 |             last_binlog_file = self.get(LAST_BINLOG_KEY)
292 |         except (IndexError, KeyError):
293 |             return True
294 |         return binlog_file != last_binlog_file
295 | 


--------------------------------------------------------------------------------
/bin/manager/env.py:
--------------------------------------------------------------------------------
 1 | """ environment functions """
 2 | import os
 3 | 
 4 | # pylint: disable=invalid-name,no-self-use,dangerous-default-value
 5 | 
 6 | # ---------------------------------------------------------
 7 | # misc utility functions for setting up environment
 8 | 
 9 | def env(key, default, environ=os.environ, fn=None):
10 |     """
11 |     Gets an environment variable, trims away comments and whitespace,
12 |     and expands other environment variables.
13 |     """
14 |     val = environ.get(key, default)
15 |     try:
16 |         val = val.split('#')[0]
17 |         val = val.strip()
18 |         val = os.path.expandvars(val)
19 |     except (AttributeError, IndexError):
20 |         # just swallow AttributeErrors for non-strings
21 |         pass
22 |     if fn: # transformation function
23 |         val = fn(val)
24 |     return val
25 | 
26 | def to_flag(val):
27 |     """
28 |     Parse environment variable strings like "yes/no", "on/off",
29 |     "true/false", "1/0" into a bool.
30 |     """
31 |     try:
32 |         return bool(int(val))
33 |     except ValueError:
34 |         val = val.lower()
35 |         if val in ('false', 'off', 'no', 'n'):
36 |             return False
37 |             # non-"1" or "0" string, we'll treat as truthy
38 |         return bool(val)
39 | 
40 | 
41 | # env values for keys
42 | PRIMARY_KEY = env('PRIMARY_KEY', env('SERVICE_NAME','mysql')+'-primary')
43 | LAST_BACKUP_KEY = env('LAST_BACKUP_KEY', 'mysql-last-backup')
44 | BACKUP_LOCK_KEY = env('BACKUP_LOCK_KEY', 'mysql-backup-running')
45 | LAST_BINLOG_KEY = env('LAST_BINLOG_KEY', 'mysql-last-binlog')
46 | BACKUP_NAME = env('BACKUP_NAME', 'mysql-backup-%Y-%m-%dT%H-%M-%SZ')
47 | BACKUP_TTL = env('BACKUP_TTL', 86400, fn='{}s'.format) # every 24 hours
48 | 


--------------------------------------------------------------------------------
/bin/manager/network.py:
--------------------------------------------------------------------------------
 1 | """ network functions """
 2 | import fcntl
 3 | import socket
 4 | import struct
 5 | 
 6 | 
 7 | def get_ip(iface='eth0'):
 8 |     """
 9 |     Use Linux SIOCGIFADDR ioctl to get the IP for the interface.
10 |     ref http://code.activestate.com/recipes/439094-get-the-ip-address\
11 |         -associated-with-a-network-inter/
12 |     """
13 |     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
14 |     return socket.inet_ntoa(fcntl.ioctl(
15 |         sock.fileno(),
16 |         0x8915, # SIOCGIFADDR
17 |         struct.pack('256s', iface[:15])
18 |     )[20:24])
19 | 


--------------------------------------------------------------------------------
/bin/manager/storage/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from manager.utils import debug
 3 | 
 4 | 
 5 | class SnapshotBackup(object):
 6 |     """
 7 |     The SnapshotBackup class defines an expected interface to the
 8 |     backup storage, where we'll put our MySQL snapshots.
 9 |     """
10 |     def __init__(self, envs=os.environ):
11 |         raise NotImplementedError
12 | 
13 |     @debug
14 |     def get_backup(self, backup_id):
15 |         """
16 |         fetch the snapshot file from the storage location, allowing
17 |         exceptions to bubble up to the caller
18 |         """
19 |         raise NotImplementedError
20 | 
21 |     @debug
22 |     def put_backup(self, backup_id, infile):
23 |         """
24 |         store the snapshot file to the expected path, allowing
25 |         exceptions to bubble up to the caller.
26 |         """
27 |         raise NotImplementedError
28 | 


--------------------------------------------------------------------------------
/bin/manager/storage/local.py:
--------------------------------------------------------------------------------
 1 | """ Module for storing snapshots in shared local disk """
 2 | import os
 3 | from shutil import copyfile
 4 | 
 5 | from manager.env import env
 6 | from manager.utils import debug
 7 | 
 8 | class Local(object):
 9 |     """
10 | 
11 |     The Manta class wraps access to the Manta object store, where we'll put
12 |     our MySQL backups.
13 |     """
14 |     def __init__(self, envs=os.environ):
15 |         self.dir = env('STORAGE_DIR', '/tmp/snapshots', envs)
16 | 
17 |     @debug
18 |     def get_backup(self, backup_id):
19 |         """
20 |         copies snapshot from 'STORAGE_DIR' location to a working
21 |         directory so it can be loaded into the DB without worrying
22 |         about other processes writing to the snapshot.
23 |         """
24 |         try:
25 |             os.mkdir(self.dir, 0770)
26 |         except OSError:
27 |             pass
28 |         try:
29 |             os.mkdir('/tmp/backup', 0770)
30 |         except OSError:
31 |             pass
32 | 
33 |         dst = '/tmp/backup/{}'.format(backup_id)
34 |         src = '{}/{}'.format(self.dir, backup_id)
35 |         copyfile(src, dst)
36 | 
37 |     def put_backup(self, backup_id, src):
38 |         """
39 |         copies snapshot to 'STORAGE_DIR'
40 |         """
41 |         dst = '{}/{}'.format(self.dir, backup_id)
42 |         copyfile(src, dst)
43 |         return dst
44 | 


--------------------------------------------------------------------------------
/bin/manager/storage/manta_stor.py:
--------------------------------------------------------------------------------
 1 | """ Module for Manta client wrapper and related tooling. """
 2 | import logging
 3 | import os
 4 | 
 5 | from manager.env import env, to_flag
 6 | from manager.utils import debug
 7 | 
 8 | # pylint: disable=import-error,dangerous-default-value,invalid-name
 9 | import manta as pymanta
10 | 
11 | # Manta client barfing if we log the body of binary data
12 | logging.getLogger('manta').setLevel(logging.INFO)
13 | 
14 | class Manta(object):
15 |     """
16 |     The Manta class wraps access to the Manta object store, where we'll put
17 |     our MySQL backups.
18 |     """
19 |     def __init__(self, envs=os.environ):
20 |         self.account = env('MANTA_USER', None, envs)
21 |         self.user = env('MANTA_SUBUSER', None, envs)
22 |         self.role = env('MANTA_ROLE', None, envs)
23 |         self.key_id = env('MANTA_KEY_ID', None, envs)
24 |         self.url = env('MANTA_URL', 'https://us-east.manta.joyent.com', envs)
25 |         self.bucket = env('MANTA_BUCKET', '/{}/stor'.format(self.account), envs)
26 |         is_tls = env('MANTA_TLS_INSECURE', False, envs, fn=to_flag)
27 | 
28 |         # we don't want to use `env` here because we have a different
29 |         # de-munging to do
30 |         self.private_key = envs.get('MANTA_PRIVATE_KEY', '').replace('#', '\n')
31 |         self.signer = pymanta.PrivateKeySigner(self.key_id, self.private_key)
32 |         self.client = pymanta.MantaClient(
33 |             self.url,
34 |             self.account,
35 |             subuser=self.user,
36 |             role=self.role,
37 |             disable_ssl_certificate_validation=is_tls,
38 |             signer=self.signer)
39 | 
40 |     @debug
41 |     def get_backup(self, backup_id):
42 |         """ Download file from Manta, allowing exceptions to bubble up """
43 |         try:
44 |             os.mkdir('/tmp/backup', 0770)
45 |         except OSError:
46 |             pass
47 |         outfile = '/tmp/backup/{}'.format(backup_id)
48 |         mpath = '{}/{}'.format(self.bucket, backup_id)
49 |         data = self.client.get_object(mpath)
50 |         with open(outfile, 'w') as f:
51 |             f.write(data)
52 | 
53 |     def put_backup(self, backup_id, infile):
54 |         """ Upload the backup file to the expected path """
55 |         # TODO: stream this backup once python-manta supports it:
56 |         # ref https://github.com/joyent/python-manta/issues/6
57 |         mpath = '{}/{}'.format(self.bucket, backup_id)
58 |         with open(infile, 'r') as f:
59 |             self.client.put_object(mpath, file=f)
60 |         return mpath
61 | 


--------------------------------------------------------------------------------
/bin/manager/storage/minio_stor.py:
--------------------------------------------------------------------------------
 1 | """ Module for storing snapshots in shared local disk """
 2 | import logging
 3 | import os
 4 | from shutil import copyfile
 5 | 
 6 | from manager.env import env, to_flag
 7 | from manager.utils import debug
 8 | from minio import Minio as pyminio, error as minioerror
 9 | 
10 | logging.getLogger('manta').setLevel(logging.INFO)
11 | 
12 | class Minio(object):
13 |     """
14 | 
15 |     The Minio class wraps access to the Minio object store, where we'll put
16 |     our MySQL backups.
17 |     """
18 |     def __init__(self, envs=os.environ):
19 |         self.access_key = env('MINIO_ACCESS_KEY', None, envs)
20 |         self.secret_key = env('MINIO_SECRET_KEY', None, envs)
21 |         self.bucket = env('MINIO_BUCKET', 'backups', envs)
22 |         self.location = env('MINIO_LOCATION', 'us-east-1', envs)
23 |         self.url = env('MINIO_URL', 'minio:9000')
24 |         is_tls = env('MINIO_TLS_SECURE', False, envs, fn=to_flag)
25 | 
26 |         self.client = pyminio(
27 |             self.url,
28 |             access_key=self.access_key,
29 |             secret_key=self.secret_key,
30 |             secure=is_tls)
31 |         try:
32 |             self.client.make_bucket(self.bucket, location=self.location)
33 |         except minioerror.BucketAlreadyOwnedByYou:
34 |             pass
35 | 
36 |     @debug
37 |     def get_backup(self, backup_id):
38 |         """
39 |         Download file from Minio, allowing exceptions to bubble up.
40 |         """
41 |         try:
42 |             os.mkdir('/tmp/backup', 0770)
43 |         except OSError:
44 |             pass
45 |         outfile = '/tmp/backup/{}'.format(backup_id)
46 |         self.client.fget_object(self.bucket, backup_id, outfile)
47 | 
48 |     def put_backup(self, backup_id, infile):
49 |         """
50 |         Upload the backup file to the expected path.
51 |         """
52 |         self.client.fput_object(self.bucket, backup_id, infile)
53 |         return backup_id
54 | 


--------------------------------------------------------------------------------
/bin/manager/utils.py:
--------------------------------------------------------------------------------
 1 | """ utility functions """
 2 | from functools import wraps
 3 | import logging
 4 | import os
 5 | import sys
 6 | 
 7 | # pylint: disable=invalid-name,no-self-use,dangerous-default-value
 8 | 
 9 | # ---------------------------------------------------------
10 | # common consts
11 | 
12 | PRIMARY = 'mysql-primary'
13 | REPLICA = 'mysql'
14 | UNASSIGNED = 'UNASSIGNED'
15 | 
16 | # ---------------------------------------------------------
17 | # logging setup
18 | 
19 | logging.basicConfig(format='%(levelname)s manage %(message)s',
20 |                     stream=sys.stdout,
21 |                     level=logging.getLevelName(
22 |                         os.environ.get('LOG_LEVEL', 'INFO')))
23 | log = logging.getLogger()
24 | 
25 | # reduce noise from requests logger
26 | logging.getLogger('requests').setLevel(logging.WARN)
27 | 
28 | 
29 | # ---------------------------------------------------------
30 | # errors and debugging setup
31 | 
32 | class WaitTimeoutError(Exception):
33 |     """ Exception raised when a timeout occurs. """
34 |     pass
35 | 
36 | class UnknownPrimary(Exception):
37 |     """ Exception raised when we can't figure out which node is primary """
38 |     pass
39 | 
40 | def debug(fn=None, log_output=False):
41 |     """
42 |     Function/method decorator to trace calls via debug logging. Acts as
43 |     pass-thru if not at LOG_LEVEL=DEBUG. Normally this would kill perf but
44 |     this application doesn't have significant throughput.
45 |     """
46 |     def _decorate(fn, *args, **kwargs):
47 |         @wraps(fn)
48 |         def wrapper(*args, **kwargs):
49 |             try:
50 |                 # because we have concurrent processes running we want
51 |                 # to tag each stack with an identifier for that process
52 |                 msg = "[{}]".format(sys.argv[1])
53 |             except IndexError:
54 |                 msg = "[pre_start]"
55 |             if len(args) > 0:
56 |                 cls_name = args[0].__class__.__name__.lower()
57 |                 name = '{}.{}'.format(cls_name, fn.__name__)
58 |             else:
59 |                 name = fn.__name__
60 |             log.debug('%s %s start', msg, name)
61 |             out = apply(fn, args, kwargs)
62 |             if log_output: # useful for checking status flags
63 |                 log.debug('%s %s end: %s', msg, name, out)
64 |             else:
65 |                 log.debug('%s %s end', msg, name)
66 |             return out
67 |         return wrapper
68 |     if fn:
69 |         return _decorate(fn)
70 |     return _decorate
71 | 


--------------------------------------------------------------------------------
/bin/test.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | import fcntl
  3 | import logging
  4 | import os
  5 | import tempfile
  6 | import unittest
  7 | 
  8 | # pylint: disable=import-error
  9 | import consul as pyconsul
 10 | import json5
 11 | import mock
 12 | 
 13 | import manage
 14 | # pylint: disable=invalid-name,no-self-use,dangerous-default-value
 15 | from manager.client import MySQL
 16 | from manager.config import ContainerPilot
 17 | from manager.discovery import Consul
 18 | from manager.env import *
 19 | from manager.network import *
 20 | from manager.storage.manta_stor import Manta
 21 | from manager.utils import *
 22 | 
 23 | 
 24 | class TestPreStart(unittest.TestCase):
 25 | 
 26 |     def setUp(self):
 27 |         logging.getLogger().setLevel(logging.WARN)
 28 |         consul = mock.MagicMock()
 29 |         manta = mock.MagicMock()
 30 |         my = mock.MagicMock()
 31 |         my.datadir = tempfile.mkdtemp()
 32 |         self.node = manage.Node(consul=consul, snaps=manta, mysql=my)
 33 | 
 34 |     def tearDown(self):
 35 |         logging.getLogger().setLevel(logging.DEBUG)
 36 | 
 37 |     def test_pre_start_first_node(self):
 38 |         """
 39 |         The first node will not attempt to download a snapshot from Manta.
 40 |         """
 41 |         self.node.consul.has_snapshot.return_value = False
 42 |         manage.pre_start(self.node)
 43 |         self.node.consul.has_snapshot.assert_called_once()
 44 |         self.node.mysql.initialize_db.assert_called_once()
 45 |         self.assertFalse(self.node.snaps.get_backup.called)
 46 |         self.assertFalse(self.node.mysql.restore_from_snapshot.called)
 47 | 
 48 |     def test_pre_start_snapshot_complete(self):
 49 |         """
 50 |         Given a successful snapshot by the first node, a new node will
 51 |         download the snapshot from Manta
 52 |         """
 53 |         self.node.consul.has_snapshot.return_value = True
 54 |         manage.pre_start(self.node)
 55 |         self.node.consul.has_snapshot.assert_called_once()
 56 |         self.node.snaps.get_backup.assert_called_once()
 57 |         self.node.mysql.restore_from_snapshot.assert_called_once()
 58 |         self.assertFalse(self.node.mysql.initialize_db.called)
 59 | 
 60 |     def test_pre_start_no_reinitialization(self):
 61 |         """
 62 |         Given a node that's restarted, pre_start should not try
 63 |         to re-initialize the node.
 64 |         """
 65 |         os.mkdir(os.path.join(self.node.mysql.datadir, 'mysql'))
 66 |         self.node.consul.has_snapshot.return_value = True
 67 |         manage.pre_start(self.node)
 68 |         self.assertFalse(self.node.consul.has_snapshot.called)
 69 | 
 70 |     def test_pre_start_snapshot_incomplete(self):
 71 |         """
 72 |         Given a snapshot that has been marked successful but not
 73 |         completed, a new node will wait and not crash.
 74 |         """
 75 |         self.node.consul = Consul(get_environ())
 76 |         self.node.consul.client = mock.MagicMock()
 77 | 
 78 |         def kv_gets(*args, **kwargs):
 79 |             yield pyconsul.ConsulException()
 80 |             yield [0, {'Value': '{"id": "xxxx", "dt": "yyyyy"}'}]
 81 | 
 82 |         self.node.consul.client.kv.get.side_effect = kv_gets()
 83 | 
 84 |         manage.pre_start(self.node)
 85 |         self.node.snaps.get_backup.assert_called_once()
 86 |         self.assertEqual(self.node.consul.client.kv.get.call_count, 2)
 87 |         self.node.mysql.restore_from_snapshot.assert_called_once()
 88 |         self.assertFalse(self.node.mysql.initialize_db.called)
 89 | 
 90 | 
 91 | class TestHealth(unittest.TestCase):
 92 | 
 93 |     LOCK_PATH = '/var/run/init.lock'
 94 | 
 95 |     def setUp(self):
 96 |         logging.getLogger().setLevel(logging.WARN)
 97 |         consul = mock.MagicMock()
 98 |         my = mock.MagicMock()
 99 |         cp = ContainerPilot()
100 |         cp.load(get_environ())
101 |         temp_file = tempfile.NamedTemporaryFile()
102 |         cp.path = temp_file.name
103 |         my.datadir = tempfile.mkdtemp()
104 |         self.node = manage.Node(consul=consul, cp=cp, mysql=my)
105 |         self.node.ip = '192.168.1.101'
106 |         self.node.name = 'node1'
107 | 
108 |     def tearDown(self):
109 |         logging.getLogger().setLevel(logging.DEBUG)
110 |         try:
111 |             os.rmdir(self.LOCK_PATH)
112 |         except:
113 |             pass
114 | 
115 |     def test_primary_first_pass(self):
116 |         """
117 |         Given uninitialized node w/ no other instances running,
118 |         set up for running as the primary.
119 |         """
120 |         self.node.mysql.wait_for_connection.return_value = True
121 |         self.node.mysql.get_primary.side_effect = UnknownPrimary()
122 | 
123 |         self.node.consul = Consul(envs=get_environ())
124 |         self.node.consul.client = mock.MagicMock()
125 |         self.node.consul.mark_as_primary = mock.MagicMock(return_value=True)
126 |         self.node.consul.renew_session = mock.MagicMock()
127 |         manage.write_snapshot = mock.MagicMock(return_value=True)
128 |         self.node.consul.client.health.service.return_value = ()
129 | 
130 |         try:
131 |             manage.health(self.node)
132 |             self.fail('Should have exited but did not.')
133 |         except SystemExit:
134 |             pass
135 | 
136 |         calls = [
137 |             mock.call.setup_root_user(True),
138 |             mock.call.create_db(True),
139 |             mock.call.create_default_user(True),
140 |             mock.call.create_repl_user(True),
141 |             mock.call.expire_root_password(True)
142 |         ]
143 |         self.node.mysql.assert_has_calls(calls)
144 |         manage.write_snapshot.assert_called_once()
145 |         self.assertEqual(self.node.cp.state, PRIMARY)
146 | 
147 |     def test_primary_typical(self):
148 |         """ Typical health check for primary with established replication """
149 |         os.mkdir(self.LOCK_PATH, 0700)
150 |         self.node.mysql.get_primary.return_value = ('node1', '192.168.1.101')
151 |         manage.health(self.node)
152 |         self.node.consul.renew_session.assert_called_once()
153 |         self.node.mysql.query.assert_called_once() # just the select 1
154 |         self.assertEqual(self.node.cp.state, PRIMARY)
155 | 
156 |     def test_primary_no_replicas(self):
157 |         """ Health check if previously initialized but with no replicas """
158 |         os.mkdir(self.LOCK_PATH, 0700)
159 |         self.node.mysql = MySQL(envs=get_environ())
160 |         self.node.mysql._conn = mock.MagicMock()
161 |         self.node.mysql.query = mock.MagicMock(return_value=())
162 | 
163 |         self.node.consul = Consul(envs=get_environ())
164 |         self.node.consul.client = mock.MagicMock()
165 |         self.node.consul.renew_session = mock.MagicMock()
166 |         self.node.consul.client.health.service.return_value = [0, [{
167 |             'Service' : {'ID': 'node1', 'Address': '192.168.1.101'},
168 |             }]]
169 | 
170 |         manage.health(self.node)
171 |         calls = [
172 |             mock.call.query('show slave status'),
173 |             mock.call.query('show slave hosts'),
174 |             mock.call.query('select 1')
175 |         ]
176 |         self.node.mysql.query.assert_has_calls(calls)
177 |         self.node.consul.client.health.service.assert_called_once()
178 |         self.node.consul.renew_session.assert_called_once()
179 |         self.assertEqual(self.node.cp.state, PRIMARY)
180 | 
181 |     def test_primary_no_replicas_no_consul_state_fails(self):
182 |         """
183 |         Health check if previously initialized but with no replicas
184 |         and no Consul state so we'll remain marked UNASSIGNED which
185 |         needs to be a failing health check.
186 |         """
187 |         os.mkdir(self.LOCK_PATH, 0700)
188 |         self.node.mysql = MySQL(envs=get_environ())
189 |         self.node.mysql._conn = mock.MagicMock()
190 |         self.node.mysql.query = mock.MagicMock(return_value=())
191 | 
192 |         self.node.consul = Consul(envs=get_environ())
193 |         self.node.consul.client = mock.MagicMock()
194 |         self.node.consul.renew_session = mock.MagicMock()
195 |         self.node.consul.client.health.service.return_value = []
196 | 
197 |         try:
198 |             logging.getLogger().setLevel(logging.CRITICAL) # noisy
199 |             manage.health(self.node)
200 |             self.fail('Should have exited but did not.')
201 |         except SystemExit:
202 |             pass
203 |         calls = [
204 |             mock.call.query('show slave status'),
205 |             mock.call.query('show slave hosts'),
206 |         ]
207 |         self.node.mysql.query.assert_has_calls(calls)
208 |         self.assertEqual(self.node.consul.client.health.service.call_count, 2)
209 |         self.assertEqual(self.node.cp.state, UNASSIGNED)
210 | 
211 |     def test_replica_typical(self):
212 |         """
213 |         Typical health check for replica with established replication
214 |         """
215 |         os.mkdir(self.LOCK_PATH, 0700)
216 |         self.node.mysql = MySQL(envs=get_environ())
217 |         self.node.mysql._conn = mock.MagicMock()
218 |         self.node.mysql.query = mock.MagicMock(return_value=[
219 |             {'Master_Server_Id': 'node2', 'Master_Host': '192.168.1.102'}])
220 | 
221 |         manage.health(self.node)
222 |         self.assertFalse(self.node.consul.renew_session.called)
223 |         calls = [
224 |             mock.call.query('show slave status'),
225 |             mock.call.query('show slave status')
226 |         ]
227 |         self.node.mysql.query.assert_has_calls(calls)
228 |         self.assertEqual(self.node.cp.state, REPLICA)
229 | 
230 |     def test_replica_no_replication(self):
231 |         """
232 |         Health check for failure mode where initial replication setup
233 |         failed but a primary already exists in Consul.
234 |         """
235 |         os.mkdir(self.LOCK_PATH, 0700)
236 |         self.node.mysql = MySQL(envs=get_environ())
237 |         self.node.mysql._conn = mock.MagicMock()
238 |         self.node.mysql.query = mock.MagicMock(return_value=())
239 |         self.node.consul = Consul(envs=get_environ())
240 |         self.node.consul.client = mock.MagicMock()
241 |         self.node.consul.renew_session = mock.MagicMock()
242 |         self.node.consul.client.health.service.return_value = [0, [{
243 |             'Service' : {'ID': 'node2', 'Address': '192.168.1.102'},
244 |             }]]
245 | 
246 |         try:
247 |             logging.getLogger().setLevel(logging.CRITICAL) # noisy
248 |             manage.health(self.node)
249 |             self.fail('Should have exited but did not.')
250 |         except SystemExit:
251 |             pass
252 |         calls = [
253 |             mock.call.query('show slave status'),
254 |             mock.call.query('show slave hosts'),
255 |             mock.call.query('show slave status')
256 |         ]
257 |         self.node.mysql.query.assert_has_calls(calls)
258 |         self.assertFalse(self.node.consul.renew_session.called)
259 |         self.assertEqual(self.node.cp.state, REPLICA)
260 | 
261 |     def test_replica_first_pass(self):
262 |         """
263 |         Given uninitialized node w/ a health primary, set up replication.
264 |         """
265 |         self.node.mysql = MySQL(envs=get_environ())
266 |         self.node.mysql._conn = mock.MagicMock()
267 |         self.node.mysql.query = mock.MagicMock()
268 | 
269 |         def query_results(*args, **kwargs):
270 |             yield ()
271 |             yield () # and after two hits we've set up replication
272 |             yield [{'Master_Server_Id': 'node2', 'Master_Host': '192.168.1.102'}]
273 | 
274 |         self.node.mysql.query.side_effect = query_results()
275 |         self.node.mysql.wait_for_connection = mock.MagicMock(return_value=True)
276 |         self.node.mysql.setup_replication = mock.MagicMock(return_value=True)
277 | 
278 |         self.node.consul = Consul(envs=get_environ())
279 |         self.node.consul.client = mock.MagicMock()
280 |         self.node.consul.client.health.service.return_value = [0, [{
281 |             'Service' : {'ID': 'node2', 'Address': '192.168.1.102'},
282 |             }]]
283 | 
284 |         manage.health(self.node)
285 |         calls = [
286 |             mock.call.query('show slave status'),
287 |             mock.call.query('show slave hosts'),
288 |             mock.call.query('show slave status')
289 |         ]
290 |         self.node.mysql.query.assert_has_calls(calls)
291 |         self.assertEqual(self.node.consul.client.health.service.call_count, 2)
292 |         manage.write_snapshot.assert_called_once()
293 |         self.assertEqual(self.node.cp.state, REPLICA)
294 | 
295 |     def test_replica_first_pass_replication_setup_fails(self):
296 |         """
297 |         Given uninitialized node w/ failed replication setup, fail
298 |         """
299 |         self.node.mysql = MySQL(envs=get_environ())
300 |         self.node.mysql._conn = mock.MagicMock()
301 |         self.node.mysql.query = mock.MagicMock(return_value=())
302 |         self.node.mysql.wait_for_connection = mock.MagicMock(return_value=True)
303 |         self.node.mysql.setup_replication = mock.MagicMock(return_value=True)
304 | 
305 |         self.node.consul = Consul(envs=get_environ())
306 |         self.node.consul.client = mock.MagicMock()
307 |         self.node.consul.client.health.service.return_value = [0, [{
308 |             'Service' : {'ID': 'node2', 'Address': '192.168.1.102'},
309 |             }]]
310 |         try:
311 |             logging.getLogger().setLevel(logging.CRITICAL) # noisy
312 |             manage.health(self.node)
313 |             self.fail('Should have exited but did not.')
314 |         except SystemExit:
315 |             pass
316 |         calls = [
317 |             mock.call.query('show slave status'),
318 |             mock.call.query('show slave hosts'),
319 |             mock.call.query('show slave status')
320 |         ]
321 |         self.node.mysql.query.assert_has_calls(calls)
322 |         self.assertEqual(self.node.consul.client.health.service.call_count, 2)
323 |         manage.write_snapshot.assert_called_once()
324 |         self.assertEqual(self.node.cp.state, REPLICA)
325 | 
326 |     def test_replica_first_pass_primary_lockout(self):
327 |         """
328 |         Given uninitialized node w/ no primary, then a health primary
329 |         retry setting up as a replica
330 |         """
331 |         self.node.mysql.wait_for_connection.return_value = True
332 |         self.node.mysql.get_primary.side_effect = UnknownPrimary()
333 | 
334 |         self.node.consul = Consul(envs=get_environ())
335 |         self.node.consul.client = mock.MagicMock()
336 |         self.node.consul.mark_as_primary = mock.MagicMock(return_value=False)
337 |         self.node.consul.client.health.service.return_value = ()
338 |         try:
339 |             logging.getLogger().setLevel(logging.CRITICAL) # noisy
340 |             manage.health(self.node)
341 |             self.fail('Should have exited but did not.')
342 |         except SystemExit:
343 |             pass
344 | 
345 |         self.assertEqual(self.node.cp.state, UNASSIGNED)
346 | 
347 | 
348 | 
349 | class TestOnChange(unittest.TestCase):
350 | 
351 |     def setUp(self):
352 |         logging.getLogger().setLevel(logging.WARN)
353 |         consul = mock.MagicMock()
354 |         my = mock.MagicMock()
355 |         cp = ContainerPilot()
356 |         cp.load(get_environ())
357 |         temp_file = tempfile.NamedTemporaryFile()
358 |         cp.path = temp_file.name
359 |         cp.reload = mock.MagicMock(return_value=True)
360 | 
361 |         self.node = manage.Node(consul=consul, cp=cp, mysql=my)
362 |         self.node.ip = '192.168.1.101'
363 |         self.node.name = 'node1'
364 | 
365 |     def tearDown(self):
366 |         logging.getLogger().setLevel(logging.DEBUG)
367 | 
368 |     def test_this_node_already_set_primary(self):
369 |         """
370 |         Given that another node has run the failover and set this node
371 |         as primary, then this node will be primary and updates its
372 |         ContainerPilot config as required
373 |         """
374 |         self.node.mysql.get_primary.return_value = ('node1', '192.168.1.101')
375 |         manage.on_change(self.node)
376 | 
377 |         self.node.consul.put.assert_called_once()
378 |         self.node.cp.reload.assert_called_once()
379 |         self.assertEqual(self.node.cp.state, PRIMARY)
380 | 
381 |     def test_another_node_already_set_primary(self):
382 |         """
383 |         Given that another node has run the failover and set some other
384 |         node as primary, then this node will not be primary and needs to
385 |         do nothing.
386 |         """
387 |         self.node.mysql.get_primary.return_value = ('node1', '192.168.1.102')
388 |         manage.on_change(self.node)
389 | 
390 |         self.assertFalse(self.node.consul.put.called)
391 |         self.node.consul.get_primary.assert_called_once()
392 |         self.assertFalse(self.node.cp.reload.called)
393 |         self.assertEqual(self.node.cp.state, REPLICA)
394 | 
395 |     def test_failover_runs_this_node_is_primary(self):
396 |         """
397 |         Given a successful failover where this node is marked primary,
398 |         the node will update its ContainerPilot config as required
399 |         """
400 |         def query_results(*args, **kwargs):
401 |             yield ()
402 |             yield () # and after two hits we've set up replication
403 |             yield [{'Master_Server_Id': 'node1', 'Master_Host': '192.168.1.101'}]
404 | 
405 |         self.node.mysql = MySQL(envs=get_environ())
406 |         self.node.mysql._conn = mock.MagicMock()
407 |         self.node.mysql.query = mock.MagicMock(side_effect=query_results())
408 |         self.node.mysql.failover = mock.MagicMock()
409 | 
410 |         def consul_get_primary_results(*args, **kwargs):
411 |             yield UnknownPrimary()
412 |             yield UnknownPrimary()
413 |             yield ('node1', '192.168.1.101')
414 | 
415 |         self.node.consul.get_primary.side_effect = consul_get_primary_results()
416 |         self.node.consul.lock.return_value = True
417 |         self.node.consul.read_lock.return_value = None, None
418 |         self.node.consul.client.health.service.return_value = [0, [
419 |             {'Service' : {'ID': 'node1', 'Address': '192.168.1.101'}},
420 |             {'Service' : {'ID': 'node3', 'Address': '192.168.1.103'}}
421 |         ]]
422 | 
423 |         manage.on_change(self.node)
424 | 
425 |         self.assertEqual(self.node.consul.get_primary.call_count, 2)
426 |         self.node.consul.lock_failover.assert_called_once()
427 |         self.node.consul.client.health.service.assert_called_once()
428 |         self.assertFalse(self.node.consul.unlock_failover.called)
429 |         self.node.consul.put.assert_called_once()
430 |         self.node.cp.reload.assert_called_once()
431 |         self.assertEqual(self.node.cp.state, PRIMARY)
432 | 
433 |     def test_failover_runs_another_node_is_primary(self):
434 |         """
435 |         Given a successful failover where another node is marked primary,
436 |         the node will not update its ContainerPilot config
437 |         """
438 |         def query_results(*args, **kwargs):
439 |             yield ()
440 |             yield () # and after two hits we've set up replication
441 |             yield [{'Master_Server_Id': 'node1', 'Master_Host': '192.168.1.102'}]
442 | 
443 |         self.node.mysql = MySQL(envs=get_environ())
444 |         self.node.mysql._conn = mock.MagicMock()
445 |         self.node.mysql.query = mock.MagicMock(side_effect=query_results())
446 |         self.node.mysql.failover = mock.MagicMock()
447 | 
448 |         def consul_get_primary_results(*args, **kwargs):
449 |             yield UnknownPrimary()
450 |             yield UnknownPrimary()
451 |             yield ('node1', '192.168.1.102')
452 | 
453 |         self.node.consul.get_primary.side_effect = consul_get_primary_results()
454 |         self.node.consul.lock_failover.return_value = True
455 |         self.node.consul.read_lock.return_value = None, None
456 |         self.node.consul.client.health.service.return_value = [0, [
457 |             {'Service' : {'ID': 'node1', 'Address': '192.168.1.101'}},
458 |             {'Service' : {'ID': 'node3', 'Address': '192.168.1.102'}}
459 |         ]]
460 | 
461 |         manage.on_change(self.node)
462 | 
463 |         self.assertEqual(self.node.consul.get_primary.call_count, 2)
464 |         self.node.consul.lock_failover.assert_called_once()
465 |         self.node.consul.client.health.service.assert_called_once()
466 |         self.assertFalse(self.node.consul.unlock_failover.called)
467 |         self.assertFalse(self.node.cp.reload.called)
468 |         self.assertEqual(self.node.cp.state, REPLICA)
469 | 
470 |     def test_failover_fails(self):
471 |         """
472 |         Given a failed failover, ensure we unlock the failover lock
473 |         but exit with an unhandled exception without trying to set
474 |         status.
475 |         """
476 |         self.node.mysql = MySQL(envs=get_environ())
477 |         self.node.mysql._conn = mock.MagicMock()
478 |         self.node.mysql.query = mock.MagicMock(return_value=())
479 |         self.node.mysql.failover = mock.MagicMock(side_effect=Exception('fail'))
480 | 
481 |         self.node.consul.get_primary.side_effect = UnknownPrimary()
482 |         self.node.consul.lock_failover.return_value = True
483 |         self.node.consul.read_lock.return_value = None, None
484 |         self.node.consul.client.health.service.return_value = [0, [
485 |             {'Service' : {'ID': 'node1', 'Address': '192.168.1.101'}},
486 |             {'Service' : {'ID': 'node3', 'Address': '192.168.1.102'}}
487 |         ]]
488 | 
489 |         try:
490 |             manage.on_change(self.node)
491 |             self.fail('Expected unhandled exception but did not.')
492 |         except Exception as ex:
493 |             self.assertEqual(ex.message, 'fail')
494 | 
495 |         self.assertEqual(self.node.consul.get_primary.call_count, 2)
496 |         self.node.consul.lock_failover.assert_called_once()
497 |         self.node.consul.client.health.service.assert_called_once()
498 |         self.node.consul.unlock_failover.assert_called_once()
499 |         self.assertFalse(self.node.cp.reload.called)
500 |         self.assertEqual(self.node.cp.state, UNASSIGNED)
501 | 
502 | 
503 |     def test_failover_locked_this_node_is_primary(self):
504 |         """
505 |         Given another node is running a failover, wait for that failover.
506 |         Given this this node is marked primary, the node will update its
507 |         ContainerPilot config as required.
508 |         """
509 |         def query_results(*args, **kwargs):
510 |             yield ()
511 |             yield () # and after two hits we've set up replication
512 |             yield [{'Master_Server_Id': 'node1', 'Master_Host': '192.168.1.101'}]
513 | 
514 |         self.node.mysql = MySQL(envs=get_environ())
515 |         self.node.mysql._conn = mock.MagicMock()
516 |         self.node.mysql.query = mock.MagicMock(side_effect=query_results())
517 |         self.node.mysql.failover = mock.MagicMock()
518 | 
519 |         def consul_get_primary_results(*args, **kwargs):
520 |             yield UnknownPrimary()
521 |             yield UnknownPrimary()
522 |             yield ('node1', '192.168.1.101')
523 | 
524 |         def lock_sequence(*args, **kwargs):
525 |             yield True
526 |             yield False
527 | 
528 |         self.node.consul = Consul(envs=get_environ())
529 |         self.node.consul.client = mock.MagicMock()
530 |         self.node.consul.put = mock.MagicMock()
531 |         self.node.consul.get_primary = mock.MagicMock(
532 |             side_effect=consul_get_primary_results())
533 |         self.node.consul.lock_failover = mock.MagicMock(return_value=False)
534 |         self.node.consul.unlock_failover = mock.MagicMock()
535 |         self.node.consul.is_locked = mock.MagicMock(side_effect=lock_sequence())
536 | 
537 |         with mock.patch('time.sleep'): # cuts 3 sec from test run
538 |             manage.on_change(self.node)
539 | 
540 |         self.assertEqual(self.node.consul.get_primary.call_count, 2)
541 |         self.node.consul.lock_failover.assert_called_once()
542 |         self.assertFalse(self.node.consul.client.health.service.called)
543 |         self.assertFalse(self.node.consul.unlock_failover.called)
544 |         self.node.consul.put.assert_called_once()
545 |         self.node.cp.reload.assert_called_once()
546 |         self.assertEqual(self.node.cp.state, PRIMARY)
547 | 
548 | 
549 |     def test_failover_locked_another_node_is_primary(self):
550 |         """
551 |         Given another node is running a failover, wait for that failover.
552 |         Given this this node is not marked primary, the node will not
553 |         update its ContainerPilot config.
554 |         """
555 |         def query_results(*args, **kwargs):
556 |             yield ()
557 |             yield () # and after two hits we've set up replication
558 |             yield [{'Master_Server_Id': 'node2', 'Master_Host': '192.168.1.102'}]
559 | 
560 |         self.node.mysql = MySQL(envs=get_environ())
561 |         self.node.mysql._conn = mock.MagicMock()
562 |         self.node.mysql.query = mock.MagicMock(side_effect=query_results())
563 |         self.node.mysql.failover = mock.MagicMock()
564 | 
565 |         def consul_get_primary_results(*args, **kwargs):
566 |             yield UnknownPrimary()
567 |             yield UnknownPrimary()
568 |             yield ('node2', '192.168.1.102')
569 | 
570 |         def lock_sequence(*args, **kwargs):
571 |             yield True
572 |             yield False
573 | 
574 |         self.node.consul = Consul(envs=get_environ())
575 |         self.node.consul.client = mock.MagicMock()
576 |         self.node.consul.put = mock.MagicMock()
577 |         self.node.consul.get_primary = mock.MagicMock(
578 |             side_effect=consul_get_primary_results())
579 |         self.node.consul.lock_failover = mock.MagicMock(return_value=False)
580 |         self.node.consul.unlock_failover = mock.MagicMock()
581 |         self.node.consul.is_locked = mock.MagicMock(side_effect=lock_sequence())
582 | 
583 |         with mock.patch('time.sleep'): # cuts 3 sec from test run
584 |             manage.on_change(self.node)
585 | 
586 |         self.assertEqual(self.node.consul.get_primary.call_count, 2)
587 |         self.node.consul.lock_failover.assert_called_once()
588 |         self.assertFalse(self.node.consul.client.health.service.called)
589 |         self.assertFalse(self.node.consul.unlock_failover.called)
590 |         self.assertFalse(self.node.consul.put.called)
591 |         self.assertFalse(self.node.cp.reload.called)
592 |         self.assertEqual(self.node.cp.state, REPLICA)
593 | 
594 | 
595 | class TestSnapshotTask(unittest.TestCase):
596 | 
597 |     def setUp(self):
598 |         logging.getLogger().setLevel(logging.WARN)
599 |         consul = mock.MagicMock()
600 |         manta = mock.MagicMock()
601 |         my = mock.MagicMock()
602 |         cp = ContainerPilot()
603 |         cp.load(get_environ())
604 |         my.datadir = tempfile.mkdtemp()
605 |         cp.state = PRIMARY
606 |         my.datadir = tempfile.mkdtemp()
607 |         self.node = manage.Node(consul=consul, cp=cp, snaps=manta, mysql=my)
608 | 
609 |     def tearDown(self):
610 |         logging.getLogger().setLevel(logging.DEBUG)
611 |         try:
612 |             os.remove('/tmp/mysql-backup-run')
613 |         except OSError:
614 |             pass
615 | 
616 |     def test_not_snapshot_node(self):
617 |         """ Don't snapshot if this isn't the snapshot node """
618 |         # TODO update when this logic changes
619 |         self.node.cp.state = REPLICA
620 |         manage.snapshot_task(self.node)
621 |         self.assertFalse(self.node.mysql.query.called)
622 | 
623 |     def test_binlog_stale(self):
624 |         """ Snapshot if the binlog is stale even if its not time to do so """
625 |         consul = Consul(envs=get_environ())
626 |         binlog_file = 'mysql.002'
627 |         now = datetime.utcnow().isoformat()
628 |         consul_values = {
629 |             LAST_BACKUP_KEY: '{{"id": "xxxx", "dt": "{}"}}'.format(now),
630 |             LAST_BINLOG_KEY: 'mysql.001',
631 |         }
632 |         consul.get = consul_values.__getitem__
633 |         self.assertTrue(consul.is_snapshot_stale(binlog_file))
634 | 
635 |     def test_is_snapshot_stale_invalid(self):
636 |         """ Snapshot if the timer has elapsed even if the binlog isn't stale"""
637 |         consul = Consul(envs=get_environ())
638 |         binlog_file = 'mysql.001'
639 | 
640 |         consul_values = {
641 |             LAST_BACKUP_KEY: '{"id": "xxxx", "dt": "yyyyy"}',
642 |             LAST_BINLOG_KEY: 'mysql.001',
643 |         }
644 |         consul.get = consul_values.__getitem__
645 |         try:
646 |             self.assertTrue(consul.is_snapshot_stale(binlog_file))
647 |             self.fail('Expected ValueError with invalid data in Consul')
648 |         except ValueError:
649 |             pass
650 | 
651 |         # not stale
652 |         now = datetime.utcnow().isoformat()
653 |         consul_values = {
654 |             LAST_BACKUP_KEY: '{{"id": "xxxx", "dt": "{}"}}'.format(now),
655 |             LAST_BINLOG_KEY: 'mysql.001',
656 |         }
657 |         consul.get = consul_values.__getitem__
658 |         self.assertFalse(consul.is_snapshot_stale(binlog_file))
659 | 
660 |         # stale
661 |         then = (datetime.utcnow() - timedelta(hours=25)).isoformat()
662 |         consul_values = {
663 |             LAST_BACKUP_KEY: '{{"id": "xxxx", "dt": "{}"}}'.format(then),
664 |             LAST_BINLOG_KEY: 'mysql.001',
665 |         }
666 |         consul.get = consul_values.__getitem__
667 |         self.assertTrue(consul.is_snapshot_stale(binlog_file))
668 | 
669 |     def test_backup_already_running(self):
670 |         """ Don't snapshot if there's already a snapshot running """
671 |         self.node.consul = Consul(envs=get_environ())
672 |         self.node.consul.client = mock.MagicMock()
673 |         self.node.consul.client.session.create.return_value = 'xyzzy'
674 | 
675 |         with mock.patch('manage.write_snapshot') as ws:
676 |             lockfile_name = '/tmp/' + BACKUP_LOCK_KEY
677 |             try:
678 |                 backup_lock = open(lockfile_name, 'w')
679 |                 fcntl.flock(backup_lock, fcntl.LOCK_EX|fcntl.LOCK_NB)
680 |                 manage.snapshot_task(self.node)
681 |             finally:
682 |                 fcntl.flock(backup_lock, fcntl.LOCK_UN)
683 |                 backup_lock.close()
684 |             self.assertFalse(ws.called)
685 | 
686 |     def test_backup_unlocked(self):
687 |         """
688 |         Make sure that if a snapshot has run that we unlock correctly.
689 |         """
690 |         self.node.consul = Consul(envs=get_environ())
691 |         self.node.consul.client = mock.MagicMock()
692 |         self.node.consul.client.session.create.return_value = 'xyzzy'
693 |         with mock.patch('manage.write_snapshot') as ws:
694 |             lockfile_name = '/tmp/' + BACKUP_LOCK_KEY
695 |             try:
696 |                 backup_lock = open(lockfile_name, 'w')
697 |                 fcntl.flock(backup_lock, fcntl.LOCK_EX|fcntl.LOCK_NB)
698 |                 manage.snapshot_task(self.node)
699 |             finally:
700 |                 fcntl.flock(backup_lock, fcntl.LOCK_UN)
701 |                 backup_lock.close()
702 |             manage.snapshot_task(self.node)
703 |             self.assertTrue(ws.called)
704 | 
705 | 
706 | class TestMySQL(unittest.TestCase):
707 | 
708 |     def setUp(self):
709 |         logging.getLogger().setLevel(logging.WARN)
710 |         self.environ = get_environ()
711 |         self.my = MySQL(self.environ)
712 |         self.my._conn = mock.MagicMock()
713 | 
714 |     def tearDown(self):
715 |         logging.getLogger().setLevel(logging.DEBUG)
716 | 
717 |     def test_parse(self):
718 |         self.assertEqual(self.my.mysql_db, 'test_mydb')
719 |         self.assertEqual(self.my.mysql_user, 'test_me')
720 |         self.assertEqual(self.my.mysql_password, 'test_pass')
721 |         self.assertEqual(self.my.mysql_root_password, 'test_root_pass')
722 |         self.assertEqual(self.my.mysql_random_root_password, True)
723 |         self.assertEqual(self.my.mysql_onetime_password, True)
724 |         self.assertEqual(self.my.repl_user, 'test_repl_user')
725 |         self.assertEqual(self.my.repl_password, 'test_repl_pass')
726 |         self.assertEqual(self.my.datadir, '/var/lib/mysql')
727 |         self.assertEqual(self.my.pool_size, 100)
728 |         self.assertIsNotNone(self.my.ip)
729 | 
730 |     def test_query_buffer_execute_should_flush(self):
731 |         self.my.add('query 1', ())
732 |         self.assertEqual(len(self.my._query_buffer.items()), 1)
733 |         self.assertEqual(len(self.my._conn.mock_calls), 0)
734 |         self.my.execute('query 2', ())
735 |         self.assertEqual(len(self.my._query_buffer.items()), 0)
736 |         exec_calls = [
737 |             mock.call.cursor().execute('query 1', params=()),
738 |             mock.call.commit(),
739 |             mock.call.cursor().fetchall(),
740 |             mock.call.cursor().execute('query 2', params=()),
741 |             mock.call.commit(),
742 |             mock.call.cursor().fetchall(),
743 |             mock.call.cursor().close()
744 |         ]
745 |         self.assertEqual(self.my._conn.mock_calls[2:], exec_calls)
746 | 
747 |     def test_query_buffer_execute_many_should_flush(self):
748 |         self.my.add('query 3', ())
749 |         self.my.add('query 4', ())
750 |         self.my.add('query 5', ())
751 |         self.my.execute_many()
752 |         self.assertEqual(len(self.my._query_buffer.items()), 0)
753 |         exec_many_calls = [
754 |             mock.call.cursor().execute('query 3', params=()),
755 |             mock.call.commit(),
756 |             mock.call.cursor().fetchall(),
757 |             mock.call.cursor().execute('query 4', params=()),
758 |             mock.call.commit(),
759 |             mock.call.cursor().fetchall(),
760 |             mock.call.cursor().execute('query 5', params=()),
761 |             mock.call.commit(),
762 |             mock.call.cursor().fetchall(),
763 |             mock.call.cursor().close()
764 |         ]
765 |         self.assertEqual(self.my._conn.mock_calls[2:], exec_many_calls)
766 | 
767 |     def test_query_buffer_query_should_flush(self):
768 |         self.my.query('query 6', ())
769 |         self.assertEqual(len(self.my._query_buffer.items()), 0)
770 |         query_calls = [
771 |             mock.call.cursor().execute('query 6', params=()),
772 |             mock.call.cursor().fetchall(),
773 |             mock.call.cursor().close()
774 |         ]
775 |         self.assertEqual(self.my._conn.mock_calls[2:], query_calls)
776 | 
777 |     def test_expected_setup_statements(self):
778 |         conn = mock.MagicMock()
779 |         self.my.setup_root_user(conn)
780 |         self.my.create_db(conn)
781 |         self.my.create_default_user(conn)
782 |         self.my.create_repl_user(conn)
783 |         self.my.expire_root_password(conn)
784 |         self.assertEqual(len(self.my._conn.mock_calls), 0) # use param, not attr
785 |         statements = [args[0] for (name, args, _)
786 |                       in conn.mock_calls if name == 'cursor().execute']
787 |         expected = [
788 |             'SET @@SESSION.SQL_LOG_BIN=0;',
789 |             "DELETE FROM `mysql`.`user` where user != 'mysql.sys';",
790 |             'CREATE USER `root`@`%` IDENTIFIED BY %s ;',
791 |             'GRANT ALL ON *.* TO `root`@`%` WITH GRANT OPTION ;',
792 |             'DROP DATABASE IF EXISTS test ;',
793 |             'FLUSH PRIVILEGES ;',
794 |             'CREATE DATABASE IF NOT EXISTS `test_mydb`;',
795 |             'CREATE USER `test_me`@`%` IDENTIFIED BY %s;',
796 |             'GRANT ALL ON `test_mydb`.* TO `test_me`@`%`;',
797 |             'FLUSH PRIVILEGES;',
798 |             'CREATE USER `test_repl_user`@`%` IDENTIFIED BY %s; ',
799 |             ('GRANT SUPER, SELECT, INSERT, REPLICATION SLAVE, RELOAD,'
800 |              ' LOCK TABLES, GRANT OPTION, REPLICATION CLIENT, RELOAD,'
801 |              ' DROP, CREATE ON *.* TO `test_repl_user`@`%`; '),
802 |             'FLUSH PRIVILEGES;',
803 |             'ALTER USER `root`@`%` PASSWORD EXPIRE']
804 |         self.assertEqual(statements, expected)
805 | 
806 | 
807 | class TestConsul(unittest.TestCase):
808 | 
809 |     def setUp(self):
810 |         self.environ = get_environ()
811 | 
812 |     def test_parse_with_consul_agent(self):
813 |         self.environ['CONSUL_AGENT'] = '1'
814 |         consul = Consul(self.environ)
815 |         self.assertEqual(consul.host, 'localhost')
816 | 
817 |     def test_parse_without_consul_agent(self):
818 |         self.environ['CONSUL_AGENT'] = '0'
819 |         consul = Consul(self.environ)
820 |         self.assertEqual(consul.host, 'my.consul.example.com')
821 | 
822 |         self.environ['CONSUL_AGENT'] = ''
823 |         consul = Consul(self.environ)
824 |         self.assertEqual(consul.host, 'my.consul.example.com')
825 | 
826 | 
827 | 
828 | class TestContainerPilotConfig(unittest.TestCase):
829 | 
830 |     def setUp(self):
831 |         logging.getLogger().setLevel(logging.WARN)
832 |         self.environ = get_environ()
833 | 
834 |     def tearDown(self):
835 |         logging.getLogger().setLevel(logging.DEBUG)
836 | 
837 |     def test_parse_with_consul_agent(self):
838 |         self.environ['CONSUL_AGENT'] = '1'
839 |         cp = ContainerPilot()
840 |         cp.load(envs=self.environ)
841 | 
842 |         self.assertEqual(cp.config['consul'], 'localhost:8500')
843 |         health_check_exec = cp.config['jobs'][4]['health']['exec']
844 |         self.assertIn('my.consul.example.com', health_check_exec)
845 |         self.assertEqual(cp.state, UNASSIGNED)
846 | 
847 |     def test_parse_without_consul_agent(self):
848 |         self.environ['CONSUL_AGENT'] = ''
849 |         cp = ContainerPilot()
850 |         cp.load(envs=self.environ)
851 |         self.assertEqual(cp.config['consul'], 'my.consul.example.com:8500')
852 |         self.assertFalse('consul-agent' in
853 |                          [job['name'] for job in cp.config['jobs']])
854 |         self.assertEqual(cp.state, UNASSIGNED)
855 | 
856 |     def test_update(self):
857 |         self.environ['CONSUL_AGENT'] = '1'
858 |         cp = ContainerPilot()
859 |         cp.state = REPLICA
860 |         cp.load(envs=self.environ)
861 |         temp_file = tempfile.NamedTemporaryFile()
862 |         cp.path = temp_file.name
863 | 
864 |         # no update expected
865 |         cp.update()
866 |         with open(temp_file.name, 'r') as updated:
867 |             self.assertEqual(updated.read(), '')
868 | 
869 |         # force an update
870 |         cp.state = PRIMARY
871 |         cp.update()
872 |         with open(temp_file.name, 'r') as updated:
873 |             config = json5.loads(updated.read())
874 |             self.assertEqual(config['consul'], 'localhost:8500')
875 |             health_check_exec = config['jobs'][4]['health']['exec']
876 |             self.assertIn('my.consul.example.com', health_check_exec)
877 | 
878 | 
879 | class TestMantaConfig(unittest.TestCase):
880 | 
881 |     def setUp(self):
882 |         self.environ = get_environ()
883 | 
884 |     def test_parse(self):
885 |         manta = Manta(self.environ)
886 |         self.assertEqual(manta.account, 'test_manta_account')
887 |         self.assertEqual(manta.user, 'test_manta_subuser')
888 |         self.assertEqual(manta.role, 'test_manta_role')
889 |         self.assertEqual(manta.bucket, '/test_manta_account/stor')
890 |         self.assertEqual(manta.url, 'https://us-east.manta.joyent.com')
891 |         self.assertEqual(
892 |             manta.private_key,
893 |             ('-----BEGIN RSA PRIVATE KEY-----\n'
894 |              'MIIEowIBAAKCAQEAvvljJQt2V3jJoM1SC9FiaBaw5AjVR40v5wKCVaONSz+FWm\n'
895 |              'pc91hUJHQClaxXDlf1p5kf3Oqu5qjM6w8oD7uPkzj++qPnCkzt+JGPfUBxpzul\n'
896 |              '80J0GLHpqQ2YUBXfJ6pCb0g7z/hkdsSwJt7DS+keWCtWpVYswj2Ln8CwNlZlye\n'
897 |              'qAmNE2ePZg8AzfpFmDROljU3GHhKaAviiLyxOklbwSbySbTmdNLHHxu22+ciW9\n'
898 |              '-----END RSA PRIVATE KEY-----'))
899 |         self.assertEqual(manta.key_id,
900 |                          '49:d5:1f:09:5e:46:92:14:c0:46:8e:48:33:75:10:bc')
901 | 
902 | 
903 | class TestUtilsEnvironment(unittest.TestCase):
904 | 
905 |     def test_to_flag(self):
906 |         self.assertEqual(to_flag('yes'), True)
907 |         self.assertEqual(to_flag('Y'), True)
908 |         self.assertEqual(to_flag('no'), False)
909 |         self.assertEqual(to_flag('N'), False)
910 |         self.assertEqual(to_flag('1'), True)
911 |         self.assertEqual(to_flag('xxxxx'), True)
912 |         self.assertEqual(to_flag('0'), False)
913 |         self.assertEqual(to_flag('xxxxx'), True)
914 |         self.assertEqual(to_flag(1), True)
915 |         self.assertEqual(to_flag(0), False)
916 | 
917 |     def test_env_parse(self):
918 | 
919 |         os.environ['TestUtilsEnvironment'] = 'PASS'
920 |         environ = {
921 |             'A': '$TestUtilsEnvironment',
922 |             'B': 'PASS  ',
923 |             'C': 'PASS # SOME COMMENT'
924 |         }
925 |         self.assertEqual(env('A', '', environ), 'PASS')
926 |         self.assertEqual(env('B', '', environ), 'PASS')
927 |         self.assertEqual(env('C', '', environ), 'PASS')
928 |         self.assertEqual(env('D', 'PASS', environ), 'PASS')
929 | 
930 | 
931 | TEST_ENVIRON = {
932 |     'CONSUL': 'my.consul.example.com',
933 |     'CONSUL_AGENT': '1',
934 | 
935 |     'CONTAINERPILOT': '/etc/containerpilot.json5',
936 | 
937 |     'MYSQL_DATABASE': 'test_mydb',
938 |     'MYSQL_USER': 'test_me',
939 |     'MYSQL_PASSWORD': 'test_pass',
940 |     'MYSQL_ROOT_PASSWORD': 'test_root_pass',
941 |     'MYSQL_RANDOM_ROOT_PASSWORD': 'Y',
942 |     'MYSQL_ONETIME_PASSWORD': '1',
943 |     'MYSQL_REPL_USER': 'test_repl_user',
944 |     'MYSQL_REPL_PASSWORD': 'test_repl_pass',
945 |     'INNODB_BUFFER_POOL_SIZE': '100',
946 | 
947 |     'MANTA_USER': 'test_manta_account',
948 |     'MANTA_SUBUSER': 'test_manta_subuser',
949 |     'MANTA_ROLE': 'test_manta_role',
950 |     'MANTA_KEY_ID': '49:d5:1f:09:5e:46:92:14:c0:46:8e:48:33:75:10:bc',
951 |     'MANTA_PRIVATE_KEY': (
952 |         '-----BEGIN RSA PRIVATE KEY-----#'
953 |         'MIIEowIBAAKCAQEAvvljJQt2V3jJoM1SC9FiaBaw5AjVR40v5wKCVaONSz+FWm#'
954 |         'pc91hUJHQClaxXDlf1p5kf3Oqu5qjM6w8oD7uPkzj++qPnCkzt+JGPfUBxpzul#'
955 |         '80J0GLHpqQ2YUBXfJ6pCb0g7z/hkdsSwJt7DS+keWCtWpVYswj2Ln8CwNlZlye#'
956 |         'qAmNE2ePZg8AzfpFmDROljU3GHhKaAviiLyxOklbwSbySbTmdNLHHxu22+ciW9#'
957 |         '-----END RSA PRIVATE KEY-----')
958 | }
959 | 
960 | def get_environ():
961 |     return TEST_ENVIRON.copy()
962 | 
963 | 
964 | if __name__ == '__main__':
965 |     unittest.main()
966 | 


--------------------------------------------------------------------------------
/etc/containerpilot.json5:
--------------------------------------------------------------------------------
 1 | {
 2 |   consul: '{{ if .CONSUL_AGENT }}localhost{{ else }}{{ .CONSUL | default "consul"}}{{ end }}:8500',
 3 |   logging: {
 4 |     level: '{{ .LOG_LEVEL | default "INFO" }}'
 5 |   },
 6 |   jobs: [
 7 |     {
 8 |       name: "preStart",
 9 |       exec: "python /usr/local/bin/manage.py",
10 |       {{ if .CONSUL_AGENT }}when: {
11 |         source: "consul-agent",
12 |         once: "healthy"
13 |       }{{ end }}
14 |     },
15 |     {
16 |       name: '{{ .SERVICE_NAME | default "mysql" }}',
17 |       exec: [
18 |         "mysqld",
19 |         "--console",
20 |         "--log-bin=mysql-bin",
21 |         "--log_slave_updates=ON",
22 |         "--gtid-mode=ON",
23 |         "--enforce-gtid-consistency=ON"
24 |       ],
25 |       port: 3306,
26 |       when: {
27 |         source: "preStart",
28 |         once: "exitSuccess"
29 |       },
30 |       health: {
31 |         exec: "python /usr/local/bin/manage.py health",
32 |         interval: 5,
33 |         ttl: 25
34 |       }
35 |     },
36 |     {
37 |       name: "onChange",
38 |       exec: "python /usr/local/bin/manage.py on_change",
39 |       when: {
40 |         source:'watch.{{ .SERVICE_NAME | default "mysql" }}-primary'  ,
41 |         each: "changed"
42 |       }
43 |     },
44 |     {
45 |       name: "snapshot-check",
46 |       exec: "python /usr/local/bin/manage.py snapshot_task",
47 |       timeout: "10m",
48 |       when: {
49 |         interval: "5m"
50 |       },
51 |     },
52 |     {{ if .CONSUL_AGENT }}{
53 |       name: "consul-agent",
54 |       restarts: "unlimited",
55 |       exec: [
56 |         "/usr/local/bin/consul", "agent",
57 |         "-data-dir=/data",
58 |         "-config-dir=/config"
59 |       ],
60 |       health: {
61 |         exec: 'consul join {{ .CONSUL | default "consul"}}',
62 |         interval: 5,
63 |         ttl: 10
64 |       }
65 |     }{{ end }}
66 |   ],
67 |   watches: [
68 |     {
69 |       name: '{{ .SERVICE_NAME | default "mysql" }}-primary',
70 |       interval: 10
71 |     }
72 |   ]
73 | }
74 | 


--------------------------------------------------------------------------------
/etc/my.cnf.tmpl:
--------------------------------------------------------------------------------
 1 | # For advice on how to change settings please see
 2 | # http://dev.mysql.com/doc/refman/5.7/en/server-configuration-defaults.html
 3 | 
 4 | [mysqld]
 5 | 
 6 | # The Containerpilot `onStart` handler will overwrite this with the value of
 7 | # MYSQL_INNODB_BUFFER_POOL_SIZE so that we can set it relative to the
 8 | # container size. Set to the amount of RAM for the most important data
 9 | # cache in MySQL. Start at 70% of total RAM for dedicated server, else 10%.
10 | 
11 | innodb_buffer_pool_size = ${buffer}M
12 | 
13 | # Remove leading # to turn on a very important data integrity option: logging
14 | # changes to the binary log between backups.
15 | 
16 | #gtid-mode=ON
17 | #enforce-gtid-consistency=ON
18 | 
19 | # Required for MySQL replication. The Containerpilot `onStart` handler will
20 | # overwrite report-host with the container hostname and server-id with a
21 | # numerical ID derived from the container hostname
22 | server-id=$server_id
23 | report-host=$hostname
24 | 
25 | # We have intentionally not set log_bin here so that we're forced to pass
26 | # it into the command-line parameters. This lets us bootstrap the DB without
27 | # causing binary log entries that will prevent replicas from coming online
28 | # due to conflicts over the system `mysql` database.
29 | #log_bin=mysql-bin
30 | #log_slave_updates=ON
31 | 
32 | # Remove leading # to set options mainly useful for reporting servers.
33 | # The server defaults are faster for transactions and fast SELECTs.
34 | # Adjust sizes as needed, experiment to find the optimal values.
35 | # join_buffer_size = 128M
36 | # sort_buffer_size = 2M
37 | # read_rnd_buffer_size = 2M
38 | # query_cache_size = 32M
39 | # query_cache_type = ON
40 | 
41 | tmp_table_size = 128M
42 | max_heap_table_size = 128M
43 | 
44 | 
45 | skip-host-cache
46 | #skip-name-resolve
47 | datadir=/var/lib/mysql
48 | #socket=/var/lib/mysql/mysql.sock
49 | secure-file-priv=/var/lib/mysql
50 | user=mysql
51 | 
52 | # Disabling symbolic-links is recommended to prevent assorted security risks
53 | symbolic-links=0
54 | 
55 | # Disable log-error so that MySQL sends its errors to stderr and therefore
56 | # to the Docker log driver
57 | # log-error=/var/log/mysqld.log
58 | 
59 | pid-file=/var/run/mysqld/mysqld.pid
60 | 


--------------------------------------------------------------------------------
/examples/compose/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2.1'
 2 | 
 3 | services:
 4 |   mysql:
 5 |     image: autopilotpattern/mysql:${TAG:-latest}
 6 |     mem_limit: 512m
 7 |     restart: always
 8 |     expose:
 9 |       - 3306
10 |     network_mode: bridge
11 |     environment:
12 |       - MYSQL_USER=dbuser
13 |       - MYSQL_PASSWORD=seekretPassword
14 |       - MYSQL_REPL_USER=repluser
15 |       - MYSQL_REPL_PASSWORD=seekretReplPassword
16 |       - MYSQL_DATABASE=demodb
17 |       - BACKUP_TTL=120
18 |       - LOG_LEVEL=DEBUG
19 |       - CONSUL=consul
20 |       - SNAPSHOT_BACKEND=minio
21 |       - MINIO_ACCESS_KEY=supersecretaccesskey
22 |       - MINIO_SECRET_KEY=supersecretsecretkey
23 |     volumes:
24 |       # shared storage location for snapshots
25 |       - ${WORK_DIR:-../..}/tmp:/tmp/snapshots
26 |     links:
27 |       - consul:consul
28 |       - minio:minio
29 | 
30 |   consul:
31 |     image: consul:0.8.4
32 |     command: >
33 |       agent -server -client=0.0.0.0 -dev -ui
34 |     restart: always
35 |     mem_limit: 128m
36 |     ports:
37 |       - 8500:8500
38 |     expose:
39 |       - 53
40 |       - 8300
41 |       - 8301
42 |       - 8302
43 |       - 8400
44 |       - 8500
45 |     network_mode: bridge
46 |     dns:
47 |       - 127.0.0.1
48 | 
49 |   minio:
50 |     image: minio/minio
51 |     command: server /export
52 |     restart: always
53 |     expose:
54 |       - 9000
55 |     network_mode: bridge
56 |     environment:
57 |       - MINIO_ACCESS_KEY=supersecretaccesskey
58 |       - MINIO_SECRET_KEY=supersecretsecretkey
59 | 


--------------------------------------------------------------------------------
/examples/triton/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2.1'
 2 | 
 3 | services:
 4 |   mysql:
 5 |     image: autopilotpattern/mysql:${TAG:-latest}
 6 |     mem_limit: 4g
 7 |     restart: always
 8 |     # expose for linking, but each container gets a private IP for
 9 |     # internal use as well
10 |     expose:
11 |       - 3306
12 |     labels:
13 |       - triton.cns.services=mysql
14 |     env_file: _env
15 |     network_mode: bridge
16 |     environment:
17 |       - CONSUL_AGENT=1
18 |       - LOG_LEVEL=DEBUG
19 |       - SERVICE_NAME=mysql
20 |       - CONSUL=mc.svc.${TRITON_CNS_SEARCH_DOMAIN_PRIVATE}
21 | 
22 |   consul:
23 |     image: consul:0.8.4
24 |     command: >
25 |       agent -server -client=0.0.0.0 -bootstrap -ui
26 |     restart: always
27 |     mem_limit: 128m
28 |     ports:
29 |       - 8500
30 |     expose:
31 |       - 53
32 |       - 8300
33 |       - 8301
34 |       - 8302
35 |       - 8400
36 |       - 8500
37 |     network_mode: bridge
38 |     dns:
39 |       - 127.0.0.1
40 |     labels:
41 |       - triton.cns.services=mc
42 | 


--------------------------------------------------------------------------------
/examples/triton/setup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e -o pipefail
  3 | 
  4 | help() {
  5 |     echo
  6 |     echo 'Usage ./setup.sh ~/path/to/MANTA_PRIVATE_KEY'
  7 |     echo
  8 |     echo 'Checks that your Triton and Docker environment is sane and configures'
  9 |     echo 'an environment file to use.'
 10 |     echo
 11 |     echo 'MANTA_PRIVATE_KEY is the filesystem path to an SSH private key'
 12 |     echo 'used to connect to Manta for the database backups.'
 13 |     echo
 14 |     echo 'Additional details must be configured in the _env file, but this script will properly'
 15 |     echo 'encode the SSH key details for use with this MySQL image.'
 16 |     echo
 17 | }
 18 | 
 19 | 
 20 | # populated by `check` function whenever we're using Triton
 21 | TRITON_USER=
 22 | TRITON_DC=
 23 | 
 24 | # ---------------------------------------------------
 25 | # Top-level commands
 26 | 
 27 | # Check for correct configuration and setup _env file
 28 | envcheck() {
 29 | 
 30 |     if [ -z "$1" ]; then
 31 |         tput rev  # reverse
 32 |         tput bold # bold
 33 |         echo 'Please provide a path to a SSH private key to access Manta.'
 34 |         tput sgr0 # clear
 35 | 
 36 |         help
 37 |         exit 1
 38 |     fi
 39 | 
 40 |     if [ ! -f "$1" ]; then
 41 |         tput rev  # reverse
 42 |         tput bold # bold
 43 |         echo 'SSH private key for Manta is unreadable.'
 44 |         tput sgr0 # clear
 45 | 
 46 |         help
 47 |         exit 1
 48 |     fi
 49 | 
 50 |     # Assign args to named vars
 51 |     MANTA_PRIVATE_KEY_PATH=$1
 52 | 
 53 |     command -v docker >/dev/null 2>&1 || {
 54 |         echo
 55 |         tput rev  # reverse
 56 |         tput bold # bold
 57 |         echo 'Docker is required, but does not appear to be installed.'
 58 |         tput sgr0 # clear
 59 |         echo 'See https://docs.joyent.com/public-cloud/api-access/docker'
 60 |         exit 1
 61 |     }
 62 |     command -v json >/dev/null 2>&1 || {
 63 |         echo
 64 |         tput rev  # reverse
 65 |         tput bold # bold
 66 |         echo 'Error! JSON CLI tool is required, but does not appear to be installed.'
 67 |         tput sgr0 # clear
 68 |         echo 'See https://apidocs.joyent.com/cloudapi/#getting-started'
 69 |         exit 1
 70 |     }
 71 | 
 72 |     command -v triton >/dev/null 2>&1 || {
 73 |         echo
 74 |         tput rev  # reverse
 75 |         tput bold # bold
 76 |         echo 'Error! Joyent Triton CLI is required, but does not appear to be installed.'
 77 |         tput sgr0 # clear
 78 |         echo 'See https://www.joyent.com/blog/introducing-the-triton-command-line-tool'
 79 |         exit 1
 80 |     }
 81 | 
 82 |     # make sure Docker client is pointed to the same place as the Triton client
 83 |     local docker_user=$(docker info 2>&1 | awk -F": " '/SDCAccount:/{print $2}')
 84 |     local docker_dc=$(echo $DOCKER_HOST | awk -F"/" '{print $3}' | awk -F'.' '{print $1}')
 85 |     TRITON_USER=$(triton profile get | awk -F": " '/account:/{print $2}')
 86 |     TRITON_DC=$(triton profile get | awk -F"/" '/url:/{print $3}' | awk -F'.' '{print $1}')
 87 |     if [ ! "$docker_user" = "$TRITON_USER" ] || [ ! "$docker_dc" = "$TRITON_DC" ]; then
 88 |         echo
 89 |         tput rev  # reverse
 90 |         tput bold # bold
 91 |         echo 'Error! The Triton CLI configuration does not match the Docker CLI configuration.'
 92 |         tput sgr0 # clear
 93 |         echo
 94 |         echo "Docker user: ${docker_user}"
 95 |         echo "Triton user: ${TRITON_USER}"
 96 |         echo "Docker data center: ${docker_dc}"
 97 |         echo "Triton data center: ${TRITON_DC}"
 98 |         exit 1
 99 |     fi
100 | 
101 |     local triton_cns_enabled=$(triton account get | awk -F": " '/cns/{print $2}')
102 |     if [ ! "true" == "$triton_cns_enabled" ]; then
103 |         echo
104 |         tput rev  # reverse
105 |         tput bold # bold
106 |         echo 'Error! Triton CNS is required and not enabled.'
107 |         tput sgr0 # clear
108 |         echo
109 |         exit 1
110 |     fi
111 | 
112 |     # setup environment file
113 |     if [ ! -f "_env" ]; then
114 |         echo '# Environment variables for MySQL service' > _env
115 |         echo 'MYSQL_USER=dbuser' >> _env
116 |         echo 'MYSQL_PASSWORD='$(cat /dev/urandom | LC_ALL=C tr -dc 'A-Za-z0-9' | head -c 7) >> _env
117 |         echo 'MYSQL_REPL_USER=repluser' >> _env
118 |         echo 'MYSQL_REPL_PASSWORD='$(cat /dev/urandom | LC_ALL=C tr -dc 'A-Za-z0-9' | head -c 7) >> _env
119 |         echo 'MYSQL_DATABASE=demodb' >> _env
120 |         echo >> _env
121 | 
122 |         echo '# Environment variables for backups to Manta' >> _env
123 |         echo 'MANTA_URL=https://us-east.manta.joyent.com' >> _env
124 |         echo 'MANTA_BUCKET= # an existing Manta bucket' >> _env
125 |         echo 'MANTA_USER= # a user with access to that bucket' >> _env
126 |         echo 'MANTA_SUBUSER=' >> _env
127 |         echo 'MANTA_ROLE=' >> _env
128 | 
129 |         # MANTA_KEY_ID must be the md5 formatted key fingerprint. A SHA256 will result in errors.
130 |         set +o pipefail
131 |         # The -E option was added to ssh-keygen recently; if it doesn't work, then
132 |         # assume we're using an older version of ssh-keygen that only outputs MD5 fingerprints
133 |         ssh-keygen -yl -E md5 -f ${MANTA_PRIVATE_KEY_PATH} > /dev/null 2>&1
134 |         if [ $? -eq 0 ]; then
135 |             echo MANTA_KEY_ID=$(ssh-keygen -yl -E md5 -f ${MANTA_PRIVATE_KEY_PATH} | awk '{print substr($2,5)}') >> _env
136 |         else
137 |             echo MANTA_KEY_ID=$(ssh-keygen -yl -f ${MANTA_PRIVATE_KEY_PATH} | awk '{print $2}') >> _env
138 |         fi
139 |         set -o pipefail
140 | 
141 |         # munge the private key so that we can pass it into an env var sanely
142 |         # and then unmunge it in our startup script
143 |         echo MANTA_PRIVATE_KEY=$(cat ${MANTA_PRIVATE_KEY_PATH} | tr '\n' '#') >> _env
144 |         echo >> _env
145 | 
146 |         echo 'Edit the _env file with your desired MYSQL_* and MANTA_* config'
147 |     else
148 |         echo 'Existing _env file found, exiting'
149 |         exit
150 |     fi
151 | }
152 | 
153 | get_root_password() {
154 |     echo $(docker logs ${COMPOSE_PROJECT_NAME:-mysql}_mysql_1 2>&1 | \
155 |                awk '/Generated root password/{print $NF}' | \
156 |                awk '{$1=$1};1'
157 |         ) | pbcopy
158 | }
159 | 
160 | 
161 | 
162 | # ---------------------------------------------------
163 | # parse arguments
164 | 
165 | # Get function list
166 | funcs=($(declare -F -p | cut -d " " -f 3))
167 | 
168 | until
169 |     if [ ! -z "$1" ]; then
170 |         # check if the first arg is a function in this file, or use a default
171 |         if [[ " ${funcs[@]} " =~ " $1 " ]]; then
172 |             cmd=$1
173 |             shift 1
174 |         else
175 |             cmd="envcheck"
176 |         fi
177 | 
178 |         $cmd "$@"
179 |         if [ $? == 127 ]; then
180 |             help
181 |         fi
182 | 
183 |         exit
184 |     else
185 |         help
186 |     fi
187 | do
188 |     echo
189 | done
190 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for shipping the container image and setting up
  2 | # permissions in Manta. Building with the docker-compose file
  3 | # directly works just fine without this.
  4 | 
  5 | MAKEFLAGS += --warn-undefined-variables
  6 | .DEFAULT_GOAL := build
  7 | .PHONY: *
  8 | 
  9 | # we get these from CI environment if available, otherwise from git
 10 | GIT_COMMIT ?= $(shell git rev-parse --short HEAD)
 11 | GIT_BRANCH ?= $(shell git rev-parse --abbrev-ref HEAD)
 12 | 
 13 | namespace ?= autopilotpattern
 14 | tag := branch-$(shell basename $(GIT_BRANCH))
 15 | image := $(namespace)/mysql
 16 | testImage := $(namespace)/mysql-testrunner
 17 | 
 18 | ## Display this help message
 19 | help:
 20 | 	@awk '/^##.*$$/,/[a-zA-Z_-]+:/' $(MAKEFILE_LIST) | awk '!(NR%2){print $$0p}{p=$$0}' | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' | sort
 21 | 
 22 | # ------------------------------------------------
 23 | # Target environment configuration
 24 | 
 25 | # if you pass `TRACE=1` into the call to `make` then the Python tests will
 26 | # run under the `trace` module (provides detailed call logging)
 27 | ifndef TRACE
 28 | python := python
 29 | else
 30 | python := python -m trace
 31 | endif
 32 | 
 33 | # ------------------------------------------------
 34 | # Container builds
 35 | 
 36 | ## Builds the application container image locally
 37 | build: build/tester
 38 | 	docker build -t=$(image):$(tag) .
 39 | 
 40 | ## Build the test running container
 41 | build/tester:
 42 | 	docker build -f tests/Dockerfile -t=$(testImage):$(tag) .
 43 | 
 44 | ## Push the current application container images to the Docker Hub
 45 | push:
 46 | 	docker push $(image):$(tag)
 47 | 	docker push $(testImage):$(tag)
 48 | 
 49 | ## Tag the current images as 'latest' and push them to the Docker Hub
 50 | ship:
 51 | 	docker tag $(image):$(tag) $(image):latest
 52 | 	docker tag $(testImage):$(tag) $(testImage):latest
 53 | 	docker tag $(image):$(tag) $(image):latest
 54 | 	docker push $(image):$(tag)
 55 | 	docker push $(image):latest
 56 | 
 57 | 
 58 | # ------------------------------------------------
 59 | # Run the example stack
 60 | 
 61 | ## Run the stack under local Compose
 62 | run/compose:
 63 | 	cd examples/compose && TAG=$(tag) docker-compose -p my up -d
 64 | 	cd examples/compose && TAG=$(tag) docker-compose -p my logs -f mysql
 65 | 
 66 | ## Scale up the local Compose stack
 67 | run/compose/scale:
 68 | 	cd examples/compose && TAG=$(tag) docker-compose -p my scale mysql=2
 69 | 
 70 | # ------------------------------------------------
 71 | # Test running
 72 | 
 73 | ## Pull the container images from the Docker Hub
 74 | pull:
 75 | 	docker pull $(image):$(tag)
 76 | 
 77 | ## Run all tests
 78 | test: test/unit test/triton # test/compose
 79 | 
 80 | ## Run the unit tests inside the mysql container
 81 | test/unit:
 82 | 	docker run --rm -w /usr/local/bin \
 83 | 		-e LOG_LEVEL=DEBUG \
 84 | 		$(image):$(tag) \
 85 | 		$(python) test.py
 86 | 
 87 | ## Run the unit tests with source mounted to the container for local dev
 88 | test/unit-src:
 89 | 	docker run --rm  -w /usr/local/bin \
 90 | 		-v $(shell pwd)/bin/manager:/usr/local/bin/manager \
 91 | 		-v $(shell pwd)/bin/manage.py:/usr/local/bin/manage.py \
 92 | 		-v $(shell pwd)/bin/test.py:/usr/local/bin/test.py \
 93 | 		-e LOG_LEVEL=DEBUG \
 94 | 		$(image):$(tag) \
 95 | 		$(python) test.py
 96 | 
 97 | ## Run the integration test runner against Compose locally.
 98 | test/compose:
 99 | 	docker run --rm \
100 | 		-e TAG=$(tag) \
101 | 		-e GIT_BRANCH=$(GIT_BRANCH) \
102 | 		-e WORK_DIR=/src \
103 | 		--network=bridge \
104 | 		-v /var/run/docker.sock:/var/run/docker.sock \
105 | 		-v $(shell pwd)/tests/compose.sh:/src/compose.sh \
106 | 		-w /src \
107 | 		$(testImage):$(tag) /src/compose.sh
108 | 
109 | 
110 | test/shell:
111 | 	docker run --rm -it \
112 | 		-e TAG=$(tag) \
113 | 		-e GIT_BRANCH=$(GIT_BRANCH) \
114 | 		-e WORK_DIR=/src \
115 | 		--network=bridge \
116 | 		-v /var/run/docker.sock:/var/run/docker.sock \
117 | 		-v $(shell pwd)/tests/compose.sh:/src/compose.sh \
118 | 		-w /src \
119 | 		$(testImage):$(tag) /bin/bash
120 | 
121 | ## Run the integration test runner. Runs locally but targets Triton.
122 | test/triton:
123 | 	$(call check_var, TRITON_PROFILE, MANTA_USER, MANTA_KEY_ID \
124 | 		required to run integration tests on Triton.)
125 | 	docker run --rm \
126 | 		-e TAG=$(tag) \
127 | 		-e TRITON_PROFILE=$(TRITON_PROFILE) \
128 | 		-e MANTA_USER=$(MANTA_USER) \
129 | 		-e MANTA_KEY_ID=$(MANTA_KEY_ID) \
130 | 		-e GIT_BRANCH=$(GIT_BRANCH) \
131 | 		-v ~/.ssh:/root/.ssh:ro \
132 | 		-v ~/.triton/profiles.d:/root/.triton/profiles.d:ro \
133 | 		-w /src \
134 | 		$(testImage):$(tag) /src/triton.sh
135 | 
136 | # -------------------------------------------------------
137 | 
138 | ## Tear down all project containers
139 | teardown:
140 | 	docker-compose -p my stop
141 | 	docker-compose -p my rm -f
142 | 
143 | ## Dump logs for each container to local disk
144 | logs:
145 | 	docker logs my_consul_1 > consul1.log 2>&1
146 | 	docker logs my_mysql_1 > mysql1.log 2>&1
147 | 	docker logs my_mysql_2 > mysql2.log 2>&1
148 | 	docker logs my_mysql_3 > mysql3.log 2>&1
149 | 
150 | # -------------------------------------------------------
151 | 
152 | MANTA_URL ?= https://us-east.manta.joyent.com
153 | MANTA_USER ?= triton_mysql
154 | MANTA_SUBUSER ?= triton_mysql
155 | MANTA_LOGIN ?= triton_mysql
156 | MANTA_ROLE ?= triton_mysql
157 | MANTA_POLICY ?= triton_mysql
158 | 
159 | ## Create user and policies for Manta backups
160 | manta:
161 | 	# you need to have your SDC_ACCOUNT set
162 | 	# usage:
163 | 	# make manta EMAIL=example@example.com PASSWORD=strongpassword
164 | 	$(call check_var, EMAIL PASSWORD SDC_ACCOUNT, \
165 | 		Required to create a Manta login.)
166 | 
167 | 	ssh-keygen -t rsa -b 4096 -C "${EMAIL}" -f manta
168 | 	sdc-user create --login=${MANTA_LOGIN} --password=${PASSWORD} --email=${EMAIL}
169 | 	sdc-user upload-key $(ssh-keygen -E md5 -lf ./manta | awk -F' ' '{gsub("MD5:","");{print $2}}') --name=${MANTA_LOGIN}-key ${MANTA_LOGIN} ./manta.pub
170 | 	sdc-policy create --name=${MANTA_POLICY} \
171 | 		--rules='CAN getobject' \
172 | 		--rules='CAN putobject' \
173 | 		--rules='CAN putmetadata' \
174 | 		--rules='CAN putsnaplink' \
175 | 		--rules='CAN getdirectory' \
176 | 		--rules='CAN putdirectory'
177 | 	sdc-role create --name=${MANTA_ROLE} \
178 | 		--policies=${MANTA_POLICY} \
179 | 		--members=${MANTA_LOGIN}
180 | 	mmkdir ${SDC_ACCOUNT}/stor/${MANTA_LOGIN}
181 | 	mchmod -- +triton_mysql /${SDC_ACCOUNT}/stor/${MANTA_LOGIN}
182 | 
183 | ## Cleans out Manta backups
184 | cleanup:
185 | 	$(call check_var, SDC_ACCOUNT, Required to cleanup Manta.)
186 | 	-mrm -r /${SDC_ACCOUNT}/stor/triton-mysql/
187 | 	mmkdir /${SDC_ACCOUNT}/stor/triton-mysql
188 | 	mchmod -- +triton_mysql /${SDC_ACCOUNT}/stor/triton-mysql
189 | 
190 | 
191 | # -------------------------------------------------------
192 | # helper functions for testing if variables are defined
193 | 
194 | ## Cleanup local backups and log debris
195 | clean:
196 | 	rm -rf tmp/
197 | 	find . -name '*.log' -delete
198 | 
199 | ## Print environment for build debugging
200 | debug:
201 | 	@echo GIT_COMMIT=$(GIT_COMMIT)
202 | 	@echo GIT_BRANCH=$(GIT_BRANCH)
203 | 	@echo namespace=$(namespace)
204 | 	@echo tag=$(tag)
205 | 	@echo image=$(image)
206 | 	@echo testImage=$(testImage)
207 | 
208 | check_var = $(foreach 1,$1,$(__check_var))
209 | __check_var = $(if $(value $1),,\
210 | 	$(error Missing $1 $(if $(value 2),$(strip $2))))
211 | 


--------------------------------------------------------------------------------
/tests/Dockerfile:
--------------------------------------------------------------------------------
 1 | # NOTE: this Dockerfile needs to be run from one-level up so that
 2 | # we get the examples docker-compose.yml files. Use 'make build/tester'
 3 | # in the makefile at the root of this repo and everything will work
 4 | 
 5 | FROM alpine:3.5
 6 | 
 7 | RUN apk update \
 8 |     && apk add nodejs python3 openssl bash curl docker
 9 | RUN npm install -g triton manta json
10 | 
11 | # the Compose package in the public releases doesn't work on Alpine
12 | RUN pip3 install docker-compose==1.10.0
13 | 
14 | # install specific version of Docker and Compose client
15 | COPY tests/triton-docker-cli/triton-docker /usr/local/bin/triton-docker
16 | RUN sed -i 's/1.9.0/1.10.0/' /usr/local/bin/triton-docker \
17 |     && ln -s /usr/local/bin/triton-docker /usr/local/bin/triton-compose \
18 |     && ln -s /usr/local/bin/triton-docker /usr/local/bin/triton-docker-install \
19 |     && /usr/local/bin/triton-docker-install \
20 |     && rm /usr/local/bin/triton-compose-helper \
21 |     && ln -s /usr/bin/docker-compose /usr/local/bin/triton-compose-helper
22 | 
23 | 
24 | # install test targets
25 | COPY examples/triton/docker-compose.yml /src/examples/triton/docker-compose.yml
26 | COPY examples/triton/setup.sh /src/examples/triton/setup.sh
27 | COPY examples/compose/docker-compose.yml /src/examples/compose/docker-compose.yml
28 | #COPY examples/compose/setup.sh /src/examples/compose/setup.sh
29 | 
30 | # install test code
31 | COPY tests/triton.sh /src/triton.sh
32 | COPY tests/compose.sh /src/compose.sh
33 | 


--------------------------------------------------------------------------------
/tests/compose.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | export GIT_BRANCH="${GIT_BRANCH:-$(git rev-parse --abbrev-ref HEAD)}"
  5 | export TAG="${TAG:-branch-$(basename "$GIT_BRANCH")}"
  6 | export COMPOSE_PROJECT="${COMPOSE_PROJECT_NAME:-my}"
  7 | export COMPOSE_FILE="${COMPOSE_FILE:-./examples/compose/docker-compose.yml}"
  8 | 
  9 | user=${MYSQL_USER:-dbuser}
 10 | passwd=${MYSQL_PASSWORD:-seekretPassword}
 11 | db=${MYSQL_DATABASE:-demodb}
 12 | 
 13 | project="$COMPOSE_PROJECT"
 14 | manifest="$COMPOSE_FILE"
 15 | 
 16 | 
 17 | fail() {
 18 |     echo
 19 |     echo '------------------------------------------------'
 20 |     echo 'FAILED: dumping logs'
 21 |     echo '------------------------------------------------'
 22 |     docker-compose -p "$project" -f "$manifest" ps
 23 |     docker-compose -p "$project" -f "$manifest" logs > compose.log
 24 |     echo '------------------------------------------------'
 25 |     echo 'FAILED'
 26 |     echo "$1"
 27 |     echo '------------------------------------------------'
 28 |     exit 1
 29 | }
 30 | 
 31 | pass() {
 32 |     teardown
 33 |     echo
 34 |     echo '------------------------------------------------'
 35 |     echo 'PASSED!'
 36 |     echo
 37 |     exit 0
 38 | }
 39 | 
 40 | function finish {
 41 |     result=$?
 42 |     if [ $result -ne 0 ]; then fail "unexpected error"; fi
 43 |     pass
 44 | }
 45 | trap finish EXIT
 46 | 
 47 | 
 48 | 
 49 | # --------------------------------------------------------------------
 50 | # Helpers
 51 | 
 52 | # asserts that 'count' MySQL instances are running and marked as Up
 53 | # by Docker. fails after the timeout.
 54 | wait_for_containers() {
 55 |     local count timeout i got
 56 |     count="$1"
 57 |     timeout="${3:-120}" # default 120sec
 58 |     i=0
 59 |     echo "waiting for $count MySQL containers to be Up..."
 60 |     while [ $i -lt "$timeout" ]; do
 61 |         got=$(docker-compose -p "$project" -f "$manifest" ps mysql)
 62 |         got=$(echo "$got" | grep -c "Up")
 63 |         if [ "$got" -eq "$count" ]; then
 64 |             echo "$count instances reported Up in <= $i seconds"
 65 |             return
 66 |         fi
 67 |         i=$((i+1))
 68 |         sleep 1
 69 |     done
 70 |     fail "$count instances did not report Up within $timeout seconds"
 71 | }
 72 | 
 73 | # asserts that the application has registered at least n instances with
 74 | # Consul. fails after the timeout.
 75 | wait_for_service() {
 76 |     local service count timeout i got consul_ip
 77 |     service="$1"
 78 |     count="$2"
 79 |     timeout="${3:-300}" # default 300sec
 80 |     i=0
 81 |     echo "waiting for $count instances of $service to be registered with Consul..."
 82 |     consul_ip=$(docker inspect "${project}_consul_1" | json -a NetworkSettings.IPAddress)
 83 |     while [ $i -lt "$timeout" ]; do
 84 |         got=$(curl -s "http://${consul_ip}:8500/v1/health/service/${service}?passing" \
 85 |                      | json -a Service.Address | wc -l | tr -d ' ')
 86 |         if [ "$got" -eq "$count" ]; then
 87 |             echo "$service registered in <= $i seconds"
 88 |             return
 89 |         fi
 90 |         i=$((i+1))
 91 |         sleep 1
 92 |     done
 93 |     fail "waited for service $service for $timeout seconds but it was not registed with Consul"
 94 | }
 95 | 
 96 | # gets the container that's currently primary in Consul
 97 | get_primary() {
 98 |     local got consul_ip
 99 |     consul_ip=$(docker inspect "${project}_consul_1" | json -a NetworkSettings.IPAddress)
100 |     got=$(curl -s "http://${consul_ip}:8500/v1/health/service/mysql-primary?passing" \
101 |                  | json -a Node.Address)
102 |     echo "$got"
103 | }
104 | 
105 | # gets a container that's currently a replica in Consul
106 | get_replica() {
107 |     local got consul_ip
108 |     consul_ip=$(docker inspect "${project}_consul_1" | json -a NetworkSettings.IPAddress)
109 |     got=$(curl -s "http://${consul_ip}:8500/v1/health/service/mysql?passing" \
110 |                  | json -a Node.Address)
111 |     echo "$got"
112 | }
113 | 
114 | # creates a table on the first instance, which will be replicated to
115 | # the other nodes
116 | create_table() {
117 |     echo 'creating test table'
118 |     exec_query "${project}_mysql_1" 'CREATE TABLE tbl1 (field1 INT, field2 VARCHAR(36));'
119 | }
120 | 
121 | check_replication() {
122 |     echo 'checking replication'
123 |     local primary="$1"
124 |     local replica="$2"
125 |     local testkey="$3"
126 |     local testval="$3"
127 |     echo "writing to $primary"
128 |     exec_query "$primary" "INSERT INTO tbl1 (field1, field2) VALUES ($testkey, \"$testval\");"
129 | 
130 |     # check the replica, giving it a few seconds to catch up
131 |     local timeout i
132 |     timeout=5
133 |     i=0
134 |     echo "checking read from $replica"
135 |     while [ $i -lt "$timeout" ]; do
136 |         got=$(exec_query "$replica" "SELECT * FROM tbl1 WHERE field1=$testkey;")
137 |         got=$(echo "$got" | grep -c "$testkey: $testval")
138 |         if [ "$got" -eq 1 ]; then
139 |             return
140 |         fi
141 |         i=$((i+1))
142 |         sleep 1
143 |     done
144 |     fail "failed to replicate write from $primary to $replica; query got $got"
145 | }
146 | 
147 | # runs a SQL statement on the node via docker exec. normally this method
148 | # would be subject to SQL injection but we control all inputs and we don't
149 | # want to have to ship a mysql client in this test rig.
150 | exec_query() {
151 |     local node="$1"
152 |     local query="$2"
153 |     echo "$node"
154 |     out=$(docker exec -i "$node" \
155 |                  mysql -u "$user" "-p${passwd}" --vertical -e "$query" "$db")
156 |     echo "$out"
157 | }
158 | 
159 | restart() {
160 |     node="${project}_$1"
161 |     docker restart "$node"
162 | }
163 | 
164 | stop() {
165 |     node="${project}_$1"
166 |     docker stop "$node"
167 | }
168 | 
169 | run() {
170 |     echo
171 |     echo '* cleaning up previous test run'
172 |     echo
173 |     docker-compose -p "$project" -f "$manifest" stop
174 |     docker-compose -p "$project" -f "$manifest" rm -f
175 | 
176 |     echo
177 |     echo '* standing up initial test targets'
178 |     echo
179 |     docker-compose -p "$project" -f "$manifest" up -d
180 | }
181 | 
182 | teardown() {
183 |     echo
184 |     echo '* tearing down containers'
185 |     echo
186 |     docker-compose -p "$project" -f "$manifest" stop
187 |     docker-compose -p "$project" -f "$manifest" rm -f
188 | }
189 | 
190 | scale() {
191 |     count="$1"
192 |     echo
193 |     echo '* scaling up cluster'
194 |     echo
195 |     docker-compose -p "$project" -f "$manifest" scale mysql="$count"
196 | }
197 | 
198 | 
199 | # --------------------------------------------------------------------
200 | # Test sections
201 | 
202 | test-failover() {
203 |     echo
204 |     echo '------------------------------------------------'
205 |     echo 'executing failover test'
206 |     echo '------------------------------------------------'
207 | 
208 |     # stand up and setup
209 |     run
210 |     wait_for_containers 1
211 |     wait_for_service 'mysql-primary' 1
212 |     scale 3
213 |     wait_for_containers 3
214 |     wait_for_service 'mysql' 2
215 |     create_table
216 | 
217 |     # verify working
218 |     check_replication "${project}_mysql_1" "${project}_mysql_2" "1" "a"
219 | 
220 |     sleep 15
221 | 
222 |     # force failover and verify again
223 |     stop "mysql_1"
224 |     wait_for_containers 2
225 |     wait_for_service 'mysql-primary' 1
226 |     wait_for_service 'mysql' 1
227 | 
228 |     local primary replica
229 |     primary=$(get_primary)
230 |     replica=$(get_replica)
231 |     check_replication "$primary" "$replica" "2" "b"
232 | }
233 | 
234 | # --------------------------------------------------------------------
235 | # Main loop
236 | 
237 | test-failover
238 | 


--------------------------------------------------------------------------------
/tests/tests.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Integration tests for autopilotpattern/mysql. These tests are executed
  3 | inside a test-running container based on autopilotpattern/testing.
  4 | """
  5 | from __future__ import print_function
  6 | import os
  7 | from os.path import expanduser
  8 | import random
  9 | import subprocess
 10 | import string
 11 | import sys
 12 | import time
 13 | import unittest
 14 | import uuid
 15 | 
 16 | from testcases import AutopilotPatternTest, WaitTimeoutError, dump_environment_to_file
 17 | 
 18 | 
 19 | class MySQLStackTest(AutopilotPatternTest):
 20 | 
 21 |     project_name = 'my'
 22 | 
 23 |     def setUp(self):
 24 |         """
 25 |         autopilotpattern/mysql setup.sh writes an _env file with a CNS
 26 |         entry and account info for Manta. If this has been mounted from
 27 |         the test environment, we'll use that, otherwise we have to
 28 |         generate it from the environment.
 29 |         """
 30 |         if not os.path.isfile('_env'):
 31 |             print('generating _env')
 32 |             os.environ['MYSQL_USER'] = self.user = 'mytestuser'
 33 |             os.environ['MYSQL_PASSWORD'] = self.passwd = gen_password()
 34 |             os.environ['MYSQL_DATABASE'] = self.db = 'mytestdb'
 35 |             os.environ['MYSQL_REPL_USER'] = self.repl_user = 'myrepluser'
 36 |             os.environ['MYSQL_REPL_PASSWORD'] = self.repl_passwd = gen_password()
 37 |             with open(os.environ['DOCKER_CERT_PATH'] + '/key.pem') as key_file:
 38 |                 manta_key = '#'.join([line.strip() for line in key_file])
 39 |             os.environ['MANTA_PRIVATE_KEY'] = manta_key
 40 | 
 41 |             dump_environment_to_file('_env')
 42 | 
 43 |     def test_replication_and_failover(self):
 44 |         """
 45 |         Given the MySQL stack, when we scale up MySQL instances they should:
 46 |         - become a new replica
 47 |         - with working replication
 48 |         Given when we stop the MySQL primary:
 49 |         - one of the replicas should become the new primary
 50 |         - the other replica should replicate from it
 51 |         """
 52 |         # wait until the first instance has configured itself as the
 53 |         # the primary; we need very long timeout b/c of provisioning
 54 |         self.settle('mysql-primary', 1, timeout=600)
 55 | 
 56 |         # scale up, make sure we have 2 working replica instances
 57 |         self.compose_scale('mysql', 3)
 58 |         self.settle('mysql', 2, timeout=600)
 59 | 
 60 |         # create a table
 61 |         create_table = 'CREATE TABLE tbl1 (field1 INT, field2 VARCHAR(36));'
 62 |         self.exec_query('mysql_1', create_table)
 63 | 
 64 |         # check replication is working by writing rows to the primary
 65 |         # and verifying they show up in the replicas
 66 | 
 67 |         insert_row = 'INSERT INTO tbl1 (field1, field2) VALUES ({}, "{}");'
 68 |         vals = [str(uuid.uuid4()),
 69 |                 str(uuid.uuid4()),
 70 |                 str(uuid.uuid4())]
 71 | 
 72 |         self.exec_query('mysql_1', insert_row.format(1, vals[0]))
 73 |         self.exec_query('mysql_1', insert_row.format(1, vals[1]))
 74 |         self.assert_good_replication(vals[:2])
 75 | 
 76 |         # kill the primary, make sure we get a new primary
 77 |         self.docker_stop('mysql_1')
 78 |         self.settle('mysql-primary', 1, timeout=300)
 79 |         self.settle('mysql', 1)
 80 | 
 81 |         # check replication is still working
 82 |         primary = self.get_service_instances_from_consul('mysql-primary')[0]
 83 |         self.exec_query(primary, insert_row.format(1, vals[2]))
 84 |         self.assert_good_replication(vals)
 85 | 
 86 |     def settle(self, service, count, timeout=60):
 87 |         """
 88 |         Wait for the service to appear healthy and correctly in Consul
 89 |         """
 90 |         try:
 91 |             nodes = self.instrument(self.wait_for_service,
 92 |                                     service, count, timeout=timeout)
 93 |             if len(nodes) < count:
 94 |                 raise WaitTimeoutError()
 95 |             self.instrument(self.assert_consul_correctness)
 96 |         except WaitTimeoutError:
 97 |             self.fail('Failed to scale {} to {} instances'
 98 |                       .format(service, count))
 99 | 
100 |     def assert_consul_correctness(self):
101 |         """ Verify that Consul addresses match container addresses """
102 |         try:
103 |             primary = self.get_primary_ip()
104 |             replicas = self.get_replica_ips()
105 |             expected = [str(ip) for ip in
106 |                         self.get_service_ips('mysql', ignore_errors=True)[1]]
107 |         except subprocess.CalledProcessError as ex:
108 |             self.fail('subprocess.CalledProcessError: {}'.format(ex.output))
109 |         expected.remove(primary)
110 |         expected.sort()
111 |         self.assertEqual(replicas, expected,
112 |                          'Upstream blocks {} did not match actual IPs {}'
113 |                          .format(replicas, expected))
114 | 
115 |     def assert_good_replication(self, expected_vals):
116 |         """
117 |         Checks each replica to make sure it has the recently written
118 |         field2 values passed in as the `vals` param.
119 |         """
120 |         check_row = 'SELECT * FROM tbl1 WHERE `field1`=1;'
121 | 
122 |         def check_replica(replica):
123 |             timeout = 15
124 |             while timeout > 0:
125 |                 # we'll give the replica a couple chances to catch up
126 |                 results = self.exec_query(replica, check_row).splitlines()
127 |                 got_vals = []
128 |                 for line in results:
129 |                     if line.startswith('field2:'):
130 |                         got_vals.append(line.replace('field2: ', '', 1))
131 |                     if not set(expected_vals) - set(got_vals):
132 |                         return None # all values replicated
133 | 
134 |                 # we're missing a value
135 |                 timeout -= 1
136 |             return got_vals
137 | 
138 |         replicas = self.get_replica_containers()
139 |         for replica in replicas:
140 |             got_vals = check_replica(replica)
141 |             if got_vals:
142 |                 self.fail('Replica {} is missing values {}; got {}'
143 |                           .format(replica, expected_vals, got_vals))
144 | 
145 |     def get_primary_ip(self):
146 |         """ Get the IP for the primary from Consul. """
147 |         try:
148 |             node = self.get_service_addresses_from_consul('mysql-primary')[0]
149 |             return node
150 |         except IndexError:
151 |             self.fail('mysql-primary does not exist in Consul.')
152 | 
153 |     def get_replica_ips(self):
154 |         """ Get the IPs for the replica(s) from Consul. """
155 |         nodes = self.get_service_addresses_from_consul('mysql')
156 |         nodes.sort()
157 |         return nodes
158 | 
159 |     def get_primary_container(self):
160 |         """ Get the container name for the primary from Consul """
161 |         try:
162 |             node = self.get_service_instances_from_consul('mysql-primary')[0]
163 |             return node
164 |         except IndexError:
165 |             self.fail('mysql-primary does not exist in Consul.')
166 | 
167 |     def get_replica_containers(self):
168 |         """ Get the container names for the replica(s) from Consul. """
169 |         nodes = self.get_service_instances_from_consul('mysql')
170 |         nodes.sort()
171 |         return nodes
172 | 
173 |     def exec_query(self, container, query, user=None, passwd=None):
174 |         """
175 |         Runs SQL statement via docker exec. Normally this method would
176 |         be subject to SQL injection but we control all inputs and we
177 |         don't want to have to ship a mysql client in the test rig.
178 |         """
179 |         if not user:
180 |             user = self.user
181 |         if not passwd:
182 |             passwd = self.passwd
183 |         cmd = ['mysql', '-u', user,
184 |                '-p{}'.format(passwd),
185 |                '--vertical', # makes parsing easier
186 |                '-e', query, self.db]
187 |         try:
188 |             out = self.docker_exec(container, cmd)
189 |         except subprocess.CalledProcessError as ex:
190 |             self.fail('subprocess.CalledProcessError in {} for command {}:\n{}'
191 |                       .format(container, cmd, ex.output))
192 |         return out
193 | 
194 | 
195 | # ------------------------------------------------
196 | # helper functions
197 | 
198 | def gen_password():
199 |     """
200 |     When we run the tests on Shippable the setup.sh fails silently
201 |     and we end up with blank (unworkable) passwords. This appears
202 |     to be specific to Shippable and not other Docker/Triton envs
203 |     """
204 |     return ''.join(random.choice(
205 |         string.ascii_uppercase + string.digits) for _ in range(10))
206 | 
207 | 
208 | if __name__ == "__main__":
209 |     unittest.main(failfast=True)
210 | 


--------------------------------------------------------------------------------
/tests/triton.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | export GIT_BRANCH="${GIT_BRANCH:-$(git rev-parse --abbrev-ref HEAD)}"
  5 | export TAG="${TAG:-branch-$(basename "$GIT_BRANCH")}"
  6 | export COMPOSE_PROJECT="${COMPOSE_PROJECT_NAME:-my}"
  7 | export COMPOSE_FILE="${COMPOSE_FILE:-./examples/triton/docker-compose.yml}"
  8 | 
  9 | user=${MYSQL_USER:-mytestuser}
 10 | passwd=${MYSQL_PASSWORD:-password1}
 11 | db=${MYSQL_DATABASE:-mytestdb}
 12 | repl_user=${MYSQL_REPL_USER:-myrepluser}
 13 | repl_passwd=${MYSQL_REPL_PASSWORD:-password2}
 14 | 
 15 | manta_url=${MANTA_URL:-https://us-east.manta.joyent.com}
 16 | manta_user=${MANTA_USER:-triton_mysql}
 17 | manta_subuser=${MANTA_SUBUSER:-triton_mysql}
 18 | manta_role=${MANTA_ROLE:-triton_mysql}
 19 | manta_bucket=${MANTA_BUCKET:-"/${manta_user}/stor/triton_mysql"}
 20 | manta_key_id=${MANTA_KEY_ID}
 21 | 
 22 | project="$COMPOSE_PROJECT"
 23 | manifest="$COMPOSE_FILE"
 24 | 
 25 | fail() {
 26 |     echo
 27 |     echo '------------------------------------------------'
 28 |     echo 'FAILED: dumping logs'
 29 |     echo '------------------------------------------------'
 30 |     triton-compose -p "$project" -f "$manifest" ps
 31 |     triton-compose -p "$project" -f "$manifest" logs
 32 |     echo '------------------------------------------------'
 33 |     echo 'FAILED'
 34 |     echo "$1"
 35 |     echo '------------------------------------------------'
 36 |     exit 1
 37 | }
 38 | 
 39 | pass() {
 40 |     teardown
 41 |     echo
 42 |     echo '------------------------------------------------'
 43 |     echo 'PASSED!'
 44 |     echo
 45 |     exit 0
 46 | }
 47 | 
 48 | function finish {
 49 |     result=$?
 50 |     if [ $result -ne 0 ]; then fail "unexpected error"; fi
 51 |     pass
 52 | }
 53 | trap finish EXIT
 54 | 
 55 | 
 56 | 
 57 | # --------------------------------------------------------------------
 58 | # Helpers
 59 | 
 60 | profile() {
 61 |     echo
 62 |     echo '------------------------------------------------'
 63 |     echo 'setting up profile for tests'
 64 |     echo '------------------------------------------------'
 65 |     echo
 66 |     export TRITON_PROFILE="${TRITON_PROFILE:-us-east-1}"
 67 |     set +e
 68 |     # if we're already set up for Docker this will fail noisily
 69 |     triton profile docker-setup -y "$TRITON_PROFILE" > /dev/null 2>&1
 70 |     set -e
 71 |     triton profile set-current "$TRITON_PROFILE"
 72 |     eval "$(triton env)"
 73 | 
 74 |     # print out for profile debugging
 75 |     env | grep DOCKER
 76 |     env | grep SDC
 77 |     env | grep TRITON
 78 | 
 79 |     local manta_key
 80 |     manta_key=$(tr '\n' '#' < "${DOCKER_CERT_PATH}/key.pem")
 81 |     {
 82 |         echo "MYSQL_USER=${user}"
 83 |         echo "MYSQL_PASSWORD=${passwd}"
 84 |         echo "MYSQL_REPL_USER=${repl_user}"
 85 |         echo "MYSQL_REPL_PASSWORD=$repl_passwd"
 86 |         echo "MYSQL_DATABASE=$db"
 87 | 
 88 |         echo "MANTA_URL=$manta_url"
 89 |         echo "MANTA_BUCKET=$manta_bucket"
 90 |         echo "MANTA_USER=$manta_user"
 91 |         echo "MANTA_SUBUSER=$manta_subuser"
 92 |         echo "MANTA_ROLE=$manta_role"
 93 |         echo "MANTA_KEY=$manta_key"
 94 |         echo "MANTA_KEY_ID=$manta_key_id"
 95 |     } > ./examples/triton/_env
 96 | }
 97 | 
 98 | # asserts that 'count' MySQL instances are running and marked as Up
 99 | # by Docker. fails after the timeout.
100 | wait_for_containers() {
101 |     local count timeout i got
102 |     count="$1"
103 |     timeout="${3:-120}" # default 120sec
104 |     i=0
105 |     echo "waiting for $count MySQL containers to be Up..."
106 |     while [ $i -lt "$timeout" ]; do
107 |         got=$(triton-compose -p "$project" -f "$manifest" ps mysql)
108 |         got=$(echo "$got" | grep -c "Up")
109 |         if [ "$got" -eq "$count" ]; then
110 |             echo "$count instances reported Up in <= $i seconds"
111 |             return
112 |         fi
113 |         i=$((i+1))
114 |         sleep 1
115 |     done
116 |     fail "$count instances did not report Up within $timeout seconds"
117 | }
118 | 
119 | # asserts that the application has registered at least n instances with
120 | # Consul. fails after the timeout.
121 | wait_for_service() {
122 |     local service count timeout i got consul_ip
123 |     service="$1"
124 |     count="$2"
125 |     timeout="${3:-300}" # default 300sec
126 |     i=0
127 |     echo "waiting for $count instances of $service to be registered with Consul..."
128 |     consul_ip=$(triton ip "${project}_consul_1")
129 |     while [ $i -lt "$timeout" ]; do
130 |         got=$(curl -s "http://${consul_ip}:8500/v1/health/service/${service}?passing" \
131 |                      | json -a Service.Address | wc -l | tr -d ' ')
132 |         if [ "$got" -eq "$count" ]; then
133 |             echo "$service registered in <= $i seconds"
134 |             return
135 |         fi
136 |         i=$((i+1))
137 |         sleep 1
138 |     done
139 |     fail "waited for service $service for $timeout seconds but it was not registed with Consul"
140 | }
141 | 
142 | # gets the container that's currently primary in Consul
143 | get_primary() {
144 |     local got consul_ip
145 |     consul_ip=$(triton ip "${project}_consul_1")
146 |     got=$(curl -s "http://${consul_ip}:8500/v1/health/service/mysql-primary?passing" \
147 |                  | json -a Node.Address)
148 |     echo "$got"
149 | }
150 | 
151 | # gets a container that's currently a replica in Consul
152 | get_replica() {
153 |     local got consul_ip
154 |     consul_ip=$(triton ip "${project}_consul_1")
155 |     got=$(curl -s "http://${consul_ip}:8500/v1/health/service/mysql?passing" \
156 |                  | json -a Node.Address)
157 |     echo "$got"
158 | }
159 | 
160 | # creates a table on the first instance, which will be replicated to
161 | # the other nodes
162 | create_table() {
163 |     echo 'creating test table'
164 |     exec_query "${project}_mysql_1" 'CREATE TABLE tbl1 (field1 INT, field2 VARCHAR(36));'
165 | }
166 | 
167 | check_replication() {
168 |     echo 'checking replication'
169 |     local primary="$1"
170 |     local replica="$2"
171 |     local testkey="$3"
172 |     local testval="$3"
173 |     exec_query "$primary" "INSERT INTO tbl1 (field1, field2) VALUES ($testkey, \"$testval\");"
174 | 
175 |     # check the replica, giving it a few seconds to catch up
176 |     local timeout i
177 |     timeout=5
178 |     i=0
179 |     while [ $i -lt "$timeout" ]; do
180 |         got=$(exec_query "$replica" "SELECT * FROM tbl1 WHERE field1=$testkey;")
181 |         got=$(echo "$got" | grep -c "$testkey: $testval")
182 |         if [ "$got" -eq 1 ]; then
183 |             return
184 |         fi
185 |         i=$((i+1))
186 |         sleep 1
187 |     done
188 |     fail "failed to replicate write from $primary to $replica; query got $got"
189 | }
190 | 
191 | # runs a SQL statement on the node via docker exec. normally this method
192 | # would be subject to SQL injection but we control all inputs and we don't
193 | # want to have to ship a mysql client in this test rig.
194 | exec_query() {
195 |     local node="$1"
196 |     local query="$2"
197 |     echo "$node"
198 |     out=$(triton-docker exec -i "$node" \
199 |                  mysql -u "$user" "-p${passwd}" --vertical -e "$query" "$db")
200 |     echo "$out"
201 | }
202 | 
203 | restart() {
204 |     node="${project}_$1"
205 |     triton-docker restart "$node"
206 | }
207 | 
208 | stop() {
209 |     node="${project}_$1"
210 |     triton-docker stop "$node"
211 | }
212 | 
213 | run() {
214 |     echo
215 |     echo '* cleaning up previous test run'
216 |     echo
217 |     triton-compose -p "$project" -f "$manifest" stop
218 |     triton-compose -p "$project" -f "$manifest" rm -f
219 | 
220 |     echo
221 |     echo '* standing up initial test targets'
222 |     echo
223 |     triton-compose -p "$project" -f "$manifest" up -d
224 | }
225 | 
226 | teardown() {
227 |     echo
228 |     echo '* tearing down containers'
229 |     echo
230 |     triton-compose -p "$project" -f "$manifest" stop
231 |     triton-compose -p "$project" -f "$manifest" rm -f
232 | 
233 |     # TODO: cleanup Manta directory too
234 |     # echo '* cleaning up Manta directory'
235 |     # mrm ...?
236 | }
237 | 
238 | scale() {
239 |     count="$1"
240 |     echo
241 |     echo '* scaling up cluster'
242 |     echo
243 |     triton-compose -p "$project" -f "$manifest" scale mysql="$count"
244 | }
245 | 
246 | 
247 | # --------------------------------------------------------------------
248 | # Test sections
249 | 
250 | test-failover() {
251 |     echo
252 |     echo '------------------------------------------------'
253 |     echo 'executing failover test'
254 |     echo '------------------------------------------------'
255 | 
256 |     # stand up and setup
257 |     run
258 |     wait_for_containers 1
259 |     wait_for_service 'mysql-primary' 1
260 |     scale 3
261 |     wait_for_containers 3
262 |     wait_for_service 'mysql' 2
263 |     create_table
264 | 
265 |     # verify working
266 |     check_replication 'mysql_1' 'mysql_2' "1" "a"
267 | 
268 |     # force failover and verify again
269 |     stop "mysql_1"
270 |     wait_for_containers 2
271 |     wait_for_service 'mysql-primary' 1
272 |     wait_for_service 'mysql' 1
273 | 
274 |     local primary replica
275 |     primary=$(get_primary)
276 |     replica=$(get_replica)
277 |     check_replication "$primary" "$replica" "2" "b"
278 | }
279 | 
280 | # --------------------------------------------------------------------
281 | # Main loop
282 | 
283 | profile
284 | test-failover
285 | 


--------------------------------------------------------------------------------