├── .gitignore ├── .travis.yml ├── 3rdparty └── README.md ├── LICENSE ├── README.md ├── RETIRED.txt ├── Vagrantfile ├── docs ├── README.md └── user-guide.md ├── mysos ├── __init__.py ├── common │ ├── __init__.py │ ├── cluster.py │ ├── decorators.py │ ├── fetcher.py │ ├── hdfs.py │ ├── pkgutil.py │ ├── testing.py │ └── zookeeper.py ├── executor │ ├── __init__.py │ ├── backup.py │ ├── executor.py │ ├── files │ │ └── bin │ │ │ └── mysql │ │ │ └── scripts │ │ │ ├── mysos_install_db.sh │ │ │ ├── mysos_launch_mysqld.sh │ │ │ ├── mysos_log_position.sh │ │ │ ├── mysos_promote_master.sh │ │ │ ├── mysos_reparent.sh │ │ │ └── mysos_wait_for_mysqld.sh │ ├── installer.py │ ├── mysos_task_runner.py │ ├── mysql_task_control.py │ ├── noop_installer.py │ ├── sandbox.py │ ├── shell_utils.py │ ├── state.py │ ├── task_control.py │ ├── task_runner.py │ └── testing │ │ ├── __init__.py │ │ ├── fake.py │ │ ├── fake_mysos_executor.py │ │ └── vagrant_mysos_executor.py ├── scheduler │ ├── __init__.py │ ├── assets │ │ ├── static │ │ │ ├── bootstrap.min.css │ │ │ ├── bootstrap.min.js │ │ │ └── jquery.min.js │ │ └── templates │ │ │ └── clusters.html │ ├── elector.py │ ├── http.py │ ├── launcher.py │ ├── mysos_scheduler.py │ ├── password.py │ ├── scheduler.py │ ├── state.py │ └── zk_state.py └── testing │ ├── __init__.py │ └── mysos_test_client.py ├── setup.py ├── tests ├── common │ ├── test_cluster.py │ └── test_zookeeper.py ├── executor │ └── test_mysos_task_runner.py └── scheduler │ ├── test_elector.py │ ├── test_http.py │ ├── test_launcher.py │ ├── test_mysos_scheduler.py │ ├── test_scheduler.py │ ├── test_state.py │ └── test_zk_state.py ├── tox.ini └── vagrant ├── bin ├── mysos_executor.sh └── mysos_scheduler.sh ├── etc ├── admin_keyfile.yml ├── framework_keys.txt ├── fw_auth_keyfile.yml └── scheduler_keyfile.txt ├── provision-dev-cluster.sh ├── test.sh └── upstart ├── mesos-master.conf ├── mesos-slave.conf ├── mysos.conf └── zookeeper.conf /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .tox 3 | *.egg-info 4 | .vagrant 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 2.7 3 | env: 4 | - TOXENV=py27 5 | install: 6 | - pip install tox 7 | script: 8 | - tox -v 9 | - tox -v -e style 10 | 11 | -------------------------------------------------------------------------------- /3rdparty/README.md: -------------------------------------------------------------------------------- 1 | Place any bleeding edge versions of packages here so that they may be picked up by tox during 2 | testing. 3 | 4 | This is also the place for the required `mesos.native` packages because they are platform specific. 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apache Cotton 2 | Apache Cotton (previously named [Mysos](https://blog.twitter.com/2015/another-look-at-mysql-at-twitter-and-incubating-mysos)) 3 | is an Apache Mesos framework for running MySQL instances. It dramatically simplifies the management 4 | of a MySQL cluster and is designed to offer: 5 | 6 | * Efficient hardware utilization through multi-tenancy (in performance-isolated containers) 7 | * High reliability through preserving the MySQL state during failure and automatic backing up to/restoring from HDFS 8 | * An automated self-service option for bringing up new MySQL clusters 9 | * High availability through automatic MySQL master failover 10 | * An elastic solution that allows users to easily scale up and down a MySQL cluster by changing the number of slave instances 11 | 12 | Cotton has been [accepted into the Apache Incubator](http://incubator.apache.org/projects/cotton.html). 13 | 14 | ## Documentation 15 | A [user guide](docs/user-guide.md) is available. Documentation improvements are always welcome, so please send patches our way. 16 | 17 | ## Getting Involved 18 | Please check out the source code from Apache's git repostory: 19 | 20 | git clone https://git-wip-us.apache.org/repos/asf/incubator-cotton.git 21 | 22 | or if you prefer GitHub, use the [GitHub mirror](https://github.com/apache/incubator-cotton): 23 | 24 | git clone https://github.com/apache/incubator-cotton.git 25 | 26 | The Cotton community maintains the following project supporting services: 27 | 28 | - IRC channel: `#cotton` on irc.freenode.net 29 | - Reporting issues: [JIRA issue tracker](https://issues.apache.org/jira/browse/COTTON). 30 | - Submitting patches: [Review board](https://reviews.apache.org/groups/cotton/). 31 | - [Development Mailing List](mailto:dev-subscribe@cotton.incubator.apache.org) 32 | ([Archives](http://www.mail-archive.com/dev@cotton.incubator.apache.org/)) 33 | 34 | ## License 35 | Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0 36 | 37 | ## Requirements 38 | * Python 2.7 39 | * Mesos Python bindings 40 | 41 | ## Building 42 | [![Build status on Travis CI](https://api.travis-ci.org/apache/incubator-cotton.svg)](https://travis-ci.org/apache/incubator-cotton) 43 | 44 | ### Building/Downloading Mesos Python Bindings 45 | Cotton uses Mesos Python bindings which consist of two Python packages. `mesos.interface` is on PyPI 46 | and gets automatically installed but `mesos.native` is platform dependent. You need to either build 47 | the package on your machine ([instructions](http://mesos.apache.org/gettingstarted/)) or download a 48 | compiled one for your platform (e.g. Mesosphere hosts 49 | [the eggs for some Linux platforms](https://mesosphere.com/downloads/)). 50 | 51 | Since `pip` doesn't support eggs, you need to convert eggs into wheels using `wheel convert`, then 52 | drop them into the `3rdparty` folder. See the [README file](3rdparty/README.md) for more 53 | information. 54 | 55 | ### Building Cotton 56 | Cotton mainly consists of two components that are built and deployed separately. 57 | 58 | - `mysos_scheduler`: The scheduler that connects to Mesos master and manages the MySQL clusters. 59 | - `mysos_executor`: The executor that is launched by Mesos slave (upon `mysos_scheduler`'s request) 60 | to carry out MySQL tasks. 61 | 62 | One way to package these components and their dependencies into a self-contained executable is to 63 | use [PEX](https://pex.readthedocs.org/en/latest/). This allow Cotton components to be launched 64 | quickly and reliably. See 65 | [End-to-end test using PEX](#end-to-end-test-on-a-local-mesos-cluster-and-pex) for an example of 66 | packaging and deploying the executor using PEX. 67 | 68 | ## Testing 69 | ### Unit Tests 70 | Make sure [tox](https://tox.readthedocs.org/en/latest/) is installed and just run: 71 | 72 | tox 73 | 74 | The unit tests don't require the `mesos.native` package to be available in `3rdparty`. Tox also 75 | builds the Cotton source package and drops it in `.tox/dist`. 76 | 77 | ### End-to-end Test on a Local Mesos Cluster and PEX 78 | Build/download the `mesos.native` package and put it in `3rdparty` and then run: 79 | 80 | tox -e pex 81 | 82 | This test demonstrates how to package a PEX executor and use it to launch a *fake* MySQL cluster on 83 | a *local* Mesos cluster. 84 | 85 | ### End-to-end Test on a Real Mesos Cluster in a Vagrant VM 86 | The Vagrant test uses the `sdist` Cotton package in `.tox/dist` so be sure to run `tox` first. Then: 87 | 88 | vagrant up 89 | 90 | # Wait for the VM and Cotton API endpoint to come up (http://192.168.33.17:55001 becomes available). 91 | 92 | tox -e vagrant 93 | 94 | `test.sh` verifies that Cotton successfully creates a MySQL cluster and then deletes it. 95 | -------------------------------------------------------------------------------- /RETIRED.txt: -------------------------------------------------------------------------------- 1 | This podling has been retired, please see: 2 | 3 | http://incubator.apache.org/projects/index.html#cotton 4 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | # Vagrantfile API/syntax version. Don't touch unless you know what you're doing! 5 | VAGRANTFILE_API_VERSION = "2" 6 | 7 | # 1.5.0 is required to use vagrant cloud images. 8 | # https://www.vagrantup.com/blog/vagrant-1-5-and-vagrant-cloud.html 9 | Vagrant.require_version ">= 1.5.0" 10 | 11 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| 12 | config.vm.box = "ubuntu/trusty64" 13 | 14 | config.vm.define "devcluster" do |dev| 15 | dev.vm.network :private_network, ip: "192.168.33.17" 16 | dev.vm.provider :virtualbox do |vb| 17 | vb.customize ["modifyvm", :id, "--memory", "2048"] 18 | end 19 | dev.vm.provision "shell", path: "vagrant/provision-dev-cluster.sh" 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Mysos 2 | Mysos is an Apache Mesos framework for running MySQL instances. It dramatically simplifies the management of a MySQL cluster and is designed to offer: 3 | 4 | * Efficient hardware utilization through multi-tenancy (in performance-isolated containers) 5 | * High reliability through preserving the MySQL state during failure and automatic backing up to/restoring from HDFS 6 | * An automated self-service option for bringing up new MySQL clusters 7 | * High availability through automatic MySQL master failover 8 | * An elastic solution that allows users to easily scale up and down a MySQL cluster by changing the number of slave instances 9 | 10 | Mysos has been [accepted into the Apache Incubator](http://incubator.apache.org/projects/mysos.html). 11 | -------------------------------------------------------------------------------- /docs/user-guide.md: -------------------------------------------------------------------------------- 1 | # Using Mysos 2 | 3 | Mysos provides a REST API for creating and managing MySQL clusters on Mesos. 4 | 5 | ## Dashboard 6 | - HTTP Method: `GET` 7 | - Path: `/` 8 | 9 | The root of API endpoint is a web page that lists the managed MySQL clusters. 10 | 11 | ## Creating a MySQL cluster 12 | - HTTP Method: `POST` 13 | - Path: `/clusters/` 14 | 15 | 16 | ### Parameters 17 | - `cluster_name`: Required. Name of the cluster. 18 | - `cluster_user`: Required. The user account for all MySQL instances in the cluster which as full 19 | admin privileges. 20 | - `num_nodes`: Number of nodes in the cluster. [default: 1] 21 | - `size`: The size of instances in the cluster as a JSON dictionary of `cpus`, `mem` and `disk`. 22 | `mem` and `disk` are specified with standard data size units such as `mb`, `gb`, `tb`, etc. (no 23 | spaces, see the default for an example) [default: `{"mem": "512mb", "disk": "2gb", "cpus": 1.0}`] 24 | - `backup_id`: An ID for the MySQL backup to restore from when the MySQL instance starts. If not 25 | specified, Mysos will start an empty MySQL instance. The format and meaning of `backup_id` is 26 | specific to the implementation of `BackupStore` that the Mysos cluster uses. 27 | - `cluster_password`: The password used for accessing MySQL instances in the cluster as well as 28 | deleting the cluster from Mysos. If unspecified then Mysos generates one for the cluster. In either 29 | case the password is sent back as part of the response. 30 | 31 | `cluster_name` is part of the path and the rest of the parameters are specified as form fields. 32 | 33 | 34 | ### Response 35 | A JSON object with the following fields: 36 | 37 | - `cluster_password`: The password for accessing the MySQL instance (associated with 38 | `cluster_user`). 39 | - `cluster_url`: A URL to the ZooKeeper group for discovering the MySQL instances of this cluster. 40 | See the *Service Discovery* section below. 41 | 42 | 43 | ### Example 44 | 45 | # Create a cluster named 'test_cluster3' and restore from the backup 'foo/bar:201503122000'. 46 | curl -X POST 192.168.33.7/clusters/test_cluster3 --form "cluster_user=mysos" \ 47 | --form "num_nodes=2" --form "backup_id=foo/bar:201503122000" \ 48 | --form 'size={"mem": "512mb", "disk": "3gb", "cpus": 1.0}' 49 | 50 | # Response 51 | {"cluster_password": "w9gMCkecsMh6sWsRdxNTa", "cluster_url": "zk://192.168.33.7:2181/mysos/discover/test_cluster3"} 52 | 53 | 54 | ### Notes 55 | - Cluster creation is asynchronous. The API call returns (with status 200) as soon as the Mysos 56 | scheduler has accepted the request. The same goes for cluster deletion. 57 | - ZooKeeper `/master` sub-group has at most one ZNode which is the master of the MySQL 58 | cluster. 59 | - ZooKeeper `/slaves` sub-group can have multiple ZNodes which are the slaves of the 60 | MySQL cluster. 61 | - A ZNode is added to the ZooKeeper group when the instance becomes available and ready to serve 62 | traffic. 63 | 64 | ## Removing a MySQL cluster 65 | - HTTP Method: `DELETE` 66 | - Path: `/clusters/` 67 | 68 | ### Parameters 69 | - `cluster_name`: Name of the cluster. 70 | - `password`: The password for the cluster returned by cluster creation call. 71 | 72 | ### Response 73 | A JSON object with: 74 | 75 | - `cluster_url`: A URL to the ZooKeeper group to watch for the termination of the cluster. The group 76 | ZNode is removed from ZooKeeper when the MySQL cluster is removed/terminated. 77 | 78 | ### Example 79 | ``` 80 | # Remove a cluster named 'test_cluster3' 81 | curl -X DELETE 192.168.33.7/cluster/test_cluster3 --form "password=w9gMCkecsMh6sWsRdxNTa" 82 | # Response 83 | {"cluster_url": "zk://mysos:mysos@192.168.33.7:2181/mysos/discover/test_cluster3"} 84 | ``` 85 | 86 | ## Service Discovery 87 | Mysos' service discovery with ZooKeeper conforms to the ServerSet protocol. Each MySQL instance is 88 | represented by a ZNode with its data being a 89 | [ServiceInstance](https://github.com/twitter/commons/blob/master/src/thrift/com/twitter/thrift/endpoint.thrift) 90 | serialized into JSON. 91 | 92 | - The `Endpoint serviceEndpoint` field in ServiceInstance has the `host` and `port` that MySQL 93 | client can connect to. 94 | - Some utilities for watching ZooKeeper and parsing the `ServiceInstance`s: 95 | [Java src](https://github.com/twitter/commons/tree/master/src/java/com/twitter/common/zookeeper) | 96 | [Maven](http://maven.twttr.com/com/twitter/zookeeper-client/LATEST/), 97 | [Python src](https://github.com/twitter/commons/tree/master/src/python/twitter/common/zookeeper) | 98 | [PyPI](https://pypi.python.org/pypi/twitter.common.zookeeper/). 99 | -------------------------------------------------------------------------------- /mysos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/incubator-retired-cotton/4aa9bb0acdd8c609686b5d370ef4b61a520364ef/mysos/__init__.py -------------------------------------------------------------------------------- /mysos/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/incubator-retired-cotton/4aa9bb0acdd8c609686b5d370ef4b61a520364ef/mysos/common/__init__.py -------------------------------------------------------------------------------- /mysos/common/cluster.py: -------------------------------------------------------------------------------- 1 | import Queue 2 | import functools 3 | import posixpath 4 | import sys 5 | import threading 6 | 7 | from mysos.common import zookeeper 8 | 9 | from kazoo.client import KazooClient 10 | from kazoo.exceptions import NoNodeError 11 | from kazoo.protocol.states import EventType 12 | from kazoo.recipe.watchers import ChildrenWatch, DataWatch 13 | from twitter.common import log 14 | from twitter.common.zookeeper.serverset.endpoint import ServiceInstance 15 | 16 | 17 | def get_cluster_path(zk_root, cluster_name): 18 | """ 19 | :param zk_root: the root path for the mysos scheduler. 20 | :param cluster_name: Name of the the cluster. 21 | :return: The path for the cluster. 22 | """ 23 | return posixpath.join(zk_root, cluster_name) 24 | 25 | 26 | class Cluster(object): 27 | """ 28 | A class that represents all members of the MySQL cluster. 29 | 30 | A newly added cluster member becomes a read-only slave until it's promoted to a master. 31 | Only the master can write. 32 | 33 | The members of the cluster are maintained in two ZooKeeper groups: a slaves group and a master 34 | group under the same 'directory'. Slaves have unique member IDs backed by ZooKeeper's sequential 35 | ZNodes. When a slave is promoted to a master, it is moved (its ID preserved) from the slave 36 | group to the master group. 37 | 38 | There is at most one member in the master group. 39 | """ 40 | SLAVES_GROUP = 'slaves' 41 | MASTER_GROUP = 'master' 42 | MEMBER_PREFIX = "member_" # Use the prefix so the path conforms to the ServerSet convention. 43 | 44 | def __init__(self, cluster_path): 45 | self.cluster_path = cluster_path 46 | self.members = {} # {ID : (serialized) ServiceInstance} mappings for members of the cluster. 47 | self.master = None # The master's member ID. 48 | self.slaves_group = posixpath.join(cluster_path, self.SLAVES_GROUP) 49 | self.master_group = posixpath.join(cluster_path, self.MASTER_GROUP) 50 | 51 | 52 | # TODO(jyx): Handle errors e.g. sessions expirations and recoverable failures. 53 | class ClusterManager(object): 54 | """ 55 | Kazoo wrapper used by the scheduler to inform executors about cluster change. 56 | NOTE: ClusterManager is thread safe, i.e., it can be accessed from multiple threads at once. 57 | """ 58 | 59 | class Error(Exception): pass 60 | 61 | def __init__(self, client, cluster_path): 62 | """ 63 | :param client: Kazoo client. 64 | :param cluster_path: The path for this cluster on ZooKeeper. 65 | """ 66 | self._client = client 67 | self._cluster = Cluster(cluster_path) 68 | self._lock = threading.Lock() 69 | self._populate() 70 | 71 | def _read_child_content(self, group, member_id): 72 | try: 73 | return self._client.get(posixpath.join(group, member_id))[0] 74 | except NoNodeError: 75 | return None 76 | 77 | def _populate(self): 78 | self._client.ensure_path(self._cluster.slaves_group) 79 | self._client.ensure_path(self._cluster.master_group) 80 | 81 | # Populate slaves. 82 | for child in self._client.get_children(self._cluster.slaves_group): 83 | child_content = self._read_child_content(self._cluster.slaves_group, child) 84 | if child_content: 85 | self._cluster.members[child] = child_content 86 | 87 | # Populate the master. 88 | master_group = self._client.get_children(self._cluster.master_group) 89 | assert len(master_group) <= 1 90 | if len(master_group) == 1: 91 | child = master_group[0] 92 | child_content = self._read_child_content(self._cluster.master_group, child) 93 | if child_content: 94 | self._cluster.members[child] = child_content 95 | self._cluster.master = child 96 | 97 | def add_member(self, service_instance): 98 | """ 99 | Add the member to the ZooKeeper group. 100 | NOTE: 101 | - New members are slaves until being promoted. 102 | - A new member is not added if the specified service_instance already exists in the group. 103 | :return: The member ID for the ServiceInstance generated by ZooKeeper. 104 | """ 105 | if not isinstance(service_instance, ServiceInstance): 106 | raise TypeError("'service_instance' should be a ServiceInstance") 107 | 108 | content = ServiceInstance.pack(service_instance) 109 | 110 | for k, v in self._cluster.members.items(): 111 | if content == v: 112 | log.info("%s not added because it already exists in the group" % service_instance) 113 | return k 114 | 115 | znode_path = self._client.create( 116 | posixpath.join(self._cluster.slaves_group, self._cluster.MEMBER_PREFIX), 117 | content, 118 | sequence=True) 119 | _, member_id = posixpath.split(znode_path) 120 | with self._lock: 121 | self._cluster.members[member_id] = content 122 | return member_id 123 | 124 | def remove_member(self, member_id): 125 | """ 126 | Remove the member if it is in the group. 127 | 128 | :return: True if the member is deleted. False if the member cannot be found. 129 | """ 130 | with self._lock: 131 | if member_id not in self._cluster.members: 132 | log.info("Member %s is not in the ZK group" % member_id) 133 | return False 134 | 135 | self._cluster.members.pop(member_id, None) 136 | 137 | if member_id == self._cluster.master: 138 | self._cluster.master = None 139 | self._client.delete(posixpath.join(self._cluster.master_group, member_id)) 140 | else: 141 | self._client.delete(posixpath.join(self._cluster.slaves_group, member_id)) 142 | 143 | return True 144 | 145 | def promote_member(self, member_id): 146 | """ 147 | Promote the member with the given ID to be the master of the cluster if it's not already the 148 | master. 149 | 150 | :return: True if the member is promoted. False if the member is already the master. 151 | """ 152 | with self._lock: 153 | if member_id not in self._cluster.members: 154 | raise ValueError("Invalid member_id: %s" % member_id) 155 | 156 | # Do nothing if the member is already the master. 157 | if self._cluster.master and self._cluster.master == member_id: 158 | log.info("Not promoting %s because is already the master" % member_id) 159 | return False 160 | 161 | tx = self._client.transaction() 162 | if self._cluster.master: 163 | tx.delete(posixpath.join(self._cluster.master_group, self._cluster.master)) 164 | self._cluster.members.pop(self._cluster.master) 165 | 166 | # "Move" the ZNode, i.e., create a ZNode of the same ID in the master group. 167 | tx.delete(posixpath.join(self._cluster.slaves_group, member_id)) 168 | tx.create( 169 | posixpath.join(self._cluster.master_group, member_id), 170 | self._cluster.members[member_id]) 171 | 172 | tx.commit() 173 | 174 | self._cluster.master = member_id 175 | 176 | return True 177 | 178 | def delete_cluster(self): 179 | with self._lock: 180 | if self._cluster.members: 181 | raise self.Error("Cannot remove a cluster that is not empty") 182 | 183 | # Need to delete master/slave sub-dirs. 184 | self._client.delete(self._cluster.cluster_path, recursive=True) 185 | 186 | 187 | # TODO(wickman): Implement kazoo connection acquiescence. 188 | class ClusterListener(object): 189 | """Kazoo wrapper used by the executor to listen to cluster change.""" 190 | 191 | def __init__(self, 192 | client, 193 | cluster_path, 194 | self_instance=None, 195 | promotion_callback=None, 196 | demotion_callback=None, 197 | master_callback=None, 198 | termination_callback=None): 199 | """ 200 | :param client: Kazoo client. 201 | :param cluster_path: The path for this cluster on ZooKeeper. 202 | :param self_instance: The local ServiceInstance associated with this listener. 203 | :param promotion_callback: Invoked when 'self_instance' is promoted. 204 | :param demotion_callback: Invoked when 'self_instance' is demoted. 205 | :param master_callback: Invoked when there is a master change otherwise. 206 | :param termination_callback: Invoked when the cluster is terminated. 207 | NOTE: Callbacks are executed synchronously in Kazoo's completion thread to ensure the delivery 208 | order of events. Blocking the callback method means no future callbacks will be invoked. 209 | """ 210 | self._client = client 211 | self._cluster = Cluster(cluster_path) 212 | self._self_content = ServiceInstance.pack(self_instance) if self_instance else None 213 | self._master = None 214 | self._master_content = None 215 | self._promotion_callback = promotion_callback or (lambda: True) 216 | self._demotion_callback = demotion_callback or (lambda: True) 217 | self._master_callback = master_callback or (lambda x: True) 218 | self._termination_callback = termination_callback or (lambda: True) 219 | 220 | self._children_watch = None # Set when the watcher detects that the master group exists. 221 | 222 | def start(self): 223 | """ 224 | Start the listener to watch the master group. 225 | 226 | NOTE: The listener only starts watching master after the base ZNode for the group is created. 227 | """ 228 | DataWatch(self._client, self._cluster.cluster_path, func=self._cluster_path_callback) 229 | DataWatch(self._client, self._cluster.master_group, func=self._master_group_callback) 230 | 231 | def _swap(self, master, master_content): 232 | i_was_master = self._self_content and self._master_content == self._self_content 233 | self._master, self._master_content = master, master_content 234 | i_am_master = self._self_content and self._master_content == self._self_content 235 | 236 | # Invoke callbacks accordingly. 237 | # NOTE: No callbacks are invoked if there is currently no master and 'self_instance' wasn't the 238 | # master. 239 | if i_was_master and not i_am_master: 240 | self._demotion_callback() 241 | elif not i_was_master and i_am_master: 242 | self._promotion_callback() 243 | elif not i_was_master and not i_am_master and master: 244 | assert master_content 245 | self._master_callback(ServiceInstance.unpack(master_content)) 246 | 247 | def _data_callback(self, master_id, master_completion): 248 | try: 249 | master_content, _ = master_completion.get() 250 | except NoNodeError: 251 | # ZNode could be gone after we detected it but before we read it. 252 | master_id, master_content = None, None 253 | self._swap(master_id, master_content) 254 | 255 | def _child_callback(self, masters): 256 | assert len(masters) <= 1, "There should be at most one master" 257 | 258 | if len(masters) == 1 and self._master != masters[0]: 259 | self._client.get_async(posixpath.join(self._cluster.master_group, masters[0])).rawlink( 260 | functools.partial(self._data_callback, masters[0])) 261 | elif len(masters) == 0: 262 | self._swap(None, None) 263 | 264 | def _cluster_path_callback(self, data, stat, event): 265 | if event and event.type == EventType.DELETED: 266 | self._termination_callback() 267 | 268 | def _master_group_callback(self, data, stat, event): 269 | if stat and not self._children_watch: 270 | log.info("Master group %s exists. Starting to watch for election result" % 271 | self._cluster.master_group) 272 | self._children_watch = ChildrenWatch( 273 | self._client, self._cluster.master_group, func=self._child_callback) 274 | 275 | 276 | def resolve_master( 277 | cluster_url, master_callback=lambda: True, termination_callback=lambda: True, zk_client=None): 278 | """ 279 | Resolve the MySQL cluster master's endpoint from the given URL for this cluster. 280 | :param cluster_url: The ZooKeeper URL for this cluster. 281 | :param master_callback: A callback method with one argument: the ServiceInstance for the elected 282 | master. 283 | :param termination_callback: A callback method with no argument. Invoked when the cluster 284 | terminates. 285 | :param zk_client: Use a custom ZK client instead of Kazoo if specified. 286 | """ 287 | try: 288 | _, zk_servers, cluster_path = zookeeper.parse(cluster_url) 289 | except Exception as e: 290 | raise ValueError("Invalid cluster_url: %s" % e.message) 291 | 292 | if not zk_client: 293 | zk_client = KazooClient(zk_servers) 294 | zk_client.start() 295 | 296 | listener = ClusterListener( 297 | zk_client, 298 | cluster_path, 299 | None, 300 | master_callback=master_callback, 301 | termination_callback=termination_callback) 302 | listener.start() 303 | 304 | 305 | def wait_for_master(cluster_url, zk_client=None): 306 | """ 307 | Convenience function to wait for the master to be elected and return the master. 308 | :param cluster_url: The ZooKeeper URL for this cluster. 309 | :param zk_client: Use a custom ZK client instead of Kazoo if specified. 310 | :return: The ServiceInstance for the elected master. 311 | """ 312 | master = Queue.Queue() 313 | resolve_master( 314 | cluster_url, 315 | master_callback=lambda x: master.put(x), 316 | termination_callback=lambda: True, 317 | zk_client=zk_client) 318 | # Block forever but using sys.maxint makes the wait interruptable by Ctrl-C. See 319 | # http://bugs.python.org/issue1360. 320 | return master.get(True, sys.maxint) 321 | 322 | 323 | def wait_for_termination(cluster_url, zk_client=None): 324 | """ 325 | Convenience function to wait for the cluster to terminate. The corresponding ZNode is removed 326 | when the cluster terminates. 327 | :param cluster_url: The ZooKeeper URL for this cluster. 328 | :param zk_client: Use a custom ZK client instead of Kazoo if specified. 329 | """ 330 | terminated = threading.Event() 331 | resolve_master( 332 | cluster_url, 333 | master_callback=lambda x: True, 334 | termination_callback=lambda: terminated.set(), 335 | zk_client=zk_client) 336 | 337 | # Block forever but using sys.maxint makes the wait interruptable by Ctrl-C. See 338 | # http://bugs.python.org/issue1360. 339 | terminated.wait(sys.maxint) 340 | -------------------------------------------------------------------------------- /mysos/common/decorators.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import inspect 3 | 4 | from twitter.common import log 5 | 6 | 7 | def logged(func): 8 | arg_names = inspect.getargspec(func).args[1:] 9 | @functools.wraps(func) 10 | def wrapped_func(self, *args): 11 | log.debug('%s(%s)' % ( 12 | func.__name__, 13 | ', '.join('%s=%s' % (name, arg) for (name, arg) in zip(arg_names, args)))) 14 | return func(self, *args) 15 | return wrapped_func 16 | 17 | 18 | def synchronized(func): 19 | @functools.wraps(func) 20 | def synchronizer(self, *args, **kwargs): 21 | assert hasattr(self, '_lock'), "Need to define a _lock to use this decorator" 22 | with self._lock: 23 | return func(self, *args, **kwargs) 24 | return synchronizer 25 | -------------------------------------------------------------------------------- /mysos/common/fetcher.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | from twitter.common.lang import Interface 4 | 5 | 6 | class Fetcher(Interface): 7 | class Error(Exception): pass 8 | 9 | @abstractmethod 10 | def fetch(self, uri, directory): 11 | pass 12 | 13 | 14 | class FetcherFactory(object): 15 | """A singleton factory for Fetchers.""" 16 | 17 | _FETCHERS = {} 18 | 19 | @classmethod 20 | def register_fetcher(cls, scheme, fetcher): 21 | cls._FETCHERS[scheme.rstrip('://')] = fetcher 22 | 23 | @classmethod 24 | def get_fetcher(cls, uri): 25 | """ 26 | :return: A Fetcher instance that matches this URI. None if no fetcher is registered with the 27 | URI's scheme. 28 | """ 29 | scheme = uri.split('://')[0] 30 | fetcher = cls._FETCHERS.get(scheme) 31 | if not fetcher: 32 | raise ValueError("No Fetcher is registered for URI scheme '%s'" % scheme) 33 | 34 | return fetcher 35 | -------------------------------------------------------------------------------- /mysos/common/hdfs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from twitter.common import log 4 | from twitter.common.concurrent import deadline, Timeout 5 | from twitter.common.fs import HDFSHelper 6 | from twitter.common.quantity import Amount, Data, Time 7 | 8 | from .fetcher import Fetcher 9 | 10 | 11 | HADOOP_CONF_DIR = '/etc/hadoop/conf' 12 | 13 | 14 | class HDFSFetcher(Fetcher): 15 | """ 16 | NOTE: Specify custom config directory using the environment variable 'HADOOP_CONF_DIR'. 17 | """ 18 | 19 | def __init__(self, timeout=Amount(5, Time.MINUTES)): 20 | if not isinstance(timeout, Amount) or not isinstance(timeout.unit(), Time): 21 | raise ValueError("'timeout' must be an Amount of Time") 22 | self._timeout = timeout 23 | 24 | def fetch(self, uri, directory): 25 | log.info("Fetching %s from HDFS" % uri) 26 | 27 | if "JAVA_HOME" in os.environ: 28 | log.info("Using JAVA_HOME '%s' for HDFS commands" % os.environ["JAVA_HOME"]) 29 | 30 | config = os.environ.get("HADOOP_CONF_DIR", HADOOP_CONF_DIR) 31 | h = HDFSHelper(config, heap_limit=Amount(256, Data.MB)) 32 | try: 33 | f = lambda: h.copy_to_local(uri, directory) 34 | deadline(f, timeout=self._timeout, propagate=True, daemon=True) 35 | except HDFSHelper.InternalError as e: 36 | raise self.Error('Unable to fetch HDFS package: %s' % e) 37 | except Timeout as e: 38 | raise self.Error("Failed to fetch package from HDFS within : %s" % (self._timeout, e)) 39 | -------------------------------------------------------------------------------- /mysos/common/pkgutil.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from twitter.common.dirutil import safe_mkdir 4 | 5 | import pkg_resources 6 | 7 | 8 | def unpack_assets(output_dir, module, asset_path, execute=lambda x: None): 9 | """ 10 | Extract files from a module in a package into 'output_dir'. 11 | 12 | :param output_dir: The directory to copy the assets *root* to. 13 | :param module: The module in the package to find the assets in. e.g., mysos.executor. 14 | :param asset_path: The path of the asset relative to the module root to unpack. 15 | :param execute: A function that expects a single argument as the path to an asset file. It can 16 | be used to process this file. 17 | 18 | NOTE: If the specified 'asset_path' is a directory, its contents are copied but it itself is not 19 | recreated in the output directory. e.g., /files/bin/file.sh is copied to 20 | /bin/file.sh. An analogy is that its like `cp -R asset_path/* output_dir` and 21 | not like `cp -R asset_path output_dir`. 22 | """ 23 | _unpack_assets(output_dir, module, asset_path, execute, asset_path) 24 | 25 | 26 | def _unpack_assets(output_dir, module, asset_root, execute, current_path): 27 | """ 28 | The internal helper function for unpack_assets(...) recursion. 29 | :param current_path: Records the current 30 | """ 31 | for asset in pkg_resources.resource_listdir(module, current_path): 32 | asset_target = os.path.join(os.path.relpath(current_path, asset_root), asset) 33 | if pkg_resources.resource_isdir(module, os.path.join(current_path, asset)): 34 | safe_mkdir(os.path.join(output_dir, asset_target)) 35 | _unpack_assets(output_dir, module, asset_root, execute, os.path.join(current_path, asset)) 36 | else: 37 | output_file = os.path.join(output_dir, asset_target) 38 | with open(output_file, 'wb') as fp: 39 | fp.write(pkg_resources.resource_string( 40 | module, os.path.join(asset_root, asset_target))) 41 | execute(output_file) 42 | -------------------------------------------------------------------------------- /mysos/common/testing.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import subprocess 3 | 4 | 5 | class Fake(object): 6 | """ 7 | A fake object that does nothing but recording the method calls and arguments. 8 | """ 9 | 10 | def __init__(self): 11 | self.method_calls = defaultdict(list) 12 | 13 | def __getattr__(self, attr): 14 | def enqueue_arguments(*args, **kw): 15 | self.method_calls[attr].append((args, kw)) 16 | return enqueue_arguments 17 | 18 | 19 | def build_and_execute_pex_target(target, binary): 20 | """ 21 | :param target: The pants target. 22 | :param binary: The path to the pex binary relative to the root of the repository. 23 | """ 24 | assert subprocess.call(["./pants", "goal", "binary", target]) == 0 25 | 26 | p = subprocess.Popen([binary, "--help"], stderr=subprocess.STDOUT, stdout=subprocess.PIPE) 27 | out, err = p.communicate() 28 | assert p.returncode == 1 29 | assert out.startswith('Options'), 'Unexpected build output: %s' % out 30 | -------------------------------------------------------------------------------- /mysos/common/zookeeper.py: -------------------------------------------------------------------------------- 1 | import string 2 | 3 | 4 | def parse(url): 5 | """ 6 | Parse ZooKeeper URL. 7 | :param url: The URL in the form of "zk://username:password@servers/path". 8 | :return: Tuple (credential, servers, path). 9 | credential: Credential for authentication with "digest" scheme. Optional and default to 10 | None. 11 | servers: Compatible with Kazoo's 'hosts' argument. 12 | path: Optional and default to '/'. 13 | NOTE: This method doesn't validate the values in the returned tuple. 14 | """ 15 | index = string.find(url, "zk://") 16 | if index != 0: 17 | raise ValueError("Expecting 'zk://' at the beginning of the URL") 18 | 19 | url = string.lstrip(url, "zk://") 20 | 21 | try: 22 | servers, path = string.split(url, '/', 1) 23 | except ValueError: 24 | servers = url 25 | path = '' 26 | 27 | path = '/' + path 28 | 29 | try: 30 | credential, servers = string.split(servers, '@', 1) 31 | except ValueError: 32 | credential = None 33 | 34 | return credential, servers, path 35 | -------------------------------------------------------------------------------- /mysos/executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/incubator-retired-cotton/4aa9bb0acdd8c609686b5d370ef4b61a520364ef/mysos/executor/__init__.py -------------------------------------------------------------------------------- /mysos/executor/backup.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | from twitter.common.lang import Interface 4 | 5 | 6 | class BackupInfo(object): 7 | def __init__(self, backup_file, cold_backup): 8 | self.backup_file = backup_file 9 | self.cold_backup = cold_backup 10 | 11 | 12 | class BackupStore(Interface): 13 | """ 14 | The storage for Mysos executor state backup (i.e. MySQL data, etc.). 15 | 16 | Thread-safety: The BackupStore implementation is not expected to be thread-safe and the caller 17 | should be responsible for it. 18 | """ 19 | 20 | class Error(Exception): pass 21 | class BackupNotFoundError(Error): pass 22 | 23 | @abstractmethod 24 | def restore(self): 25 | """ 26 | Restore the backup. 27 | 28 | :return: The BackupInfo object. None if no state is restored. 29 | """ 30 | pass 31 | 32 | 33 | class BackupStoreProvider(Interface): 34 | @abstractmethod 35 | def from_task(self, task, sandbox): 36 | """ 37 | Factory method that creates a BackupStore instance from 'task' (TaskInfo). 38 | 39 | :return: The BackupStore instance. 40 | """ 41 | pass 42 | 43 | 44 | class NoopBackupStore(BackupStore): 45 | """Used when no restore is requested.""" 46 | 47 | def restore(self): 48 | return None # Returning 'None' because no state is restored. 49 | 50 | 51 | class NoopBackupStoreProvider(BackupStoreProvider): 52 | """Used when no restore is requested.""" 53 | 54 | def from_task(self, task, sandbox): 55 | return NoopBackupStore() 56 | -------------------------------------------------------------------------------- /mysos/executor/executor.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from threading import Event 4 | import traceback 5 | 6 | from mysos.common.decorators import logged 7 | 8 | from .task_runner import TaskError 9 | 10 | from mesos.interface import Executor 11 | import mesos.interface.mesos_pb2 as mesos_pb2 12 | from twitter.common import log 13 | from twitter.common.concurrent import defer 14 | from twitter.common.quantity import Amount, Time 15 | 16 | 17 | class MysosExecutor(Executor): 18 | """ 19 | MysosExecutor is a fine-grained executor, i.e., one executor executes a single task. 20 | """ 21 | 22 | STOP_WAIT = Amount(5, Time.SECONDS) 23 | 24 | def __init__(self, runner_provider, sandbox): 25 | """ 26 | :param runner_provider: An implementation of TaskRunnerProvider. 27 | :param sandbox: The path to the sandbox where all files the executor reads/writes are located. 28 | """ 29 | self._runner_provider = runner_provider 30 | self._runner = None # A singleton task runner created by launchTask(). 31 | self._driver = None # Assigned in registered(). 32 | self._killed = False # True if the executor's singleton task is killed by the scheduler. 33 | self._sandbox = sandbox 34 | 35 | self._terminated = Event() # Set when the runner has terminated. 36 | 37 | # --- Mesos methods. --- 38 | @logged 39 | def registered(self, driver, executorInfo, frameworkInfo, slaveInfo): 40 | log.info('Registered with slave: %s' % slaveInfo) 41 | self._driver = driver # Cache the driver to kill later. 42 | 43 | @logged 44 | def reregistered(self, driver, slaveInfo): 45 | log.info('Reregistered with slave: %s' % slaveInfo) 46 | 47 | @logged 48 | def disconnected(self, driver): 49 | log.info("ExecutorDriver disconnected from Mesos slave") 50 | 51 | @logged 52 | def launchTask(self, driver, task): 53 | if self._runner: 54 | log.error("Executor allows only one task") 55 | update = mesos_pb2.TaskStatus() 56 | update.state = mesos_pb2.TASK_FAILED 57 | driver.sendStatusUpdate(update) 58 | return 59 | 60 | # Create the runner here in the driver thread so subsequent task launches are rejected. 61 | try: 62 | self._runner = self._runner_provider.from_task(task, self._sandbox) 63 | except (TaskError, ValueError) as e: 64 | # TODO(jyx): These should really all be 'ValueError's from all providers because they are 65 | # simply factory methods. 66 | log.error("Failed to create TaskRunner: %s" % e.message) 67 | self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED, e.message) 68 | self._kill() 69 | return 70 | 71 | # Run the task in a separate daemon thread. 72 | defer(lambda: self._run_task(task)) 73 | 74 | def _run_task(self, task): 75 | assert self._runner, "_runner should be created before this method is called" 76 | 77 | try: 78 | self._runner.start() 79 | log.info("Task runner for task %s started" % task.task_id) 80 | 81 | self._send_update(task.task_id.value, mesos_pb2.TASK_RUNNING) 82 | except TaskError as e: 83 | log.error("Task runner for task %s failed to start: %s" % (task.task_id, str(e))) 84 | # Send TASK_FAILED if the task failed to start. 85 | self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED) 86 | except Exception as e: 87 | log.error("Error occurred while executing the task: %s" % e) 88 | log.error(traceback.format_exc()) 89 | # Send TASK_LOST for unknown errors. 90 | self._send_update(task.task_id.value, mesos_pb2.TASK_LOST) 91 | else: 92 | # Wait for the task's return code (when it terminates). 93 | try: 94 | returncode = self._runner.join() 95 | # If '_runner' terminates, it has either failed or been killed. 96 | log.warn("Task process terminated with return code %s" % returncode) 97 | except TaskError as e: 98 | log.error("Task terminated: %s" % e) 99 | finally: 100 | if self._killed: 101 | self._send_update(task.task_id.value, mesos_pb2.TASK_KILLED) 102 | else: 103 | self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED) 104 | self._terminated.set() 105 | finally: 106 | # No matter what happens above, when we reach here the executor has no task to run so it 107 | # should just commit seppuku. 108 | self._kill() 109 | 110 | @logged 111 | def frameworkMessage(self, driver, message): 112 | if not self._runner: 113 | log.info('Ignoring framework message because no task is running yet') 114 | return 115 | 116 | defer(lambda: self._framework_message(message)) 117 | 118 | def _framework_message(self, message): 119 | master_epoch = message # The log position request is for electing the master of this 'epoch'. 120 | try: 121 | position = self._runner.get_log_position() 122 | log.info('Obtained log position %s for epoch %s' % (position, master_epoch)) 123 | 124 | assert self._driver 125 | 126 | # TODO(jyx): Define the message in ProtoBuf or Thrift. 127 | self._driver.sendFrameworkMessage(json.dumps({ 128 | 'epoch': master_epoch, # Send the epoch back without parsing it. 129 | 'position': position 130 | })) 131 | except Exception as e: 132 | log.error("Committing suicide due to failure to process framework message: %s" % e) 133 | log.error(traceback.format_exc()) 134 | self._kill() 135 | 136 | @logged 137 | def killTask(self, driver, taskId): 138 | # Killing the task also kills the executor because there is one task per executor. 139 | log.info("Asked to kill task %s" % taskId.value) 140 | 141 | self._kill() 142 | 143 | def _kill(self): 144 | if self._runner: 145 | self._killed = True 146 | self._runner.stop() # It could be already stopped. If so, self._runner.stop() is a no-op. 147 | self._terminated.wait(sys.maxint) 148 | 149 | assert self._driver 150 | 151 | # TODO(jyx): Fix https://issues.apache.org/jira/browse/MESOS-243. 152 | defer(lambda: self._driver.stop(), delay=self.STOP_WAIT) 153 | 154 | @logged 155 | def shutdown(self, driver): 156 | log.info("Asked to shut down") 157 | self._kill() 158 | 159 | @logged 160 | def error(self, driver, message): 161 | log.error("Shutting down due to error: %s" % message) 162 | self._kill() 163 | 164 | def _send_update(self, task_id, state, message=None): 165 | update = mesos_pb2.TaskStatus() 166 | if not isinstance(state, int): 167 | raise TypeError('Invalid state type %s, should be int.' % type(state)) 168 | if state not in [ 169 | mesos_pb2.TASK_STARTING, 170 | mesos_pb2.TASK_RUNNING, 171 | mesos_pb2.TASK_FINISHED, 172 | mesos_pb2.TASK_KILLED, 173 | mesos_pb2.TASK_FAILED, 174 | mesos_pb2.TASK_LOST]: 175 | raise ValueError('Invalid state: %s' % state) 176 | update.state = state 177 | update.task_id.value = task_id 178 | if message: 179 | update.message = str(message) 180 | log.info('Updating %s => %s. Reason: %s' % (task_id, mesos_pb2.TaskState.Name(state), message)) 181 | self._driver.sendStatusUpdate(update) 182 | -------------------------------------------------------------------------------- /mysos/executor/files/bin/mysql/scripts/mysos_install_db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Install the MySQL server files before starting 'mysqld'. 4 | # 5 | 6 | set -uex 7 | 8 | framework_user=$1 9 | data_dir=$2 10 | conf_file=$3 11 | 12 | # Expecting mysqld to be under $mysql_basedir/bin/ 13 | mysql_basedir=$(dirname $(dirname $(which mysqld))) 14 | 15 | # Remove the datadir if any since we are creating a new instance. 16 | rm -rf $data_dir 17 | 18 | # Initialize the DB. 19 | mysql_install_db \ 20 | --defaults-file=$conf_file \ 21 | --datadir=$data_dir \ 22 | --user=$framework_user \ 23 | --bind-address=0.0.0.0 \ 24 | --basedir=$mysql_basedir 25 | -------------------------------------------------------------------------------- /mysos/executor/files/bin/mysql/scripts/mysos_launch_mysqld.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -uex 4 | 5 | framework_user=$1 6 | host=$2 7 | port=$3 8 | server_id=$4 9 | data_dir=$5 10 | log_dir=$6 11 | tmp_dir=$7 12 | conf_file=$8 13 | buffer_pool_size=$9 14 | 15 | # Expecting mysqld to be under $mysql_basedir/bin/ 16 | mysql_basedir=$(dirname $(dirname $(which mysqld))) 17 | 18 | # Need a temp directory under /tmp because the sandbox path is too long for '--socket'. We also need 19 | # this path to be unique on the host. 20 | socket_tmp=`mktemp -d` 21 | 22 | # Start the server in read only mode. 23 | mysqld \ 24 | --defaults-file=$conf_file \ 25 | --user=$framework_user \ 26 | --port=$port \ 27 | --server-id=$server_id \ 28 | --socket=$socket_tmp/mysql.sock \ 29 | --pid-file=$log_dir/mysqld.pid \ 30 | --basedir=$mysql_basedir \ 31 | --datadir=$data_dir \ 32 | --tmpdir=$tmp_dir \ 33 | --innodb_data_home_dir=$data_dir \ 34 | --innodb_log_group_home_dir=$log_dir \ 35 | --log-error=$data_dir/mysql-error.log \ 36 | --log-bin=$log_dir/mysql-bin \ 37 | --relay-log=$log_dir/mysql-relay-bin \ 38 | --relay-log-index=$log_dir/mysql-relay-bin.index \ 39 | --innodb_buffer_pool_size=$buffer_pool_size \ 40 | --skip-grant-tables 41 | -------------------------------------------------------------------------------- /mysos/executor/files/bin/mysql/scripts/mysos_log_position.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Get the log position of the MySQL slave. 4 | # 5 | 6 | set -uxe 7 | 8 | slave_host=$1 9 | slave_port=$2 10 | 11 | # Get the relay log file. 12 | relay_master_log_file=`mysql -u root -P $slave_port -h $slave_host -e "show slave status\G;" | \ 13 | grep Relay_Master_Log_File | awk '{print $2}'` 14 | 15 | # Get the relay log position. 16 | exec_master_log_pos=`mysql -u root -P $slave_port -h $slave_host -e "show slave status\G;" | \ 17 | grep Exec_Master_Log_Pos | awk '{print $2}'` 18 | 19 | # The output can be empty if the slave has not set up replication. 20 | echo $relay_master_log_file,$exec_master_log_pos 21 | -------------------------------------------------------------------------------- /mysos/executor/files/bin/mysql/scripts/mysos_promote_master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Promote the MySQL slave to be a master. 4 | # 5 | 6 | set -ue # No -x due to passwords in the commands. 7 | 8 | host=$1 9 | port=$2 10 | user=$3 11 | password=$4 12 | admin_user=$5 13 | admin_passwd=$6 14 | 15 | # Stop and reset the slave. 16 | mysql -u root -P $port -h $host -e "STOP SLAVE; RESET SLAVE ALL; 17 | 18 | # Put the master in read-write mode. 19 | SET GLOBAL read_only = 0; UNLOCK TABLES; 20 | 21 | # This reloads the grant tables so the following account-management statements can work. 22 | FLUSH PRIVILEGES; 23 | 24 | # Grant all permissions to the admin user (and create it if not exists). 25 | GRANT ALL ON *.* to '$admin_user'@'%' IDENTIFIED BY '$admin_passwd' WITH GRANT OPTION; 26 | 27 | # Grant all permissions to the user (and create it if not exists). 28 | GRANT ALL ON *.* to '$user'@'%' IDENTIFIED BY '$password' WITH GRANT OPTION; 29 | 30 | # Drop anonymous user. 31 | DELETE FROM mysql.user WHERE user=''; FLUSH PRIVILEGES;" 32 | -------------------------------------------------------------------------------- /mysos/executor/files/bin/mysql/scripts/mysos_reparent.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Reparent the slave to a new master. 4 | # 5 | 6 | set -ue # No -x due to passwords in the commands. 7 | 8 | master_host=$1 9 | master_port=$2 10 | slave_host=$3 11 | slave_port=$4 12 | admin_user=$5 13 | admin_passwd=$6 14 | 15 | # Stop the replication (if any). 16 | mysql -u root -P $slave_port -h $slave_host -e "STOP SLAVE;" 17 | 18 | # Get the relay log file. 19 | relay_master_log_file=`mysql -u root -P $slave_port -h $slave_host -e "show slave status\G;" | \ 20 | grep Relay_Master_Log_File | awk '{print $2}'` 21 | 22 | # Get the relay log position. 23 | exec_master_log_pos=`mysql -u root -P $slave_port -h $slave_host -e "show slave status\G;" | \ 24 | grep Exec_Master_Log_Pos | awk '{print $2}'` 25 | 26 | # TODO(vinod): Make sure the slave processed the relay log. 27 | 28 | # Remove the relay logs. 29 | mysql -u root -P $slave_port -h $slave_host -e "RESET SLAVE;" 30 | 31 | # Point to a master. 32 | if [ "x$relay_master_log_file" != "x" ]; then 33 | mysql -u root -P $slave_port -h $slave_host -e "CHANGE MASTER TO 34 | MASTER_HOST='$master_host', 35 | MASTER_PORT=$master_port, 36 | MASTER_USER='$admin_user', 37 | MASTER_PASSWORD='$admin_passwd', 38 | MASTER_LOG_FILE='$relay_master_log_file', 39 | MASTER_LOG_POS=$exec_master_log_pos;" 40 | else 41 | mysql -u root -P $slave_port -h $slave_host -e "CHANGE MASTER TO 42 | MASTER_HOST='$master_host', 43 | MASTER_PORT=$master_port, 44 | MASTER_USER='$admin_user', 45 | MASTER_PASSWORD='$admin_passwd';" 46 | fi 47 | 48 | # Start replication. 49 | mysql -u root -P $slave_port -h $slave_host -e "START SLAVE;" 50 | -------------------------------------------------------------------------------- /mysos/executor/files/bin/mysql/scripts/mysos_wait_for_mysqld.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Wait for the mysqld instance to start up and be ready for traffic. 4 | # 5 | 6 | set -ue 7 | 8 | path=$1 9 | port=$2 10 | timeout=$3 11 | 12 | waited=1 13 | while true 14 | do 15 | if [ -f $path ]; then 16 | listen=`netstat -lnt | grep $port | wc -l` 17 | if [ "x$listen" = "x1" ]; then 18 | exit 0 19 | fi 20 | fi 21 | sleep 1 22 | waited=`expr $waited + 1` 23 | if [ $waited -ge $timeout ]; then 24 | exit 1 25 | fi 26 | done 27 | -------------------------------------------------------------------------------- /mysos/executor/installer.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | from twitter.common.lang import Interface 4 | 5 | 6 | class PackageInstaller(Interface): 7 | """ 8 | Responsible for installing MySQL and dependent packages. 9 | 10 | NOTE: It is not responsible for installing DB instances. 11 | """ 12 | 13 | class Error(Exception): pass 14 | 15 | @abstractmethod 16 | def install(self): 17 | """ 18 | :return: The environment variables necessary for MySQL to execute. 19 | :rtype: dict 20 | """ 21 | pass 22 | 23 | 24 | class PackageInstallerProvider(Interface): 25 | @abstractmethod 26 | def from_task(self, task, sandbox): 27 | """ 28 | Factory method that creates a PackageInstaller instance from 'task' (TaskInfo). 29 | 30 | :return: The PackageInstaller instance. 31 | """ 32 | pass 33 | -------------------------------------------------------------------------------- /mysos/executor/mysos_task_runner.py: -------------------------------------------------------------------------------- 1 | import Queue 2 | import json 3 | import os 4 | import signal 5 | from subprocess import CalledProcessError 6 | import sys 7 | import threading 8 | 9 | from mysos.common.cluster import ClusterListener, get_cluster_path 10 | from mysos.common.zookeeper import parse 11 | 12 | from .installer import PackageInstaller 13 | from .state import StateManager 14 | from .task_runner import TaskError, TaskRunner, TaskRunnerProvider 15 | from .task_control import TaskControl 16 | 17 | from kazoo.client import KazooClient 18 | from twitter.common import log 19 | from twitter.common.concurrent import defer 20 | from twitter.common.zookeeper.serverset.endpoint import Endpoint, ServiceInstance 21 | 22 | 23 | class MysosTaskRunnerProvider(TaskRunnerProvider): 24 | def __init__(self, task_control_provider, installer_provider, backup_store_provider): 25 | self._task_control_provider = task_control_provider 26 | self._installer_provider = installer_provider 27 | self._backup_store_provider = backup_store_provider 28 | 29 | def from_task(self, task, sandbox): 30 | data = json.loads(task.data) 31 | cluster_name, host, port, zk_url = data['cluster'], data['host'], data['port'], data['zk_url'] 32 | _, servers, path = parse(zk_url) 33 | kazoo = KazooClient(servers) 34 | kazoo.start() 35 | self_instance = ServiceInstance(Endpoint(host, port)) 36 | 37 | try: 38 | task_control = self._task_control_provider.from_task(task, sandbox) 39 | installer = self._installer_provider.from_task(task, sandbox) 40 | backup_store = self._backup_store_provider.from_task(task, sandbox) 41 | except (TaskControl.Error, PackageInstaller.Error) as e: 42 | kazoo.stop() # Kazoo needs to be cleaned up. See kazoo/issues/217. 43 | raise TaskError(e.message) 44 | 45 | state_manager = StateManager(sandbox, backup_store) 46 | 47 | return MysosTaskRunner( 48 | self_instance, 49 | kazoo, 50 | get_cluster_path(path, cluster_name), 51 | installer, 52 | task_control, 53 | state_manager) 54 | 55 | 56 | class MysosTaskRunner(TaskRunner): 57 | """ 58 | A runner that manages the lifecycle of a MySQL task (through the provided 'task_control'). 59 | 60 | The task is executed as a long-running process its return code can be obtained using 'join()'. 61 | 62 | Thread-safety: 63 | This class is accessed from the MysosExecutor thread (not the ExecutorDriver thread because 64 | MysosExecutor invokes operations asynchronously) and the ClusterListener thread and is 65 | thread-safe. 66 | 67 | TODO(jyx): Push the knowledge of the underlying subprocess down to the task control and stop the 68 | the subprocess using the task control. 69 | """ 70 | 71 | def __init__(self, self_instance, kazoo, cluster_root, installer, task_control, state_manager): 72 | """ 73 | :param self_instance: The local ServiceInstance associated with this task runner. 74 | :param kazoo: Kazoo client, it should be started before being passed in. 75 | :param cluster_root: The ZooKeeper root path for *this cluster*. 76 | :param installer: The PackageInstaller for MySQL. 77 | :param task_control: The TaskControl that interacts with the task process. 78 | :param state_manager: The StateManager for managing the executor state. 79 | """ 80 | self._installer = installer 81 | self._env = None # The environment variables for the 'task_control' commands. Set by the 82 | # installer. 83 | 84 | self._task_control = task_control 85 | self._state_manager = state_manager 86 | 87 | self._lock = threading.Lock() 88 | self._popen = None # The singleton task process started by '_task_control'. 89 | 90 | self._started = False # Indicates whether start() has already been called. 91 | self._stopping = False # Indicates whether stop() has already been called. 92 | self._exited = threading.Event() # Set when the task process has exited. 93 | self._result = Queue.Queue() # The returncode returned by the task process or an exception. 94 | 95 | # Public events and queue. 96 | self.promoted = threading.Event() 97 | self.demoted = threading.Event() 98 | self.master = Queue.Queue() # Set when a master change is detected. 99 | 100 | self._kazoo = kazoo 101 | self._listener = ClusterListener( 102 | kazoo, 103 | cluster_root, 104 | self_instance, 105 | promotion_callback=self._on_promote, 106 | demotion_callback=self._on_demote, 107 | master_callback=self._on_master_change) # Listener started by start(). 108 | 109 | # --- Public interface. --- 110 | def start(self): 111 | """ 112 | Start the runner in a separate thread and wait for the task process to be forked. 113 | """ 114 | with self._lock: 115 | if self._started: 116 | raise TaskError("Runner already started") 117 | self._started = True 118 | 119 | # Can potentially hold the lock for a long time but it's OK since the runner is not accessed 120 | # by multiple threads until after it's started; can be a noop as well, depending on the 121 | # installer implementation. 122 | try: 123 | # 1. Install the application. 124 | self._env = self._installer.install() 125 | log.info("Package installation completed. Resulting environment variables: %s" % self._env) 126 | 127 | # 2. Restore/initialize the application state. 128 | self._state_manager.bootstrap(self._task_control, self._env) 129 | log.info("Executor state fully bootstrapped") 130 | 131 | # 3. Start the task subprocess. 132 | # Store the process so we can kill it if necessary. 133 | self._popen = self._task_control.start(env=self._env) 134 | log.info("Task started in subprocess %s" % self._popen.pid) 135 | defer(self._wait) 136 | 137 | # 4. Start monitoring. 138 | # Only start listening to ZK events after the task subprocess has been successfully started. 139 | self._listener.start() 140 | except (PackageInstaller.Error, StateManager.Error, CalledProcessError) as e: 141 | raise TaskError("Failed to start MySQL task: %s" % e) 142 | 143 | def _wait(self): 144 | # Block until the subprocess exits and delivers the return code. 145 | self._result.put(self._popen.wait()) 146 | 147 | # Notify stop() if it is waiting. 148 | self._exited.set() 149 | 150 | def stop(self, timeout=10): 151 | with self._lock: 152 | # stop() could be called by multiple threads. Locking so we only stop the runner once. 153 | if self._stopping: 154 | log.warn("The runner is already stopping/stopped") 155 | return False 156 | else: 157 | log.info("Stopping runner") 158 | self._stopping = True 159 | 160 | try: 161 | return self._stop(timeout) 162 | finally: 163 | self._kazoo.stop() 164 | log.info("Runner cleaned up") 165 | 166 | def _stop(self, timeout): 167 | """ 168 | Stop the runner and wait for its thread (and the sub-processes) to exit. 169 | 170 | :param timeout: The timeout that the process should die before a hard SIGKILL is issued 171 | (SIGTERM is used initially). 172 | :return: True if an active runner is stopped, False if the runner is not started or already 173 | stopping/stopped. 174 | """ 175 | with self._lock: 176 | if not self._started: 177 | log.warn("Cannot stop the runner because it's not started") 178 | return False 179 | 180 | if not self._popen: 181 | log.info("The runner task did not start successfully so no need to kill it") 182 | return False 183 | 184 | try: 185 | log.info("Terminating process group: %s" % self._popen.pid) 186 | os.killpg(self._popen.pid, signal.SIGTERM) 187 | except OSError as e: 188 | log.info("The sub-processes are already terminated: %s" % e) 189 | return False 190 | 191 | log.info("Waiting for process to terminate due to SIGTERM") 192 | 193 | # Escalate to SIGKILL if SIGTERM is not sufficient. 194 | if not self._exited.wait(timeout=timeout): 195 | with self._lock: 196 | try: 197 | log.warn("Killing process group %s which failed to terminate cleanly within %s secs" % 198 | (self._popen.pid, timeout)) 199 | os.killpg(self._popen.pid, signal.SIGKILL) 200 | except OSError as e: 201 | log.info("The sub-processes are already terminated: %s" % e) 202 | return False 203 | else: 204 | return True 205 | 206 | log.info("Waiting for process to terminate due to SIGKILL") 207 | if not self._exited.wait(timeout=timeout): 208 | raise TaskError("Failed to kill process group %s" % self._popen.pid) 209 | 210 | return True 211 | 212 | def get_log_position(self): 213 | """ 214 | Get the log position of the MySQL slave. Return None if it cannot be obtained. 215 | """ 216 | try: 217 | log_position = self._task_control.get_log_position(env=self._env) 218 | return log_position 219 | except CalledProcessError as e: 220 | raise TaskError("Unable to get the slave's log position: %s" % e) 221 | 222 | def join(self): 223 | """ 224 | Wait for the runner to terminate. 225 | 226 | :return: The return code of the subprocess. NOTE: A negative value -N indicates that the 227 | child was terminated by signal N (on Unix). 228 | 229 | :exception: The TaskError exception due to an error in task control operations. 230 | """ 231 | # Using 'sys.maxint' makes this forever wait interruptible. 232 | result = self._result.get(True, sys.maxint) 233 | if isinstance(result, Exception): 234 | raise result 235 | else: 236 | return result 237 | 238 | # --- ClusterListener handlers. --- 239 | def _on_promote(self): 240 | self.promoted.set() 241 | if not self._exited.is_set(): 242 | defer(self._promote) 243 | 244 | def _promote(self): 245 | try: 246 | self._task_control.promote(env=self._env) 247 | except CalledProcessError as e: 248 | self._result.put(TaskError("Failed to promote the slave: %s" % e)) 249 | self.stop() 250 | 251 | def _on_demote(self): 252 | """ 253 | Executor shuts itself down when demoted. 254 | """ 255 | self.demoted.set() 256 | 257 | # Stop the runner asynchronously. 258 | if not self._exited.is_set(): 259 | log.info("Shutting down runner because it is demoted.") 260 | # Call stop() asynchronously because this callback is invoked from the Kazoo thread which we 261 | # don't want to block. 262 | defer(self.stop) 263 | 264 | def _on_master_change(self, master): 265 | self.master.put(master) 266 | if not self._exited.is_set(): 267 | defer(lambda: self._reparent(master)) 268 | 269 | def _reparent(self, master): 270 | try: 271 | self._task_control.reparent( 272 | master.service_endpoint.host, 273 | master.service_endpoint.port, 274 | env=self._env) 275 | except CalledProcessError as e: 276 | self._result.put(TaskError("Failed to reparent the slave: %s" % e)) 277 | self.stop() 278 | -------------------------------------------------------------------------------- /mysos/executor/mysql_task_control.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import subprocess 4 | import threading 5 | 6 | from mysos.common.decorators import synchronized 7 | 8 | from .sandbox import Sandbox 9 | from .task_control import TaskControl, TaskControlProvider 10 | 11 | from twitter.common import log 12 | from twitter.common.quantity import Amount, Data 13 | import yaml 14 | 15 | 16 | MEM_FRACTION_FOR_BUFFER_POOL = 0.75 17 | 18 | 19 | class MySQLTaskControlProvider(TaskControlProvider): 20 | """ 21 | The default implementation of MySQLTaskControlProvider. 22 | There exist other implementations for testing purposes. 23 | """ 24 | 25 | def from_task(self, task, sandbox): 26 | data = json.loads(task.data) 27 | task_mem = None 28 | for resource in task.resources: 29 | if resource.name == 'mem': 30 | task_mem = resource.scalar.value 31 | break 32 | 33 | assert task_mem, "Task resources should always include 'mem'" 34 | 35 | buffer_pool_size = int( 36 | Amount(int(task_mem), Data.MB).as_(Data.BYTES) * MEM_FRACTION_FOR_BUFFER_POOL) 37 | log.info("Allocating %s bytes of memory to MySQL buffer pool" % buffer_pool_size) 38 | 39 | # TODO(jyx): Use an ephemeral sandbox for now. Will change when Mesos adds persistent resources 40 | # support: MESOS-1554. 41 | return MySQLTaskControl( 42 | sandbox, 43 | data['framework_user'], 44 | data['host'], 45 | data['port'], 46 | data['cluster'], 47 | data['cluster_user'], 48 | data['cluster_password'], 49 | data['server_id'], 50 | data['admin_keypath'], 51 | buffer_pool_size) 52 | 53 | 54 | class MySQLTaskControl(TaskControl): 55 | def __init__( 56 | self, 57 | sandbox, 58 | framework_user, 59 | host, 60 | port, 61 | cluster_name, 62 | cluster_user, 63 | password, 64 | server_id, 65 | admin_keypath, 66 | buffer_pool_size): 67 | """ 68 | :param sandbox: The sandbox where all files of this Mysos executor instance reside. 69 | :param framework_user: The Unix user this framework runs as. 70 | :param host: The hostname of the host that runs the MySQL instance. 71 | :param port: The port of the MySQL instance. 72 | :param cluster_name: The name of the cluster. 73 | :param cluster_user: The Unix account that mysqld will run as and also the MySQL username. 74 | :param password: The MySQL password associated with 'cluster_user' in MySQL. 75 | :param server_id: The ID that identifies the MySQL instance. 76 | :param buffer_pool_size: For the 'innodb_buffer_pool_size' variable in MySQL options. 77 | """ 78 | if not isinstance(sandbox, Sandbox): 79 | raise TypeError("'sandbox' should be an instance of Sandbox") 80 | self._sandbox = sandbox 81 | 82 | self._framework_user = framework_user 83 | self._host = host 84 | self._port = port 85 | self._cluster_name = cluster_name 86 | self._cluster_user = cluster_user 87 | self._password = password 88 | self._server_id = server_id 89 | 90 | if not isinstance(buffer_pool_size, int): 91 | raise ValueError("'buffer_pool_size' should be an instance of int") 92 | 93 | self._buffer_pool_size = buffer_pool_size 94 | 95 | try: 96 | with open(admin_keypath, "r") as f: 97 | cred = yaml.load(f) 98 | self._admin_username = cred["username"] 99 | self._admin_password = cred["password"] 100 | log.info("Loaded credentials for admin account %s" % self._admin_username) 101 | except IOError as e: 102 | raise ValueError("Unable to obtain admin credentials: %s" % e) 103 | except (KeyError, yaml.YAMLError) as e: 104 | raise ValueError("Invalid key file format %s" % e) 105 | 106 | self._lock = threading.Lock() 107 | self._process = None # The singleton task process that launches mysqld. 108 | 109 | self._scripts_dir = os.path.join(self._sandbox.bin, "mysql", "scripts") 110 | if not os.path.isdir(self._scripts_dir): 111 | raise TaskControl.Error("Scripts directory %s does not exist" % self._scripts_dir) 112 | 113 | custom_conf_file = os.environ.get('MYSOS_DEFAULTS_FILE', None) 114 | if custom_conf_file: 115 | log.info("Using 'MYSOS_DEFAULTS_FILE': %s" % custom_conf_file) 116 | self._conf_file = custom_conf_file.strip() 117 | else: 118 | self._conf_file = os.path.join(self._sandbox.bin, "mysql", "conf", "my.cnf") 119 | 120 | if not os.path.isfile(self._conf_file): 121 | raise TaskControl.Error("Option file %s does not exist" % self._conf_file) 122 | 123 | @synchronized 124 | def initialize(self, env): 125 | command = "%(cmd)s %(framework_user)s %(data_dir)s %(conf_file)s" % dict( 126 | cmd=os.path.join(self._scripts_dir, "mysos_install_db.sh"), 127 | framework_user=self._framework_user, 128 | data_dir=self._sandbox.mysql_data_dir, 129 | conf_file=self._conf_file) 130 | log.info("Executing command: %s" % command) 131 | subprocess.check_call(command, shell=True, env=env) 132 | 133 | @synchronized 134 | def start(self, env=None): 135 | if self._process: 136 | log.warn("start() called when a running task subprocess already exists") 137 | return 138 | 139 | command = ( 140 | "%(cmd)s %(framework_user)s %(host)s %(port)s %(server_id)s %(data_dir)s %(log_dir)s " 141 | "%(tmp_dir)s %(conf_file)s %(buffer_pool_size)s" % dict( 142 | cmd=os.path.join(self._scripts_dir, "mysos_launch_mysqld.sh"), 143 | framework_user=self._framework_user, 144 | host=self._host, 145 | port=self._port, 146 | server_id=self._server_id, 147 | data_dir=self._sandbox.mysql_data_dir, 148 | log_dir=self._sandbox.mysql_log_dir, 149 | tmp_dir=self._sandbox.mysql_tmp_dir, 150 | conf_file=self._conf_file, 151 | buffer_pool_size=self._buffer_pool_size)) 152 | log.info("Executing command: %s" % command) 153 | self._process = subprocess.Popen(command, shell=True, env=env, preexec_fn=os.setpgrp) 154 | 155 | # There is a delay before mysqld becomes available to accept requests. Wait for it. 156 | command = "%(cmd)s %(pid_file)s %(port)s %(timeout)s" % dict( 157 | cmd=os.path.join(self._scripts_dir, "mysos_wait_for_mysqld.sh"), 158 | pid_file=os.path.join(self._sandbox.mysql_log_dir, "mysqld.pid"), 159 | port=self._port, 160 | timeout=60) 161 | log.info("Executing command: %s" % command) 162 | subprocess.check_call(command, shell=True, env=env) 163 | 164 | return self._process 165 | 166 | @synchronized 167 | def reparent(self, master_host, master_port, env=None): 168 | command = ("%(cmd)s %(master_host)s %(master_port)s %(slave_host)s %(slave_port)s " 169 | "%(admin_user)s %(admin_password)s") 170 | params = dict( 171 | cmd=os.path.join(self._scripts_dir, "mysos_reparent.sh"), 172 | master_host=master_host, 173 | master_port=master_port, 174 | slave_host=self._host, 175 | slave_port=self._port, 176 | admin_user=self._admin_username, 177 | admin_password=self._admin_password) 178 | 179 | log.info("Executing command: %s" % (command % dict(params, admin_password=""))) 180 | subprocess.check_call(command % params, shell=True, env=env) 181 | 182 | @synchronized 183 | def promote(self, env=None): 184 | command = ("%(cmd)s %(host)s %(port)s %(cluster_user)s %(password)s %(admin_user)s " 185 | "%(admin_password)s") 186 | params = dict( 187 | cmd=os.path.join(self._scripts_dir, "mysos_promote_master.sh"), 188 | host=self._host, 189 | port=self._port, 190 | cluster_user=self._cluster_user, 191 | password=self._password, 192 | admin_user=self._admin_username, 193 | admin_password=self._admin_password) 194 | 195 | log.info("Executing command: %s" % ( 196 | command % dict(params, password="", admin_password=""))) 197 | subprocess.check_call(command % params, shell=True, env=env) 198 | 199 | @synchronized 200 | def get_log_position(self, env=None): 201 | command = '%(cmd)s %(host)s %(port)s' % dict( 202 | cmd=os.path.join(self._scripts_dir, "mysos_log_position.sh"), 203 | host=self._host, 204 | port=self._port) 205 | 206 | log.info("Executing command: %s" % command) 207 | output = subprocess.check_output(command, shell=True, env=env).strip() 208 | 209 | if len(output.split(',')) == 2: 210 | log_file, log_position = output.split(',') # log_file may be empty. 211 | log.info('Obtained log position: %s ' % str((log_file, log_position))) 212 | return log_file, log_position 213 | else: 214 | return None 215 | -------------------------------------------------------------------------------- /mysos/executor/noop_installer.py: -------------------------------------------------------------------------------- 1 | from .installer import PackageInstaller, PackageInstallerProvider 2 | 3 | 4 | class NoopPackageInstaller(PackageInstaller): 5 | """ 6 | An installer that doesn't actually install the package. 7 | 8 | It can be used when the host has dependent packages (i.e. MySQL) pre-installed and its upgrades 9 | managed externally. 10 | """ 11 | 12 | def install(self): 13 | return None # No environment variables to pass along to the task sub-processes. 14 | 15 | 16 | class NoopPackageInstallerProvider(PackageInstallerProvider): 17 | def from_task(self, task, sandbox): 18 | return NoopPackageInstaller() 19 | -------------------------------------------------------------------------------- /mysos/executor/sandbox.py: -------------------------------------------------------------------------------- 1 | from os.path import isabs, join 2 | 3 | from twitter.common.dirutil import safe_mkdir 4 | 5 | 6 | class Sandbox(object): 7 | """ 8 | Represents the structure of the Mysos executor sandbox: 9 | 10 | sandbox_root/ # Sandbox for this Mysos instance. 11 | bin/ # Binaries, executables. 12 | mysql/scripts/ # Mysos scripts (come with the executor). 13 | mysos_install_db.sh 14 | ... 15 | lib/ # Libraries installed by the installer. 16 | var/ # For Mysos (and MySQL) to save its state ('datadir', 'tmpdir', etc). 17 | mysql/ # The sandbox maintains and exposes some standard mysql directories. 18 | data/ 19 | tmp/ 20 | logs/ 21 | """ 22 | 23 | def __init__(self, root): 24 | """ 25 | Initializes the sandbox. 26 | 27 | :param root: Root path of the sandbox. 28 | 29 | The sandbox makes sure that the folder paths it exposes as properties are created. 30 | """ 31 | if not isabs(root): 32 | raise ValueError("Only an absolute path is allowed for 'root") 33 | 34 | self._root = root 35 | 36 | safe_mkdir(self.bin) 37 | safe_mkdir(self.lib) 38 | safe_mkdir(self.var) 39 | safe_mkdir(self.mysql_var) 40 | safe_mkdir(self.mysql_data_dir) 41 | safe_mkdir(self.mysql_tmp_dir) 42 | safe_mkdir(self.mysql_log_dir) 43 | 44 | @property 45 | def root(self): 46 | """Root path of the sandbox.""" 47 | return self._root 48 | 49 | @property 50 | def bin(self): 51 | """For Mysos binaries.""" 52 | return join(self._root, "bin") 53 | 54 | @property 55 | def lib(self): 56 | """For libraries that the Mysos executor replies on.""" 57 | return join(self._root, "lib") 58 | 59 | @property 60 | def var(self): 61 | """For the Mysos executor's application data.""" 62 | return join(self._root, "var") 63 | 64 | @property 65 | def mysql_var(self): 66 | """For mysql related data files.""" 67 | return join(self.var, 'mysql') 68 | 69 | @property 70 | def mysql_data_dir(self): 71 | return join(self.mysql_var, 'data') 72 | 73 | @property 74 | def mysql_tmp_dir(self): 75 | return join(self.mysql_var, 'tmp') 76 | 77 | @property 78 | def mysql_log_dir(self): 79 | return join(self.mysql_var, 'logs') 80 | 81 | def __str__(self): 82 | return self._root 83 | -------------------------------------------------------------------------------- /mysos/executor/shell_utils.py: -------------------------------------------------------------------------------- 1 | """Set of utility functions for working with OS commands. 2 | 3 | Functions in this module return the command string. These commands are composed but not executed. 4 | """ 5 | 6 | import os 7 | from subprocess import call 8 | 9 | 10 | HADOOP_CONF_DIR = '/etc/hadoop/conf' 11 | 12 | 13 | def encrypt(key_file): 14 | """ 15 | Encrypt the data from stdin and write output to stdout. 16 | 17 | :param key_file: The key file used to encrypt the stream. 18 | """ 19 | if not os.path.isfile(key_file): 20 | raise ValueError("Cannot find key_file: %" % key_file) 21 | 22 | return "openssl aes-256-cbc -salt -pass file:%s" % key_file 23 | 24 | 25 | def decrypt(key_file): 26 | """ 27 | Decrypt the data from stdin and write output to stdout. 28 | 29 | :param key_file: The key file used to decrypt the stream. 30 | """ 31 | if not os.path.isfile(key_file): 32 | raise ValueError("Cannot find key_file: %" % key_file) 33 | 34 | return "openssl aes-256-cbc -d -pass file:%s" % key_file 35 | 36 | 37 | def compress(extension): 38 | """ 39 | Compress the data from stdin and write output to stdout. 40 | :param extension: The compression format identified by the file extension. Allowed values are: 41 | 'gz' for gzip, 'bz' or 'bz2' for bzip. 42 | """ 43 | if extension == "gz": 44 | cmd = "pigz" if exists("pigz") else "gzip" 45 | elif extension == "bz" or extension == "bz2": 46 | cmd = "bzip2" 47 | elif extension == 'lzo': 48 | cmd = "lzop" 49 | else: 50 | raise ValueError("Unknown compression format/file extension") 51 | 52 | return cmd 53 | 54 | 55 | def decompress(extension): 56 | """ 57 | Decompress the data from stdin and write output to stdout. 58 | 59 | :param extension: The compression format identified by the file extension. Allowed values are: 60 | 'gz' for gzip, 'bz' or 'bz2' for bzip. 61 | """ 62 | if extension == "gz": 63 | cmd = "pigz -d" if exists("pigz") else "gzip -d" 64 | elif extension == "bz" or extension == "bz2": 65 | cmd = "bzip2 -d" 66 | elif extension == 'lzo': 67 | cmd = "lzop -d" 68 | else: 69 | raise ValueError("Unknown compression format/file extension") 70 | 71 | return cmd 72 | 73 | 74 | def hdfs_cat(uri, conf=HADOOP_CONF_DIR): 75 | """ 76 | Fetch the data from the specified uri and write output to stdout. 77 | 78 | :param uri: The HDFS URI. 79 | :param conf: The hadoop config directory. 80 | """ 81 | return "hadoop --config %s dfs -cat %s" % (conf, uri) 82 | 83 | 84 | def pv(size): 85 | """ 86 | Monitor the progress of data through a pipe. If 'pv' is not available, simply 'cat' it. 87 | 88 | :param size: The size of the data, to calculate percentage. 89 | """ 90 | if exists('pv'): 91 | return "pv --wait --size %s" % size 92 | else: 93 | return "cat" 94 | 95 | 96 | def untar(directory): 97 | """ 98 | Untar the data from stdin into the specified directory. 99 | 100 | :param directory: The directory to write files to. 101 | """ 102 | return "tar -C %s -x" % directory 103 | 104 | 105 | def tar(path): 106 | """ 107 | Tar the path and write output to stdout. 108 | 109 | :param path: All contents under path are 'tar'ed. 110 | """ 111 | if not os.path.exists(path): 112 | raise ValueError("Invalid argument: 'path' doesn't exist") 113 | 114 | path = path.rstrip(os.sep) 115 | parent, base = os.path.split(path) 116 | return "tar -C %s %s" % (parent, base) 117 | 118 | 119 | def exists(cmd): 120 | """Return true if 'cmd' exists in $PATH.""" 121 | with open(os.devnull, "w") as f: 122 | return call(['which', cmd], stdout=f) == 0 # No stdout. 123 | -------------------------------------------------------------------------------- /mysos/executor/state.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import subprocess 4 | 5 | from .backup import BackupStore 6 | 7 | from twitter.common import log 8 | 9 | 10 | BACKUP_INFO_FILENAME = "backup_info.json" 11 | 12 | 13 | class StateManager(object): 14 | """ 15 | Responsible for managing (restoring, initializing) the executor state. 16 | 17 | TODO(jyx): Will also periodically back up the state. 18 | """ 19 | 20 | class Error(Exception): pass 21 | 22 | def __init__(self, sandbox, backup_store): 23 | """ 24 | :param sandbox: The sandbox. 25 | :param backup_store: The BackupStore implementation. 26 | """ 27 | self._sandbox = sandbox 28 | self._backup_store = backup_store 29 | 30 | def bootstrap(self, task_control, env): 31 | """ 32 | Bootstraps the executor state. 33 | 34 | :param task_control: Task control for carrying out state bootstrapping commands. 35 | :param env: The environment variables for task_control methods. 36 | """ 37 | 38 | # 1. Directly return if the data folder is not empty. 39 | if os.listdir(self._sandbox.mysql_data_dir): 40 | # TODO(jyx): This will be expected when we use persistent volumes. Validate the state in that 41 | # case. 42 | log.warn("MySQL state already exists unexpectedly. Finishing bootstrap without restoration " 43 | "or initialization") 44 | return 45 | 46 | # 2. If the data folder is clean, restore state from the backup store. This can be a noop if the 47 | # user doesn't want to restore any state. 48 | try: 49 | backup_info = self._backup_store.restore() 50 | except BackupStore.Error as e: 51 | raise self.Error("Failed to restore MySQL state: %s" % e) 52 | 53 | if backup_info: 54 | # If some backup is restored, persist the backup info. 55 | log.info("Finished restoring the backup") 56 | backup_info_file = os.path.join(self._sandbox.var, BACKUP_INFO_FILENAME) 57 | # Useful for the user to check the result of backup restore. 58 | with open(backup_info_file, 'w') as f: 59 | json.dump(backup_info.__dict__, f) 60 | log.info("Persisted backup info '%s' to file %s" % (backup_info.__dict__, backup_info_file)) 61 | else: 62 | # If no recovery necessary, initialize the data dirs. 63 | log.info("No MySQL backup is restored. Initializing a new MySQL instance") 64 | try: 65 | task_control.initialize(env) 66 | except subprocess.CalledProcessError as e: 67 | raise self.Error("Unable to initialize MySQL state: %s" % e) 68 | -------------------------------------------------------------------------------- /mysos/executor/task_control.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | from twitter.common.lang import Interface 4 | 5 | 6 | class TaskControlProvider(Interface): 7 | @abstractmethod 8 | def from_task(self, task, sandbox): 9 | pass 10 | 11 | 12 | class TaskControl(Interface): 13 | """ 14 | This class encapsulates commands that control the MySQL task process. 15 | 16 | Implementation NOTEs: 17 | 1. These commands (methods) should run serially when accessed from multiple 18 | threads. 19 | 2. This class doesn't capture subprocess.CalledProcessErrors thrown when underlying MySQL 20 | commands exit with a non-zero return code. The caller should handle it. 21 | """ 22 | 23 | class Error(Exception): 24 | pass 25 | 26 | @abstractmethod 27 | def initialize(self, env): 28 | """ 29 | Initialize a new DB instance. 30 | 31 | :param env: The 'env' necessary for 'subprocess' to run the command. 32 | """ 33 | pass 34 | 35 | @abstractmethod 36 | def start(self, env=None): 37 | """ 38 | Start the task in a subprocess. 39 | 40 | :param env: The 'env' necessary for 'subprocess' to run the command. 41 | :return: A subprocess.Popen object that represents the leader of the process group that executes 42 | the task. 43 | """ 44 | pass 45 | 46 | @abstractmethod 47 | def reparent(self, master_host, master_port, env=None): 48 | """ 49 | Reparent the MySQL slave to the new master. 50 | 51 | :param env: The 'env' necessary for 'subprocess' to run the command. 52 | """ 53 | pass 54 | 55 | @abstractmethod 56 | def promote(self, env=None): 57 | """ 58 | Promote a slave to mastership. 59 | 60 | :param env: The 'env' necessary for 'subprocess' to run the command. 61 | """ 62 | pass 63 | 64 | @abstractmethod 65 | def get_log_position(self, env=None): 66 | """ 67 | Retrieve the log position from mysqld. 68 | 69 | :param env: The 'env' necessary for 'subprocess' to run the command. 70 | :return: The log position, None if it cannot be obtained. 71 | """ 72 | pass 73 | -------------------------------------------------------------------------------- /mysos/executor/task_runner.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | from twitter.common.lang import Interface 4 | 5 | 6 | class TaskError(Exception): 7 | pass 8 | 9 | 10 | class TaskRunner(Interface): 11 | pass 12 | 13 | 14 | class TaskRunnerProvider(Interface): 15 | @abstractmethod 16 | def from_task(self, task, sandbox): 17 | """ 18 | Factory method that creates a TaskRunner instance from 'task' (TaskInfo). 19 | """ 20 | pass 21 | -------------------------------------------------------------------------------- /mysos/executor/testing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/incubator-retired-cotton/4aa9bb0acdd8c609686b5d370ef4b61a520364ef/mysos/executor/testing/__init__.py -------------------------------------------------------------------------------- /mysos/executor/testing/fake.py: -------------------------------------------------------------------------------- 1 | """This module is for all fake implementations of things in Mysos executor for testing.""" 2 | 3 | import os 4 | import random 5 | import subprocess 6 | import threading 7 | 8 | from mysos.common.decorators import synchronized 9 | from mysos.executor.task_control import TaskControl, TaskControlProvider 10 | 11 | 12 | class FakeTaskControlProvider(TaskControlProvider): 13 | """ 14 | An implementation of TaskControlProvider for testing. 15 | """ 16 | 17 | def from_task(self, task, sandbox): 18 | return FakeTaskControl(position=random.randint(0, 100)) 19 | 20 | 21 | class FakeTaskControl(TaskControl): 22 | def __init__( 23 | self, 24 | mysqld="tail -f /dev/null", 25 | initialize_cmd=":", 26 | start_cmd=":", 27 | reparent_cmd=":", 28 | promote_cmd=":", 29 | get_log_position_cmd=":", 30 | position=1): 31 | """ 32 | :param mysqld: The command that 'simulates' mysqld (and does nothing). 33 | :param *_cmd: The commands that are executed for the respective FakeTaskControl operations. 34 | :param position: The 'mysqld' log position to return as the result of get_log_position(). 35 | """ 36 | self._lock = threading.Lock() 37 | self._mysqld = mysqld 38 | self._initialize_cmd = initialize_cmd 39 | self._start_cmd = start_cmd, 40 | self._reparent_cmd = reparent_cmd, 41 | self._promote_cmd = promote_cmd, 42 | self._get_log_position_cmd = get_log_position_cmd 43 | self._position = position 44 | self._process = None 45 | 46 | @synchronized 47 | def initialize(self, env): 48 | subprocess.check_call(self._initialize_cmd, shell=True) 49 | 50 | @synchronized 51 | def start(self, env=None): 52 | if self._process: 53 | return 54 | 55 | self._process = subprocess.Popen(self._mysqld, shell=True, preexec_fn=os.setpgrp) 56 | subprocess.check_call(self._start_cmd, shell=True) 57 | return self._process 58 | 59 | @synchronized 60 | def reparent(self, master_host, master_port, env=None): 61 | subprocess.check_call(self._reparent_cmd, shell=True) 62 | 63 | @synchronized 64 | def promote(self, env=None): 65 | subprocess.check_call(self._promote_cmd, shell=True) 66 | 67 | @synchronized 68 | def get_log_position(self, env=None): 69 | subprocess.check_call(self._get_log_position_cmd, shell=True) 70 | return self._position 71 | -------------------------------------------------------------------------------- /mysos/executor/testing/fake_mysos_executor.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import posixpath 4 | import socket 5 | 6 | from mysos.common import zookeeper 7 | from mysos.common.testing import Fake 8 | from mysos.executor.executor import MysosExecutor 9 | from mysos.executor.mysos_task_runner import MysosTaskRunner, TaskRunnerProvider 10 | from mysos.executor.noop_installer import NoopPackageInstaller 11 | from mysos.executor.sandbox import Sandbox 12 | 13 | from .fake import FakeTaskControlProvider 14 | 15 | import mesos.native 16 | from twitter.common import app, log 17 | from twitter.common.log.options import LogOptions 18 | from twitter.common.zookeeper.serverset.endpoint import Endpoint, ServiceInstance 19 | from zake.fake_client import FakeClient 20 | 21 | 22 | SANDBOX_ROOT = os.path.join(os.path.realpath('.'), "sandbox") 23 | 24 | 25 | class FakeTaskRunnerProvider(TaskRunnerProvider): 26 | """ 27 | Creates a MysosTaskRunner with FakeTaskControl for testing purposes. 28 | NOTE: zake is used so a running ZK server is not required. 29 | """ 30 | 31 | def __init__(self, task_control_provider): 32 | self._task_control_provider = task_control_provider 33 | 34 | def from_task(self, task, sandbox): 35 | data = json.loads(task.data) 36 | cluster_name, port, zk_url = data['cluster'], data['port'], data['zk_url'] 37 | 38 | _, servers, path = zookeeper.parse(zk_url) 39 | 40 | zk_client = FakeClient() 41 | zk_client.start() 42 | self_instance = ServiceInstance(Endpoint(socket.gethostbyname(socket.gethostname()), port)) 43 | task_control = self._task_control_provider.from_task(task, sandbox) 44 | 45 | return MysosTaskRunner( 46 | self_instance, 47 | zk_client, 48 | posixpath.join(path, cluster_name), 49 | NoopPackageInstaller(), 50 | task_control, 51 | Fake()) 52 | 53 | 54 | # This is a testing executor. We log more verbosely. 55 | LogOptions.disable_disk_logging() 56 | LogOptions.set_stderr_log_level('google:DEBUG') 57 | 58 | 59 | def proxy_main(): 60 | def main(args, options): 61 | log.info('Starting testing mysos executor') 62 | 63 | executor = MysosExecutor( 64 | FakeTaskRunnerProvider(FakeTaskControlProvider()), Sandbox(SANDBOX_ROOT)) 65 | 66 | driver = mesos.native.MesosExecutorDriver(executor) 67 | driver.run() 68 | 69 | log.info('Exiting executor main') 70 | 71 | app.main() 72 | -------------------------------------------------------------------------------- /mysos/executor/testing/vagrant_mysos_executor.py: -------------------------------------------------------------------------------- 1 | """This 'testing' executor is built to be run in the vagrant VM. 2 | 3 | It is basically the same as the normal Mysos executor except that it doesn't rely on HDFS. 4 | """ 5 | 6 | import os 7 | import stat 8 | 9 | from mysos.common.pkgutil import unpack_assets 10 | from mysos.executor.mysql_task_control import MySQLTaskControlProvider 11 | from mysos.executor.executor import MysosExecutor 12 | from mysos.executor.mysos_task_runner import MysosTaskRunnerProvider 13 | from mysos.executor.noop_installer import NoopPackageInstallerProvider 14 | from mysos.executor.sandbox import Sandbox 15 | from mysos.executor.backup import NoopBackupStoreProvider 16 | 17 | from twitter.common import app, log 18 | from twitter.common.log.options import LogOptions 19 | 20 | import mesos.native 21 | 22 | 23 | MYSOS_MODULE = 'mysos.executor' 24 | ASSET_RELPATH = 'files' 25 | 26 | 27 | def chmod_scripts(path): 28 | """Make scripts executable.""" 29 | if path.endswith('.sh'): 30 | st = os.stat(path) 31 | os.chmod(path, st.st_mode | stat.S_IEXEC) 32 | 33 | 34 | LogOptions.disable_disk_logging() 35 | LogOptions.set_stderr_log_level('google:INFO') 36 | 37 | 38 | def proxy_main(): 39 | def main(args, options): 40 | # 'sandbox' directory resides under the working directory assigned by the Mesos slave. 41 | sandbox_root = os.path.join(os.path.realpath('.'), "sandbox") 42 | 43 | unpack_assets(sandbox_root, MYSOS_MODULE, ASSET_RELPATH, execute=chmod_scripts) 44 | 45 | log.info("Starting Vagrant Mysos executor within sandbox %s" % sandbox_root) 46 | 47 | sandbox = Sandbox(sandbox_root) 48 | executor = MysosExecutor( 49 | MysosTaskRunnerProvider( 50 | MySQLTaskControlProvider(), 51 | NoopPackageInstallerProvider(), # Do not install any package. 52 | NoopBackupStoreProvider()), # Do not recover any state. 53 | sandbox) 54 | driver = mesos.native.MesosExecutorDriver(executor) 55 | driver.run() 56 | 57 | log.info('Exiting executor main') 58 | 59 | app.main() 60 | -------------------------------------------------------------------------------- /mysos/scheduler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/incubator-retired-cotton/4aa9bb0acdd8c609686b5d370ef4b61a520364ef/mysos/scheduler/__init__.py -------------------------------------------------------------------------------- /mysos/scheduler/assets/templates/clusters.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Mysos Clusters 5 | 6 | 7 | 8 | 17 | 18 | 19 | 40 | 41 |
42 |

Clusters

43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | %for cluster in clusters: 54 | 55 | 56 | 57 | 58 | 59 | %endfor 60 | 61 |
nameusernum nodes
${cluster.name}${cluster.user}${cluster.num_nodes}
62 |
63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /mysos/scheduler/elector.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from datetime import datetime, timedelta 3 | import threading 4 | 5 | import mesos.interface.mesos_pb2 as mesos_pb2 6 | from twitter.common import log 7 | from twitter.common.exceptions import ExceptionalThread 8 | from twitter.common.quantity import Amount, Time 9 | 10 | 11 | class MySQLMasterElector(ExceptionalThread): 12 | """ 13 | Elect the most current slave in the cluster to be the new master. 14 | 15 | The current election algorithm: 16 | The elector queries slaves periodically (to tolerate message loss) until all slaves have 17 | responded with their current positions and the slave with the highest (most current) 18 | position is elected the new master. If the election timeout has passed and not all slaves have 19 | responded, the master is elected from the ones that have. 20 | 21 | Thread-safety: 22 | The elector runs as a separate thread and it periodically queries the slaves util the election 23 | completes or is aborted. Its public methods are thread-safe. 24 | 25 | Usage: 26 | After constructing the elector, add all currently known candidates of the election through 27 | 'elector.add_slave()' before calling 'elector.start()' to make sure it waits for all of them. 28 | 29 | NOTE: An elector is not reusable. To restart the election, create another MySQLMasterElector 30 | instance. 31 | """ 32 | 33 | def __init__( 34 | self, 35 | driver, 36 | cluster_name, 37 | epoch, 38 | master_callback, 39 | election_timeout, 40 | query_interval=Amount(1, Time.SECONDS)): 41 | """ 42 | :param driver: The SchedulerDriver for querying the slaves. 43 | :param cluster_name: The name of the MySQL cluster. 44 | :param epoch: The master epoch that identifies this election. 45 | :param master_callback: The callback function with one argument: the 'task_id' of the elected 46 | master which could be None if no one is electable. 47 | :param election_timeout: The amount of time the elector waits for all slaves to respond. If 48 | not all slaves have responded within the timeout, then the master is 49 | elected from the ones who have. 50 | :param query_interval: The timeout before the elector re-sends queries for positions. 51 | 52 | :type epoch: int 53 | :type query_interval: Amount 54 | :type election_timeout: Amount 55 | :type master_callback: function 56 | """ 57 | super(MySQLMasterElector, self).__init__() 58 | 59 | if not isinstance(epoch, int): 60 | raise TypeError("'epoch' should be an int") 61 | if not isinstance(query_interval, Amount) or not isinstance(query_interval.unit(), Time): 62 | raise ValueError("'query_interval' must be an Amount of Time") 63 | if not isinstance(election_timeout, Amount) or not isinstance(election_timeout.unit(), Time): 64 | raise ValueError("'election_timeout' must be an Amount of Time") 65 | if not hasattr(master_callback, '__call__'): 66 | raise TypeError("master_callback must be a function") 67 | 68 | self._query_interval = query_interval.as_(Time.SECONDS) 69 | 70 | self._election_deadline = ( 71 | datetime.utcnow() + timedelta(seconds=election_timeout.as_(Time.SECONDS))) 72 | 73 | self._driver = driver 74 | self._cluster_name = cluster_name # For logging. 75 | self._epoch = epoch 76 | self._master_callback = master_callback 77 | 78 | self._positions = OrderedDict() # Slave {Task ID: Position} mappings. Use OrderedDict so we can 79 | # easily locate the first added slave. 80 | self._mesos_slaves = {} # Slave {Task ID, Mesos slave ID)} mappings. 81 | self._master = None # Elected master (its TaskID); initially None and can still be None after 82 | # the election has timed out and there are no slaves to elect from. 83 | 84 | self._lock = threading.Lock() 85 | self._aborted = threading.Event() # Elector thread aborted (don't invoke callback). 86 | self._completed = threading.Event() # Election process completed (invoke callback). 87 | 88 | def add_slave(self, task_id, mesos_slave_id): 89 | """ 90 | Add a new MySQL slave (Task ID, Mesos Slave ID) which could affect the ongoing election. 91 | """ 92 | with self._lock: 93 | if self._completed.is_set(): 94 | log.debug("Ignoring addition of slave %s because the election has completed" % task_id) 95 | return 96 | 97 | log.info('Adding slave %s to the election for cluster %s' % ( 98 | str((task_id, mesos_slave_id)), 99 | self._cluster_name)) 100 | 101 | self._positions[task_id] = None 102 | self._mesos_slaves[task_id] = mesos_slave_id 103 | 104 | def remove_slave(self, task_id): 105 | """ 106 | Remove a slave from the election process. 107 | """ 108 | with self._lock: 109 | if self._completed.is_set(): 110 | log.debug("Ignoring removal of slave %s because the election has completed" % task_id) 111 | return 112 | 113 | log.info('Removing slave %s from election for cluster %s' % ( 114 | str((task_id, self._mesos_slaves[task_id])), self._cluster_name)) 115 | 116 | assert task_id in self._positions 117 | assert task_id in self._mesos_slaves 118 | del self._positions[task_id] 119 | del self._mesos_slaves[task_id] 120 | 121 | def update_position(self, epoch, task_id, position): 122 | """ 123 | Called by the launcher upon receiving the executor's response to position query. 124 | 125 | :type epoch: int 126 | :param epoch: The master epoch this position is for. 127 | :param position: The position of the slave's log (not necessarily numeric). The elector 128 | doesn't care about its exact format as long as it is comparable. 129 | """ 130 | if not isinstance(epoch, int): 131 | raise TypeError("'epoch' should be an int") 132 | 133 | with self._lock: 134 | if self._completed.is_set(): 135 | log.debug("Ignoring position %s from slave %s because the election has completed" % ( 136 | position, task_id)) 137 | return 138 | 139 | if epoch != self._epoch: 140 | log.info( 141 | "Ignoring position %s from slave %s due to epoch mismatch: (expected: %s, actual: %s)" % 142 | (position, task_id, self._epoch, epoch)) 143 | return 144 | 145 | if task_id not in self._mesos_slaves: 146 | log.warn("Ignoring unsolicited position response %s from MySQL slave %s" % ( 147 | str(position), task_id)) 148 | return 149 | 150 | log.info('Updating position to %s (epoch=%s) for slave %s of cluster %s' % ( 151 | str(position), 152 | epoch, 153 | str((task_id, self._mesos_slaves[task_id])), 154 | self._cluster_name)) 155 | 156 | self._positions[task_id] = position 157 | 158 | def run(self): 159 | # Re-run the election in a loop periodically until a master can be elected or the elector is 160 | # aborted. 161 | while not self._aborted.is_set() and not self._completed.wait(self._query_interval): 162 | if datetime.utcnow() < self._election_deadline: 163 | self._elect(timedout=False) 164 | else: 165 | log.info("Timed out waiting for all slaves to respond. Now elect from existing responses") 166 | self._elect(timedout=True) 167 | if not self._completed.is_set(): 168 | log.warn("No slave is electable after timeout") 169 | 170 | if self._aborted.is_set(): # If asked to stop, directly return without triggering the callback. 171 | log.info("Asked to stop the elector thread for cluster %s. Stopping..." % self._cluster_name) 172 | return 173 | 174 | self._master_callback(self._master) # Invoke the callback from the elector thread. 175 | log.info( 176 | "Stopping the elector thread for cluster %s (epoch %s) because the election has completed" % 177 | (self._cluster_name, self._epoch)) 178 | 179 | def _elect(self, timedout=False): 180 | """ 181 | Try to elect the master from MySQL slaves. 182 | 183 | Elect the slave with the highest position if all slaves have responded, otherwise re-query the 184 | remaining slaves for log positions (unless 'timedout' is True). 185 | 186 | :param timedout: If True, just elect the slave with the highest position from the ones who 187 | have responded. 188 | 189 | NOTE: If 'timedout' is True and no slaves have responded (but there are running slaves in 190 | the cluster), theoretically we can still randomly elect one from them but currently we don't 191 | as this would suggest some major issues with the MySQL or Mesos cluster and it's dangerous to 192 | do so. 193 | """ 194 | with self._lock: 195 | # Special-casing the first epoch because this is when the cluster first starts and every 196 | # slave should be the same: just pick the first slave that comes up. 197 | if self._epoch == 0 and len(self._positions) > 0: 198 | self._master = next(iter(self._positions)) 199 | self._completed.set() 200 | return 201 | 202 | if timedout or (self._positions and _all(self._positions.values())): 203 | # Pick the slave with the highest position (value of the key-value pair). If all items have 204 | # the same value, an arbitrary (but deterministic) one is chosen. 205 | if _any(self._positions.values()): # Need at least one position. 206 | master_task, _ = max(self._positions.items(), key=lambda kv: kv[1]) 207 | log.info('Elected master %s for cluster %s' % (master_task, self._cluster_name)) 208 | self._master = master_task 209 | self._completed.set() 210 | else: 211 | # (Re)query the remaining slaves for log positions. 212 | for task_id in self._positions: 213 | if not self._positions[task_id]: 214 | self._query_slave(task_id) 215 | 216 | def _query_slave(self, task_id): 217 | assert task_id in self._mesos_slaves 218 | 219 | log.info('Querying MySQL slave %s for its log position (epoch=%s)' % ( 220 | str((task_id, self._mesos_slaves[task_id])), self._epoch)) 221 | 222 | # Because the elector re-sends messages, it's necessary to use the epoch to differentiate 223 | # responses for each election. 224 | self._driver.sendFrameworkMessage( 225 | mesos_pb2.ExecutorID(value=task_id), 226 | mesos_pb2.SlaveID(value=self._mesos_slaves[task_id]), 227 | str(self._epoch)) # Send the slave the epoch so it can be included in the response. 228 | 229 | def abort(self): 230 | """Stop the elector thread.""" 231 | self._aborted.set() 232 | 233 | 234 | def _all(iterable): 235 | """Same as built-in version of all() except it explicitly checks if an element "is None".""" 236 | for element in iterable: 237 | if element is None: 238 | return False 239 | return True 240 | 241 | 242 | def _any(iterable): 243 | """Same as built-in version of any() except it explicitly checks if an element "is not None".""" 244 | for element in iterable: 245 | if element is not None: 246 | return True 247 | return False 248 | -------------------------------------------------------------------------------- /mysos/scheduler/http.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from .launcher import MySQLClusterLauncher 5 | from .scheduler import MysosScheduler 6 | 7 | import bottle 8 | from mako.template import Template 9 | from twitter.common.http import HttpServer, route, static_file 10 | 11 | 12 | class MysosServer(HttpServer): 13 | def __init__(self, scheduler, asset_dir, metric_sampler): 14 | super(MysosServer, self).__init__() 15 | self._scheduler = scheduler 16 | self._asset_dir = asset_dir 17 | 18 | self._static_dir = os.path.join(self._asset_dir, 'static') 19 | self._template_dir = os.path.join(self._asset_dir, 'templates') 20 | 21 | self._clusters_template = Template(filename=os.path.join(self._template_dir, 'clusters.html')) 22 | 23 | self._metric_sampler = metric_sampler 24 | 25 | @route('/clusters/', method=['POST']) 26 | def create(self, clustername): 27 | """Create a db cluster.""" 28 | cluster_name = clustername # For naming consistency. 29 | num_nodes = bottle.request.forms.get('num_nodes', default=1) 30 | cluster_user = bottle.request.forms.get('cluster_user', default=None) 31 | backup_id = bottle.request.forms.get('backup_id', default=None) 32 | size = bottle.request.forms.get('size', default=None) 33 | cluster_password = bottle.request.forms.get('cluster_password', default=None) 34 | 35 | try: 36 | cluster_zk_url, cluster_password = self._scheduler.create_cluster( 37 | cluster_name, 38 | cluster_user, 39 | num_nodes, 40 | size, 41 | backup_id=backup_id, 42 | cluster_password=cluster_password) 43 | return json.dumps(dict(cluster_url=cluster_zk_url, cluster_password=cluster_password)) 44 | except MysosScheduler.ClusterExists as e: 45 | raise bottle.HTTPResponse(e.message, status=409) 46 | except MysosScheduler.InvalidUser as e: 47 | raise bottle.HTTPResponse(e.message, status=400) 48 | except MysosScheduler.ServiceUnavailable as e: 49 | raise bottle.HTTPResponse(e.message, status=503) 50 | except ValueError as e: 51 | raise bottle.HTTPResponse(e.message, status=400) 52 | 53 | @route('/clusters/', method=['DELETE']) 54 | def remove(self, clustername): 55 | """Remove a db cluster.""" 56 | cluster_name = clustername # For naming consistency. 57 | 58 | password = bottle.request.forms.get('password', default=None) 59 | 60 | try: 61 | cluster_zk_url = self._scheduler.delete_cluster(cluster_name, password) 62 | return json.dumps(dict(cluster_url=cluster_zk_url)) 63 | except MysosScheduler.ClusterNotFound as e: 64 | raise bottle.HTTPResponse(e.message, status=404) 65 | except MySQLClusterLauncher.PermissionError as e: 66 | raise bottle.HTTPResponse(e.message, status=403) 67 | 68 | @route('/', method=['GET']) 69 | def clusters(self): 70 | """Landing page, showing the list of managed clusters.""" 71 | if not self._scheduler.connected.is_set(): 72 | return "

Mysos scheduler is still connecting...

" 73 | 74 | return self._clusters_template.render(clusters=self._scheduler.clusters) 75 | 76 | @route('/static/', method=['GET']) 77 | def serve_static(self, filepath): 78 | return static_file(filepath, root=self._static_dir) 79 | 80 | @route("/vars.json") 81 | def serve_vars_json(self, var=None, value=None): 82 | return self._metric_sampler.sample() 83 | -------------------------------------------------------------------------------- /mysos/scheduler/mysos_scheduler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | from mysos.common import pkgutil, zookeeper 5 | 6 | from .http import MysosServer 7 | from .scheduler import MysosScheduler 8 | from .state import LocalStateProvider, Scheduler, StateProvider 9 | from .zk_state import ZooKeeperStateProvider 10 | 11 | from kazoo.client import KazooClient 12 | 13 | import mesos.interface 14 | from mesos.interface.mesos_pb2 import Credential, FrameworkInfo 15 | import mesos.native 16 | from twitter.common import app, log 17 | from twitter.common.exceptions import ExceptionalThread 18 | from twitter.common.http import HttpServer 19 | from twitter.common.log.options import LogOptions 20 | from twitter.common.metrics import MetricSampler, RootMetrics 21 | from twitter.common.quantity import Time 22 | from twitter.common.quantity.parse_simple import InvalidTime, parse_time 23 | import yaml 24 | 25 | 26 | FRAMEWORK_NAME = 'mysos' 27 | MYSOS_MODULE = 'mysos.scheduler' 28 | ASSET_RELPATH = 'assets' 29 | 30 | 31 | LogOptions.disable_disk_logging() 32 | LogOptions.set_stderr_log_level('google:INFO') 33 | 34 | 35 | def proxy_main(): 36 | app.add_option( 37 | '--port', 38 | dest='api_port', 39 | type='int', 40 | default=None, 41 | help='Port for the HTTP API server') 42 | 43 | app.add_option( 44 | '--mesos_master', 45 | dest='mesos_master', 46 | default=None, 47 | help='Mesos master address. It can be a ZooKeeper URL through which the master can be ' 48 | 'detected') 49 | 50 | app.add_option( 51 | '--framework_user', 52 | dest='framework_user', 53 | help='The Unix user that Mysos executor runs as') 54 | 55 | app.add_option( 56 | '--framework_role', 57 | dest='framework_role', 58 | default='*', 59 | help="The role that Mysos framework runs as. If set, Mysos only uses Mesos pool resources " 60 | "with that role. The default value '*' is what Mesos considers as the default role.\n" 61 | "NOTE: Mesos master needs to be configured to allow the specified role. See its --roles " 62 | "flag") 63 | 64 | app.add_option( 65 | '--executor_uri', 66 | dest='executor_uri', 67 | default=None, 68 | help='URI for the Mysos executor package') 69 | 70 | app.add_option( 71 | '--executor_cmd', 72 | dest='executor_cmd', 73 | default=None, 74 | help='Command to execute the executor package') 75 | 76 | app.add_option( 77 | '--executor_environ', 78 | dest='executor_environ', 79 | default=None, 80 | help="Environment variables for the executors (and the tasks) as a list of dicts keyed by " 81 | "{name, value} in JSON. Note that these variables don't affect Mesos slave components " 82 | "such as the fetcher") 83 | 84 | app.add_option( 85 | '--zk_url', 86 | dest='zk_url', 87 | default=None, 88 | help='ZooKeeper URL for various Mysos operations, in the form of ' 89 | '"zk://username:password@servers/path". The sub-directory /discover is used for ' 90 | 'communicating MySQL cluster information between Mysos scheduler and executors') 91 | 92 | # TODO(jyx): This could also be made a per-cluster configuration. 93 | app.add_option( 94 | '--election_timeout', 95 | dest='election_timeout', 96 | default='60s', 97 | help='The amount of time the scheduler waits for all slaves to respond during a MySQL master ' 98 | 'election, e.g., 60s. After the timeout the master is elected from only the slaves that ' 99 | 'have responded') 100 | 101 | app.add_option( 102 | '--admin_keypath', 103 | dest='admin_keypath', 104 | default=None, 105 | help='The path to the key file with MySQL admin credentials on Mesos slaves') 106 | 107 | app.add_option( 108 | '--work_dir', 109 | dest='work_dir', 110 | default=os.path.join(tempfile.gettempdir(), 'mysos'), 111 | help="Directory path to place Mysos work directories, e.g., web assets, state files if " 112 | "--state_storage=local. Default to a system temp directory.") 113 | 114 | app.add_option( 115 | '--state_storage', 116 | dest='state_storage', 117 | default='zk', 118 | help="Mechanism to persist scheduler state. Available options are 'zk' and 'local'. If 'zk' " 119 | "is chosen, the scheduler state is stored under /state; see --zk_url. Otherwise " 120 | "'local' is chosen and the state is persisted under /state; see --work_dir") 121 | 122 | app.add_option( 123 | '--scheduler_keypath', 124 | dest='scheduler_keypath', 125 | help="Path to the key file that the scheduler uses to store secrets such as MySQL " 126 | "cluster passwords. This key must be exactly 32 bytes long") 127 | 128 | app.add_option( 129 | '--framework_failover_timeout', 130 | dest='framework_failover_timeout', 131 | default='14d', 132 | help='Time after which Mysos framework is considered deleted. This implies losing all tasks. ' 133 | 'SHOULD BE VERY HIGH') 134 | 135 | # TODO(jyx): Flags like this are generally optional but specific executor implementations may 136 | # require them. Consider adding validators that can be plugged in so configuration errors can be 137 | # caught in the scheduler. 138 | app.add_option( 139 | '--installer_args', 140 | dest='installer_args', 141 | default=None, 142 | help='Arguments for MySQL installer directly passed along to and parsed by the installer. ' 143 | 'e.g., a serialized JSON string') 144 | 145 | app.add_option( 146 | '--backup_store_args', 147 | dest='backup_store_args', 148 | default=None, 149 | help="Arguments for the store for MySQL backups. Its use and format are defined by the " 150 | "backup store implementation. e.g., It can be a serialized JSON string") 151 | 152 | app.add_option( 153 | '--framework_authentication_file', 154 | dest='framework_authentication_file', 155 | default=None, 156 | help="Path to the key file for authenticating the framework against Mesos master. Framework " 157 | "will fail to register with Mesos if authentication is required by Mesos and this " 158 | "option is not provided") 159 | 160 | app.add_option( 161 | '--executor_source_prefix', 162 | dest='executor_source_prefix', 163 | default=None, 164 | help="Mysos uses the 'source' field in ExecutorInfo (See Mesos documentation) to group tasks " 165 | "to support metrics tracking by external utilities. The format of ExecutorInfo.source " 166 | "is '..'. This flag specifies the prefix to use in the " 167 | "'source' field. e.g., it can be '.'. There is no " 168 | "preceding period if is empty") 169 | 170 | app.add_option( 171 | '--verbose', 172 | dest='verbose', 173 | default=None, 174 | help="Turn on verbose logging") 175 | 176 | def main(args, options): 177 | log.info("Options in use: %s", options) 178 | 179 | if not options.api_port: 180 | app.error('Must specify --port') 181 | 182 | if not options.mesos_master: 183 | app.error('Must specify --mesos_master') 184 | 185 | if not options.framework_user: 186 | app.error('Must specify --framework_user') 187 | 188 | if not options.executor_uri: 189 | app.error('Must specify --executor_uri') 190 | 191 | if not options.executor_cmd: 192 | app.error('Must specify --executor_cmd') 193 | 194 | if not options.zk_url: 195 | app.error('Must specify --zk_url') 196 | 197 | if not options.admin_keypath: 198 | app.error('Must specify --admin_keypath') 199 | 200 | if not options.scheduler_keypath: 201 | app.error('Must specify --scheduler_keypath') 202 | 203 | if options.verbose: 204 | LogOptions.set_stderr_log_level('google:DEBUG') 205 | 206 | try: 207 | election_timeout = parse_time(options.election_timeout) 208 | framework_failover_timeout = parse_time(options.framework_failover_timeout) 209 | except InvalidTime as e: 210 | app.error(e.message) 211 | 212 | try: 213 | _, zk_servers, zk_root = zookeeper.parse(options.zk_url) 214 | except Exception as e: 215 | app.error("Invalid --zk_url: %s" % e.message) 216 | 217 | web_assets_dir = os.path.join(options.work_dir, "web") 218 | pkgutil.unpack_assets(web_assets_dir, MYSOS_MODULE, ASSET_RELPATH) 219 | log.info("Extracted web assets into %s" % options.work_dir) 220 | 221 | fw_principal = None 222 | fw_secret = None 223 | if options.framework_authentication_file: 224 | try: 225 | with open(options.framework_authentication_file, "r") as f: 226 | cred = yaml.load(f) 227 | fw_principal = cred["principal"] 228 | fw_secret = cred["secret"] 229 | log.info("Loaded credential (principal=%s) for framework authentication" % fw_principal) 230 | except IOError as e: 231 | app.error("Unable to read the framework authentication key file: %s" % e) 232 | except (KeyError, yaml.YAMLError) as e: 233 | app.error("Invalid framework authentication key file format %s" % e) 234 | 235 | scheduler_key = None 236 | try: 237 | with open(options.scheduler_keypath, 'rb') as f: 238 | scheduler_key = f.read().strip() 239 | if not scheduler_key: 240 | raise ValueError("The key file is empty") 241 | except Exception as e: 242 | app.error("Cannot read --scheduler_keypath: %s" % e) 243 | 244 | log.info("Starting Mysos scheduler") 245 | 246 | kazoo = KazooClient(zk_servers) 247 | kazoo.start() 248 | 249 | if options.state_storage == 'zk': 250 | log.info("Using ZooKeeper (path: %s) for state storage" % zk_root) 251 | state_provider = ZooKeeperStateProvider(kazoo, zk_root) 252 | else: 253 | log.info("Using local disk for state storage") 254 | state_provider = LocalStateProvider(options.work_dir) 255 | 256 | try: 257 | state = state_provider.load_scheduler_state() 258 | except StateProvider.Error as e: 259 | app.error(e.message) 260 | 261 | if state: 262 | log.info("Successfully restored scheduler state") 263 | framework_info = state.framework_info 264 | if framework_info.HasField('id'): 265 | log.info("Recovered scheduler's FrameworkID is %s" % framework_info.id.value) 266 | else: 267 | log.info("No scheduler state to restore") 268 | framework_info = FrameworkInfo( 269 | user=options.framework_user, 270 | name=FRAMEWORK_NAME, 271 | checkpoint=True, 272 | failover_timeout=framework_failover_timeout.as_(Time.SECONDS), 273 | role=options.framework_role) 274 | if fw_principal: 275 | framework_info.principal = fw_principal 276 | state = Scheduler(framework_info) 277 | state_provider.dump_scheduler_state(state) 278 | 279 | scheduler = MysosScheduler( 280 | state, 281 | state_provider, 282 | options.framework_user, 283 | options.executor_uri, 284 | options.executor_cmd, 285 | kazoo, 286 | options.zk_url, 287 | election_timeout, 288 | options.admin_keypath, 289 | scheduler_key, 290 | installer_args=options.installer_args, 291 | backup_store_args=options.backup_store_args, 292 | executor_environ=options.executor_environ, 293 | executor_source_prefix=options.executor_source_prefix, 294 | framework_role=options.framework_role) 295 | 296 | RootMetrics().register_observable('scheduler', scheduler) 297 | 298 | if fw_principal and fw_secret: 299 | cred = Credential(principal=fw_principal, secret=fw_secret) 300 | scheduler_driver = mesos.native.MesosSchedulerDriver( 301 | scheduler, 302 | framework_info, 303 | options.mesos_master, 304 | cred) 305 | else: 306 | scheduler_driver = mesos.native.MesosSchedulerDriver( 307 | scheduler, 308 | framework_info, 309 | options.mesos_master) 310 | 311 | scheduler_driver.start() 312 | 313 | metric_sampler = MetricSampler(RootMetrics()) 314 | metric_sampler.start() 315 | 316 | server = HttpServer() 317 | server.mount_routes(MysosServer(scheduler, web_assets_dir, metric_sampler)) 318 | 319 | et = ExceptionalThread( 320 | target=server.run, args=('0.0.0.0', options.api_port, 'cherrypy')) 321 | et.daemon = True 322 | et.start() 323 | 324 | try: 325 | # Wait for the scheduler to stop. 326 | # The use of 'stopped' event instead of scheduler_driver.join() is necessary to stop the 327 | # process with SIGINT. 328 | while not scheduler.stopped.wait(timeout=0.5): 329 | pass 330 | except KeyboardInterrupt: 331 | log.info('Interrupted, exiting.') 332 | else: 333 | log.info('Scheduler exited.') 334 | 335 | app.shutdown(1) # Mysos scheduler is supposed to be long-running thus the use of exit status 1. 336 | 337 | app.main() 338 | -------------------------------------------------------------------------------- /mysos/scheduler/password.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | import nacl.exceptions 5 | import nacl.secret 6 | import nacl.utils 7 | 8 | 9 | class PasswordBox(object): 10 | """ 11 | Implements password encryption using PyNaCl. 12 | """ 13 | 14 | class Error(Exception): pass 15 | 16 | def __init__(self, key): 17 | self._secret_box = nacl.secret.SecretBox(key) 18 | 19 | def encrypt(self, plaintext): 20 | try: 21 | return self._secret_box.encrypt( 22 | plaintext, nacl.utils.random(nacl.secret.SecretBox.NONCE_SIZE)) 23 | except nacl.exceptions.CryptoError as e: 24 | raise self.Error("Failed to encrypt the password: %s" % e) 25 | 26 | def decrypt(self, encrypted): 27 | try: 28 | return self._secret_box.decrypt(encrypted) 29 | except nacl.exceptions.CryptoError as e: 30 | raise self.Error("Failed to decrypt the password: %s" % e) 31 | 32 | def match(self, plaintext, encrypted): 33 | return plaintext == self._secret_box.decrypt(encrypted) 34 | 35 | 36 | def gen_password(): 37 | """Return a randomly-generated password of 21 characters.""" 38 | return ''.join(random.choice( 39 | string.ascii_uppercase + 40 | string.ascii_lowercase + 41 | string.digits) for _ in range(21)) 42 | 43 | 44 | def gen_encryption_key(): 45 | """Return a randomly-generated encryption key of 32 characters.""" 46 | return nacl.utils.random(nacl.secret.SecretBox.KEY_SIZE) 47 | -------------------------------------------------------------------------------- /mysos/scheduler/state.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | import cPickle 3 | from cPickle import PickleError 4 | import os 5 | 6 | from twitter.common import log 7 | from twitter.common.collections.orderedset import OrderedSet 8 | from twitter.common.dirutil import safe_mkdir 9 | 10 | from mesos.interface import mesos_pb2 11 | 12 | 13 | class StateProvider(object): 14 | """ 15 | StateProvider is responsible for checkpointing and restoring the state of the Mysos scheduler. 16 | 17 | It maintains the following key hierarchy: 18 | /state/scheduler # Scheduler-level state. 19 | /state/clusters/ # Folder for all cluster-level states. 20 | cluster1 # State for 'cluster1'. 21 | cluster2 # State for 'cluster2'. 22 | ... 23 | """ 24 | 25 | class Error(Exception): pass 26 | 27 | @abstractmethod 28 | def dump_scheduler_state(self, state): 29 | """Persist scheduler-level state.""" 30 | pass 31 | 32 | @abstractmethod 33 | def load_scheduler_state(self): 34 | """ 35 | Restore scheduler-level state. 36 | :return: The Scheduler object. None if no state is available. 37 | """ 38 | pass 39 | 40 | @abstractmethod 41 | def dump_cluster_state(self, state): 42 | """Persist cluster-level state.""" 43 | pass 44 | 45 | @abstractmethod 46 | def load_cluster_state(self, cluster_name): 47 | """ 48 | Restore cluster-level state. 49 | :return: The MySQLCluster object. None if no state is available. 50 | """ 51 | pass 52 | 53 | @abstractmethod 54 | def remove_cluster_state(self, cluster_name): 55 | """Remove cluster-level state.""" 56 | pass 57 | 58 | # --- Helper methods. --- 59 | @classmethod 60 | def _get_scheduler_state_key(cls): 61 | return ['state', 'scheduler'] 62 | 63 | @classmethod 64 | def _get_cluster_state_key(cls, cluster_name): 65 | return ['state', 'clusters', cluster_name] 66 | 67 | 68 | class Scheduler(object): 69 | """ 70 | Scheduler-level state. 71 | 72 | NOTE: It references cluster-level states indirectly through cluster names. 73 | """ 74 | 75 | def __init__(self, framework_info): 76 | self.framework_info = framework_info 77 | self.clusters = OrderedSet() # Names of clusters this scheduler manages. cluster creation 78 | # order is preserved with the OrderedSet. 79 | 80 | 81 | class MySQLCluster(object): 82 | """ 83 | The state of a MySQL cluster. 84 | 85 | It includes tasks (MySQLTask) for members of the cluster. 86 | """ 87 | 88 | def __init__(self, name, user, encrypted_password, num_nodes, cpus, mem, disk, backup_id=None): 89 | if not isinstance(num_nodes, int): 90 | raise TypeError("'num_nodes' should be an int") 91 | 92 | self.name = name 93 | self.user = user 94 | self.encrypted_password = encrypted_password 95 | self.num_nodes = num_nodes 96 | self.cpus = cpus 97 | self.mem = mem 98 | self.disk = disk 99 | self.backup_id = backup_id 100 | 101 | self.members = {} # {TaskID : MemberID} mappings. MemberIDs are assigned by ZooKeeper. A task 102 | # must be running and published to ZK before it becomes a member. 103 | self.master_id = None # MemberID of the MySQL master. 104 | self.tasks = {} # {TaskID : MySQLTask} mappings 105 | self.next_epoch = 0 # Monotonically increasing number after each master change. 106 | self.next_id = 0 # Monotonically increasing number for unique task IDs. 107 | 108 | @property 109 | def active_tasks(self): 110 | """Tasks that have been launched and have not terminated.""" 111 | return [t for t in self.tasks.values() if t.state in ( 112 | mesos_pb2.TASK_STAGING, mesos_pb2.TASK_STARTING, mesos_pb2.TASK_RUNNING)] 113 | 114 | @property 115 | def running_tasks(self): 116 | return [t for t in self.tasks.values() if t.state == mesos_pb2.TASK_RUNNING] 117 | 118 | 119 | class MySQLTask(object): 120 | """The state of a MySQL task.""" 121 | 122 | def __init__(self, cluster_name, task_id, mesos_slave_id, hostname, port): 123 | if not isinstance(port, int): 124 | raise TypeError("'port' should be an int") 125 | 126 | self.cluster_name = cluster_name # So we can refer back to the cluster it belongs to. 127 | self.task_id = task_id 128 | self.mesos_slave_id = mesos_slave_id 129 | self.hostname = hostname 130 | self.port = port 131 | self.state = mesos_pb2.TASK_STAGING # Initial state. Will be updated by statusUpdate(). 132 | 133 | 134 | class LocalStateProvider(StateProvider): 135 | """StateProvider implementation that uses local disk to store the state.""" 136 | 137 | def __init__(self, work_dir): 138 | """ 139 | :param work_dir: The root directory under which the scheduler state is stored. e.g. The path 140 | for 'cluster1' is /state/clusters/cluster1. 141 | """ 142 | self._work_dir = work_dir 143 | 144 | def dump_scheduler_state(self, state): 145 | if not isinstance(state, Scheduler): 146 | raise TypeError("'state' should be an instance of Scheduler") 147 | path = self._get_scheduler_state_path() 148 | safe_mkdir(os.path.dirname(path)) 149 | 150 | try: 151 | with open(path, 'wb') as f: 152 | cPickle.dump(state, f) 153 | except PickleError as e: 154 | raise self.Error('Failed to persist Scheduler: %s' % e) 155 | 156 | def load_scheduler_state(self): 157 | path = self._get_scheduler_state_path() 158 | if not os.path.isfile(path): 159 | log.info("No scheduler state found on path %s" % path) 160 | return None 161 | 162 | try: 163 | with open(path, 'rb') as f: 164 | return cPickle.load(f) 165 | except PickleError as e: 166 | raise self.Error('Failed to recover Scheduler: %s' % e) 167 | 168 | def dump_cluster_state(self, state): 169 | if not isinstance(state, MySQLCluster): 170 | raise TypeError("'state' should be an instance of MySQLCluster") 171 | 172 | path = self._get_cluster_state_path(state.name) 173 | safe_mkdir(os.path.dirname(path)) 174 | 175 | try: 176 | with open(path, 'wb') as f: 177 | return cPickle.dump(state, f) 178 | except PickleError as e: 179 | raise self.Error('Failed to persist state for cluster %s: %s' % (state.name, e)) 180 | 181 | def load_cluster_state(self, cluster_name): 182 | path = self._get_cluster_state_path(cluster_name) 183 | if not os.path.isfile(path): 184 | log.info("No cluster state found on path %s" % path) 185 | return None 186 | 187 | try: 188 | with open(path, 'rb') as f: 189 | return cPickle.load(f) 190 | except PickleError as e: 191 | raise self.Error('Failed to recover MySQLCluster: %s' % e) 192 | 193 | def remove_cluster_state(self, cluster_name): 194 | path = self._get_cluster_state_path(cluster_name) 195 | if not os.path.isfile(path): 196 | log.info("No cluster state found on path %s" % path) 197 | return 198 | 199 | os.remove(path) 200 | 201 | # --- Helper methods. --- 202 | def _get_scheduler_state_path(self): 203 | return os.path.join(self._work_dir, os.path.join(*self._get_scheduler_state_key())) 204 | 205 | def _get_cluster_state_path(self, cluster_name): 206 | return os.path.join(self._work_dir, os.path.join(*self._get_cluster_state_key(cluster_name))) 207 | -------------------------------------------------------------------------------- /mysos/scheduler/zk_state.py: -------------------------------------------------------------------------------- 1 | import cPickle 2 | from cPickle import PickleError 3 | import posixpath 4 | 5 | from twitter.common import log 6 | 7 | from .state import MySQLCluster, Scheduler, StateProvider 8 | 9 | from kazoo.exceptions import KazooException, NoNodeError 10 | 11 | 12 | class ZooKeeperStateProvider(StateProvider): 13 | """ 14 | StateProvider implementation backed by ZooKeeper. 15 | 16 | This class is thread-safe. 17 | """ 18 | 19 | def __init__(self, client, base_path): 20 | """ 21 | :param client: Kazoo client. 22 | :param base_path: The base path for the scheduler state on ZooKeeper. 23 | """ 24 | self._client = client 25 | self._base_path = base_path 26 | 27 | def dump_scheduler_state(self, state): 28 | if not isinstance(state, Scheduler): 29 | raise TypeError("'state' should be an instance of Scheduler") 30 | 31 | path = self._get_scheduler_state_path() 32 | self._client.retry(self._client.ensure_path, posixpath.dirname(path)) 33 | 34 | content = cPickle.dumps(state) 35 | try: 36 | self._client.retry(self._create_or_set, path, content) 37 | except KazooException as e: 38 | raise self.Error('Failed to persist Scheduler: %s' % e) 39 | 40 | def load_scheduler_state(self): 41 | path = self._get_scheduler_state_path() 42 | 43 | try: 44 | content = self._client.get(path)[0] 45 | state = cPickle.loads(content) 46 | if not isinstance(state, Scheduler): 47 | raise self.Error("Invalid state object. Expect Scheduler, got %s" % type(state)) 48 | return state 49 | except NoNodeError: 50 | log.info('No scheduler state found on path %s' % path) 51 | return None 52 | except (KazooException, PickleError, ValueError) as e: 53 | raise self.Error('Failed to recover Scheduler: %s' % e) 54 | 55 | def dump_cluster_state(self, state): 56 | if not isinstance(state, MySQLCluster): 57 | raise TypeError("'state' should be an instance of MySQLCluster") 58 | 59 | path = self._get_cluster_state_path(state.name) 60 | self._client.retry(self._client.ensure_path, posixpath.dirname(path)) 61 | 62 | content = cPickle.dumps(state) 63 | self._client.retry(self._create_or_set, path, content) 64 | 65 | def load_cluster_state(self, cluster_name): 66 | path = self._get_cluster_state_path(cluster_name) 67 | 68 | try: 69 | content = self._client.get(path)[0] 70 | state = cPickle.loads(content) 71 | if not isinstance(state, MySQLCluster): 72 | raise self.Error("Invalid state object. Expect MySQLCluster, got %s" % type(state)) 73 | return state 74 | except NoNodeError: 75 | log.info('No cluster state found on path %s' % path) 76 | return None 77 | except (KazooException, PickleError, ValueError) as e: 78 | raise self.Error('Failed to recover MySQLCluster: %s' % e) 79 | 80 | def remove_cluster_state(self, cluster_name): 81 | path = self._get_cluster_state_path(cluster_name) 82 | try: 83 | self._client.retry(self._client.delete, path, recursive=True) 84 | except KazooException as e: 85 | raise self.Error("Failed to remove MySQLCluster: %s" % e) 86 | 87 | # --- Helper methods. --- 88 | def _get_scheduler_state_path(self): 89 | return posixpath.join(self._base_path, posixpath.join(*self._get_scheduler_state_key())) 90 | 91 | def _get_cluster_state_path(self, cluster_name): 92 | return posixpath.join( 93 | self._base_path, posixpath.join(*self._get_cluster_state_key(cluster_name))) 94 | 95 | def _create_or_set(self, path, content): 96 | """Set the ZNode if the path exists, otherwise create it.""" 97 | if self._client.exists(path): 98 | self._client.set(path, content) 99 | else: 100 | self._client.create(path, content) 101 | -------------------------------------------------------------------------------- /mysos/testing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/incubator-retired-cotton/4aa9bb0acdd8c609686b5d370ef4b61a520364ef/mysos/testing/__init__.py -------------------------------------------------------------------------------- /mysos/testing/mysos_test_client.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import BaseHTTPServer 4 | import json 5 | import os 6 | import tempfile 7 | import time 8 | import urllib 9 | import urllib2 10 | 11 | from mysos.common.cluster import wait_for_master, wait_for_termination 12 | 13 | from sqlalchemy import create_engine 14 | from sqlalchemy.exc import OperationalError 15 | from twitter.common import app, log 16 | from twitter.common.dirutil import safe_mkdir 17 | from twitter.common.log.options import LogOptions 18 | 19 | 20 | LogOptions.disable_disk_logging() 21 | LogOptions.set_stderr_log_level('google:INFO') 22 | 23 | 24 | def proxy_main(): 25 | app.add_option( 26 | '--api_host', 27 | dest='api_host', 28 | help='Host for the HTTP API server') 29 | 30 | app.add_option( 31 | '--api_port', 32 | dest='api_port', 33 | type='int', 34 | help='Port for the HTTP API server') 35 | 36 | app.add_option( 37 | '--cluster', 38 | dest='cluster_name', 39 | help='Name of the MySQL cluster to create') 40 | 41 | app.add_option( 42 | '--password_file', 43 | dest='password_file', 44 | default=os.path.join(tempfile.gettempdir(), 'mysos', 'mysos_test_client', 'password_file'), 45 | help="Path to the file for persisting the cluster password for testing purposes") 46 | 47 | @app.command 48 | @app.command_option( 49 | '--num_nodes', 50 | dest='num_nodes', 51 | type='int', 52 | help='Number of nodes this cluster should have') 53 | @app.command_option( 54 | '--backup_id', 55 | dest='backup_id', 56 | default=None, 57 | help="The 'backup_id' to restore from") 58 | @app.command_option( 59 | '--cluster_user', 60 | dest='cluster_user', 61 | help='MySQL user name the of cluster') 62 | @app.command_option( 63 | '--size', 64 | dest='size', 65 | help="The size of instances in the cluster as a JSON dictionary of 'cpus', 'mem', 'disk'. " 66 | "'mem' and 'disk' are specified with data size units: kb, mb, gb, etc. If given 'None'" 67 | "then app defaults are used.") 68 | @app.command_option( 69 | '--cluster_password', 70 | dest='cluster_password', 71 | help="The password used for accessing MySQL instances in the cluster as well as deleting " 72 | "the cluster from Mysos.") 73 | def create(args, options): 74 | validate_common_options(options) 75 | 76 | if not options.num_nodes: 77 | app.error("--num_nodes is required") 78 | 79 | if not options.cluster_user: 80 | app.error("--cluster_user is required") 81 | 82 | url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name) 83 | values = dict( 84 | num_nodes=int(options.num_nodes), 85 | cluster_user=options.cluster_user, 86 | size=options.size if options.size else '', # 'urlencode()' doesn't accept None. 87 | backup_id=options.backup_id if options.backup_id else '', 88 | cluster_password=options.cluster_password if options.cluster_password else '') 89 | 90 | req = urllib2.Request(url, urllib.urlencode(values)) 91 | try: 92 | response = urllib2.urlopen(req).read() 93 | except urllib2.HTTPError as e: 94 | log.error("POST request failed: %s, %s, %s" % ( 95 | e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read())) 96 | app.quit(1) 97 | 98 | try: 99 | result = json.loads(response) 100 | if not isinstance(result, dict): 101 | raise ValueError() 102 | except ValueError: 103 | log.error("Invalid response: %s" % response) 104 | app.quit(1) 105 | 106 | log.info("Cluster created. Cluster info: %s" % str(result)) 107 | with open(options.password_file, 'w') as f: 108 | f.write(result["cluster_password"]) 109 | 110 | log.info("Waiting for the master for this cluster to be elected...") 111 | master_endpoint = wait_for_master(result['cluster_url']).service_endpoint 112 | 113 | connection_str = "mysql://%s:%s@%s:%d/" % ( 114 | options.cluster_user, 115 | result["cluster_password"], 116 | master_endpoint.host, 117 | master_endpoint.port) 118 | log.info("Connecting to the MySQL cluster master: %s" % connection_str) 119 | engine = create_engine(connection_str) 120 | 121 | for i in range(5): # Loop for 5 times/seconds to wait for the master to be promoted. 122 | try: 123 | # TODO(jyx): Test writing to the master and reading from the slave. 124 | result = engine.execute("SELECT 1;").scalar() 125 | assert 1 == int(result), "Expecting result to be 1 but got %s" % result 126 | break 127 | except OperationalError: 128 | if i == 4: 129 | raise 130 | log.debug("MySQL master not ready yet. Sleep for 1 second...") 131 | time.sleep(1) 132 | 133 | log.info("Cluster successfully started") 134 | 135 | @app.command 136 | def delete(args, options): 137 | validate_common_options(options) 138 | 139 | with open(options.password_file, 'r') as f: 140 | password = f.read().strip() 141 | if not password: 142 | app.error("Empty password file") 143 | 144 | url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name) 145 | values = dict(password=password) 146 | 147 | req = urllib2.Request(url, urllib.urlencode(values)) 148 | req.get_method = lambda: 'DELETE' 149 | 150 | try: 151 | response = urllib2.urlopen(req).read() 152 | except urllib2.HTTPError as e: 153 | log.error("DELETE request failed: %s, %s, %s" % ( 154 | e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read())) 155 | app.quit(1) 156 | 157 | try: 158 | result = json.loads(response) 159 | if not isinstance(result, dict): 160 | raise ValueError() 161 | except ValueError: 162 | log.error("Invalid response: %s" % response) 163 | app.quit(1) 164 | 165 | log.info("Cluster deletion result: %s" % result) 166 | 167 | log.info("Waiting for the cluster to terminate...") 168 | wait_for_termination(result['cluster_url']) 169 | 170 | log.info("Cluster terminated/deleted") 171 | 172 | def validate_common_options(options): 173 | if not options.api_host: 174 | app.error("--api_host is required") 175 | 176 | if not options.api_port: 177 | app.error("--api_port is required") 178 | 179 | if not options.cluster_name: 180 | app.error("--cluster is required") 181 | 182 | if not options.password_file: 183 | app.error("--password_file is required") 184 | log.info("Using --password_file=%s" % options.password_file) 185 | safe_mkdir(os.path.dirname(options.password_file)) 186 | 187 | def main(args, options): 188 | app.help() 189 | 190 | app.main() 191 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | 5 | COMMONS_VERSION = '==0.3.2' 6 | MESOS_VERSION = '==0.20.1' 7 | 8 | 9 | here = os.path.abspath(os.path.dirname(__file__)) 10 | 11 | 12 | def make_commons_requirement(name): 13 | return 'twitter.common.{0}{1}'.format(name, COMMONS_VERSION) 14 | 15 | 16 | def list_package_data_files(package_root, data_folder): 17 | """List the data files in the data_folder under the given package_root.""" 18 | paths = [] 19 | for root, _, files in os.walk(os.path.join(package_root, data_folder)): 20 | for filename in files: 21 | paths.append(os.path.relpath(os.path.join(root, filename), package_root)) 22 | 23 | return paths 24 | 25 | 26 | setup( 27 | name='mysos', 28 | version='0.1.0-dev0', 29 | description='Mysos (MySQL on Mesos)', 30 | url='https://github.com/twitter/mysos', 31 | license='Apache License, Version 2.0', 32 | classifiers=[ 33 | 'Development Status :: 3 - Alpha', 34 | 'Programming Language :: Python :: 2', 35 | 'Programming Language :: Python :: 2.7', 36 | ], 37 | keywords='mesos mysql', 38 | packages=find_packages(exclude=['tests*']), 39 | package_data={ 40 | '': (list_package_data_files('mysos/executor', 'files') + 41 | list_package_data_files('mysos/scheduler', 'assets')) 42 | }, 43 | install_requires=[ 44 | 'kazoo==1.3.1', 45 | 'mako==0.4.0', 46 | 'mesos.interface{0}'.format(MESOS_VERSION), 47 | 'pyyaml==3.10', 48 | make_commons_requirement('app'), 49 | make_commons_requirement('collections'), 50 | make_commons_requirement('concurrent'), 51 | make_commons_requirement('exceptions'), 52 | make_commons_requirement('http'), 53 | make_commons_requirement('lang'), 54 | make_commons_requirement('log'), 55 | make_commons_requirement('metrics'), 56 | make_commons_requirement('quantity'), 57 | make_commons_requirement('zookeeper'), 58 | ], 59 | extras_require={ 60 | 'test': [ 61 | 'pynacl>=0.3.0', 62 | 'webtest', 63 | 'zake==0.2.1', 64 | ], 65 | 'scheduler': [ 66 | 'cherrypy==3.2.2', 67 | 'mesos.native{0}'.format(MESOS_VERSION), 68 | 'pynacl>=0.3.0,<1', 69 | ], 70 | 'executor': [ 71 | 'mesos.native{0}'.format(MESOS_VERSION), 72 | ], 73 | 'test_client': [ 74 | 'sqlalchemy', 75 | 'mysql-python' 76 | ] 77 | }, 78 | entry_points={ 79 | 'console_scripts': [ 80 | 'mysos_scheduler=mysos.scheduler.mysos_scheduler:proxy_main [scheduler]', 81 | 'vagrant_mysos_executor=mysos.executor.testing.vagrant_mysos_executor:proxy_main [executor]', 82 | 'mysos_test_client=mysos.testing.mysos_test_client:proxy_main [test_client]', 83 | ], 84 | }, 85 | ) 86 | -------------------------------------------------------------------------------- /tests/common/test_cluster.py: -------------------------------------------------------------------------------- 1 | import Queue 2 | import threading 3 | import unittest 4 | 5 | from mysos.common.cluster import ClusterListener, ClusterManager 6 | 7 | from kazoo.handlers.threading import SequentialThreadingHandler 8 | import pytest 9 | from twitter.common.zookeeper.serverset.endpoint import Endpoint, ServiceInstance 10 | from zake.fake_client import FakeClient 11 | from zake.fake_storage import FakeStorage 12 | 13 | 14 | class CallbackHandler(object): 15 | """Utility for testing callbacks.""" 16 | 17 | def __init__(self): 18 | self.promoted = threading.Event() 19 | self.demoted = threading.Event() 20 | self.detected = Queue.Queue() 21 | self.terminated = threading.Event() 22 | 23 | def promotion_callback(self): 24 | self.promoted.set() 25 | 26 | def demotion_callback(self): 27 | self.demoted.set() 28 | 29 | def master_callback(self, master): 30 | self.detected.put(master) 31 | 32 | def termination_callback(self): 33 | self.terminated.set() 34 | 35 | 36 | class TestCluster(unittest.TestCase): 37 | def setUp(self): 38 | self.storage = FakeStorage(SequentialThreadingHandler()) 39 | self.client = FakeClient(storage=self.storage) 40 | self.client.start() 41 | 42 | def tearDown(self): 43 | self.client.stop() 44 | 45 | def test_add_member(self): 46 | manager = ClusterManager(self.client, "/home/my_cluster") 47 | 48 | instance1 = ServiceInstance(Endpoint("host1", 10000)) 49 | member1 = manager.add_member(instance1) 50 | assert member1 == manager.add_member(instance1) # Second insertion is ignored. 51 | 52 | instance2 = ServiceInstance(Endpoint("host2", 10000)) 53 | manager.add_member(instance2) 54 | 55 | assert len(manager._cluster.members) == 2 56 | 57 | assert (self.storage.paths["/home/my_cluster/slaves/member_0000000000"]["data"] == 58 | ServiceInstance.pack(instance1)) 59 | assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"]["data"] == 60 | ServiceInstance.pack(instance2)) 61 | 62 | def test_promote_member(self): 63 | manager = ClusterManager(self.client, "/home/my_cluster") 64 | instance = ServiceInstance(Endpoint("host", 10000)) 65 | member = manager.add_member(instance) 66 | 67 | assert manager.promote_member(member) 68 | assert not manager.promote_member(member) # The 2nd promotion is a no-op. 69 | 70 | assert (self.storage.paths["/home/my_cluster/master/member_0000000000"]["data"] == 71 | ServiceInstance.pack(instance)) 72 | 73 | def test_remove_member(self): 74 | manager = ClusterManager(self.client, "/home/my_cluster") 75 | instance = ServiceInstance(Endpoint("host", 10000)) 76 | member = manager.add_member(instance) 77 | 78 | assert manager.remove_member(member) 79 | assert not manager.remove_member(member) # The second deletion is ignored. 80 | 81 | assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths 82 | 83 | def test_callbacks(self): 84 | manager = ClusterManager(self.client, "/home/my_cluster") 85 | 86 | # Set up 2 listeners. 87 | instance1 = ServiceInstance(Endpoint("host1", 10000)) 88 | handler1 = CallbackHandler() 89 | listener1 = ClusterListener( 90 | self.client, 91 | "/home/my_cluster", 92 | instance1, 93 | handler1.promotion_callback, 94 | handler1.demotion_callback, 95 | handler1.master_callback, 96 | handler1.termination_callback) 97 | listener1.start() 98 | member1 = manager.add_member(instance1) 99 | 100 | instance2 = ServiceInstance(Endpoint("host2", 10000)) 101 | handler2 = CallbackHandler() 102 | listener2 = ClusterListener( 103 | self.client, 104 | "/home/my_cluster", 105 | instance2, 106 | handler2.promotion_callback, 107 | handler2.demotion_callback, 108 | handler2.master_callback) 109 | listener2.start() 110 | member2 = manager.add_member(instance2) 111 | 112 | # Test promotion. 113 | manager.promote_member(member1) 114 | 115 | assert handler1.promoted.wait(1) 116 | assert handler2.detected.get(True, 1) == instance1 117 | 118 | assert (self.storage.paths["/home/my_cluster/master/member_0000000000"]["data"] == 119 | ServiceInstance.pack(instance1)) 120 | assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"]["data"] == 121 | ServiceInstance.pack(instance2)) 122 | 123 | manager.promote_member(member2) 124 | 125 | assert handler1.demoted.wait(1) 126 | assert handler2.promoted.wait(1) 127 | 128 | assert (self.storage.paths["/home/my_cluster/master/member_0000000001"]["data"] == 129 | ServiceInstance.pack(instance2)) 130 | assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths 131 | 132 | manager.remove_member(member2) 133 | assert handler2.demoted.wait(1) 134 | 135 | # Test removing cluster. 136 | manager.remove_member(member1) 137 | manager.delete_cluster() 138 | assert handler1.terminated.wait(1) 139 | 140 | def test_invalid_arguments(self): 141 | client = FakeClient() 142 | client.start() 143 | manager = ClusterManager(client, "/home/my_cluster") 144 | 145 | with pytest.raises(ValueError) as e: 146 | manager.promote_member("123") 147 | assert e.value.message == 'Invalid member_id: 123' 148 | 149 | def test_invalid_znode(self): 150 | instance1 = ServiceInstance(Endpoint("host1", 10000)) 151 | handler1 = CallbackHandler() 152 | listener1 = ClusterListener( 153 | self.client, 154 | "/home/my_cluster", 155 | instance1, 156 | handler1.promotion_callback, 157 | handler1.demotion_callback, 158 | handler1.master_callback) 159 | listener1.start() 160 | 161 | self.client.ensure_path("/home/my_cluster/master") 162 | self.client.create("/home/my_cluster/master/member_", "Invalid Data", sequence=True) 163 | 164 | # Invalid ZNode data translates into a 'None' return. 165 | assert handler1.detected.get(True, 1) is None 166 | 167 | def test_existing_zk(self): 168 | """ 169 | ClusterManager needs to be able to recover from an existing ZK group for scheduler failover. 170 | """ 171 | manager = ClusterManager(self.client, "/home/my_cluster") 172 | 173 | instance1 = ServiceInstance(Endpoint("host1", 10000)) 174 | member1 = manager.add_member(instance1) 175 | instance2 = ServiceInstance(Endpoint("host2", 10000)) 176 | member2 = manager.add_member(instance2) 177 | 178 | assert (self.storage.paths["/home/my_cluster/slaves/member_0000000000"]["data"] == 179 | ServiceInstance.pack(instance1)) 180 | assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"]["data"] == 181 | ServiceInstance.pack(instance2)) 182 | 183 | manager.promote_member(member1) 184 | 185 | # Test the new ClusterManager. 186 | manager2 = ClusterManager(self.client, "/home/my_cluster") 187 | assert len(manager2._cluster.members) == 2 188 | assert member1 in manager2._cluster.members 189 | assert member2 in manager2._cluster.members 190 | assert manager2._cluster.members[member1] == ServiceInstance.pack(instance1) 191 | 192 | def test_remove_cluster(self): 193 | manager = ClusterManager(self.client, "/home/my_cluster") 194 | 195 | instance1 = ServiceInstance(Endpoint("host1", 10000)) 196 | member1 = manager.add_member(instance1) 197 | instance2 = ServiceInstance(Endpoint("host2", 10000)) 198 | member2 = manager.add_member(instance2) 199 | 200 | manager.promote_member(member1) 201 | 202 | with pytest.raises(ClusterManager.Error): 203 | manager.delete_cluster() 204 | 205 | manager.remove_member(member1) 206 | manager.remove_member(member2) 207 | manager.delete_cluster() 208 | 209 | assert "/home/my_cluster" not in self.storage.paths 210 | -------------------------------------------------------------------------------- /tests/common/test_zookeeper.py: -------------------------------------------------------------------------------- 1 | from mysos.common.zookeeper import parse 2 | 3 | import pytest 4 | 5 | 6 | def test_parse(): 7 | assert parse("zk://host1:port1") == (None, "host1:port1", "/") 8 | assert parse("zk://jake:1@host1:port1") == ("jake:1", "host1:port1", "/") 9 | assert parse("zk://jake:1@host1:port1/") == ("jake:1", "host1:port1", "/") 10 | assert (parse("zk://jake:1@host1:port1,host2:port2") == 11 | ("jake:1", "host1:port1,host2:port2", "/")) 12 | assert (parse("zk://jake:1@host1:port1,host2:port2/") == 13 | ("jake:1", "host1:port1,host2:port2", "/")) 14 | assert (parse("zk://jake:1@host1:port1,host2:port2/path/to/znode") == 15 | ("jake:1", "host1:port1,host2:port2", "/path/to/znode")) 16 | 17 | 18 | def test_parse_errors(): 19 | with pytest.raises(ValueError) as e: 20 | parse("host1:port1") 21 | assert e.value.message == "Expecting 'zk://' at the beginning of the URL" 22 | 23 | # This method doesn't validate the values in the tuple. 24 | assert parse("zk://") == (None, "", "/") 25 | assert parse("zk://host_no_port") == (None, "host_no_port", "/") 26 | assert parse("zk://jake@host") == ("jake", "host", "/") 27 | -------------------------------------------------------------------------------- /tests/executor/test_mysos_task_runner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import signal 3 | import unittest 4 | 5 | from mysos.common.cluster import ClusterManager 6 | from mysos.common.testing import Fake 7 | from mysos.executor.noop_installer import NoopPackageInstaller 8 | from mysos.executor.mysos_task_runner import MysosTaskRunner 9 | from mysos.executor.task_runner import TaskError 10 | from mysos.executor.testing.fake import FakeTaskControl 11 | 12 | from kazoo.handlers.threading import SequentialThreadingHandler 13 | import pytest 14 | from twitter.common.concurrent import deadline 15 | from twitter.common.quantity import Amount, Time 16 | from twitter.common.zookeeper.serverset.endpoint import Endpoint, ServiceInstance 17 | from zake.fake_client import FakeClient 18 | from zake.fake_storage import FakeStorage 19 | 20 | 21 | if 'MYSOS_DEBUG' in os.environ: 22 | from twitter.common import log 23 | from twitter.common.log.options import LogOptions 24 | LogOptions.set_stderr_log_level('google:DEBUG') 25 | LogOptions.set_simple(True) 26 | log.init('mysos_tests') 27 | 28 | 29 | class FakeStateManager(Fake): pass 30 | 31 | 32 | class TestTaskRunner(unittest.TestCase): 33 | def setUp(self): 34 | self._storage = FakeStorage(SequentialThreadingHandler()) 35 | self._client = FakeClient(storage=self._storage) 36 | self._client.start() 37 | self._self_instance = ServiceInstance(Endpoint("host", 10000)) 38 | self._state_manager = FakeStateManager() 39 | 40 | def tearDown(self): 41 | self._client.stop() 42 | 43 | def test_stop(self): 44 | task_control = FakeTaskControl() 45 | runner = MysosTaskRunner( 46 | self._self_instance, 47 | self._client, 48 | "/home/test/my_cluster", 49 | NoopPackageInstaller(), 50 | task_control, 51 | self._state_manager) 52 | runner.start() 53 | assert runner.stop() 54 | 55 | # Killed by SIGTERM. 56 | assert deadline(runner.join, Amount(1, Time.SECONDS)) == -signal.SIGTERM 57 | 58 | def test_demote(self): 59 | task_control = FakeTaskControl() 60 | runner = MysosTaskRunner( 61 | self._self_instance, 62 | self._client, 63 | "/home/test/my_cluster", 64 | NoopPackageInstaller(), 65 | task_control, 66 | self._state_manager) 67 | 68 | manager = ClusterManager(self._client, "/home/test/my_cluster") 69 | runner.start() 70 | 71 | self_member = manager.add_member(self._self_instance) 72 | 73 | # 'self_instance' becomes the master. 74 | manager.promote_member(self_member) 75 | 76 | runner.promoted.wait(1) 77 | 78 | another_member = manager.add_member(ServiceInstance(Endpoint("another_host", 10000))) 79 | 80 | # This demotes 'self_instance', which should cause runner to stop. 81 | manager.promote_member(another_member) 82 | 83 | assert deadline(runner.join, Amount(1, Time.SECONDS)) 84 | 85 | def test_reparent(self): 86 | task_control = FakeTaskControl() 87 | runner = MysosTaskRunner( 88 | self._self_instance, 89 | self._client, 90 | "/home/test/my_cluster", 91 | NoopPackageInstaller(), 92 | task_control, 93 | self._state_manager) 94 | 95 | manager = ClusterManager(self._client, "/home/test/my_cluster") 96 | runner.start() 97 | 98 | # Promote another instance. 99 | master = ServiceInstance(Endpoint("another_host", 10000)) 100 | another_member = manager.add_member(master) 101 | manager.promote_member(another_member) 102 | 103 | assert runner.master.get(True, 1) == master 104 | 105 | assert runner.stop() 106 | assert deadline(runner.join, Amount(1, Time.SECONDS)) 107 | 108 | def test_mysqld_error(self): 109 | task_control = FakeTaskControl(mysqld="exit 123") 110 | runner = MysosTaskRunner( 111 | self._self_instance, 112 | self._client, 113 | "/home/test/my_cluster", 114 | NoopPackageInstaller(), 115 | task_control, 116 | self._state_manager) 117 | 118 | runner.start() 119 | assert deadline(runner.join, Amount(1, Time.SECONDS)) == 123 120 | 121 | def test_start_command_error(self): 122 | task_control = FakeTaskControl(start_cmd="exit 1") 123 | runner = MysosTaskRunner( 124 | self._self_instance, 125 | self._client, 126 | "/home/test/my_cluster", 127 | NoopPackageInstaller(), 128 | task_control, 129 | self._state_manager) 130 | 131 | with pytest.raises(TaskError) as e: 132 | runner.start() 133 | assert e.value.message.startswith("Failed to start MySQL task") 134 | 135 | def test_promote_command_error(self): 136 | task_control = FakeTaskControl(promote_cmd="exit 1") 137 | runner = MysosTaskRunner( 138 | self._self_instance, 139 | self._client, 140 | "/home/test/my_cluster", 141 | NoopPackageInstaller(), 142 | task_control, 143 | self._state_manager) 144 | 145 | manager = ClusterManager(self._client, "/home/test/my_cluster") 146 | runner.start() 147 | 148 | self_member = manager.add_member(self._self_instance) 149 | 150 | # 'self_instance' becomes the master. 151 | manager.promote_member(self_member) 152 | 153 | runner.promoted.wait(1) 154 | 155 | with pytest.raises(TaskError) as e: 156 | runner.join() 157 | assert e.value.message.startswith("Failed to promote the slave") 158 | 159 | def test_get_log_position(self): 160 | task_control = FakeTaskControl(position=1) 161 | runner = MysosTaskRunner( 162 | self._self_instance, 163 | self._client, 164 | "/home/test/my_cluster", 165 | NoopPackageInstaller(), 166 | task_control, 167 | self._state_manager) 168 | 169 | runner.start() 170 | assert runner.get_log_position() == 1 171 | 172 | def test_get_log_position_error(self): 173 | task_control = FakeTaskControl(get_log_position_cmd="exit 1") 174 | runner = MysosTaskRunner( 175 | self._self_instance, 176 | self._client, 177 | "/home/test/my_cluster", 178 | NoopPackageInstaller(), 179 | task_control, 180 | self._state_manager) 181 | 182 | with pytest.raises(TaskError) as e: 183 | runner.get_log_position() 184 | assert (e.value.message == 185 | "Unable to get the slave's log position: " + 186 | "Command 'exit 1' returned non-zero exit status 1") 187 | 188 | def test_stop_interminable(self): 189 | cmd = """trap "echo Trapped SIGTERM!" TERM 190 | while : 191 | do 192 | sleep 60 193 | done 194 | """ 195 | task_control = FakeTaskControl(mysqld=cmd) 196 | runner = MysosTaskRunner( 197 | self._self_instance, 198 | self._client, 199 | "/home/test/my_cluster", 200 | NoopPackageInstaller(), 201 | task_control, 202 | self._state_manager) 203 | 204 | task_control._mysqld = cmd 205 | runner.start() 206 | assert runner.stop(timeout=1) 207 | assert deadline(runner.join, Amount(1, Time.SECONDS)) == -signal.SIGKILL 208 | -------------------------------------------------------------------------------- /tests/scheduler/test_elector.py: -------------------------------------------------------------------------------- 1 | import Queue 2 | import os 3 | import unittest 4 | 5 | from mysos.common.testing import Fake 6 | from mysos.scheduler.elector import MySQLMasterElector 7 | 8 | from twitter.common.quantity import Amount, Time 9 | 10 | 11 | if 'MYSOS_DEBUG' in os.environ: 12 | from twitter.common import log 13 | from twitter.common.log.options import LogOptions 14 | LogOptions.set_stderr_log_level('google:DEBUG') 15 | LogOptions.set_simple(True) 16 | log.init('mysos_tests') 17 | 18 | 19 | class FakeDriver(Fake): pass 20 | 21 | 22 | class TestElector(unittest.TestCase): 23 | def setUp(self): 24 | self._driver = FakeDriver() 25 | self._epoch = 1 26 | self._result = Queue.Queue() 27 | self._elector = MySQLMasterElector( 28 | self._driver, 29 | "cluster0", 30 | self._epoch, 31 | lambda x: self._result.put(x), 32 | Amount(1, Time.SECONDS), 33 | Amount(150, Time.MILLISECONDS)) 34 | self._elector.start() 35 | 36 | def tearDown(self): 37 | if self._elector: # Terminate the elector if it's not used in the test. 38 | self._elector.abort() 39 | self._elector.join() 40 | 41 | def test_single_slave(self): 42 | slave1 = ("task_id1", "slave_id1") 43 | self._elector.add_slave(*slave1) 44 | self._elector.update_position(self._epoch, slave1[0], 1) 45 | 46 | assert self._result.get(True, 1) == slave1[0] 47 | 48 | def test_two_slaves(self): 49 | slave1 = ("task_id1", "slave_id1") 50 | slave2 = ("task_id2", "slave_id2") 51 | self._elector.add_slave(*slave1) 52 | self._elector.add_slave(*slave2) 53 | 54 | self._elector.update_position(self._epoch, slave1[0], 1) 55 | self._elector.update_position(self._epoch, slave2[0], 2) 56 | 57 | assert self._result.get(True, 1) == slave2[0] 58 | 59 | def test_two_slaves_complex_position(self): 60 | slave1 = ("task_id1", "slave_id1") 61 | slave2 = ("task_id2", "slave_id2") 62 | self._elector.add_slave(*slave1) 63 | self._elector.add_slave(*slave2) 64 | 65 | # The positions are sequences of numeric strings. 66 | self._elector.update_position(self._epoch, slave1[0], ["1", "2"]) 67 | self._elector.update_position(self._epoch, slave2[0], ["2", "1"]) 68 | 69 | assert self._result.get(True, 1) == slave2[0] 70 | 71 | def test_delayed_update(self): 72 | slave1 = ("task_id1", "slave_id1") 73 | slave2 = ("task_id2", "slave_id2") 74 | self._elector.add_slave(*slave1) 75 | self._elector.add_slave(*slave2) 76 | self._elector.update_position(self._epoch, slave2[0], 2) 77 | 78 | # Force an election (after timing out) and test that slave2 is elected because it's the only 79 | # slave that responded. 80 | self._elector._elect(timedout=True) 81 | assert self._result.get(True, 1) == slave2[0] 82 | 83 | def test_position_for_invalid_slave(self): 84 | slave1 = ("task_id1", "slave_id1") 85 | slave2 = ("task_id2", "slave_id2") 86 | self._elector.update_position(self._epoch, slave1[0], 100) # This update is ignored. 87 | self._elector.add_slave(*slave1) 88 | self._elector.add_slave(*slave2) 89 | self._elector.update_position(self._epoch, slave2[0], 1) 90 | 91 | # Timeout is 1 second. Testing the organic 'timeout' of the thread. 92 | assert self._result.get(True, 2) == slave2[0] 93 | 94 | def test_position_for_previous_epoch(self): 95 | slave1 = ("task_id1", "slave_id1") 96 | slave2 = ("task_id2", "slave_id2") 97 | self._elector.add_slave(*slave1) 98 | self._elector.add_slave(*slave2) 99 | 100 | self._elector.update_position(self._epoch - 1, slave1[0], 100) # Update from a previous epoch. 101 | self._elector.update_position(self._epoch, slave2[0], 1) 102 | 103 | # Induce an election after it timed out. 104 | self._elector._elect(timedout=True) 105 | assert self._result.get(True, 1) == slave2[0] 106 | 107 | def test_remove_slave_after_election(self): 108 | slave1 = ("task_id1", "slave_id1") 109 | slave2 = ("task_id2", "slave_id2") 110 | self._elector.add_slave(*slave1) 111 | self._elector.add_slave(*slave2) 112 | 113 | self._elector.update_position(self._epoch, slave1[0], 1) 114 | self._elector.update_position(self._epoch, slave2[0], 2) 115 | 116 | assert self._result.get(True, 1) == slave2[0] 117 | 118 | # At this point a master is already elected. Slave removal is ignored. 119 | self._elector.remove_slave(slave2[0]) 120 | 121 | assert len(self._elector._positions) == 2 122 | 123 | def test_remove_slave_during_election(self): 124 | slave1 = ("task_id1", "slave_id1") 125 | slave2 = ("task_id2", "slave_id2") 126 | self._elector.add_slave(*slave1) 127 | self._elector.add_slave(*slave2) 128 | 129 | self._elector.update_position(self._epoch, slave2[0], 2) 130 | 131 | # Election still ongoing. Removing slave2 allows slave1 to be elected. 132 | self._elector.remove_slave(slave2[0]) 133 | self._elector.update_position(self._epoch, slave1[0], 1) 134 | 135 | self._elector._elect() 136 | assert self._result.get(True, 1) == slave1[0] 137 | -------------------------------------------------------------------------------- /tests/scheduler/test_http.py: -------------------------------------------------------------------------------- 1 | import json 2 | import shutil 3 | import tempfile 4 | import unittest 5 | 6 | from mysos.common import pkgutil 7 | from mysos.scheduler.http import MysosServer 8 | from mysos.scheduler.scheduler import MysosScheduler 9 | 10 | import pytest 11 | from webtest import AppError, TestApp 12 | from twitter.common.metrics import MetricSampler, RootMetrics 13 | 14 | 15 | MYSOS_MODULE = 'mysos.scheduler' 16 | ASSET_RELPATH = 'assets' 17 | 18 | 19 | class FakeScheduler(object): 20 | def __init__(self): 21 | self._exception = None 22 | self._response = None 23 | 24 | def set_exception(self, exception): 25 | self._exception = exception 26 | 27 | def set_response(self, response): 28 | self._response = response 29 | 30 | def create_cluster( 31 | self, 32 | cluster_name, 33 | cluster_user, 34 | num_nodes, 35 | size, 36 | backup_id=None, 37 | cluster_password=None): 38 | if self._exception: 39 | raise self._exception 40 | return self._response 41 | 42 | 43 | class TestHTTP(unittest.TestCase): 44 | @classmethod 45 | def setUpClass(cls): 46 | cls.web_assets_dir = tempfile.mkdtemp() 47 | pkgutil.unpack_assets(cls.web_assets_dir, MYSOS_MODULE, ASSET_RELPATH) 48 | 49 | @classmethod 50 | def tearDownClass(cls): 51 | shutil.rmtree(cls.web_assets_dir) 52 | 53 | def setUp(self): 54 | self._scheduler = FakeScheduler() 55 | self._app = TestApp( 56 | MysosServer(self._scheduler, self.web_assets_dir, MetricSampler(RootMetrics())).app) 57 | 58 | def test_create_cluster_successful(self): 59 | response = ('test_cluster_url', 'passwordfortestcluster') 60 | self._scheduler.set_response(response) 61 | body = self._app.post( 62 | '/clusters/test_cluster', {'num_nodes': 3, 'cluster_user': 'mysos'}).normal_body 63 | assert json.loads(body) == dict(cluster_url=response[0], cluster_password=response[1]) 64 | 65 | def test_create_cluster_exists(self): 66 | self._scheduler.set_exception(MysosScheduler.ClusterExists()) 67 | 68 | with pytest.raises(AppError) as e: 69 | assert self._app.post('/clusters/test_cluster', {'num_nodes': 3, 'cluster_user': 'mysos'}) 70 | assert e.value.message.startswith('Bad response: 409') 71 | 72 | def test_create_cluster_value_error(self): 73 | self._scheduler.set_exception(ValueError()) 74 | with pytest.raises(AppError) as e: 75 | self._app.post('/clusters/test_cluster', {'num_nodes': 3, 'cluster_user': 'mysos'}) 76 | assert e.value.message.startswith('Bad response: 400') 77 | 78 | def test_create_cluster_invalid_user(self): 79 | self._scheduler.set_exception(MysosScheduler.InvalidUser()) 80 | with pytest.raises(AppError) as e: 81 | self._app.post('/clusters/test_cluster', {'num_nodes': 3, 'cluster_user': 'mysos'}) 82 | assert e.value.message.startswith('Bad response: 400') 83 | -------------------------------------------------------------------------------- /tests/scheduler/test_mysos_scheduler.py: -------------------------------------------------------------------------------- 1 | import getpass 2 | import os 3 | import posixpath 4 | 5 | from mysos.common.cluster import get_cluster_path, wait_for_master, wait_for_termination 6 | from mysos.scheduler.password import gen_encryption_key 7 | from mysos.scheduler.scheduler import MysosScheduler 8 | from mysos.scheduler.state import LocalStateProvider, Scheduler 9 | 10 | from kazoo.handlers.threading import SequentialThreadingHandler 11 | from mesos.interface.mesos_pb2 import DRIVER_STOPPED, FrameworkInfo 12 | from twitter.common import log 13 | from twitter.common.concurrent import deadline 14 | from twitter.common.dirutil import safe_mkdtemp 15 | from twitter.common.metrics import RootMetrics 16 | from twitter.common.quantity import Amount, Time 17 | from zake.fake_client import FakeClient 18 | from zake.fake_storage import FakeStorage 19 | 20 | 21 | if 'MYSOS_DEBUG' in os.environ: 22 | from twitter.common.log.options import LogOptions 23 | LogOptions.set_stderr_log_level('google:DEBUG') 24 | LogOptions.set_simple(True) 25 | log.init('mysos_tests') 26 | 27 | 28 | def test_scheduler_runs(): 29 | """ 30 | Verifies that the scheduler successfully launches 3 "no-op" MySQL tasks. 31 | NOTE: Due to the limitation of zake the scheduler's ZK operations are not propagated to 32 | executors in separate processes but they are unit-tested separately. 33 | """ 34 | import mesos.native 35 | 36 | # Make sure fake_mysos_executor.pex is available to be fetched by Mesos slave. 37 | assert os.path.isfile('dist/fake_mysos_executor.pex') 38 | 39 | storage = FakeStorage(SequentialThreadingHandler()) 40 | zk_client = FakeClient(storage=storage) 41 | zk_client.start() 42 | 43 | zk_url = "zk://fake_host/home/mysos/clusters" 44 | cluster_name = "test_cluster" 45 | num_nodes = 3 46 | 47 | state_provider = LocalStateProvider(safe_mkdtemp()) 48 | 49 | framework_info = FrameworkInfo( 50 | user=getpass.getuser(), 51 | name="mysos", 52 | checkpoint=False) 53 | 54 | state = Scheduler(framework_info) 55 | 56 | scheduler = MysosScheduler( 57 | state, 58 | state_provider, 59 | getpass.getuser(), 60 | os.path.abspath("dist/fake_mysos_executor.pex"), 61 | "./fake_mysos_executor.pex", 62 | zk_client, 63 | zk_url, 64 | Amount(40, Time.SECONDS), 65 | "/fakepath", 66 | gen_encryption_key()) 67 | 68 | RootMetrics().register_observable('scheduler', scheduler) 69 | 70 | scheduler_driver = mesos.native.MesosSchedulerDriver( 71 | scheduler, 72 | framework_info, 73 | "local") 74 | scheduler_driver.start() 75 | 76 | # Wait until the scheduler is connected and becomes available. 77 | assert scheduler.connected.wait(30) 78 | 79 | scheduler.create_cluster(cluster_name, "mysql_user", num_nodes, cluster_password="passwd") 80 | 81 | # A slave is promoted to be the master. 82 | deadline( 83 | lambda: wait_for_master( 84 | get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name), 85 | zk_client), 86 | Amount(40, Time.SECONDS)) 87 | 88 | scheduler.delete_cluster(cluster_name, password="passwd") 89 | 90 | # The cluster is deleted from ZooKeeper. 91 | deadline( 92 | lambda: wait_for_termination( 93 | get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name), 94 | zk_client), 95 | Amount(40, Time.SECONDS)) 96 | 97 | sample = RootMetrics().sample() 98 | assert sample['scheduler.tasks_killed'] == 1 99 | 100 | assert scheduler_driver.stop() == DRIVER_STOPPED 101 | -------------------------------------------------------------------------------- /tests/scheduler/test_scheduler.py: -------------------------------------------------------------------------------- 1 | import getpass 2 | import os 3 | import shutil 4 | import tempfile 5 | import unittest 6 | 7 | from mysos.common.testing import Fake 8 | from mysos.scheduler.scheduler import ( 9 | DEFAULT_TASK_CPUS, 10 | DEFAULT_TASK_DISK, 11 | DEFAULT_TASK_MEM, 12 | INCOMPATIBLE_ROLE_OFFER_REFUSE_DURATION, 13 | MysosScheduler) 14 | from mysos.scheduler.launcher import create_resources 15 | from mysos.scheduler.password import gen_encryption_key, PasswordBox 16 | from mysos.scheduler.state import LocalStateProvider, MySQLCluster, Scheduler 17 | 18 | from kazoo.handlers.threading import SequentialThreadingHandler 19 | import mesos.interface.mesos_pb2 as mesos_pb2 20 | from twitter.common import log 21 | from twitter.common.metrics import RootMetrics 22 | from twitter.common.quantity import Amount, Data, Time 23 | from zake.fake_client import FakeClient 24 | from zake.fake_storage import FakeStorage 25 | 26 | import pytest 27 | 28 | 29 | if 'MYSOS_DEBUG' in os.environ: 30 | from twitter.common.log.options import LogOptions 31 | LogOptions.set_stderr_log_level('google:DEBUG') 32 | LogOptions.set_simple(True) 33 | log.init('mysos_tests') 34 | 35 | 36 | class FakeDriver(Fake): pass 37 | 38 | 39 | class TestScheduler(unittest.TestCase): 40 | def setUp(self): 41 | self._driver = FakeDriver() 42 | self._storage = FakeStorage(SequentialThreadingHandler()) 43 | self._zk_client = FakeClient(storage=self._storage) 44 | self._zk_client.start() 45 | 46 | self._framework_id = mesos_pb2.FrameworkID() 47 | self._framework_id.value = "framework_id_0" 48 | 49 | self._offer = mesos_pb2.Offer() 50 | self._offer.id.value = "offer_id_0" 51 | self._offer.framework_id.value = self._framework_id.value 52 | self._offer.slave_id.value = "slave_id_0" 53 | self._offer.hostname = "localhost" 54 | 55 | resources = create_resources( 56 | cpus=DEFAULT_TASK_CPUS * 3, 57 | mem=DEFAULT_TASK_MEM * 3, 58 | disk=DEFAULT_TASK_DISK * 3, 59 | ports=set([10000, 10001, 10002])) 60 | self._offer.resources.extend(resources) 61 | 62 | self._framework_user = "framework_user" 63 | 64 | self._zk_url = "zk://host/mysos/test" 65 | self._cluster = MySQLCluster( 66 | "cluster0", "user", "pass", 3, DEFAULT_TASK_CPUS, DEFAULT_TASK_MEM, DEFAULT_TASK_DISK) 67 | 68 | self._tmpdir = tempfile.mkdtemp() 69 | self._state_provider = LocalStateProvider(self._tmpdir) 70 | 71 | framework_info = mesos_pb2.FrameworkInfo( 72 | user=getpass.getuser(), 73 | name="mysos", 74 | checkpoint=False) 75 | self._state = Scheduler(framework_info) 76 | 77 | def tearDown(self): 78 | shutil.rmtree(self._tmpdir, True) # Clean up after ourselves. 79 | 80 | def test_scheduler_recovery(self): 81 | scheduler_key = gen_encryption_key() 82 | 83 | scheduler1 = MysosScheduler( 84 | self._state, 85 | self._state_provider, 86 | self._framework_user, 87 | "./executor.pex", 88 | "cmd.sh", 89 | self._zk_client, 90 | self._zk_url, 91 | Amount(5, Time.SECONDS), 92 | "/etc/mysos/admin_keyfile.yml", 93 | scheduler_key) 94 | scheduler1.registered(self._driver, self._framework_id, object()) 95 | scheduler1.create_cluster("cluster1", "mysql_user", 3) 96 | scheduler1.resourceOffers(self._driver, [self._offer]) 97 | 98 | # One task is launched for one offer. 99 | assert len(scheduler1._launchers["cluster1"]._cluster.tasks) == 1 100 | 101 | with pytest.raises(MysosScheduler.ClusterExists): 102 | scheduler1.create_cluster("cluster1", "mysql_user", 3) 103 | 104 | # FrameworkID should have been persisted. 105 | self._state = self._state_provider.load_scheduler_state() 106 | assert self._state.framework_info.id.value == self._framework_id.value 107 | 108 | # Simulate restart. 109 | scheduler2 = MysosScheduler( 110 | self._state, 111 | self._state_provider, 112 | self._framework_user, 113 | "./executor.pex", 114 | "cmd.sh", 115 | self._zk_client, 116 | self._zk_url, 117 | Amount(5, Time.SECONDS), 118 | "/etc/mysos/admin_keyfile.yml", 119 | scheduler_key) 120 | 121 | # Scheduler always receives registered() with the same FrameworkID after failover. 122 | scheduler2.registered(self._driver, self._framework_id, object()) 123 | 124 | assert len(scheduler2._launchers) == 1 125 | assert scheduler2._launchers["cluster1"].cluster_name == "cluster1" 126 | 127 | # Scheduler has recovered the cluster so it doesn't accept another of the same name. 128 | with pytest.raises(MysosScheduler.ClusterExists): 129 | scheduler2.create_cluster("cluster1", "mysql_user", 3) 130 | 131 | def test_scheduler_recovery_failure_before_launch(self): 132 | scheduler_key = gen_encryption_key() 133 | 134 | scheduler1 = MysosScheduler( 135 | self._state, 136 | self._state_provider, 137 | self._framework_user, 138 | "./executor.pex", 139 | "cmd.sh", 140 | self._zk_client, 141 | self._zk_url, 142 | Amount(5, Time.SECONDS), 143 | "/etc/mysos/admin_keyfile.yml", 144 | scheduler_key) 145 | scheduler1.registered(self._driver, self._framework_id, object()) 146 | _, password = scheduler1.create_cluster("cluster1", "mysql_user", 3) 147 | 148 | # Simulate restart before the task is successfully launched. 149 | scheduler2 = MysosScheduler( 150 | self._state, 151 | self._state_provider, 152 | self._framework_user, 153 | "./executor.pex", 154 | "cmd.sh", 155 | self._zk_client, 156 | self._zk_url, 157 | Amount(5, Time.SECONDS), 158 | "/etc/mysos/admin_keyfile.yml", 159 | scheduler_key) 160 | 161 | assert len(scheduler2._launchers) == 0 # No launchers are recovered. 162 | 163 | # Scheduler always receives registered() with the same FrameworkID after failover. 164 | scheduler2.registered(self._driver, self._framework_id, object()) 165 | 166 | assert len(scheduler2._launchers) == 1 167 | assert scheduler2._launchers["cluster1"].cluster_name == "cluster1" 168 | 169 | password_box = PasswordBox(scheduler_key) 170 | 171 | assert password_box.match( 172 | password, scheduler2._launchers["cluster1"]._cluster.encrypted_password) 173 | 174 | # Now offer the resources for this task. 175 | scheduler2.resourceOffers(self._driver, [self._offer]) 176 | 177 | # One task is launched for the offer. 178 | assert len(scheduler2._launchers["cluster1"]._cluster.active_tasks) == 1 179 | 180 | # Scheduler has recovered the cluster so it doesn't accept another of the same name. 181 | with pytest.raises(MysosScheduler.ClusterExists): 182 | scheduler2.create_cluster("cluster1", "mysql_user", 3) 183 | 184 | def test_incompatible_resource_role(self): 185 | scheduler1 = MysosScheduler( 186 | self._state, 187 | self._state_provider, 188 | self._framework_user, 189 | "./executor.pex", 190 | "cmd.sh", 191 | self._zk_client, 192 | self._zk_url, 193 | Amount(5, Time.SECONDS), 194 | "/etc/mysos/admin_keyfile.yml", 195 | gen_encryption_key(), 196 | framework_role='mysos') # Require 'mysos' but the resources are in '*'. 197 | 198 | RootMetrics().register_observable('scheduler', scheduler1) 199 | 200 | scheduler1.registered(self._driver, self._framework_id, object()) 201 | scheduler1.create_cluster("cluster1", "mysql_user", 3) 202 | scheduler1.resourceOffers(self._driver, [self._offer]) 203 | 204 | assert "declineOffer" in self._driver.method_calls 205 | assert len(self._driver.method_calls["declineOffer"]) == 1 206 | # [0][0][1]: [First declineOffer call][The positional args][The first positional arg], which is 207 | # a 'Filters' object. 208 | assert (self._driver.method_calls["declineOffer"][0][0][1].refuse_seconds == 209 | INCOMPATIBLE_ROLE_OFFER_REFUSE_DURATION.as_(Time.SECONDS)) 210 | 211 | sample = RootMetrics().sample() 212 | assert sample['scheduler.offers_incompatible_role'] == 1 213 | 214 | def test_scheduler_metrics(self): 215 | scheduler_key = gen_encryption_key() 216 | 217 | scheduler = MysosScheduler( 218 | self._state, 219 | self._state_provider, 220 | self._framework_user, 221 | "./executor.pex", 222 | "cmd.sh", 223 | self._zk_client, 224 | self._zk_url, 225 | Amount(5, Time.SECONDS), 226 | "/etc/mysos/admin_keyfile.yml", 227 | scheduler_key) 228 | 229 | RootMetrics().register_observable('scheduler', scheduler) 230 | 231 | scheduler.registered(self._driver, self._framework_id, object()) 232 | 233 | sample = RootMetrics().sample() 234 | assert sample['scheduler.framework_registered'] == 1 235 | 236 | scheduler.create_cluster( 237 | "cluster1", "mysql_user", 3, cluster_password='test_password') 238 | 239 | sample = RootMetrics().sample() 240 | assert sample['scheduler.cluster_count'] == 1 241 | assert sample['scheduler.total_requested_mem_mb'] == DEFAULT_TASK_MEM.as_(Data.MB) * 3 242 | assert sample['scheduler.total_requested_disk_mb'] == DEFAULT_TASK_DISK.as_(Data.MB) * 3 243 | assert sample['scheduler.total_requested_cpus'] == DEFAULT_TASK_CPUS * 3 244 | 245 | scheduler.resourceOffers(self._driver, [self._offer]) 246 | sample = RootMetrics().sample() 247 | assert sample['scheduler.resource_offers'] == 1 248 | assert sample['scheduler.tasks_launched'] == 1 249 | 250 | status = mesos_pb2.TaskStatus() 251 | status.state = mesos_pb2.TASK_RUNNING 252 | status.slave_id.value = self._offer.slave_id.value 253 | status.task_id.value = 'mysos-cluster1-0' 254 | 255 | scheduler.statusUpdate(self._driver, status) 256 | 257 | status.state = mesos_pb2.TASK_FAILED 258 | scheduler.statusUpdate(self._driver, status) 259 | 260 | sample = RootMetrics().sample() 261 | assert sample['scheduler.tasks_failed'] == 1 262 | 263 | scheduler.delete_cluster("cluster1", 'test_password') 264 | 265 | sample = RootMetrics().sample() 266 | assert sample['scheduler.cluster_count'] == 0 267 | assert sample['scheduler.total_requested_mem_mb'] == 0 268 | assert sample['scheduler.total_requested_disk_mb'] == 0 269 | assert sample['scheduler.total_requested_cpus'] == 0 270 | 271 | def test_scheduler_delete_empty_cluster(self): 272 | scheduler_key = gen_encryption_key() 273 | 274 | scheduler = MysosScheduler( 275 | self._state, 276 | self._state_provider, 277 | self._framework_user, 278 | "./executor.pex", 279 | "cmd.sh", 280 | self._zk_client, 281 | self._zk_url, 282 | Amount(5, Time.SECONDS), 283 | "/etc/mysos/admin_keyfile.yml", 284 | scheduler_key) 285 | 286 | scheduler.registered(self._driver, self._framework_id, object()) 287 | _, password = scheduler.create_cluster("cluster1", "mysql_user", 3) 288 | 289 | assert len(scheduler._launchers) == 1 290 | 291 | # Deleting the cluster before any offer comes in for launching any task. 292 | scheduler.delete_cluster("cluster1", password) 293 | 294 | assert len(scheduler._launchers) == 0 295 | -------------------------------------------------------------------------------- /tests/scheduler/test_state.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import unittest 5 | 6 | from mysos.scheduler.scheduler import DEFAULT_TASK_CPUS, DEFAULT_TASK_MEM, DEFAULT_TASK_DISK 7 | from mysos.scheduler.state import ( 8 | LocalStateProvider, 9 | MySQLCluster, 10 | MySQLTask, 11 | Scheduler 12 | ) 13 | from mysos.scheduler.password import gen_encryption_key, PasswordBox 14 | 15 | from mesos.interface.mesos_pb2 import FrameworkInfo 16 | 17 | 18 | if 'MYSOS_DEBUG' in os.environ: 19 | from twitter.common import log 20 | from twitter.common.log.options import LogOptions 21 | LogOptions.set_stderr_log_level('google:DEBUG') 22 | LogOptions.set_simple(True) 23 | log.init('mysos_tests') 24 | 25 | 26 | class TestState(unittest.TestCase): 27 | def setUp(self): 28 | self._tmpdir = tempfile.mkdtemp() 29 | self._state_provider = LocalStateProvider(self._tmpdir) 30 | 31 | def tearDown(self): 32 | shutil.rmtree(self._tmpdir, True) 33 | 34 | def test_scheduler_state(self): 35 | expected = Scheduler(FrameworkInfo( 36 | user='test_user', 37 | name='test_fw_name', 38 | checkpoint=True)) 39 | expected.clusters.add('cluster2') 40 | expected.clusters.add('cluster1') 41 | 42 | self._state_provider.dump_scheduler_state(expected) 43 | actual = self._state_provider.load_scheduler_state() 44 | 45 | assert expected.framework_info == actual.framework_info 46 | assert expected.clusters == actual.clusters 47 | 48 | def test_cluster_state(self): 49 | password_box = PasswordBox(gen_encryption_key()) 50 | 51 | expected = MySQLCluster( 52 | 'cluster1', 53 | 'cluster_user', 54 | password_box.encrypt('cluster_password'), 55 | 3, 56 | DEFAULT_TASK_CPUS, 57 | DEFAULT_TASK_MEM, 58 | DEFAULT_TASK_DISK) 59 | 60 | expected.tasks['task1'] = MySQLTask( 61 | 'cluster1', 'task1', 'slave1', 'host1', 10000) 62 | 63 | self._state_provider.dump_cluster_state(expected) 64 | actual = self._state_provider.load_cluster_state('cluster1') 65 | 66 | assert expected.user == actual.user 67 | assert isinstance(actual.num_nodes, int) 68 | assert expected.num_nodes == actual.num_nodes 69 | assert len(expected.tasks) == len(actual.tasks) 70 | assert expected.tasks['task1'].port == actual.tasks['task1'].port 71 | assert expected.encrypted_password == actual.encrypted_password 72 | assert password_box.match('cluster_password', actual.encrypted_password) 73 | -------------------------------------------------------------------------------- /tests/scheduler/test_zk_state.py: -------------------------------------------------------------------------------- 1 | import cPickle 2 | import os 3 | import unittest 4 | 5 | from mysos.scheduler.scheduler import DEFAULT_TASK_CPUS, DEFAULT_TASK_MEM, DEFAULT_TASK_DISK 6 | from mysos.scheduler.state import ( 7 | MySQLCluster, 8 | MySQLTask, 9 | Scheduler, 10 | StateProvider 11 | ) 12 | from mysos.scheduler.zk_state import ZooKeeperStateProvider 13 | 14 | from kazoo.handlers.threading import SequentialThreadingHandler 15 | from mesos.interface.mesos_pb2 import FrameworkInfo 16 | import pytest 17 | from zake.fake_client import FakeClient 18 | from zake.fake_storage import FakeStorage 19 | 20 | 21 | if 'MYSOS_DEBUG' in os.environ: 22 | from twitter.common import log 23 | from twitter.common.log.options import LogOptions 24 | LogOptions.set_stderr_log_level('google:DEBUG') 25 | LogOptions.set_simple(True) 26 | log.init('mysos_tests') 27 | 28 | 29 | class TestZooKeeperStateProvider(unittest.TestCase): 30 | def setUp(self): 31 | self._storage = FakeStorage(SequentialThreadingHandler()) 32 | self._client = FakeClient(storage=self._storage) 33 | self._client.start() 34 | self._state_provider = ZooKeeperStateProvider(self._client, '/mysos') 35 | 36 | def tearDown(self): 37 | self._client.stop() 38 | 39 | def test_scheduler_state(self): 40 | expected = Scheduler(FrameworkInfo( 41 | user='test_user', 42 | name='test_fw_name', 43 | checkpoint=True)) 44 | expected.tasks = dict(taks1='cluster1', task2='cluster2') 45 | 46 | self._state_provider.dump_scheduler_state(expected) 47 | actual = self._state_provider.load_scheduler_state() 48 | 49 | assert expected.framework_info == actual.framework_info 50 | assert expected.tasks == actual.tasks 51 | 52 | def test_scheduler_state_errors(self): 53 | assert not self._state_provider.load_scheduler_state() # Not an error for scheduler state to be 54 | # not found. 55 | 56 | self._client.ensure_path("/mysos/state") 57 | self._client.create("/mysos/state/scheduler", cPickle.dumps(object())) 58 | with pytest.raises(StateProvider.Error): 59 | self._state_provider.load_scheduler_state() 60 | 61 | def test_cluster_state(self): 62 | expected = MySQLCluster( 63 | 'cluster1', 64 | 'cluster_user', 65 | 'cluster_password', 66 | 3, 67 | DEFAULT_TASK_CPUS, 68 | DEFAULT_TASK_MEM, 69 | DEFAULT_TASK_DISK) 70 | 71 | expected.tasks['task1'] = MySQLTask( 72 | 'cluster1', 'task1', 'slave1', 'host1', 10000) 73 | 74 | self._state_provider.dump_cluster_state(expected) 75 | actual = self._state_provider.load_cluster_state('cluster1') 76 | 77 | assert expected.user == actual.user 78 | assert isinstance(actual.num_nodes, int) 79 | assert expected.num_nodes == actual.num_nodes 80 | assert len(expected.tasks) == len(actual.tasks) 81 | assert expected.tasks['task1'].port == actual.tasks['task1'].port 82 | 83 | def test_cluster_state_errors(self): 84 | assert not self._state_provider.load_cluster_state('nonexistent') 85 | 86 | self._client.ensure_path("/mysos/state/clusters") 87 | self._client.create("/mysos/state/clusters/cluster1", cPickle.dumps(object())) 88 | with pytest.raises(StateProvider.Error): 89 | self._state_provider.load_cluster_state('cluster1') 90 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27 3 | 4 | [testenv] 5 | deps = pytest 6 | install_command = pip install -e .[test] --find-links {toxinidir}/3rdparty {opts} {packages} 7 | commands = py.test -v -k 'not test_scheduler_runs' tests {posargs:} 8 | 9 | [testenv:integration] 10 | deps = 11 | pex<1 12 | pytest 13 | wheel 14 | whitelist_externals=mkdir 15 | commands = 16 | mkdir -p {toxinidir}/dist/ 17 | pip install --find-links {toxinidir}/3rdparty -e .[scheduler] 18 | pex \ 19 | --source-dir={toxinidir} \ 20 | --output-file={toxinidir}/dist/fake_mysos_executor.pex \ 21 | --requirement=mesos.native \ 22 | --requirement=zake \ 23 | --repo={toxinidir}/3rdparty \ 24 | --entry-point="mysos.executor.testing.fake_mysos_executor:proxy_main" 25 | py.test -v -k 'test_scheduler_runs' tests {posargs:} 26 | 27 | [testenv:style] 28 | deps = 29 | twitter.common.app==0.3.0 30 | twitter.common.collections==0.3.0 31 | twitter.common.lang==0.3.0 32 | twitter.common.log==0.3.0 33 | twitter.checkstyle==0.1.0 34 | skip_install = True 35 | commands = twitterstyle -n ImportOrder mysos tests 36 | 37 | # This currently requires the Vagrant VM to be up. 38 | # TODO(jyx): Launch Vagrant here directly. 39 | [testenv:vagrant] 40 | install_command = pip install -e .[test_client] --find-links {toxinidir}/3rdparty {opts} {packages} 41 | commands = {toxinidir}/vagrant/test.sh 42 | -------------------------------------------------------------------------------- /vagrant/bin/mysos_executor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -uex 4 | 5 | virtualenv venv # Create vent in the sandbox. 6 | 7 | # Using python to run pip and vagrant_mysos_executor because the shebang in venv/bin/pip can 8 | # exceed system limit and cannot be executed directly. 9 | 10 | # 'protobuf' is a a dependency of mesos.interface's but we install it separately because otherwise 11 | # 3.0.0-alpha is installed and it breaks the mesos.interface install. 12 | venv/bin/python venv/bin/pip install --find-links /home/vagrant/mysos/deps \ 13 | 'protobuf==2.6.1' mesos.native 14 | venv/bin/python venv/bin/pip install --pre --find-links /home/vagrant/mysos/deps --find-links . \ 15 | mysos[executor] 16 | 17 | venv/bin/python venv/bin/vagrant_mysos_executor 18 | -------------------------------------------------------------------------------- /vagrant/bin/mysos_scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -uex 4 | 5 | TMPDIR=$(mktemp -d) 6 | 7 | virtualenv $TMPDIR # Create venv under /tmp. 8 | 9 | # 'protobuf' is a dependency of mesos.interface's but we install it separately because otherwise 10 | # 3.0.0-alpha is installed and it breaks the mesos.interface install. 11 | $TMPDIR/bin/pip install --find-links /home/vagrant/mysos/deps 'protobuf==2.6.1' mesos.native 12 | $TMPDIR/bin/pip install --pre --find-links /home/vagrant/mysos/dist \ 13 | --find-links /home/vagrant/mysos/deps mysos[scheduler] 14 | 15 | ZK_HOST=192.168.33.17 16 | API_PORT=55001 17 | 18 | # NOTE: In --executor_environ we are pointing MYSOS_DEFAULTS_FILE to an empty MySQL defaults file. 19 | # The file 'my5.6.cnf' is pre-installed by the 'mysql-server-5.6' package on the VM. 20 | $TMPDIR/bin/mysos_scheduler \ 21 | --port=$API_PORT \ 22 | --framework_user=vagrant \ 23 | --mesos_master=zk://$ZK_HOST:2181/mesos/master \ 24 | --executor_uri=/home/vagrant/mysos/dist/mysos-0.1.0-dev0.zip \ 25 | --executor_cmd=/home/vagrant/mysos/vagrant/bin/mysos_executor.sh \ 26 | --zk_url=zk://$ZK_HOST:2181/mysos \ 27 | --admin_keypath=/home/vagrant/mysos/vagrant/etc/admin_keyfile.yml \ 28 | --framework_failover_timeout=1m \ 29 | --framework_role=mysos \ 30 | --framework_authentication_file=/home/vagrant/mysos/vagrant/etc/fw_auth_keyfile.yml \ 31 | --scheduler_keypath=/home/vagrant/mysos/vagrant/etc/scheduler_keyfile.txt \ 32 | --executor_source_prefix='vagrant.devcluster' \ 33 | --executor_environ='[{"name": "MYSOS_DEFAULTS_FILE", "value": "/etc/mysql/conf.d/my5.6.cnf"}]' 34 | -------------------------------------------------------------------------------- /vagrant/etc/admin_keyfile.yml: -------------------------------------------------------------------------------- 1 | username: sys.mysos.0 2 | password: testpassword -------------------------------------------------------------------------------- /vagrant/etc/framework_keys.txt: -------------------------------------------------------------------------------- 1 | mysos testpassword -------------------------------------------------------------------------------- /vagrant/etc/fw_auth_keyfile.yml: -------------------------------------------------------------------------------- 1 | principal: mysos 2 | secret: testpassword -------------------------------------------------------------------------------- /vagrant/etc/scheduler_keyfile.txt: -------------------------------------------------------------------------------- 1 | 73SZAptK4K6i2sB8fw6B0aQf0qLO6zmw -------------------------------------------------------------------------------- /vagrant/provision-dev-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | # This script requires mysos binaries to be built. Run `tox` first. 4 | 5 | # Install dependencies. 6 | export DEBIAN_FRONTEND=noninteractive 7 | aptitude update -q 8 | aptitude install -q -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" \ 9 | curl \ # We use curl --silent to download pakcages. 10 | libcurl3-dev \ # Mesos requirement. 11 | libsasl2-dev \ # Mesos requirement. 12 | python-dev \ 13 | zookeeper \ 14 | mysql-server-5.6 \ 15 | libmysqlclient-dev \ 16 | python-virtualenv \ 17 | libffi-dev # For pynacl. 18 | 19 | # Fix up Ubuntu mysql-server-5.6 issue: mysql_install_db looks for this file even if we don't need 20 | # it. 21 | ln -sf /usr/share/doc/mysql-server-5.6/examples/my-default.cnf /usr/share/mysql/my-default.cnf 22 | 23 | # Set the hostname to the IP address. This simplifies things for components that want to advertise 24 | # the hostname to the user, or other components. 25 | hostname 192.168.33.17 26 | 27 | # Install an update-mysos tool to sync mysos binaries and configs into the VM. 28 | cat > /usr/local/bin/update-mysos <