├── .gitignore ├── LICENSE ├── README.md ├── atlas.json ├── bin ├── install.sh ├── local_deploy.sh └── local_install.sh ├── dat └── foo.tsv ├── doc ├── 00.mesos_ui.png ├── 01.framework.png ├── 02.sandbox.png ├── activity.png ├── classes.png ├── framework.png ├── hashring.png ├── kernel.png ├── sample_tsp.png ├── tsp_fitness.R ├── tsp_fitness.png ├── tsp_fitness.tsv └── tutorial.graffle │ ├── data.plist │ └── image1.tiff └── src ├── contain.py ├── exelixi.py ├── ga.py ├── hashring.py ├── monoids.py ├── resource.py ├── sample_lmd.py ├── sample_tsp.py ├── service.py ├── uow.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | **~ 2 | *.py[cod] 3 | exelixi.log 4 | .DS_Store 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Packages 10 | *.egg 11 | *.egg-info 12 | dist 13 | build 14 | eggs 15 | parts 16 | var 17 | sdist 18 | develop-eggs 19 | .installed.cfg 20 | lib 21 | lib64 22 | __pycache__ 23 | 24 | # Installer logs 25 | pip-log.txt 26 | 27 | # Unit test / coverage reports 28 | .coverage 29 | .tox 30 | nosetests.xml 31 | 32 | # Translations 33 | *.mo 34 | 35 | # Mr Developer 36 | .mr.developer.cfg 37 | .project 38 | .pydevproject 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Exelixi 2 | 3 | **Exelixi** is a distributed framework based on [Apache Mesos], 4 | mostly implemented in Python using [gevent] for high-performance concurrency 5 | It is intended to run cluster computing jobs (partitioned batch jobs, which include some messaging) in pure Python. 6 | By default, it runs [genetic algorithms] at scale. 7 | However, it can handle a broad range of other problem domains by 8 | using `--uow` command line option to override the `UnitOfWorkFactory` class definition. 9 | 10 | Please see the [project wiki](https://github.com/ceteri/exelixi/wiki) for more details, 11 | including a [tutorial](https://github.com/ceteri/exelixi/wiki/Tutorial:-Fog-Computing-at-Hella-Scale) 12 | on how to build Mesos-based frameworks. 13 | 14 | 15 | ### Quick Start 16 | 17 | To check out the [GA] on a laptop (with Python 2.7 installed), simply run: 18 | 19 | ./src/ga.py 20 | 21 | Otherwise, to run at scale, the following steps will help you get **Exelixi** running on [Apache Mesos]. 22 | For help in general with command line options: 23 | 24 | ./src/exelixi.py -h 25 | 26 | The following instructions are based on using the [Elastic Mesos] service, 27 | which uses Ubuntu Linux servers running on [Amazon AWS]. 28 | Even so, the basic outline of steps shown here apply in general. 29 | 30 | First, launch an [Apache Mesos] cluster. 31 | Once you have confirmation that your cluster is running 32 | (e.g., [Elastic Mesos] sends you an email messages with a list of masters and slaves) 33 | then use `ssh` to login on any of the masters: 34 | 35 | ssh -A -l ubuntu 36 | 37 | You must install the [Python bindings](https://github.com/apache/mesos/tree/master/src/python) for [Apache Mesos]. 38 | The default version of Mesos changes in this code as there are updates to [Elastic Mesos](https://elastic.mesosphere.io/), 39 | since the tutorials are based on that service. 40 | You can check [http://mesosphere.io/downloads/](http://mesosphere.io/downloads/) for the latest. 41 | If you run Mesos in different environment, 42 | simply make a one-line change to the `EGG` environment variable in the `bin/local_install.sh` script. 43 | Also, you need to install the **Exelixi** source. 44 | 45 | On the Mesos master, download the `master` branch of the **Exelixi** code repo on GitHub and install the required libraries: 46 | 47 | wget https://github.com/ceteri/exelixi/archive/master.zip ; \ 48 | unzip master.zip ; \ 49 | cd exelixi-master ; \ 50 | ./bin/local_install.sh 51 | 52 | If you've customized the code by forking your own GitHub code repo, then substitute that download URL instead. 53 | Alternatively, if you've customized by subclassing the `uow.UnitOfWorkFactory` default [GA], 54 | then place that Python source file into the `src/` subdirectory. 55 | 56 | Next, run the installation command on the master, to set up each of the slaves: 57 | 58 | ./src/exelixi.py -n localhost:5050 | ./bin/install.sh 59 | 60 | Now launch the Framework, which in turn launches the worker services remotely on slave nodes. 61 | In the following case, it runs workers on two slave nodes: 62 | 63 | ./src/exelixi.py -m localhost:5050 -w 2 64 | 65 | Once everything has been set up successfully, the log file in `exelixi.log` will show a line: 66 | 67 | all worker services launched and init tasks completed 68 | 69 | From there, the [GA] runs. 70 | See a [GitHub gist](https://gist.github.com/ceteri/7609046) for an example of a successful run. 71 | 72 | 73 | ### Blame List 74 | 75 | [Paco Nathan](https://github.com/ceteri) 76 | 77 | 78 | [Amazon AWS]: http://aws.amazon.com/ 79 | [Apache Mesos]: http://mesos.apache.org/ 80 | [Elastic Mesos]: https://elastic.mesosphere.io/ 81 | [GA]: http://en.wikipedia.org/wiki/Genetic_algorithm 82 | [Python egg]: https://wiki.python.org/moin/egg 83 | [genetic algorithms]: http://en.wikipedia.org/wiki/Genetic_algorithm 84 | [gevent]: http://www.gevent.org/ 85 | -------------------------------------------------------------------------------- /atlas.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": [ 3 | "README.md" 4 | ], 5 | "formats": { 6 | "pdf": { 7 | "version": false, 8 | "index": false, 9 | "toc": false 10 | }, 11 | "epub": { 12 | "index": false, 13 | "toc": false, 14 | "epubcheck": false 15 | }, 16 | "mobi": { 17 | "index": false, 18 | "toc": false 19 | }, 20 | "html": { 21 | "index": true, 22 | "toc": true 23 | } 24 | }, 25 | "theme": "oreillymedia/atlas_tech1c_theme", 26 | "title": "exelixi" 27 | } -------------------------------------------------------------------------------- /bin/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 4 | 5 | # build a tarball/container for the Executor 6 | 7 | rm -rf /tmp/exelixi.tgz 8 | tar cvzf /tmp/exelixi.tgz ../exelixi-master/bin ../exelixi-master/src ../exelixi-master/dat 9 | 10 | # distribute tarball/container to the Mesos slaves via HDFS 11 | 12 | hadoop fs -rm -f -R /exelixi 13 | hadoop fs -mkdir /exelixi 14 | hadoop fs -put /tmp/exelixi.tgz /exelixi 15 | 16 | # run installer on each of the Mesos slaves 17 | 18 | printf "UserKnownHostsFile /dev/null\nStrictHostKeyChecking no\n" >> ~/.ssh/config 19 | 20 | while read slave 21 | do 22 | echo $slave 23 | ssh $slave 'bash -s' < $DIR/local_install.sh 24 | ssh $slave 'bash -s' < $DIR/local_deploy.sh 25 | 26 | if [ ! -z $1 ] 27 | then 28 | # optional job-specific installations 29 | ssh $slave 'bash -s' < $1 30 | fi 31 | done -------------------------------------------------------------------------------- /bin/local_deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | echo "deploying Exelixi..." 4 | rm -rf exelixi.tgz exelixi-master 5 | hadoop fs -get /exelixi/exelixi.tgz 6 | tar xvzf exelixi.tgz 7 | -------------------------------------------------------------------------------- /bin/local_install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | EGG="mesos_0.15.0-rc4_amd64.egg" 4 | 5 | echo "installing Python/Mesos..." 6 | ## NB: TODO de-Ubuntu-fy the Python parts of this install, hopefully via Anaconda/conda? 7 | sudo aptitude -y install python-setuptools 8 | sudo aptitude -y install python-protobuf 9 | sudo aptitude -y install python-gevent 10 | sudo aptitude -y install python-psutil 11 | sudo aptitude -y install python-dev 12 | sudo aptitude -y install python-pip 13 | 14 | sudo aptitude -y install git 15 | sudo pip install cython 16 | sudo pip install git+https://github.com/kmike/hat-trie.git#egg=hat-trie 17 | 18 | rm -rf $EGG 19 | wget http://downloads.mesosphere.io/master/ubuntu/13.10/$EGG 20 | sudo easy_install $EGG 21 | 22 | echo "testing Python/Mesos..." 23 | python -c 'import mesos' 24 | -------------------------------------------------------------------------------- /dat/foo.tsv: -------------------------------------------------------------------------------- 1 | 93 11 23 69 2 | -------------------------------------------------------------------------------- /doc/00.mesos_ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/00.mesos_ui.png -------------------------------------------------------------------------------- /doc/01.framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/01.framework.png -------------------------------------------------------------------------------- /doc/02.sandbox.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/02.sandbox.png -------------------------------------------------------------------------------- /doc/activity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/activity.png -------------------------------------------------------------------------------- /doc/classes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/classes.png -------------------------------------------------------------------------------- /doc/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/framework.png -------------------------------------------------------------------------------- /doc/hashring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/hashring.png -------------------------------------------------------------------------------- /doc/kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/kernel.png -------------------------------------------------------------------------------- /doc/sample_tsp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/sample_tsp.png -------------------------------------------------------------------------------- /doc/tsp_fitness.R: -------------------------------------------------------------------------------- 1 | data <- read.delim('~/src/exelixi/doc/tsp_fitness.tsv', header=F) 2 | plot(ecdf(data$V1), main="TSP fitness distribution", xlab="fitness value", ylab="CDF") 3 | abline(h=.8, col="blue") -------------------------------------------------------------------------------- /doc/tsp_fitness.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/tsp_fitness.png -------------------------------------------------------------------------------- /doc/tsp_fitness.tsv: -------------------------------------------------------------------------------- 1 | 0.2750 2 | 0.2750 3 | 0.2750 4 | 0.3202 5 | 0.3202 6 | 0.3202 7 | 0.3202 8 | 0.3202 9 | 0.3250 10 | 0.3250 11 | 0.3250 12 | 0.3375 13 | 0.3375 14 | 0.3375 15 | 0.3538 16 | 0.3538 17 | 0.3538 18 | 0.5096 19 | 0.5144 20 | 0.5240 21 | 0.5240 22 | 0.5240 23 | 0.5288 24 | 0.5337 25 | 0.5337 26 | 0.6106 27 | 0.6106 28 | 0.6106 29 | 0.6154 30 | 0.6154 31 | 0.6154 32 | 0.6202 33 | 0.6202 34 | 0.6202 35 | 0.6202 36 | 0.6202 37 | 0.6202 38 | 0.6202 39 | 0.6250 40 | 0.6250 41 | 0.6250 42 | 0.6298 43 | 0.6298 44 | 0.6298 45 | 0.6298 46 | 0.6298 47 | 0.6298 48 | 0.6346 49 | 0.6346 50 | 0.6394 51 | 0.6394 52 | 0.6442 53 | 0.6538 54 | 0.6538 55 | 0.6538 56 | 0.6635 57 | 0.6635 58 | 0.6635 59 | 0.6683 60 | 0.6683 61 | 0.6683 62 | 0.6731 63 | 0.6731 64 | 0.6779 65 | 0.6779 66 | 0.6779 67 | 0.6779 68 | 0.6827 69 | 0.6827 70 | 0.7163 71 | 0.7212 72 | 0.7260 73 | 0.7260 74 | 0.7308 75 | 0.7308 76 | 0.7404 77 | 0.7404 78 | 0.7500 79 | 0.7596 80 | 0.7596 81 | 0.7596 82 | 0.7644 83 | 0.7644 84 | 0.7644 85 | 0.7644 86 | 0.7644 87 | 0.7692 88 | 0.7692 89 | 0.7692 90 | 0.7692 91 | 0.7692 92 | 0.7692 93 | 0.7740 94 | 0.7740 95 | 0.7788 96 | 0.7788 97 | 0.7837 98 | -------------------------------------------------------------------------------- /doc/tutorial.graffle/image1.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/tutorial.graffle/image1.tiff -------------------------------------------------------------------------------- /src/contain.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | from collections import namedtuple 5 | from gevent import Greenlet 6 | from json import dumps, loads 7 | from os.path import abspath 8 | from service import UnitOfWork 9 | from uow import UnitOfWorkFactory 10 | import logging 11 | import sys 12 | 13 | 14 | ###################################################################### 15 | ## class definitions 16 | 17 | class Container (object): 18 | """Container for a distrib Py UnitOfWork""" 19 | 20 | def __init__ (self): 21 | """constructor""" 22 | self.param_space = [] 23 | 24 | ## NB: override to specify the data source 25 | self.file_name = abspath('dat/foo.tsv') 26 | ## NB: override to define the fields of a result tuple 27 | self.Result = namedtuple('Foo', ['bar', 'ugh']) 28 | 29 | 30 | def data_load (self, file_name): 31 | """load the specified data file""" 32 | ## NB: override to load the data file 33 | self.param_space.append(23) 34 | 35 | 36 | def run_calc (self, params): 37 | """run calculations based on the given param space element""" 38 | ## NB: override to calculate a job 39 | return self.Result(93, 11) 40 | 41 | 42 | class ContainerUOWFactory (UnitOfWorkFactory): 43 | """UnitOfWorkFactory definition for distrib Py jobs""" 44 | 45 | def __init__ (self): 46 | #super(UnitOfWorkFactory, self).__init__() 47 | pass 48 | 49 | def instantiate_uow (self, uow_name, prefix): 50 | return ContainerUOW(uow_name, prefix, Container()) 51 | 52 | 53 | class ContainerUOW (UnitOfWork): 54 | """UnitOfWork definition for distrib Py jobs""" 55 | def __init__ (self, uow_name, prefix, container): 56 | super(ContainerUOW, self).__init__(uow_name, prefix) 57 | self._shard = {} 58 | 59 | self._container = container 60 | self.results = [] 61 | 62 | 63 | def perform_task (self, payload): 64 | """perform a task consumed from the Worker.task_queue""" 65 | logging.debug(payload) 66 | 67 | if "job" in payload: 68 | result = self._container.run_calc(payload["job"]) 69 | self.results.append(result) 70 | logging.debug(result) 71 | elif "nop" in payload: 72 | pass 73 | 74 | 75 | def orchestrate (self, framework): 76 | """initialize shards, then iterate until all percentiles are trained""" 77 | framework.send_ring_rest("shard/init", {}) 78 | framework.send_ring_rest("data/load", { "file": self._container.file_name }) 79 | 80 | self._container.data_load(self._container.file_name) 81 | framework.phase_barrier() 82 | 83 | while len(self._container.param_space) > 0: 84 | for shard_id, shard_uri in framework.get_worker_list(): 85 | if len(self._container.param_space) > 0: 86 | params = self._container.param_space.pop(0) 87 | framework.send_worker_rest(shard_id, shard_uri, "calc/run", { "params": params }) 88 | 89 | framework.phase_barrier() 90 | 91 | # report the results 92 | needs_header = True 93 | 94 | for shard_msg in framework.send_ring_rest("shard/dump", {}): 95 | payload = loads(shard_msg) 96 | 97 | if needs_header: 98 | print "\t".join(payload["fields"]) 99 | needs_header = False 100 | 101 | for result in payload["results"]: 102 | print "\t".join(map(lambda x: str(x), result)) 103 | 104 | 105 | def handle_endpoints (self, worker, uri_path, env, start_response, body): 106 | """UnitOfWork REST endpoints, delegated from the Worker""" 107 | if uri_path == '/shard/init': 108 | # initialize the shard 109 | Greenlet(self.shard_init, worker, env, start_response, body).start() 110 | return True 111 | elif uri_path == '/data/load': 112 | # load the data 113 | Greenlet(self.data_load, worker, env, start_response, body).start() 114 | return True 115 | elif uri_path == '/calc/run': 116 | # run the calculations 117 | Greenlet(self.calc_run, worker, env, start_response, body).start() 118 | return True 119 | elif uri_path == '/shard/dump': 120 | # dump the results 121 | Greenlet(self.shard_dump, worker, env, start_response, body).start() 122 | return True 123 | else: 124 | return False 125 | 126 | 127 | ###################################################################### 128 | ## job-specific REST endpoints implemented as gevent coroutines 129 | 130 | def shard_init (self, *args, **kwargs): 131 | """initialize a shard""" 132 | worker = args[0] 133 | payload, start_response, body = worker.get_response_context(args[1:]) 134 | 135 | if worker.auth_request(payload, start_response, body): 136 | self.set_ring(worker.shard_id, worker.ring) 137 | worker.prep_task_queue() 138 | 139 | start_response('200 OK', [('Content-Type', 'text/plain')]) 140 | body.put("Bokay\r\n") 141 | body.put(StopIteration) 142 | 143 | 144 | def data_load (self, *args, **kwargs): 145 | """prepare for calculations""" 146 | worker = args[0] 147 | payload, start_response, body = worker.get_response_context(args[1:]) 148 | 149 | if worker.auth_request(payload, start_response, body): 150 | with worker.wrap_task_event(): 151 | # HTTP response first, then initiate long-running task 152 | start_response('200 OK', [('Content-Type', 'text/plain')]) 153 | body.put("Bokay\r\n") 154 | body.put(StopIteration) 155 | 156 | # load the data file 157 | logging.debug(payload["file"]) 158 | self._container.data_load(payload["file"]) 159 | 160 | # put a NOP into the queue, so we'll have something to join on 161 | worker.put_task_queue({ "nop": True }) 162 | 163 | 164 | def calc_run (self, *args, **kwargs): 165 | """enqueue one calculation""" 166 | worker = args[0] 167 | payload, start_response, body = worker.get_response_context(args[1:]) 168 | 169 | if worker.auth_request(payload, start_response, body): 170 | with worker.wrap_task_event(): 171 | # caller expects JSON response 172 | start_response('200 OK', [('Content-Type', 'application/json')]) 173 | body.put(dumps({ "ok": 1 })) 174 | body.put("\r\n") 175 | body.put(StopIteration) 176 | 177 | # put the params into the queue 178 | worker.put_task_queue({ "job": payload["params"] }) 179 | 180 | 181 | def shard_dump (self, *args, **kwargs): 182 | """dump the results""" 183 | worker = args[0] 184 | payload, start_response, body = worker.get_response_context(args[1:]) 185 | 186 | if worker.auth_request(payload, start_response, body): 187 | start_response('200 OK', [('Content-Type', 'application/json')]) 188 | body.put(dumps({ "fields": self.results[0]._fields, "results": self.results })) 189 | body.put("\r\n") 190 | body.put(StopIteration) 191 | 192 | 193 | if __name__=='__main__': 194 | ## test GA in standalone-mode, without distributed services 195 | pass 196 | -------------------------------------------------------------------------------- /src/exelixi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # author: Paco Nathan 17 | # https://github.com/ceteri/exelixi 18 | 19 | 20 | from argparse import ArgumentParser 21 | from os.path import abspath 22 | from service import Framework, Worker 23 | from util import get_master_leader, get_master_state, pipe_slave_list 24 | import logging 25 | import sys 26 | 27 | 28 | ###################################################################### 29 | ## globals 30 | 31 | APP_NAME = "Exelixi" 32 | 33 | 34 | ###################################################################### 35 | ## command line arguments 36 | 37 | def parse_cli_args (): 38 | parser = ArgumentParser(prog="Exelixi", usage="one of the operational modes shown below...", add_help=True, 39 | description="Exelixi, a distributed framework for genetic algorithms, based on Apache Mesos") 40 | 41 | group1 = parser.add_argument_group("Mesos Framework", "run as a distributed framework on an Apache Mesos cluster") 42 | group1.add_argument("-m", "--master", metavar="HOST:PORT", nargs=1, 43 | help="location for one of the masters") 44 | group1.add_argument("-w", "--workers", nargs=1, type=int, default=[1], 45 | help="number of workers to be launched") 46 | 47 | group1.add_argument("--cpu", nargs=1, type=int, default=[1], 48 | help="CPU allocation per worker, as CPU count") 49 | group1.add_argument("--mem", nargs=1, type=int, default=[32], 50 | help="MEM allocation per worker, as MB/shard") 51 | 52 | group2 = parser.add_argument_group("Mesos Executor", "run as an Apache Mesos executor (using no arguments)") 53 | 54 | group3 = parser.add_argument_group("Standalone Framework", "run as a test framework in standalone mode") 55 | group3.add_argument("-s", "--slaves", nargs="+", metavar="HOST:PORT", 56 | help="list of slaves (HOST:PORT) on which to run workers") 57 | 58 | group4 = parser.add_argument_group("Standalone Worker", "run as a test worker in standalone mode") 59 | group4.add_argument("-p", "--port", nargs=1, metavar="PORT", 60 | help="port number to use for this service") 61 | 62 | group5 = parser.add_argument_group("Nodes", "enumerate the slave nodes in an Apache Mesos cluster") 63 | group5.add_argument("-n", "--nodes", nargs="?", metavar="HOST:PORT", 64 | help="location for one of the Apache Mesos masters") 65 | 66 | parser.add_argument("--uow", nargs=1, metavar="PKG.CLASS", default=["uow.UnitOfWorkFactory"], 67 | help="subclassed UnitOfWork definition") 68 | 69 | parser.add_argument("--prefix", nargs=1, default=["hdfs://exelixi"], 70 | help="path prefix for durable storage") 71 | 72 | parser.add_argument("--log", nargs=1, default=["DEBUG"], 73 | help="logging level: INFO, DEBUG, WARNING, ERROR, CRITICAL") 74 | 75 | return parser.parse_args() 76 | 77 | 78 | if __name__=='__main__': 79 | # interpret CLI arguments 80 | args = parse_cli_args() 81 | 82 | if args.nodes: 83 | # query and report the slave list, then exit... 84 | # NB: one per line, to handle large clusters gracefully 85 | pipe_slave_list(args.nodes) 86 | sys.exit(0) 87 | 88 | # set up logging 89 | numeric_log_level = getattr(logging, args.log[0], None) 90 | 91 | if not isinstance(numeric_log_level, int): 92 | raise ValueError("Invalid log level: %s" % loglevel) 93 | 94 | logging.basicConfig(format="%(asctime)s\t%(levelname)s\t%(message)s", 95 | filename="exelixi.log", 96 | filemode="w", 97 | level=numeric_log_level 98 | ) 99 | logging.debug(args) 100 | 101 | # report settings for options 102 | opts = [] 103 | 104 | if args.uow: 105 | opts.append(" ...using %s for the UnitOfWork definitions" % (args.uow[0])) 106 | 107 | if args.prefix: 108 | opts.append(" ...using %s for the path prefix in durable storage" % (args.prefix[0])) 109 | 110 | # handle the different operational modes 111 | if args.master: 112 | logging.info("%s: running a Framework atop an Apache Mesos cluster", APP_NAME) 113 | logging.info(" ...with master %s and %d workers(s)", args.master[0], args.workers[0]) 114 | 115 | for x in opts: 116 | logging.info(x) 117 | 118 | try: 119 | from resource import MesosScheduler 120 | 121 | master_uri = get_master_leader(args.master[0]) 122 | exe_path = abspath(sys.argv[0]) 123 | 124 | # run Mesos driver to launch Framework and manage resource offers 125 | driver = MesosScheduler.start_framework(master_uri, exe_path, args.workers[0], args.uow[0], args.prefix[0], args.cpu[0], args.mem[0]) 126 | MesosScheduler.stop_framework(driver) 127 | except ImportError as e: 128 | logging.critical("Python module 'mesos' has not been installed", exc_info=True) 129 | raise 130 | 131 | elif args.slaves: 132 | logging.info("%s: running a Framework in standalone mode", APP_NAME) 133 | logging.info(" ...with slave(s) %s", args.slaves) 134 | 135 | for x in opts: 136 | logging.info(x) 137 | 138 | # run UnitOfWork orchestration via REST endpoints on the workers 139 | fra = Framework(args.uow[0], args.prefix[0]) 140 | fra.set_worker_list(args.slaves) 141 | fra.orchestrate_uow() 142 | 143 | elif args.port: 144 | logging.info("%s: running a worker service on port %s", APP_NAME, args.port[0]) 145 | 146 | try: 147 | svc = Worker(port=int(args.port[0])) 148 | svc.shard_start() 149 | except KeyboardInterrupt: 150 | pass 151 | 152 | else: 153 | logging.info("%s: running an Executor on an Apache Mesos slave", APP_NAME) 154 | 155 | try: 156 | from resource import MesosExecutor 157 | MesosExecutor.run_executor() 158 | except ImportError as e: 159 | logging.critical("Python module 'mesos' has not been installed", exc_info=True) 160 | raise 161 | except KeyboardInterrupt: 162 | pass 163 | -------------------------------------------------------------------------------- /src/ga.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # author: Paco Nathan 17 | # https://github.com/ceteri/exelixi 18 | 19 | 20 | from hat_trie import Trie 21 | from collections import Counter 22 | from gevent import Greenlet 23 | from hashlib import sha224 24 | from hashring import HashRing 25 | from json import dumps, loads 26 | from monoids import dictm 27 | from random import random, sample 28 | from service import UnitOfWork 29 | from string import ascii_lowercase 30 | from util import instantiate_class, post_distrib_rest 31 | import logging 32 | import sys 33 | 34 | 35 | ###################################################################### 36 | ## class definitions 37 | 38 | class Population (UnitOfWork): 39 | def __init__ (self, uow_name, prefix, indiv_instance): 40 | super(Population, self).__init__(uow_name, prefix) 41 | 42 | logging.debug("INIT POPULATION") 43 | 44 | self.indiv_class = indiv_instance.__class__ 45 | self.total_indiv = 0 46 | self.current_gen = 0 47 | 48 | self._shard = {} 49 | self._trie = Trie(ascii_lowercase) 50 | 51 | 52 | def perform_task (self, payload): 53 | """perform a task consumed from the Worker.task_queue""" 54 | key = payload["key"] 55 | gen = payload["gen"] 56 | feature_set = payload["feature_set"] 57 | self.receive_reify(key, gen, feature_set) 58 | 59 | 60 | def orchestrate (self, framework): 61 | """ 62 | initialize a Population of unique Individuals at generation 0, 63 | then iterate N times or until a "good enough" solution is found 64 | """ 65 | framework.send_ring_rest("pop/init", {}) 66 | framework.send_ring_rest("pop/gen", {}) 67 | 68 | while True: 69 | framework.phase_barrier() 70 | 71 | if self.current_gen == self.uow_factory.n_gen: 72 | break 73 | 74 | # determine the fitness cutoff threshold 75 | self.total_indiv = 0 76 | hist = {} 77 | 78 | for shard_msg in framework.send_ring_rest("pop/hist", {}): 79 | logging.debug(shard_msg) 80 | payload = loads(shard_msg) 81 | self.total_indiv += payload["total_indiv"] 82 | hist = dictm.fold([hist, payload["hist"]]) 83 | 84 | # test for the terminating condition 85 | hist_items = map(lambda x: (float(x[0]), x[1],), sorted(hist.items(), reverse=True)) 86 | 87 | if self.test_termination(self.current_gen, hist_items): 88 | break 89 | 90 | ## NB: TODO save Framework state to Zookeeper 91 | 92 | # apply the fitness cutoff and breed "children" for the 93 | # next generation 94 | fitness_cutoff = self.get_fitness_cutoff(hist_items) 95 | framework.send_ring_rest("pop/next", { "current_gen": self.current_gen, "fitness_cutoff": fitness_cutoff }) 96 | self.current_gen += 1 97 | 98 | # report the best Individuals in the final result 99 | results = [] 100 | 101 | for l in framework.send_ring_rest("pop/enum", { "fitness_cutoff": fitness_cutoff }): 102 | results.extend(loads(l)) 103 | 104 | results.sort(reverse=True) 105 | 106 | for x in results: 107 | # print results to stdout 108 | print "\t".join(x) 109 | 110 | 111 | def handle_endpoints (self, worker, uri_path, env, start_response, body): 112 | """UnitOfWork REST endpoints, delegated from the Worker""" 113 | if uri_path == '/pop/init': 114 | # initialize the Population subset on this shard 115 | Greenlet(self.pop_init, worker, env, start_response, body).start() 116 | return True 117 | elif uri_path == '/pop/gen': 118 | # create generation 0 in this shard 119 | Greenlet(self.pop_gen, worker, env, start_response, body).start() 120 | return True 121 | elif uri_path == '/pop/hist': 122 | # calculate a partial histogram for the fitness distribution 123 | Greenlet(self.pop_hist, worker, env, start_response, body).start() 124 | return True 125 | elif uri_path == '/pop/next': 126 | # attempt to run another generation 127 | Greenlet(self.pop_next, worker, env, start_response, body).start() 128 | return True 129 | elif uri_path == '/pop/enum': 130 | # enumerate the Individuals in this shard of the Population 131 | Greenlet(self.pop_enum, worker, env, start_response, body).start() 132 | return True 133 | elif uri_path == '/pop/reify': 134 | # test/add a new Individual into the Population (birth) 135 | Greenlet(self.pop_reify, worker, env, start_response, body).start() 136 | return True 137 | else: 138 | return False 139 | 140 | 141 | ###################################################################### 142 | ## GA-specific REST endpoints implemented as gevent coroutines 143 | 144 | def pop_init (self, *args, **kwargs): 145 | """initialize a Population of unique Individuals on this shard""" 146 | worker = args[0] 147 | payload, start_response, body = worker.get_response_context(args[1:]) 148 | 149 | if worker.auth_request(payload, start_response, body): 150 | self.set_ring(worker.shard_id, worker.ring) 151 | worker.prep_task_queue() 152 | 153 | start_response('200 OK', [('Content-Type', 'text/plain')]) 154 | body.put("Bokay\r\n") 155 | body.put(StopIteration) 156 | 157 | 158 | def pop_gen (self, *args, **kwargs): 159 | """create generation 0 of Individuals in this shard of the Population""" 160 | worker = args[0] 161 | payload, start_response, body = worker.get_response_context(args[1:]) 162 | 163 | if worker.auth_request(payload, start_response, body): 164 | with worker.wrap_task_event(): 165 | # HTTP response first, then initiate long-running task 166 | start_response('200 OK', [('Content-Type', 'text/plain')]) 167 | body.put("Bokay\r\n") 168 | body.put(StopIteration) 169 | 170 | self.populate(0) 171 | 172 | 173 | def pop_hist (self, *args, **kwargs): 174 | """calculate a partial histogram for the fitness distribution""" 175 | worker = args[0] 176 | payload, start_response, body = worker.get_response_context(args[1:]) 177 | 178 | if worker.auth_request(payload, start_response, body): 179 | start_response('200 OK', [('Content-Type', 'application/json')]) 180 | body.put(dumps({ "total_indiv": self.total_indiv, "hist": self.get_part_hist() })) 181 | body.put("\r\n") 182 | body.put(StopIteration) 183 | 184 | 185 | def pop_next (self, *args, **kwargs): 186 | """iterate N times or until a 'good enough' solution is found""" 187 | worker = args[0] 188 | payload, start_response, body = worker.get_response_context(args[1:]) 189 | 190 | if worker.auth_request(payload, start_response, body): 191 | with worker.wrap_task_event(): 192 | # HTTP response first, then initiate long-running task 193 | start_response('200 OK', [('Content-Type', 'text/plain')]) 194 | body.put("Bokay\r\n") 195 | body.put(StopIteration) 196 | 197 | current_gen = payload["current_gen"] 198 | fitness_cutoff = payload["fitness_cutoff"] 199 | self.next_generation(current_gen, fitness_cutoff) 200 | 201 | 202 | def pop_enum (self, *args, **kwargs): 203 | """enumerate the Individuals in this shard of the Population""" 204 | worker = args[0] 205 | payload, start_response, body = worker.get_response_context(args[1:]) 206 | 207 | if worker.auth_request(payload, start_response, body): 208 | fitness_cutoff = payload["fitness_cutoff"] 209 | 210 | start_response('200 OK', [('Content-Type', 'application/json')]) 211 | body.put(dumps(self.enum(fitness_cutoff))) 212 | body.put("\r\n") 213 | body.put(StopIteration) 214 | 215 | 216 | def pop_reify (self, *args, **kwargs): 217 | """test/add a newly generated Individual into the Population (birth)""" 218 | worker = args[0] 219 | payload, start_response, body = worker.get_response_context(args[1:]) 220 | 221 | if worker.auth_request(payload, start_response, body): 222 | worker.put_task_queue(payload) 223 | 224 | start_response('200 OK', [('Content-Type', 'text/plain')]) 225 | body.put("Bokay\r\n") 226 | body.put(StopIteration) 227 | 228 | 229 | ###################################################################### 230 | ## Individual lifecycle within the local subset of the Population 231 | 232 | def populate (self, current_gen): 233 | """initialize the population""" 234 | for _ in xrange(self.uow_factory.n_pop): 235 | # constructor pattern 236 | indiv = self.indiv_class() 237 | indiv.populate(current_gen, self.uow_factory.generate_features()) 238 | 239 | # add the generated Individual to the Population 240 | # failure semantics: must filter nulls from initial population 241 | self.reify(indiv) 242 | 243 | 244 | def reify (self, indiv): 245 | """test/add a newly generated Individual into the Population (birth)""" 246 | neighbor_shard_id = None 247 | shard_uri = None 248 | 249 | if self._hash_ring: 250 | neighbor_shard_id = self._hash_ring.get_node(indiv.key) 251 | 252 | if neighbor_shard_id != self._shard_id: 253 | shard_uri = self._shard_dict[neighbor_shard_id] 254 | 255 | # distribute the tasks in this phase throughout the HashRing, 256 | # using a remote task_queue with synchronization based on a 257 | # barrier pattern 258 | 259 | if shard_uri: 260 | msg = { "key": indiv.key, "gen": indiv.gen, "feature_set": loads(indiv.get_json_feature_set()) } 261 | lines = post_distrib_rest(self.prefix, neighbor_shard_id, shard_uri, "pop/reify", msg) 262 | return False 263 | else: 264 | return self._reify_locally(indiv) 265 | 266 | 267 | def receive_reify (self, key, gen, feature_set): 268 | """test/add a received reify request """ 269 | indiv = self.indiv_class() 270 | indiv.populate(gen, feature_set) 271 | self._reify_locally(indiv) 272 | 273 | 274 | def _reify_locally (self, indiv): 275 | """test/add a newly generated Individual into the Population locally (birth)""" 276 | if not (indiv.key in self._trie): 277 | self._trie[indiv.key] = 1 278 | self.total_indiv += 1 279 | 280 | # potentially an expensive operation, deferred until remote reification 281 | indiv.get_fitness(self.uow_factory, force=True) 282 | self._shard[indiv.key] = indiv 283 | 284 | return True 285 | else: 286 | return False 287 | 288 | 289 | def evict (self, indiv): 290 | """remove an Individual from the Population (death)""" 291 | if indiv.key in self._shard: 292 | # Individual only needs to be removed locally 293 | del self._shard[indiv.key] 294 | 295 | # NB: serialize to disk (write behinds) 296 | url = self._get_storage_path(indiv) 297 | 298 | 299 | def get_part_hist (self): 300 | """tally counts for the partial histogram of the fitness distribution""" 301 | l = [ round(indiv.get_fitness(self.uow_factory, force=False), self.uow_factory.hist_granularity) for indiv in self._shard.values() ] 302 | return dict(Counter(l)) 303 | 304 | 305 | def get_fitness_cutoff (self, hist_items): 306 | """determine fitness cutoff (bin lower bounds) for the parent selection filter""" 307 | logging.debug("fit: %s", hist_items) 308 | 309 | n_indiv = sum([ count for bin, count in hist_items ]) 310 | part_sum = 0 311 | break_next = False 312 | 313 | for bin, count in hist_items: 314 | if break_next: 315 | break 316 | 317 | part_sum += count 318 | percentile = part_sum / float(n_indiv) 319 | break_next = percentile >= self.uow_factory.selection_rate 320 | 321 | logging.debug("fit: percentile %f part_sum %d n_indiv %d bin %f", percentile, part_sum, n_indiv, bin) 322 | return bin 323 | 324 | 325 | def _get_storage_path (self, indiv): 326 | """create a path for durable storage of an Individual""" 327 | return self.prefix + "/" + indiv.key 328 | 329 | 330 | def _boost_diversity (self, current_gen, indiv): 331 | """randomly select other individuals and mutate them, to promote genetic diversity""" 332 | if self.uow_factory.mutation_rate > random(): 333 | indiv.mutate(self, current_gen, self.uow_factory) 334 | elif len(self._shard.values()) >= 3: 335 | # NB: ensure that at least three parents remain in each 336 | # shard per generation 337 | self.evict(indiv) 338 | 339 | 340 | def _select_parents (self, current_gen, fitness_cutoff): 341 | """select the parents for the next generation""" 342 | partition = map(lambda x: (round(x.get_fitness(), self.uow_factory.hist_granularity) > fitness_cutoff, x), self._shard.values()) 343 | good_fit = map(lambda x: x[1], filter(lambda x: x[0], partition)) 344 | poor_fit = map(lambda x: x[1], filter(lambda x: not x[0], partition)) 345 | 346 | # randomly select other individuals to promote genetic 347 | # diversity, while removing the remnant 348 | for indiv in poor_fit: 349 | self._boost_diversity(current_gen, indiv) 350 | 351 | return self._shard.values() 352 | 353 | 354 | def next_generation (self, current_gen, fitness_cutoff): 355 | """select/mutate/crossover parents to produce a new generation""" 356 | parents = self._select_parents(current_gen, fitness_cutoff) 357 | 358 | for _ in xrange(self.uow_factory.n_pop - len(parents)): 359 | f, m = sample(parents, 2) 360 | success = f.breed(self, current_gen, m, self.uow_factory) 361 | 362 | # backfill to replenish / avoid the dreaded Population collapse 363 | new_count = 0 364 | 365 | for _ in xrange(self.uow_factory.n_pop - len(self._shard.values())): 366 | # constructor pattern 367 | indiv = self.indiv_class() 368 | indiv.populate(current_gen, self.uow_factory.generate_features()) 369 | self.reify(indiv) 370 | 371 | logging.info("gen\t%d\tshard\t%s\tsize\t%d\ttotal\t%d", current_gen, self._shard_id, len(self._shard.values()), self.total_indiv) 372 | 373 | 374 | def test_termination (self, current_gen, hist): 375 | """evaluate the terminating condition for this generation and report progress""" 376 | return self.uow_factory.test_termination(current_gen, hist, self.total_indiv) 377 | 378 | 379 | def enum (self, fitness_cutoff): 380 | """enum all Individuals that exceed the given fitness cutoff""" 381 | return [[ "indiv", "%0.4f" % indiv.get_fitness(), str(indiv.gen), indiv.get_json_feature_set() ] 382 | for indiv in filter(lambda x: x.get_fitness() >= fitness_cutoff, self._shard.values()) ] 383 | 384 | 385 | class Individual (object): 386 | def __init__ (self): 387 | """create an Individual member of the Population""" 388 | self.gen = None 389 | self.key = None 390 | self._feature_set = None 391 | self._fitness = None 392 | 393 | 394 | def get_fitness (self, uow_factory=None, force=False): 395 | """determine the fitness ranging [0.0, 1.0]; higher is better""" 396 | if uow_factory and uow_factory.use_force(force): 397 | # potentially the most expensive operation, deferred with careful consideration 398 | self._fitness = uow_factory.get_fitness(self._feature_set) 399 | 400 | return self._fitness 401 | 402 | 403 | def get_json_feature_set (self): 404 | """dump the feature set as a JSON string""" 405 | return dumps(tuple(self._feature_set)) 406 | 407 | 408 | def populate (self, gen, feature_set): 409 | """populate the instance variables""" 410 | self.gen = gen 411 | self._feature_set = feature_set 412 | 413 | # create a unique key using a SHA-3 digest of the JSON representing this feature set 414 | m = sha224() 415 | m.update(self.get_json_feature_set()) 416 | self.key = unicode(m.hexdigest()) 417 | 418 | 419 | def mutate (self, pop, gen, uow_factory): 420 | """attempt to mutate the feature set""" 421 | # constructor pattern 422 | mutant = self.__class__() 423 | mutant.populate(gen, uow_factory.mutate_features(self._feature_set)) 424 | 425 | # add the mutant Individual to the Population, but remove its prior self 426 | # failure semantics: ignore, mutation rate is approx upper bounds 427 | if pop.reify(mutant): 428 | pop.evict(self) 429 | return True 430 | else: 431 | return False 432 | 433 | 434 | def breed (self, pop, gen, mate, uow_factory): 435 | """breed with a mate to produce a child""" 436 | # constructor pattern 437 | child = self.__class__() 438 | child.populate(gen, uow_factory.breed_features(self._feature_set, mate._feature_set)) 439 | 440 | # add the child Individual to the Population 441 | # failure semantics: ignore, the count will rebalance over the hash ring 442 | return pop.reify(child) 443 | 444 | 445 | if __name__=='__main__': 446 | ## test GA in standalone-mode, without distributed services 447 | 448 | # parse command line options 449 | if len(sys.argv) < 2: 450 | uow_name = "uow.UnitOfWorkFactory" 451 | else: 452 | uow_name = sys.argv[1] 453 | 454 | uow_factory = instantiate_class(uow_name) 455 | 456 | # initialize a Population of unique Individuals at generation 0 457 | uow = uow_factory.instantiate_uow(uow_name, "/tmp/exelixi") 458 | uow.populate(uow.current_gen) 459 | fitness_cutoff = 0 460 | 461 | # iterate N times or until a "good enough" solution is found 462 | while uow.current_gen < uow_factory.n_gen: 463 | hist = uow.get_part_hist() 464 | hist_items = map(lambda x: (float(x[0]), x[1],), sorted(hist.items(), reverse=True)) 465 | 466 | if uow.test_termination(uow.current_gen, hist_items): 467 | break 468 | 469 | fitness_cutoff = uow.get_fitness_cutoff(hist_items) 470 | uow.next_generation(uow.current_gen, fitness_cutoff) 471 | 472 | uow.current_gen += 1 473 | 474 | # report summary 475 | for x in sorted(uow.enum(fitness_cutoff), reverse=True): 476 | print "\t".join(x) 477 | -------------------------------------------------------------------------------- /src/hashring.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # Copyright (c) 2012, Amir Salihefendic 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without modification, 8 | # are permitted provided that the following conditions are met: 9 | # 10 | # 1. Redistributions of source code must retain the above copyright notice, this 11 | # list of conditions and the following disclaimer. 12 | # 13 | # 2. Redistributions in binary form must reproduce the above copyright notice, 14 | # this list of conditions and the following disclaimer in the documentation 15 | # and/or other materials provided with the distribution. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | # IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 21 | # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 25 | # OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 26 | # OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | # author: Amir Salihefendic 29 | # http://amix.dk/blog/post/19367 30 | 31 | 32 | import md5 33 | 34 | 35 | class HashRing(object): 36 | 37 | def __init__(self, nodes=None, replicas=3): 38 | """Manages a hash ring. 39 | 40 | `nodes` is a list of objects that have a proper __str__ representation. 41 | `replicas` indicates how many virtual points should be used pr. node, 42 | replicas are required to improve the distribution. 43 | """ 44 | self.replicas = replicas 45 | 46 | self.ring = dict() 47 | self._sorted_keys = [] 48 | 49 | if nodes: 50 | for node in nodes: 51 | self.add_node(node) 52 | 53 | def add_node(self, node): 54 | """Adds a `node` to the hash ring (including a number of replicas). 55 | """ 56 | for i in xrange(0, self.replicas): 57 | key = self.gen_key('%s:%s' % (node, i)) 58 | self.ring[key] = node 59 | self._sorted_keys.append(key) 60 | 61 | self._sorted_keys.sort() 62 | 63 | def remove_node(self, node): 64 | """Removes `node` from the hash ring and its replicas. 65 | """ 66 | for i in xrange(0, self.replicas): 67 | key = self.gen_key('%s:%s' % (node, i)) 68 | del self.ring[key] 69 | self._sorted_keys.remove(key) 70 | 71 | def get_node(self, string_key): 72 | """Given a string key a corresponding node in the hash ring is returned. 73 | 74 | If the hash ring is empty, `None` is returned. 75 | """ 76 | return self.get_node_pos(string_key)[0] 77 | 78 | def get_node_pos(self, string_key): 79 | """Given a string key a corresponding node in the hash ring is returned 80 | along with it's position in the ring. 81 | 82 | If the hash ring is empty, (`None`, `None`) is returned. 83 | """ 84 | if not self.ring: 85 | return None, None 86 | 87 | key = self.gen_key(string_key) 88 | 89 | nodes = self._sorted_keys 90 | for i in xrange(0, len(nodes)): 91 | node = nodes[i] 92 | if key <= node: 93 | return self.ring[node], i 94 | 95 | return self.ring[nodes[0]], 0 96 | 97 | def get_nodes(self, string_key): 98 | """Given a string key it returns the nodes as a generator that can hold the key. 99 | 100 | The generator is never ending and iterates through the ring 101 | starting at the correct position. 102 | """ 103 | if not self.ring: 104 | yield None, None 105 | 106 | node, pos = self.get_node_pos(string_key) 107 | for key in self._sorted_keys[pos:]: 108 | yield self.ring[key] 109 | 110 | while True: 111 | for key in self._sorted_keys: 112 | yield self.ring[key] 113 | 114 | def gen_key(self, key): 115 | """Given a string key it returns a long value, 116 | this long value represents a place on the hash ring. 117 | 118 | md5 is currently used because it mixes well. 119 | """ 120 | m = md5.new() 121 | m.update(key) 122 | return long(m.hexdigest(), 16) 123 | 124 | 125 | if __name__=='__main__': 126 | import random 127 | 128 | memcache_servers = ['192.168.0.246:11212', 129 | '192.168.0.247:11212', 130 | '192.168.0.249:11212'] 131 | 132 | ring = HashRing(memcache_servers) 133 | 134 | print ring.get_node('my_key') 135 | print ring.get_node('foo bar') 136 | print ring.get_node(str(random.random())) 137 | -------------------------------------------------------------------------------- /src/monoids.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # Francisco Mota, 2011-11-09 5 | # http://fmota.eu/blog/monoids-in-python.html 6 | # see also: http://arxiv.org/abs/1304.7544 7 | 8 | class Monoid (object): 9 | def __init__ (self, null, lift, op): 10 | self.null = null 11 | self.lift = lift 12 | self.op = op 13 | 14 | def fold (self, xs): 15 | if hasattr(xs, "__fold__"): 16 | return xs.__fold__(self) 17 | else: 18 | return reduce(self.op, (self.lift(x) for x in xs), self.null) 19 | 20 | def __call__ (self, *args): 21 | return self.fold(args) 22 | 23 | def star (self): 24 | return Monoid(self.null, self.fold, self.op) 25 | 26 | 27 | def dict_op (a, b): 28 | for key, val in b.items(): 29 | if not key in a: 30 | a[key] = val 31 | else: 32 | a[key] += val 33 | 34 | return a 35 | 36 | 37 | summ = Monoid(0, lambda x: x, lambda a,b: a+b) 38 | joinm = Monoid('', lambda x: str(x), lambda a,b: a+b) 39 | listm = Monoid([], lambda x: [x], lambda a,b: a+b) 40 | tuplem = Monoid((), lambda x: (x,), lambda a,b: a+b) 41 | lenm = Monoid(0, lambda x: 1, lambda a,b: a+b) 42 | prodm = Monoid(1, lambda x: x, lambda a,b: a*b) 43 | dictm = Monoid({}, lambda x: x, lambda a,b: dict_op(a, b)) 44 | 45 | 46 | if __name__=='__main__': 47 | x1 = { "a": 2, "b": 3 } 48 | x2 = { "b": 2, "c": 7 } 49 | 50 | print x1, x2 51 | print dictm.fold([x1, x2]) 52 | -------------------------------------------------------------------------------- /src/resource.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # author: Paco Nathan 17 | # https://github.com/ceteri/exelixi 18 | 19 | 20 | from json import dumps, loads 21 | from service import Framework, Worker, WorkerInfo 22 | from threading import Thread 23 | from util import get_telemetry 24 | from uuid import uuid1 25 | import logging 26 | import mesos 27 | import mesos_pb2 28 | import os 29 | import subprocess 30 | import sys 31 | import time 32 | 33 | 34 | ###################################################################### 35 | ## class definitions 36 | 37 | class MesosScheduler (mesos.Scheduler): 38 | # https://github.com/apache/mesos/blob/master/src/python/src/mesos.py 39 | 40 | def __init__ (self, executor, exe_path, n_workers, uow_name, prefix, cpu_alloc, mem_alloc): 41 | self.executor = executor 42 | self.taskData = {} 43 | self.tasksLaunched = 0 44 | self.tasksFinished = 0 45 | self.messagesSent = 0 46 | self.messagesReceived = 0 47 | 48 | # resource requirements 49 | self._cpu_alloc = cpu_alloc 50 | self._mem_alloc = mem_alloc 51 | 52 | # protected members to customize for Exelixi needs 53 | self._executors = {} 54 | self._exe_path = exe_path 55 | self._n_workers = n_workers 56 | self._uow_name = uow_name 57 | self._prefix = prefix 58 | 59 | 60 | def registered (self, driver, frameworkId, masterInfo): 61 | """ 62 | Invoked when the scheduler successfully registers with a Mesos 63 | master. It is called with the frameworkId, a unique ID 64 | generated by the master, and the masterInfo which is 65 | information about the master itself. 66 | """ 67 | 68 | logging.info("registered with framework ID %s", frameworkId.value) 69 | 70 | 71 | def resourceOffers (self, driver, offers): 72 | """ 73 | Invoked when resources have been offered to this framework. A 74 | single offer will only contain resources from a single slave. 75 | Resources associated with an offer will not be re-offered to 76 | _this_ framework until either (a) this framework has rejected 77 | those resources (see SchedulerDriver.launchTasks) or (b) those 78 | resources have been rescinded (see Scheduler.offerRescinded). 79 | Note that resources may be concurrently offered to more than 80 | one framework at a time (depending on the allocator being 81 | used). In that case, the first framework to launch tasks 82 | using those resources will be able to use them while the other 83 | frameworks will have those resources rescinded (or if a 84 | framework has already launched tasks with those resources then 85 | those tasks will fail with a TASK_LOST status and a message 86 | saying as much). 87 | """ 88 | 89 | logging.debug("Mesos Scheduler: received %d resource offers", len(offers)) 90 | 91 | for offer in offers: 92 | tasks = [] 93 | logging.debug("Mesos Scheduler: received resource offer %s", offer.id.value) 94 | 95 | ## NB: currently we force 'offer.hostname' to be unique per Executor... 96 | ## could be changed, but we'd need to juggle the service port numbers 97 | 98 | if self.tasksLaunched < self._n_workers and offer.hostname not in self._executors: 99 | tid = self.tasksLaunched 100 | self.tasksLaunched += 1 101 | logging.debug("Mesos Scheduler: accepting offer on slave %s to start task %d", offer.hostname, tid) 102 | 103 | task = mesos_pb2.TaskInfo() 104 | task.task_id.value = str(tid) 105 | task.slave_id.value = offer.slave_id.value 106 | task.name = "task %d" % tid 107 | task.executor.MergeFrom(self.executor) 108 | 109 | cpus = task.resources.add() 110 | cpus.name = "cpus" 111 | cpus.type = mesos_pb2.Value.SCALAR 112 | cpus.scalar.value = self._cpu_alloc 113 | 114 | mem = task.resources.add() 115 | mem.name = "mem" 116 | mem.type = mesos_pb2.Value.SCALAR 117 | mem.scalar.value = self._mem_alloc 118 | 119 | tasks.append(task) 120 | self.taskData[task.task_id.value] = (offer.slave_id, task.executor.executor_id) 121 | 122 | # record and report the Mesos slave node's telemetry and state 123 | self._executors[offer.hostname] = WorkerInfo(offer, task) 124 | 125 | for exe in self._executors.values(): 126 | logging.debug(exe.report()) 127 | 128 | # request the driver to launch the task 129 | driver.launchTasks(offer.id, tasks) 130 | 131 | 132 | def statusUpdate (self, driver, update): 133 | """ 134 | Invoked when the status of a task has changed (e.g., a slave 135 | is lost and so the task is lost, a task finishes and an 136 | executor sends a status update saying so, etc.) Note that 137 | returning from this callback acknowledges receipt of this 138 | status update. If for whatever reason the scheduler aborts 139 | during this callback (or the process exits) another status 140 | update will be delivered. Note, however, that this is 141 | currently not true if the slave sending the status update is 142 | lost or fails during that time. 143 | """ 144 | 145 | logging.debug("Mesos Scheduler: task %s is in state %d", update.task_id.value, update.state) 146 | 147 | if update.state == mesos_pb2.TASK_FINISHED: 148 | self.tasksFinished += 1 149 | slave_id, executor_id = self.taskData[update.task_id.value] 150 | 151 | # update WorkerInfo with telemetry from initial discovery task 152 | telemetry = loads(str(update.data)) 153 | logging.info("telemetry from slave %s, executor %s\n%s", slave_id.value, executor_id.value, str(update.data)) 154 | 155 | exe = self.lookup_executor(slave_id.value, executor_id.value) 156 | exe.ip_addr = telemetry["ip_addr"] 157 | 158 | ## NB: TODO make the service port a parameter 159 | exe.port = Worker.DEFAULT_PORT 160 | 161 | if self.tasksFinished == self._n_workers: 162 | logging.info("Mesos Scheduler: %d init tasks completed", self._n_workers) 163 | 164 | # request to launch service as a child process 165 | self.messagesSent += 1 166 | message = str(dumps([ self._exe_path, "-p", exe.port ])) 167 | driver.sendFrameworkMessage(executor_id, slave_id, message) 168 | 169 | 170 | def frameworkMessage (self, driver, executorId, slaveId, message): 171 | """ 172 | Invoked when an executor sends a message. These messages are 173 | best effort; do not expect a framework message to be 174 | retransmitted in any reliable fashion. 175 | """ 176 | 177 | self.messagesReceived += 1 178 | logging.info("Mesos Scheduler: slave %s executor %s", slaveId.value, executorId.value) 179 | logging.info("message %d received: %s", self.messagesReceived, str(message)) 180 | 181 | if self.messagesReceived == self._n_workers: 182 | if self.messagesReceived != self.messagesSent: 183 | logging.critical("Mesos Scheduler: framework messages lost! sent %d received %d", self.messagesSent, self.messagesReceived) 184 | sys.exit(1) 185 | 186 | for exe in self._executors.values(): 187 | logging.debug(exe.report()) 188 | 189 | logging.info("all worker services launched and init tasks completed") 190 | exe_info = self._executors.values() 191 | worker_list = [ exe.get_shard_uri() for exe in exe_info ] 192 | 193 | # run UnitOfWork orchestration via REST endpoints on the workers 194 | fra = Framework(self._uow_name, self._prefix) 195 | fra.set_worker_list(worker_list, exe_info) 196 | 197 | time.sleep(1) 198 | fra.orchestrate_uow() 199 | 200 | # shutdown the Executors after the end of an algorithm run 201 | driver.stop() 202 | 203 | 204 | def lookup_executor (self, slave_id, executor_id): 205 | """lookup the Executor based on IDs""" 206 | for exe in self._executors.values(): 207 | if exe.slave_id == slave_id: 208 | return exe 209 | 210 | 211 | @staticmethod 212 | def start_framework (master_uri, exe_path, n_workers, uow_name, prefix, cpu_alloc, mem_alloc): 213 | # initialize an executor 214 | executor = mesos_pb2.ExecutorInfo() 215 | executor.executor_id.value = uuid1().hex 216 | executor.command.value = exe_path 217 | executor.name = "Exelixi Executor" 218 | executor.source = "per-job build" 219 | 220 | ## NB: TODO download tarball/container from HDFS 221 | #uri = executor.command.uris.add() 222 | #uri.executable = false 223 | #uri.value = "hdfs://namenode/exelixi/exelixi.tgz" 224 | 225 | # initialize the framework 226 | framework = mesos_pb2.FrameworkInfo() 227 | framework.user = "" # have Mesos fill in the current user 228 | framework.name = "Exelixi Framework" 229 | 230 | if os.getenv("MESOS_CHECKPOINT"): 231 | logging.debug("Mesos Scheduler: enabling checkpoint for the framework") 232 | framework.checkpoint = True 233 | 234 | # create a scheduler and capture the command line options 235 | sched = MesosScheduler(executor, exe_path, n_workers, uow_name, prefix, cpu_alloc, mem_alloc) 236 | 237 | # initialize a driver 238 | if os.getenv("MESOS_AUTHENTICATE"): 239 | logging.debug("Mesos Scheduler: enabling authentication for the framework") 240 | 241 | if not os.getenv("DEFAULT_PRINCIPAL"): 242 | logging.critical("Mesos Scheduler: expecting authentication principal in the environment") 243 | sys.exit(1); 244 | 245 | if not os.getenv("DEFAULT_SECRET"): 246 | logging.critical("Mesos Scheduler: expecting authentication secret in the environment") 247 | sys.exit(1); 248 | 249 | credential = mesos_pb2.Credential() 250 | credential.principal = os.getenv("DEFAULT_PRINCIPAL") 251 | credential.secret = os.getenv("DEFAULT_SECRET") 252 | 253 | driver = mesos.MesosSchedulerDriver(sched, framework, master_uri, credential) 254 | else: 255 | driver = mesos.MesosSchedulerDriver(sched, framework, master_uri) 256 | 257 | return driver 258 | 259 | 260 | @staticmethod 261 | def stop_framework (driver): 262 | """ensure that the driver process terminates""" 263 | status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1 264 | driver.stop(); 265 | sys.exit(status) 266 | 267 | 268 | class MesosExecutor (mesos.Executor): 269 | # https://github.com/apache/mesos/blob/master/src/python/src/mesos.py 270 | 271 | def launchTask (self, driver, task): 272 | """ 273 | Invoked when a task has been launched on this executor 274 | (initiated via Scheduler.launchTasks). Note that this task 275 | can be realized with a thread, a process, or some simple 276 | computation, however, no other callbacks will be invoked on 277 | this executor until this callback has returned. 278 | """ 279 | 280 | ## NB: the following code runs on the Mesos slave (source of the resource offer) 281 | 282 | def run_task(): 283 | logging.debug("Mesos Executor: requested task %s", task.task_id.value) 284 | 285 | update = mesos_pb2.TaskStatus() 286 | update.task_id.value = task.task_id.value 287 | update.state = mesos_pb2.TASK_RUNNING 288 | update.data = str("running discovery task") 289 | 290 | logging.debug(update.data) 291 | driver.sendStatusUpdate(update) 292 | 293 | update = mesos_pb2.TaskStatus() 294 | update.task_id.value = task.task_id.value 295 | update.state = mesos_pb2.TASK_FINISHED 296 | 297 | ## NB: TODO test port availability... 298 | update.data = str(dumps(get_telemetry(), indent=4)) 299 | 300 | ## NB: TODO download tarball/container for service launch 301 | 302 | # notify scheduler: ready to launch service 303 | logging.debug(update.data) 304 | driver.sendStatusUpdate(update) 305 | 306 | # now create a thread to run the requested task: run tasks in 307 | # new threads or processes, rather than inside launchTask... 308 | # NB: gevent/coroutines/Greenlets conflict here... must run 309 | # those in a child shell process 310 | 311 | thread = Thread(target=run_task) 312 | thread.start() 313 | 314 | 315 | def frameworkMessage (self, driver, message): 316 | """ 317 | Invoked when a framework message has arrived for this 318 | executor. These messages are best effort; do not expect a 319 | framework message to be retransmitted in any reliable fashion. 320 | """ 321 | 322 | # launch service 323 | logging.info("Mesos Executor: service launched: %s", message) 324 | subprocess.Popen(loads(message)) 325 | 326 | # notify scheduler: service was successfully launched 327 | driver.sendFrameworkMessage(str("service launched")) 328 | 329 | 330 | @staticmethod 331 | def run_executor (): 332 | """run the executor until it is stopped externally by the framework""" 333 | driver = mesos.MesosExecutorDriver(MesosExecutor()) 334 | sys.exit(0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1) 335 | 336 | 337 | if __name__=='__main__': 338 | print "Starting executor..." 339 | MesosExecutor.run_executor() 340 | -------------------------------------------------------------------------------- /src/sample_lmd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # author: Paco Nathan 17 | # https://github.com/ceteri/exelixi 18 | 19 | 20 | from collections import namedtuple 21 | from copy import deepcopy 22 | from random import randint, sample 23 | from uow import UnitOfWorkFactory 24 | import logging 25 | import sys 26 | 27 | 28 | ###################################################################### 29 | ## class definitions 30 | 31 | OPS = ( "rend", "turn", "sup", "loop" ) 32 | 33 | Point = namedtuple('Point', 'x y') 34 | 35 | DIR_W = Point(1, 0) # DIR_N 36 | DIR_S = Point(0, 1) # DIR_W 37 | DIR_E = Point(-1, 0) # DIR_S 38 | DIR_N = Point(0, -1) # DIR_E 39 | 40 | 41 | class Drone (object): 42 | def __init__ (self, x, y): 43 | self.pos = Point(x, y) 44 | self.dir = Point(1, 0) 45 | 46 | 47 | def _mod_math (self, pos, dir, mod): 48 | result = pos + dir 49 | 50 | if result < 0: 51 | result += mod 52 | else: 53 | result %= mod 54 | 55 | return result 56 | 57 | 58 | def exec_op_sup (self, mod, sup): 59 | x = self._mod_math(self.pos.x, sup.x, mod) 60 | y = self._mod_math(self.pos.y, sup.y, mod) 61 | self.pos = Point(x, y) 62 | return x, y 63 | 64 | 65 | def exec_op_move (self, mod): 66 | x = self._mod_math(self.pos.x, self.dir.x, mod) 67 | y = self._mod_math(self.pos.y, self.dir.y, mod) 68 | self.pos = Point(x, y) 69 | return x, y 70 | 71 | 72 | def exec_op_turn (self): 73 | if self.dir.x == DIR_W.x and self.dir.y == DIR_W.y: 74 | self.dir = DIR_N 75 | elif self.dir.x == DIR_S.x and self.dir.y == DIR_W.y: 76 | self.dir = DIR_W 77 | elif self.dir.x == DIR_E.x and self.dir.y == DIR_E.y: 78 | self.dir = DIR_S 79 | elif self.dir.x == DIR_N.x and self.dir.y == DIR_N.y: 80 | self.dir = DIR_E 81 | 82 | 83 | class LMDFactory (UnitOfWorkFactory): 84 | """UnitOfWork definition for Lawnmower Drone GP""" 85 | 86 | def __init__ (self): 87 | #super(UnitOfWorkFactory, self).__init__() 88 | self.n_pop = 300 89 | self.n_gen = 200 90 | self.max_indiv = 20000 91 | self.selection_rate = 0.3 92 | self.mutation_rate = 0.3 93 | self.term_limit = 5.0e-02 94 | self.hist_granularity = 3 95 | 96 | self.grid = [ 97 | [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ], 98 | [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ], 99 | [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ], 100 | [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ], 101 | [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ], 102 | [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ], 103 | [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ], 104 | [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ], 105 | [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ], 106 | [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ], 107 | ] 108 | 109 | # sampling parameters 110 | self.length = len(self.grid) ** 2 111 | self.min = 0 112 | self.max = len(OPS) - 1 113 | 114 | 115 | def generate_features (self): 116 | """generate a new feature set for a lawnmower drone""" 117 | rand_len = randint(1, self.length) 118 | feature_set = [] 119 | 120 | while len(feature_set) < rand_len: 121 | op = randint(self.min, self.max) 122 | 123 | if op == OPS.index("sup"): 124 | feature_set.append(op) 125 | feature_set.append(randint(0, len(self.grid) - 1)) 126 | feature_set.append(randint(0, len(self.grid) - 1)) 127 | 128 | elif op == OPS.index("loop"): 129 | if len(feature_set) > 2: 130 | offset = randint(1, len(feature_set) - 1) 131 | feature_set.append(op) 132 | feature_set.append(offset) 133 | 134 | else: 135 | feature_set.append(op) 136 | 137 | return feature_set 138 | 139 | 140 | def mutate_features (self, feature_set): 141 | """mutate a copy of the given GP program""" 142 | pos_to_mutate = randint(0, len(feature_set) - 1) 143 | mutated_feature_set = list(feature_set) 144 | mutated_feature_set[pos_to_mutate] = randint(self.min, self.max) 145 | return mutated_feature_set 146 | 147 | 148 | def breed_features (self, f_feature_set, m_feature_set): 149 | """breed two GP programs to produce a toddler GP program""" 150 | split = randint(1, min(len(f_feature_set), len(m_feature_set))) 151 | return f_feature_set[split:] + m_feature_set[:split] 152 | 153 | 154 | def _simulate (self, grid, code, drone): 155 | """simulate the lawnmower grid""" 156 | sp = 0 157 | mod = len(self.grid) 158 | num_ops = 0 159 | max_ops = self.length 160 | result = None 161 | 162 | try: 163 | while sp < len(code) and num_ops < max_ops: 164 | num_ops += 1 165 | op = code[sp] 166 | 167 | if op == OPS.index("rend"): 168 | x, y = drone.exec_op_move(mod) 169 | grid[y][x] = 0 170 | 171 | elif op == OPS.index("turn"): 172 | drone.exec_op_turn() 173 | 174 | elif op == OPS.index("sup"): 175 | sup = Point(code[sp + 1], code[sp + 2]) 176 | sp += 2 177 | 178 | if sup.x == 0 and sup.y == 0: 179 | return None 180 | 181 | x, y = drone.exec_op_sup(mod, sup) 182 | grid[y][x] = 0 183 | 184 | elif op == OPS.index("loop"): 185 | offset = code[sp + 1] 186 | 187 | if offset == 0 or offset > sp: 188 | return None 189 | 190 | sp -= offset 191 | 192 | else: 193 | return None 194 | 195 | #print num_ops, sp, "pos", drone.pos, "dir", drone.dir 196 | sp += 1 197 | 198 | result = grid 199 | 200 | finally: 201 | return result 202 | 203 | 204 | def get_fitness (self, feature_set): 205 | """determine the fitness ranging [0.0, 1.0]; higher is better""" 206 | drone = Drone(randint(0, len(self.grid)), randint(0, len(self.grid))) 207 | grid = self._simulate(deepcopy(self.grid), feature_set, drone) 208 | fitness = 0.0 209 | 210 | if grid: 211 | terrorists = 0 212 | 213 | for row in grid: 214 | #print row 215 | terrorists += sum(row) 216 | 217 | fitness = (self.length - terrorists) / float(self.length) 218 | 219 | if len(feature_set) > 5: 220 | penalty = len(feature_set) / 10.0 221 | fitness /= penalty 222 | 223 | #print fitness, feature_set 224 | return fitness 225 | 226 | 227 | if __name__=='__main__': 228 | uow = LMDFactory() 229 | 230 | print uow.grid 231 | -------------------------------------------------------------------------------- /src/sample_tsp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # author: Paco Nathan 17 | # https://github.com/ceteri/exelixi 18 | 19 | 20 | from random import randint, sample 21 | from uow import UnitOfWorkFactory 22 | import logging 23 | import sys 24 | 25 | 26 | ###################################################################### 27 | ## class definitions 28 | 29 | class TSPFactory (UnitOfWorkFactory): 30 | """UnitOfWork definition for Traveling Salesperson Problem""" 31 | 32 | def __init__ (self): 33 | #super(UnitOfWorkFactory, self).__init__() 34 | self.n_pop = 10 35 | self.n_gen = 23 36 | self.max_indiv = 2000 37 | self.selection_rate = 0.2 38 | self.mutation_rate = 0.02 39 | self.term_limit = 5.0e-03 40 | self.hist_granularity = 3 41 | 42 | # cost matrix for an example TSP: optimize the bicycling route 43 | # for weekend chores in Mountain View for a young Steve Jobs 44 | # tuple definition: (name, addr, duration) 45 | 46 | self.route_meta = ( ( "Home", "secret", 0 ), 47 | ( "Piazzas Fine Foods", "3922 Middlefield Rd, Palo Alto, CA 94303", 45 ), 48 | ( "Mountain View Public Library", "585 Franklin St, Mountain View, CA 94041", 30 ), 49 | ( "Seascapes Fish & Pets Inc", "298 Castro St, Mountain View, CA 94041", 10 ), 50 | ( "Dana Street Roasting Company", "744 W Dana St, Mountain View, CA 94041", 20 ), 51 | ( "Supercuts", "2420 Charleston Rd, Mountain View, CA 94043", 60 ), 52 | ) 53 | 54 | self.route_cost = ( ( 0, 7, 11, 12, 14, 8 ), 55 | ( 7, 0, 18, 18, 19, 5 ), 56 | ( 14, 19, 0, 2, 3, 19 ), 57 | ( 12, 20, 3, 0, 1, 19 ), 58 | ( 12, 18, 3, 1, 0, 18 ), 59 | ( 8, 5, 18, 18, 19, 0 ), 60 | ) 61 | 62 | # sampling parameters 63 | self.length = len(self.route_cost) - 1 64 | self.min = 1 65 | self.max = self.length 66 | 67 | 68 | def generate_features (self): 69 | """generate a new feature set for young Steve pedaling""" 70 | features = [] 71 | expected = list(xrange(self.min, self.max + 1)) 72 | 73 | # sample row indices in the cost matrix, without replacement 74 | for _ in xrange(self.length): 75 | x = sample(expected, 1)[0] 76 | features.append(x) 77 | expected.remove(x) 78 | 79 | return features 80 | 81 | 82 | def mutate_features (self, feature_set): 83 | """mutate a copy of the given feature set""" 84 | pos_to_mutate = randint(0, len(feature_set) - 1) 85 | mutated_feature_set = list(feature_set) 86 | mutated_feature_set[pos_to_mutate] = randint(self.min, self.max) 87 | return mutated_feature_set 88 | 89 | 90 | def breed_features (self, f_feature_set, m_feature_set): 91 | """breed two feature sets to produce a child""" 92 | half = len(f_feature_set) / 2 93 | return f_feature_set[half:] + m_feature_set[:half] 94 | 95 | 96 | def get_fitness (self, feature_set): 97 | """determine the fitness ranging [0.0, 1.0]; higher is better""" 98 | #print feature_set 99 | 100 | # 1st estimator: all points were visited? 101 | expected = set(xrange(self.min, self.max + 1)) 102 | observed = set(feature_set) 103 | cost1 = len(expected - observed) / float(len(expected)) 104 | #print expected, observed, cost1 105 | 106 | # 2nd estimator: travel time was minimized? 107 | total_cost = 0 108 | worst_case = float(sum(self.route_cost[0])) * 2.0 109 | x0 = 0 110 | 111 | for x1 in feature_set: 112 | total_cost += self.route_cost[x0][x1] 113 | x0 = x1 114 | 115 | total_cost += self.route_cost[x0][0] 116 | cost2 = min(1.0, total_cost / worst_case) 117 | #print total_cost, worst_case, cost2 118 | 119 | # combine the two estimators into a fitness score 120 | fitness = 1.0 - (cost1 + cost2) / 2.0 121 | 122 | if cost1 > 0.0: 123 | fitness /= 2.0 124 | 125 | #print cost1, cost2, fitness, feature_set 126 | return fitness 127 | 128 | 129 | if __name__=='__main__': 130 | uow = TSPFactory() 131 | 132 | print uow.route_meta 133 | print uow.route_cost 134 | -------------------------------------------------------------------------------- /src/service.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # author: Paco Nathan 17 | # https://github.com/ceteri/exelixi 18 | 19 | 20 | from contextlib import contextmanager 21 | from gevent import monkey, shutdown, signal, spawn, wsgi, Greenlet 22 | from gevent.event import Event 23 | from gevent.queue import JoinableQueue 24 | from hashring import HashRing 25 | from json import dumps, loads 26 | from signal import SIGQUIT 27 | from util import instantiate_class, post_distrib_rest 28 | from uuid import uuid1 29 | import logging 30 | import sys 31 | 32 | 33 | ###################################################################### 34 | ## class definitions 35 | 36 | class Worker (object): 37 | # http://www.gevent.org/gevent.wsgi.html 38 | # http://toastdriven.com/blog/2011/jul/31/gevent-long-polling-you/ 39 | # http://blog.pythonisito.com/2012/07/gevent-and-greenlets.html 40 | 41 | DEFAULT_PORT = "9311" 42 | 43 | 44 | def __init__ (self, port=DEFAULT_PORT): 45 | # REST services 46 | monkey.patch_all() 47 | signal(SIGQUIT, shutdown) 48 | self.is_config = False 49 | self.server = wsgi.WSGIServer(('', int(port)), self._response_handler, log=None) 50 | 51 | # sharding 52 | self.prefix = None 53 | self.shard_id = None 54 | self.ring = None 55 | 56 | # concurrency based on message passing / barrier pattern 57 | self._task_event = None 58 | self._task_queue = None 59 | 60 | # UnitOfWork 61 | self._uow = None 62 | 63 | 64 | def shard_start (self): 65 | """start the worker service for this shard""" 66 | self.server.serve_forever() 67 | 68 | 69 | def shard_stop (self, *args, **kwargs): 70 | """stop the worker service for this shard""" 71 | payload = args[0] 72 | 73 | if (self.prefix == payload["prefix"]) and (self.shard_id == payload["shard_id"]): 74 | logging.info("worker service stopping... you can safely ignore any exceptions that follow") 75 | self.server.stop() 76 | else: 77 | # returns incorrect response in this case, to avoid exception 78 | logging.error("incorrect shard %s prefix %s", payload["shard_id"], payload["prefix"]) 79 | 80 | 81 | ###################################################################### 82 | ## authentication methods 83 | 84 | def auth_request (self, payload, start_response, body): 85 | """test the authentication credentials for a REST call""" 86 | if (self.prefix == payload["prefix"]) and (self.shard_id == payload["shard_id"]): 87 | return True 88 | else: 89 | # UoW caller did not provide correct credentials to access shard 90 | start_response('403 Forbidden', [('Content-Type', 'text/plain')]) 91 | body.put("Forbidden, incorrect credentials for this shard\r\n") 92 | body.put(StopIteration) 93 | 94 | logging.error("incorrect credentials shard %s prefix %s", payload["shard_id"], payload["prefix"]) 95 | return False 96 | 97 | 98 | def shard_config (self, *args, **kwargs): 99 | """configure the service to run a shard""" 100 | payload, start_response, body = self.get_response_context(args) 101 | 102 | if self.is_config: 103 | # hey, somebody call security... 104 | start_response('403 Forbidden', [('Content-Type', 'text/plain')]) 105 | body.put("Forbidden, shard is already in a configured state\r\n") 106 | body.put(StopIteration) 107 | 108 | logging.warning("denied configuring shard %s prefix %s", self.shard_id, self.prefix) 109 | else: 110 | self.is_config = True 111 | self.prefix = payload["prefix"] 112 | self.shard_id = payload["shard_id"] 113 | 114 | # dependency injection for UnitOfWork 115 | uow_name = payload["uow_name"] 116 | logging.info("initializing unit of work based on %s", uow_name) 117 | 118 | ff = instantiate_class(uow_name) 119 | self._uow = ff.instantiate_uow(uow_name, self.prefix) 120 | 121 | start_response('200 OK', [('Content-Type', 'text/plain')]) 122 | body.put("Bokay\r\n") 123 | body.put(StopIteration) 124 | 125 | logging.info("configuring shard %s prefix %s", self.shard_id, self.prefix) 126 | 127 | 128 | ###################################################################### 129 | ## barrier pattern methods 130 | 131 | @contextmanager 132 | def wrap_task_event (self): 133 | """initialize a gevent.Event, to which the UnitOfWork will wait as a listener""" 134 | self._task_event = Event() 135 | yield 136 | 137 | # complete the Event, notifying the UnitOfWork which waited 138 | self._task_event.set() 139 | self._task_event = None 140 | 141 | 142 | def _consume_task_queue (self): 143 | """consume/serve requests until the task_queue empties""" 144 | while True: 145 | payload = self._task_queue.get() 146 | 147 | try: 148 | self._uow.perform_task(payload) 149 | finally: 150 | self._task_queue.task_done() 151 | 152 | 153 | def prep_task_queue (self): 154 | """prepare task_queue for another set of distributed tasks""" 155 | self._task_queue = JoinableQueue() 156 | spawn(self._consume_task_queue) 157 | 158 | 159 | def put_task_queue (self, payload): 160 | """put the given task definition into the task_queue""" 161 | self._task_queue.put_nowait(payload) 162 | 163 | 164 | def queue_wait (self, *args, **kwargs): 165 | """wait until all shards finished sending task_queue requests""" 166 | payload, start_response, body = self.get_response_context(args) 167 | 168 | if self.auth_request(payload, start_response, body): 169 | if self._task_event: 170 | self._task_event.wait() 171 | 172 | # HTTP response first, then initiate long-running task 173 | start_response('200 OK', [('Content-Type', 'text/plain')]) 174 | body.put("Bokay\r\n") 175 | body.put(StopIteration) 176 | 177 | 178 | def queue_join (self, *args, **kwargs): 179 | """join on the task_queue, as a barrier to wait until it empties""" 180 | payload, start_response, body = self.get_response_context(args) 181 | 182 | if self.auth_request(payload, start_response, body): 183 | start_response('200 OK', [('Content-Type', 'text/plain')]) 184 | body.put("join queue...\r\n") 185 | 186 | ## NB: TODO this step of emptying out the task_queue on 187 | ## shards could take a while on a large run... perhaps use 188 | ## a long-polling HTTP request or websocket instead? 189 | self._task_queue.join() 190 | 191 | body.put("done\r\n") 192 | body.put(StopIteration) 193 | 194 | 195 | ###################################################################### 196 | ## hash ring methods 197 | 198 | def ring_init (self, *args, **kwargs): 199 | """initialize the HashRing""" 200 | payload, start_response, body = self.get_response_context(args) 201 | 202 | if self.auth_request(payload, start_response, body): 203 | self.ring = payload["ring"] 204 | 205 | start_response('200 OK', [('Content-Type', 'text/plain')]) 206 | body.put("Bokay\r\n") 207 | body.put(StopIteration) 208 | 209 | logging.info("setting hash ring %s", self.ring) 210 | 211 | 212 | ###################################################################### 213 | ## WSGI handler for REST endpoints 214 | 215 | def get_response_context (self, args): 216 | """decode the WSGI response context from the Greenlet args""" 217 | env = args[0] 218 | msg = env["wsgi.input"].read() 219 | payload = loads(msg) 220 | start_response = args[1] 221 | body = args[2] 222 | 223 | return payload, start_response, body 224 | 225 | 226 | def _response_handler (self, env, start_response): 227 | """handle HTTP request/response""" 228 | uri_path = env["PATH_INFO"] 229 | body = JoinableQueue() 230 | 231 | if self._uow and self._uow.handle_endpoints(self, uri_path, env, start_response, body): 232 | pass 233 | 234 | ########################################## 235 | # Worker endpoints 236 | 237 | elif uri_path == '/shard/config': 238 | # configure the service to run a shard 239 | Greenlet(self.shard_config, env, start_response, body).start() 240 | 241 | elif uri_path == '/shard/stop': 242 | # shutdown the service 243 | ## NB: must parse POST data specially, to avoid exception 244 | payload = loads(env["wsgi.input"].read()) 245 | Greenlet(self.shard_stop, payload).start_later(1) 246 | 247 | # HTTP response starts first, to avoid error after server stops 248 | start_response('200 OK', [('Content-Type', 'text/plain')]) 249 | body.put("Goodbye\r\n") 250 | body.put(StopIteration) 251 | 252 | elif uri_path == '/queue/wait': 253 | # wait until all shards have finished sending task_queue requests 254 | Greenlet(self.queue_wait, env, start_response, body).start() 255 | 256 | elif uri_path == '/queue/join': 257 | # join on the task_queue, as a barrier to wait until it empties 258 | Greenlet(self.queue_join, env, start_response, body).start() 259 | 260 | elif uri_path == '/check/persist': 261 | ## NB: TODO checkpoint the service state to durable storage 262 | start_response('200 OK', [('Content-Type', 'text/plain')]) 263 | body.put("Bokay\r\n") 264 | body.put(StopIteration) 265 | 266 | elif uri_path == '/check/recover': 267 | ## NB: TODO restart the service, recovering from most recent checkpoint 268 | start_response('200 OK', [('Content-Type', 'text/plain')]) 269 | body.put("Bokay\r\n") 270 | body.put(StopIteration) 271 | 272 | ########################################## 273 | # HashRing endpoints 274 | 275 | elif uri_path == '/ring/init': 276 | # initialize the HashRing 277 | Greenlet(self.ring_init, env, start_response, body).start() 278 | 279 | elif uri_path == '/ring/add': 280 | ## NB: TODO add a node to the HashRing 281 | start_response('200 OK', [('Content-Type', 'text/plain')]) 282 | body.put("Bokay\r\n") 283 | body.put(StopIteration) 284 | 285 | elif uri_path == '/ring/del': 286 | ## NB: TODO delete a node from the HashRing 287 | start_response('200 OK', [('Content-Type', 'text/plain')]) 288 | body.put("Bokay\r\n") 289 | body.put(StopIteration) 290 | 291 | ########################################## 292 | # utility endpoints 293 | 294 | elif uri_path == '/': 295 | # dump info about the service in general 296 | start_response('200 OK', [('Content-Type', 'text/plain')]) 297 | body.put(str(env) + "\r\n") 298 | body.put(StopIteration) 299 | 300 | else: 301 | # ne znayu 302 | start_response('404 Not Found', [('Content-Type', 'text/plain')]) 303 | body.put('Not Found\r\n') 304 | body.put(StopIteration) 305 | 306 | return body 307 | 308 | 309 | class WorkerInfo (object): 310 | def __init__ (self, offer, task): 311 | self.host = offer.hostname 312 | self.slave_id = offer.slave_id.value 313 | self.task_id = task.task_id.value 314 | self.executor_id = task.executor.executor_id.value 315 | self.ip_addr = None 316 | self.port = None 317 | 318 | def get_shard_uri (self): 319 | """generate a URI for this worker service""" 320 | return self.ip_addr + ":" + self.port 321 | 322 | 323 | def report (self): 324 | """report the slave telemetry + state""" 325 | return "host %s slave %s task %s exe %s ip %s:%s" % (self.host, self.slave_id, str(self.task_id), self.executor_id, self.ip_addr, self.port) 326 | 327 | 328 | class Framework (object): 329 | def __init__ (self, uow_name, prefix="/tmp/exelixi"): 330 | """initialize the system parameters, which represent operational state""" 331 | self.uuid = uuid1().hex 332 | self.prefix = prefix + "/" + self.uuid 333 | logging.info("prefix: %s", self.prefix) 334 | 335 | # dependency injection for UnitOfWork 336 | self.uow_name = uow_name 337 | logging.info("initializing unit of work based on %s", uow_name) 338 | 339 | ff = instantiate_class(self.uow_name) 340 | self._uow = ff.instantiate_uow(self.uow_name, self.prefix) 341 | 342 | self._shard_assoc = None 343 | self._ring = None 344 | 345 | 346 | def _gen_shard_id (self, i, n): 347 | """generate a shard_id""" 348 | s = str(i) 349 | z = ''.join([ '0' for _ in xrange(len(str(n)) - len(s)) ]) 350 | return "shard/" + z + s 351 | 352 | 353 | def set_worker_list (self, worker_list, exe_info=None): 354 | """associate shards with Executors""" 355 | self._shard_assoc = {} 356 | 357 | for i in xrange(len(worker_list)): 358 | shard_id = self._gen_shard_id(i, len(worker_list)) 359 | 360 | if not exe_info: 361 | self._shard_assoc[shard_id] = [worker_list[i], None] 362 | else: 363 | self._shard_assoc[shard_id] = [worker_list[i], exe_info[i]] 364 | 365 | logging.info("shard list: %s", str(self._shard_assoc)) 366 | 367 | 368 | def get_worker_list (self): 369 | """generator for the worker shards""" 370 | for shard_id, (shard_uri, exe_info) in self._shard_assoc.items(): 371 | yield shard_id, shard_uri 372 | 373 | 374 | def get_worker_count (self): 375 | """count the worker shards""" 376 | return len(self._shard_assoc) 377 | 378 | 379 | def send_worker_rest (self, shard_id, shard_uri, path, base_msg): 380 | """access a REST endpoint on the specified shard""" 381 | return post_distrib_rest(self.prefix, shard_id, shard_uri, path, base_msg) 382 | 383 | 384 | def send_ring_rest (self, path, base_msg): 385 | """access a REST endpoint on each of the shards""" 386 | json_str = [] 387 | 388 | for shard_id, (shard_uri, exe_info) in self._shard_assoc.items(): 389 | lines = post_distrib_rest(self.prefix, shard_id, shard_uri, path, base_msg) 390 | json_str.append(lines[0]) 391 | 392 | return json_str 393 | 394 | 395 | def phase_barrier (self): 396 | """ 397 | implements a two-phase barrier to (1) wait until all shards 398 | have finished sending task_queue requests, then (2) join on 399 | each task_queue, to wait until it has emptied 400 | """ 401 | self.send_ring_rest("queue/wait", {}) 402 | self.send_ring_rest("queue/join", {}) 403 | 404 | 405 | def orchestrate_uow (self): 406 | """orchestrate a UnitOfWork distributed across the HashRing via REST endpoints""" 407 | # configure the shards and the hash ring 408 | self.send_ring_rest("shard/config", { "uow_name": self.uow_name }) 409 | 410 | self._ring = { shard_id: shard_uri for shard_id, (shard_uri, exe_info) in self._shard_assoc.items() } 411 | self.send_ring_rest("ring/init", { "ring": self._ring }) 412 | 413 | # distribute the UnitOfWork tasks 414 | self._uow.orchestrate(self) 415 | 416 | # shutdown 417 | self.send_ring_rest("shard/stop", {}) 418 | 419 | 420 | class UnitOfWork (object): 421 | def __init__ (self, uow_name, prefix): 422 | self.uow_name = uow_name 423 | self.uow_factory = instantiate_class(uow_name) 424 | 425 | self.prefix = prefix 426 | 427 | self._shard_id = None 428 | self._shard_dict = None 429 | self._hash_ring = None 430 | 431 | 432 | def set_ring (self, shard_id, shard_dict): 433 | """initialize the HashRing""" 434 | self._shard_id = shard_id 435 | self._shard_dict = shard_dict 436 | self._hash_ring = HashRing(shard_dict.keys()) 437 | 438 | 439 | def perform_task (self, payload): 440 | """perform a task consumed from the Worker.task_queue""" 441 | pass 442 | 443 | 444 | def orchestrate (self, framework): 445 | """orchestrate Workers via REST endpoints""" 446 | pass 447 | 448 | 449 | def handle_endpoints (self, worker, uri_path, env, start_response, body): 450 | """UnitOfWork REST endpoints""" 451 | pass 452 | 453 | 454 | if __name__=='__main__': 455 | if len(sys.argv) < 2: 456 | print "usage:\n %s " % (sys.argv[0]) 457 | sys.exit(1) 458 | 459 | shard_uri = sys.argv[1] 460 | uow_name = sys.argv[2] 461 | 462 | fra = Framework(uow_name) 463 | print "framework launching based on %s stored at %s..." % (fra.uow_name, fra.prefix) 464 | 465 | fra.set_worker_list([ shard_uri ]) 466 | fra.orchestrate_uow() 467 | -------------------------------------------------------------------------------- /src/uow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # author: Paco Nathan 17 | # https://github.com/ceteri/exelixi 18 | 19 | 20 | from ga import Individual, Population 21 | from random import randint 22 | from util import instantiate_class 23 | import logging 24 | 25 | 26 | ###################################################################### 27 | ## class definitions 28 | 29 | class UnitOfWorkFactory (object): 30 | """encapsulates all of the dependency injection and UnitOfWork definitions""" 31 | 32 | def __init__ (self): 33 | ## NB: override these GA parameters 34 | self.n_pop = 23 35 | self.n_gen = 10 36 | self.term_limit = 5.0e-03 37 | self.hist_granularity = 3 38 | self.selection_rate = 0.2 39 | self.mutation_rate = 0.02 40 | self.max_indiv = 2000 41 | 42 | ## NB: override these feature set parameters 43 | self.length = 5 44 | self.min = 0 45 | self.max = 100 46 | self.target = 231 47 | 48 | 49 | def instantiate_uow (self, uow_name, prefix): 50 | """instantiate a UnitOfWork, to decouple services from the GA problem domain""" 51 | ## NB: override these class references to customize the GA definition 52 | return Population(uow_name, prefix, Individual()) 53 | 54 | 55 | def get_fitness (self, feature_set): 56 | """determine the fitness ranging [0.0, 1.0]; higher is better""" 57 | ## NB: override this fitness function 58 | return 1.0 - abs(sum(feature_set) - self.target) / float(self.target) 59 | 60 | 61 | def use_force (self, force): 62 | """determine whether to force recalculation of a fitness function""" 63 | # NB: override in some use cases, e.g., when required for evaluating shared resources 64 | return force 65 | 66 | 67 | def generate_features (self): 68 | """generate a new feature set""" 69 | ## NB: override this feature set generator 70 | return sorted([ randint(self.min, self.max) for _ in xrange(self.length) ]) 71 | 72 | 73 | def mutate_features (self, feature_set): 74 | """mutate a copy of the given feature set""" 75 | ## NB: override this feature set mutator 76 | pos_to_mutate = randint(0, len(feature_set) - 1) 77 | mutated_feature_set = list(feature_set) 78 | mutated_feature_set[pos_to_mutate] = randint(self.min, self.max) 79 | return sorted(mutated_feature_set) 80 | 81 | 82 | def breed_features (self, f_feature_set, m_feature_set): 83 | """breed two feature sets to produce a child""" 84 | ## NB: override this feature set crossover 85 | half = len(f_feature_set) / 2 86 | return sorted(f_feature_set[half:] + m_feature_set[:half]) 87 | 88 | 89 | def _calc_median_hist (self, hist_items, n_indiv): 90 | """calculate the median from a fitness histogram""" 91 | sum_count = 0 92 | mid_count = float(n_indiv) / 2 93 | 94 | if n_indiv == 1: 95 | return hist_items[0][0] 96 | else: 97 | for i in xrange(len(hist_items)): 98 | bin, count = hist_items[i] 99 | sum_count += count 100 | 101 | if sum_count == mid_count: 102 | return bin 103 | elif sum_count > mid_count: 104 | bin0, count0 = hist_items[i - 1] 105 | return ((bin0 * count0) + (bin * count)) / (count0 + count) 106 | 107 | 108 | def test_termination (self, current_gen, hist_items, total_indiv): 109 | """evaluate the terminating condition for this generation and report progress""" 110 | ## NB: override this termination test 111 | 112 | # calculate a mean squared error (MSE) of fitness for a Population 113 | hist_keys = map(lambda x: x[0], hist_items) 114 | n_indiv = sum([ count for bin, count in hist_items ]) 115 | fit_mse = sum([ count * (1.0 - float(bin)) ** 2.0 for bin, count in hist_items ]) / float(n_indiv) 116 | 117 | # calculate summary stats 118 | fit_max = max(hist_keys) 119 | fit_avg = sum(hist_keys) / float(n_indiv) 120 | fit_med = self._calc_median_hist(hist_items, n_indiv) 121 | 122 | # report the progress for one generation 123 | gen_report = "gen\t%d\tsize\t%d\ttotal\t%d\tmse\t%.2e\tmax\t%.2e\tmed\t%.2e\tavg\t%.2e" % (current_gen, n_indiv, total_indiv, fit_mse, fit_max, fit_med, fit_avg) 124 | print gen_report 125 | logging.info(gen_report) 126 | logging.debug(filter(lambda x: x[1] > 0, hist_items)) 127 | 128 | # stop when a "good enough" solution is found 129 | return (fit_mse <= self.term_limit) or (total_indiv >= self.max_indiv) 130 | 131 | 132 | if __name__=='__main__': 133 | # a simple test 134 | uow_name = "uow.UnitOfWorkFactory" 135 | uow = instantiate_class(uow_name) 136 | 137 | print uow 138 | -------------------------------------------------------------------------------- /src/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # author: Paco Nathan 17 | # https://github.com/ceteri/exelixi 18 | 19 | 20 | from collections import OrderedDict 21 | from httplib import BadStatusLine 22 | from importlib import import_module 23 | from json import dumps, loads 24 | from os.path import abspath 25 | from random import random 26 | from urllib2 import urlopen, Request, URLError 27 | import logging 28 | import psutil 29 | import socket 30 | 31 | 32 | ###################################################################### 33 | ## utilities 34 | 35 | def instantiate_class (class_path): 36 | """instantiate a class from the given package.class name""" 37 | module_name, class_name = class_path.split(".") 38 | return getattr(import_module(module_name), class_name)() 39 | 40 | 41 | def post_distrib_rest (prefix, shard_id, shard_uri, path, base_msg): 42 | """POST a JSON-based message to a REST endpoint on a shard""" 43 | msg = base_msg.copy() 44 | 45 | # populate credentials 46 | msg["prefix"] = prefix 47 | msg["shard_id"] = shard_id 48 | 49 | # POST the JSON payload to the REST endpoint 50 | uri = "http://" + shard_uri + "/" + path 51 | req = Request(uri) 52 | req.add_header('Content-Type', 'application/json') 53 | 54 | logging.debug("send %s %s", shard_uri, path) 55 | logging.debug(dumps(msg)) 56 | 57 | # read/collect the response 58 | try: 59 | f = urlopen(req, dumps(msg)) 60 | return f.readlines() 61 | except URLError as e: 62 | logging.critical("could not reach REST endpoint %s error: %s", uri, str(e.reason), exc_info=True) 63 | raise 64 | except BadStatusLine as e: 65 | logging.critical("REST endpoint died %s error: %s", uri, str(e.line), exc_info=True) 66 | 67 | 68 | def get_telemetry (): 69 | """get system resource telemetry on a Mesos slave via psutil""" 70 | telemetry = OrderedDict() 71 | 72 | telemetry["ip_addr"] = socket.gethostbyname(socket.gethostname()) 73 | 74 | telemetry["mem_free"] = psutil.virtual_memory().free 75 | 76 | telemetry["cpu_num"] = psutil.NUM_CPUS 77 | 78 | x = psutil.cpu_times() 79 | telemetry["cpu_times"] = OrderedDict([ ("user", x.user), ("system", x.system), ("idle", x.idle) ]) 80 | 81 | x = psutil.disk_usage("/tmp") 82 | telemetry["disk_usage"] = OrderedDict([ ("free", x.free), ("percent", x.percent) ]) 83 | 84 | x = psutil.disk_io_counters() 85 | telemetry["disk_io"] = OrderedDict([ ("read_count", x.read_count), ("write_count", x.write_count), ("read_bytes", x.read_bytes), ("write_bytes", x.write_bytes), ("read_time", x.read_time), ("write_time", x.write_time) ]) 86 | 87 | x = psutil.network_io_counters() 88 | telemetry["network_io"] = OrderedDict([ ("bytes_sent", x.bytes_sent), ("bytes_recv", x.bytes_recv), ("packets_sent", x.packets_sent), ("packets_recv", x.packets_recv), ("errin", x.errin), ("errout", x.errout), ("dropin", x.dropin), ("dropout", x.dropout) ]) 89 | 90 | return telemetry 91 | 92 | 93 | def get_master_state (master_uri): 94 | """get current state, represented as JSON, from the Mesos master""" 95 | uri = "http://" + master_uri + "/master/state.json" 96 | 97 | try: 98 | response = urlopen(uri) 99 | return loads(response.read()) 100 | except URLError as e: 101 | logging.critical("could not reach REST endpoint %s error: %s", uri, str(e.reason), exc_info=True) 102 | raise 103 | 104 | 105 | def get_master_leader (master_uri): 106 | """get the host:port for the Mesos master leader""" 107 | state = get_master_state(master_uri) 108 | return state["leader"].split("@")[1] 109 | 110 | 111 | def pipe_slave_list (master_uri): 112 | """report a list of slave IP addr, one per line to stdout -- for building pipes""" 113 | state = get_master_state(get_master_leader(master_uri)) 114 | 115 | for s in state["slaves"]: 116 | print s["pid"].split("@")[1].split(":")[0] 117 | 118 | 119 | if __name__=='__main__': 120 | pass 121 | --------------------------------------------------------------------------------