├── .gitignore
├── LICENSE
├── README.md
├── atlas.json
├── bin
    ├── install.sh
    ├── local_deploy.sh
    └── local_install.sh
├── dat
    └── foo.tsv
├── doc
    ├── 00.mesos_ui.png
    ├── 01.framework.png
    ├── 02.sandbox.png
    ├── activity.png
    ├── classes.png
    ├── framework.png
    ├── hashring.png
    ├── kernel.png
    ├── sample_tsp.png
    ├── tsp_fitness.R
    ├── tsp_fitness.png
    ├── tsp_fitness.tsv
    └── tutorial.graffle
    │   ├── data.plist
    │   └── image1.tiff
└── src
    ├── contain.py
    ├── exelixi.py
    ├── ga.py
    ├── hashring.py
    ├── monoids.py
    ├── resource.py
    ├── sample_lmd.py
    ├── sample_tsp.py
    ├── service.py
    ├── uow.py
    └── util.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | **~
 2 | *.py[cod]
 3 | exelixi.log
 4 | .DS_Store
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Packages
10 | *.egg
11 | *.egg-info
12 | dist
13 | build
14 | eggs
15 | parts
16 | var
17 | sdist
18 | develop-eggs
19 | .installed.cfg
20 | lib
21 | lib64
22 | __pycache__
23 | 
24 | # Installer logs
25 | pip-log.txt
26 | 
27 | # Unit test / coverage reports
28 | .coverage
29 | .tox
30 | nosetests.xml
31 | 
32 | # Translations
33 | *.mo
34 | 
35 | # Mr Developer
36 | .mr.developer.cfg
37 | .project
38 | .pydevproject
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Exelixi
 2 | 
 3 | **Exelixi** is a distributed framework based on [Apache Mesos],
 4 | mostly implemented in Python using [gevent] for high-performance concurrency
 5 | It is intended to run cluster computing jobs (partitioned batch jobs, which include some messaging) in pure Python.
 6 | By default, it runs [genetic algorithms] at scale.
 7 | However, it can handle a broad range of other problem domains by 
 8 | using `--uow` command line option to override the `UnitOfWorkFactory` class definition.
 9 | 
10 | Please see the [project wiki](https://github.com/ceteri/exelixi/wiki) for more details,
11 | including a [tutorial](https://github.com/ceteri/exelixi/wiki/Tutorial:-Fog-Computing-at-Hella-Scale)
12 | on how to build Mesos-based frameworks.
13 | 
14 | 
15 | ### Quick Start
16 | 
17 | To check out the [GA] on a laptop (with Python 2.7 installed), simply run:
18 | 
19 |     ./src/ga.py
20 | 
21 | Otherwise, to run at scale, the following steps will help you get **Exelixi** running on [Apache Mesos].
22 | For help in general with command line options:
23 | 
24 |     ./src/exelixi.py -h
25 | 
26 | The following instructions are based on using the [Elastic Mesos] service,
27 | which uses Ubuntu Linux servers running on [Amazon AWS].
28 | Even so, the basic outline of steps shown here apply in general.
29 | 
30 | First, launch an [Apache Mesos] cluster.
31 | Once you have confirmation that your cluster is running
32 | (e.g., [Elastic Mesos] sends you an email messages with a list of masters and slaves)
33 | then use `ssh` to login on any of the masters:
34 | 
35 |     ssh -A -l ubuntu <master-public-ip>
36 | 
37 | You must install the [Python bindings](https://github.com/apache/mesos/tree/master/src/python) for [Apache Mesos].
38 | The default version of Mesos changes in this code as there are updates to [Elastic Mesos](https://elastic.mesosphere.io/),
39 | since the tutorials are based on that service.
40 | You can check [http://mesosphere.io/downloads/](http://mesosphere.io/downloads/) for the latest.
41 | If you run Mesos in different environment, 
42 | simply make a one-line change to the `EGG` environment variable in the `bin/local_install.sh` script.
43 | Also, you need to install the **Exelixi** source.
44 | 
45 | On the Mesos master, download the `master` branch of the **Exelixi** code repo on GitHub and install the required libraries:
46 | 
47 |     wget https://github.com/ceteri/exelixi/archive/master.zip ; \
48 |     unzip master.zip ; \
49 |     cd exelixi-master ; \
50 |     ./bin/local_install.sh
51 | 
52 | If you've customized the code by forking your own GitHub code repo, then substitute that download URL instead.
53 | Alternatively, if you've customized by subclassing the `uow.UnitOfWorkFactory` default [GA],
54 | then place that Python source file into the `src/` subdirectory.
55 | 
56 | Next, run the installation command on the master, to set up each of the slaves:
57 | 
58 |     ./src/exelixi.py -n localhost:5050 | ./bin/install.sh
59 | 
60 | Now launch the Framework, which in turn launches the worker services remotely on slave nodes.
61 | In the following case, it runs workers on two slave nodes:
62 | 
63 |     ./src/exelixi.py -m localhost:5050 -w 2
64 | 
65 | Once everything has been set up successfully, the log file in `exelixi.log` will show a line:
66 | 
67 |     all worker services launched and init tasks completed
68 | 
69 | From there, the [GA] runs.
70 | See a [GitHub gist](https://gist.github.com/ceteri/7609046) for an example of a successful run.
71 | 
72 | 
73 | ### Blame List
74 | 
75 | [Paco Nathan](https://github.com/ceteri)
76 | 
77 | 
78 | [Amazon AWS]: http://aws.amazon.com/
79 | [Apache Mesos]: http://mesos.apache.org/
80 | [Elastic Mesos]: https://elastic.mesosphere.io/
81 | [GA]: http://en.wikipedia.org/wiki/Genetic_algorithm
82 | [Python egg]: https://wiki.python.org/moin/egg
83 | [genetic algorithms]: http://en.wikipedia.org/wiki/Genetic_algorithm
84 | [gevent]: http://www.gevent.org/
85 | 


--------------------------------------------------------------------------------
/atlas.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "files": [
 3 |     "README.md"
 4 |   ],
 5 |   "formats": {
 6 |     "pdf": {
 7 |       "version": false,
 8 |       "index": false,
 9 |       "toc": false
10 |     },
11 |     "epub": {
12 |       "index": false,
13 |       "toc": false,
14 |       "epubcheck": false
15 |     },
16 |     "mobi": {
17 |       "index": false,
18 |       "toc": false
19 |     },
20 |     "html": {
21 |       "index": true,
22 |       "toc": true
23 |     }
24 |   },
25 |   "theme": "oreillymedia/atlas_tech1c_theme",
26 |   "title": "exelixi"
27 | }


--------------------------------------------------------------------------------
/bin/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 4 | 
 5 | # build a tarball/container for the Executor
 6 | 
 7 | rm -rf /tmp/exelixi.tgz
 8 | tar cvzf /tmp/exelixi.tgz ../exelixi-master/bin ../exelixi-master/src ../exelixi-master/dat
 9 | 
10 | # distribute tarball/container to the Mesos slaves via HDFS
11 | 
12 | hadoop fs -rm -f -R /exelixi
13 | hadoop fs -mkdir /exelixi
14 | hadoop fs -put /tmp/exelixi.tgz /exelixi
15 | 
16 | # run installer on each of the Mesos slaves
17 | 
18 | printf "UserKnownHostsFile /dev/null\nStrictHostKeyChecking no\n" >> ~/.ssh/config
19 | 
20 | while read slave
21 | do
22 |   echo $slave
23 |   ssh $slave 'bash -s' < $DIR/local_install.sh
24 |   ssh $slave 'bash -s' < $DIR/local_deploy.sh
25 | 
26 |   if [ ! -z $1 ]
27 |   then
28 |     # optional job-specific installations
29 |     ssh $slave 'bash -s' < $1
30 |   fi
31 | done


--------------------------------------------------------------------------------
/bin/local_deploy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -x
2 | 
3 | echo "deploying Exelixi..."
4 | rm -rf exelixi.tgz exelixi-master
5 | hadoop fs -get /exelixi/exelixi.tgz
6 | tar xvzf exelixi.tgz
7 | 


--------------------------------------------------------------------------------
/bin/local_install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | EGG="mesos_0.15.0-rc4_amd64.egg"
 4 | 
 5 | echo "installing Python/Mesos..."
 6 | ## NB: TODO de-Ubuntu-fy the Python parts of this install, hopefully via Anaconda/conda?
 7 | sudo aptitude -y install python-setuptools
 8 | sudo aptitude -y install python-protobuf
 9 | sudo aptitude -y install python-gevent
10 | sudo aptitude -y install python-psutil 
11 | sudo aptitude -y install python-dev
12 | sudo aptitude -y install python-pip
13 | 
14 | sudo aptitude -y install git
15 | sudo pip install cython
16 | sudo pip install git+https://github.com/kmike/hat-trie.git#egg=hat-trie
17 | 
18 | rm -rf $EGG
19 | wget http://downloads.mesosphere.io/master/ubuntu/13.10/$EGG
20 | sudo easy_install $EGG
21 | 
22 | echo "testing Python/Mesos..."
23 | python -c 'import mesos'
24 | 


--------------------------------------------------------------------------------
/dat/foo.tsv:
--------------------------------------------------------------------------------
1 | 93	11	23	69
2 | 


--------------------------------------------------------------------------------
/doc/00.mesos_ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/00.mesos_ui.png


--------------------------------------------------------------------------------
/doc/01.framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/01.framework.png


--------------------------------------------------------------------------------
/doc/02.sandbox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/02.sandbox.png


--------------------------------------------------------------------------------
/doc/activity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/activity.png


--------------------------------------------------------------------------------
/doc/classes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/classes.png


--------------------------------------------------------------------------------
/doc/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/framework.png


--------------------------------------------------------------------------------
/doc/hashring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/hashring.png


--------------------------------------------------------------------------------
/doc/kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/kernel.png


--------------------------------------------------------------------------------
/doc/sample_tsp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/sample_tsp.png


--------------------------------------------------------------------------------
/doc/tsp_fitness.R:
--------------------------------------------------------------------------------
1 | data <- read.delim('~/src/exelixi/doc/tsp_fitness.tsv', header=F)
2 | plot(ecdf(data$V1), main="TSP fitness distribution", xlab="fitness value", ylab="CDF")
3 | abline(h=.8, col="blue")


--------------------------------------------------------------------------------
/doc/tsp_fitness.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/tsp_fitness.png


--------------------------------------------------------------------------------
/doc/tsp_fitness.tsv:
--------------------------------------------------------------------------------
 1 | 0.2750
 2 | 0.2750
 3 | 0.2750
 4 | 0.3202
 5 | 0.3202
 6 | 0.3202
 7 | 0.3202
 8 | 0.3202
 9 | 0.3250
10 | 0.3250
11 | 0.3250
12 | 0.3375
13 | 0.3375
14 | 0.3375
15 | 0.3538
16 | 0.3538
17 | 0.3538
18 | 0.5096
19 | 0.5144
20 | 0.5240
21 | 0.5240
22 | 0.5240
23 | 0.5288
24 | 0.5337
25 | 0.5337
26 | 0.6106
27 | 0.6106
28 | 0.6106
29 | 0.6154
30 | 0.6154
31 | 0.6154
32 | 0.6202
33 | 0.6202
34 | 0.6202
35 | 0.6202
36 | 0.6202
37 | 0.6202
38 | 0.6202
39 | 0.6250
40 | 0.6250
41 | 0.6250
42 | 0.6298
43 | 0.6298
44 | 0.6298
45 | 0.6298
46 | 0.6298
47 | 0.6298
48 | 0.6346
49 | 0.6346
50 | 0.6394
51 | 0.6394
52 | 0.6442
53 | 0.6538
54 | 0.6538
55 | 0.6538
56 | 0.6635
57 | 0.6635
58 | 0.6635
59 | 0.6683
60 | 0.6683
61 | 0.6683
62 | 0.6731
63 | 0.6731
64 | 0.6779
65 | 0.6779
66 | 0.6779
67 | 0.6779
68 | 0.6827
69 | 0.6827
70 | 0.7163
71 | 0.7212
72 | 0.7260
73 | 0.7260
74 | 0.7308
75 | 0.7308
76 | 0.7404
77 | 0.7404
78 | 0.7500
79 | 0.7596
80 | 0.7596
81 | 0.7596
82 | 0.7644
83 | 0.7644
84 | 0.7644
85 | 0.7644
86 | 0.7644
87 | 0.7692
88 | 0.7692
89 | 0.7692
90 | 0.7692
91 | 0.7692
92 | 0.7692
93 | 0.7740
94 | 0.7740
95 | 0.7788
96 | 0.7788
97 | 0.7837
98 | 


--------------------------------------------------------------------------------
/doc/tutorial.graffle/image1.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d2iq-archive/exelixi/81bb97d3e99fe055e3816a5692b4dc29cdce6c94/doc/tutorial.graffle/image1.tiff


--------------------------------------------------------------------------------
/src/contain.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | from collections import namedtuple
  5 | from gevent import Greenlet
  6 | from json import dumps, loads
  7 | from os.path import abspath
  8 | from service import UnitOfWork
  9 | from uow import UnitOfWorkFactory
 10 | import logging
 11 | import sys
 12 | 
 13 | 
 14 | ######################################################################
 15 | ## class definitions
 16 | 
 17 | class Container (object):
 18 |     """Container for a distrib Py UnitOfWork"""
 19 | 
 20 |     def __init__ (self):
 21 |         """constructor"""
 22 |         self.param_space = []
 23 | 
 24 |         ## NB: override to specify the data source
 25 |         self.file_name = abspath('dat/foo.tsv')
 26 |         ## NB: override to define the fields of a result tuple
 27 |         self.Result = namedtuple('Foo', ['bar', 'ugh'])
 28 | 
 29 | 
 30 |     def data_load (self, file_name):
 31 |         """load the specified data file"""
 32 |         ## NB: override to load the data file
 33 |         self.param_space.append(23)
 34 | 
 35 | 
 36 |     def run_calc (self, params):
 37 |         """run calculations based on the given param space element"""
 38 |         ## NB: override to calculate a job
 39 |         return self.Result(93, 11)
 40 | 
 41 | 
 42 | class ContainerUOWFactory (UnitOfWorkFactory):
 43 |     """UnitOfWorkFactory definition for distrib Py jobs"""
 44 | 
 45 |     def __init__ (self):
 46 |         #super(UnitOfWorkFactory, self).__init__()
 47 |         pass
 48 | 
 49 |     def instantiate_uow (self, uow_name, prefix):
 50 |         return ContainerUOW(uow_name, prefix, Container())
 51 | 
 52 | 
 53 | class ContainerUOW (UnitOfWork):
 54 |     """UnitOfWork definition for distrib Py jobs"""
 55 |     def __init__ (self, uow_name, prefix, container):
 56 |         super(ContainerUOW, self).__init__(uow_name, prefix)
 57 |         self._shard = {}
 58 | 
 59 |         self._container = container
 60 |         self.results = []
 61 | 
 62 | 
 63 |     def perform_task (self, payload):
 64 |         """perform a task consumed from the Worker.task_queue"""
 65 |         logging.debug(payload)
 66 | 
 67 |         if "job" in payload:
 68 |             result = self._container.run_calc(payload["job"])
 69 |             self.results.append(result)
 70 |             logging.debug(result)
 71 |         elif "nop" in payload:
 72 |             pass
 73 | 
 74 | 
 75 |     def orchestrate (self, framework):
 76 |         """initialize shards, then iterate until all percentiles are trained"""
 77 |         framework.send_ring_rest("shard/init", {})
 78 |         framework.send_ring_rest("data/load", { "file": self._container.file_name })
 79 | 
 80 |         self._container.data_load(self._container.file_name)
 81 |         framework.phase_barrier()
 82 | 
 83 |         while len(self._container.param_space) > 0:
 84 |             for shard_id, shard_uri in framework.get_worker_list():
 85 |                 if len(self._container.param_space) > 0:
 86 |                     params = self._container.param_space.pop(0)
 87 |                     framework.send_worker_rest(shard_id, shard_uri, "calc/run", { "params": params })
 88 | 
 89 |         framework.phase_barrier()
 90 | 
 91 |         # report the results
 92 |         needs_header = True
 93 | 
 94 |         for shard_msg in framework.send_ring_rest("shard/dump", {}):
 95 |             payload = loads(shard_msg)
 96 | 
 97 |             if needs_header:
 98 |                 print "\t".join(payload["fields"])
 99 |                 needs_header = False
100 | 
101 |             for result in payload["results"]:
102 |                 print "\t".join(map(lambda x: str(x), result))
103 | 
104 | 
105 |     def handle_endpoints (self, worker, uri_path, env, start_response, body):
106 |         """UnitOfWork REST endpoints, delegated from the Worker"""
107 |         if uri_path == '/shard/init':
108 |             # initialize the shard
109 |             Greenlet(self.shard_init, worker, env, start_response, body).start()
110 |             return True
111 |         elif uri_path == '/data/load':
112 |             # load the data
113 |             Greenlet(self.data_load, worker, env, start_response, body).start()
114 |             return True
115 |         elif uri_path == '/calc/run':
116 |             # run the calculations
117 |             Greenlet(self.calc_run, worker, env, start_response, body).start()
118 |             return True
119 |         elif uri_path == '/shard/dump':
120 |             # dump the results
121 |             Greenlet(self.shard_dump, worker, env, start_response, body).start()
122 |             return True
123 |         else:
124 |             return False
125 | 
126 | 
127 |     ######################################################################
128 |     ## job-specific REST endpoints implemented as gevent coroutines
129 | 
130 |     def shard_init (self, *args, **kwargs):
131 |         """initialize a shard"""
132 |         worker = args[0]
133 |         payload, start_response, body = worker.get_response_context(args[1:])
134 | 
135 |         if worker.auth_request(payload, start_response, body):
136 |             self.set_ring(worker.shard_id, worker.ring)
137 |             worker.prep_task_queue()
138 | 
139 |             start_response('200 OK', [('Content-Type', 'text/plain')])
140 |             body.put("Bokay\r\n")
141 |             body.put(StopIteration)
142 | 
143 | 
144 |     def data_load (self, *args, **kwargs):
145 |         """prepare for calculations"""
146 |         worker = args[0]
147 |         payload, start_response, body = worker.get_response_context(args[1:])
148 | 
149 |         if worker.auth_request(payload, start_response, body):
150 |             with worker.wrap_task_event():
151 |                 # HTTP response first, then initiate long-running task
152 |                 start_response('200 OK', [('Content-Type', 'text/plain')])
153 |                 body.put("Bokay\r\n")
154 |                 body.put(StopIteration)
155 | 
156 |                 # load the data file
157 |                 logging.debug(payload["file"])
158 |                 self._container.data_load(payload["file"])
159 | 
160 |                 # put a NOP into the queue, so we'll have something to join on
161 |                 worker.put_task_queue({ "nop": True })
162 | 
163 | 
164 |     def calc_run (self, *args, **kwargs):
165 |         """enqueue one calculation"""
166 |         worker = args[0]
167 |         payload, start_response, body = worker.get_response_context(args[1:])
168 | 
169 |         if worker.auth_request(payload, start_response, body):
170 |             with worker.wrap_task_event():
171 |                 # caller expects JSON response
172 |                 start_response('200 OK', [('Content-Type', 'application/json')])
173 |                 body.put(dumps({ "ok": 1 }))
174 |                 body.put("\r\n")
175 |                 body.put(StopIteration)
176 | 
177 |                 # put the params into the queue
178 |                 worker.put_task_queue({ "job": payload["params"] })
179 | 
180 | 
181 |     def shard_dump (self, *args, **kwargs):
182 |         """dump the results"""
183 |         worker = args[0]
184 |         payload, start_response, body = worker.get_response_context(args[1:])
185 | 
186 |         if worker.auth_request(payload, start_response, body):
187 |             start_response('200 OK', [('Content-Type', 'application/json')])
188 |             body.put(dumps({ "fields": self.results[0]._fields, "results": self.results }))
189 |             body.put("\r\n")
190 |             body.put(StopIteration)
191 | 
192 | 
193 | if __name__=='__main__':
194 |     ## test GA in standalone-mode, without distributed services
195 |     pass
196 | 


--------------------------------------------------------------------------------
/src/exelixi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | # 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # author: Paco Nathan
 17 | # https://github.com/ceteri/exelixi
 18 | 
 19 | 
 20 | from argparse import ArgumentParser
 21 | from os.path import abspath
 22 | from service import Framework, Worker
 23 | from util import get_master_leader, get_master_state, pipe_slave_list
 24 | import logging
 25 | import sys
 26 | 
 27 | 
 28 | ######################################################################
 29 | ## globals
 30 | 
 31 | APP_NAME = "Exelixi"
 32 | 
 33 | 
 34 | ######################################################################
 35 | ## command line arguments
 36 | 
 37 | def parse_cli_args ():
 38 |     parser = ArgumentParser(prog="Exelixi", usage="one of the operational modes shown below...", add_help=True,
 39 |                             description="Exelixi, a distributed framework for genetic algorithms, based on Apache Mesos")
 40 | 
 41 |     group1 = parser.add_argument_group("Mesos Framework", "run as a distributed framework on an Apache Mesos cluster")
 42 |     group1.add_argument("-m", "--master", metavar="HOST:PORT", nargs=1,
 43 |                         help="location for one of the masters")
 44 |     group1.add_argument("-w", "--workers", nargs=1, type=int, default=[1],
 45 |                         help="number of workers to be launched")
 46 | 
 47 |     group1.add_argument("--cpu", nargs=1, type=int, default=[1],
 48 |                         help="CPU allocation per worker, as CPU count")
 49 |     group1.add_argument("--mem", nargs=1, type=int, default=[32],
 50 |                         help="MEM allocation per worker, as MB/shard")
 51 | 
 52 |     group2 = parser.add_argument_group("Mesos Executor", "run as an Apache Mesos executor (using no arguments)")
 53 | 
 54 |     group3 = parser.add_argument_group("Standalone Framework", "run as a test framework in standalone mode")
 55 |     group3.add_argument("-s", "--slaves", nargs="+", metavar="HOST:PORT",
 56 |                         help="list of slaves (HOST:PORT) on which to run workers")
 57 | 
 58 |     group4 = parser.add_argument_group("Standalone Worker", "run as a test worker in standalone mode")
 59 |     group4.add_argument("-p", "--port", nargs=1, metavar="PORT",
 60 |                         help="port number to use for this service")
 61 | 
 62 |     group5 = parser.add_argument_group("Nodes", "enumerate the slave nodes in an Apache Mesos cluster")
 63 |     group5.add_argument("-n", "--nodes", nargs="?", metavar="HOST:PORT",
 64 |                         help="location for one of the Apache Mesos masters")
 65 | 
 66 |     parser.add_argument("--uow", nargs=1, metavar="PKG.CLASS", default=["uow.UnitOfWorkFactory"],
 67 |                         help="subclassed UnitOfWork definition")
 68 | 
 69 |     parser.add_argument("--prefix", nargs=1, default=["hdfs://exelixi"],
 70 |                         help="path prefix for durable storage")
 71 | 
 72 |     parser.add_argument("--log", nargs=1, default=["DEBUG"],
 73 |                         help="logging level: INFO, DEBUG, WARNING, ERROR, CRITICAL")
 74 | 
 75 |     return parser.parse_args()
 76 | 
 77 | 
 78 | if __name__=='__main__':
 79 |     # interpret CLI arguments
 80 |     args = parse_cli_args()
 81 | 
 82 |     if args.nodes:
 83 |         # query and report the slave list, then exit...
 84 |         # NB: one per line, to handle large clusters gracefully
 85 |         pipe_slave_list(args.nodes)
 86 |         sys.exit(0)
 87 | 
 88 |     # set up logging
 89 |     numeric_log_level = getattr(logging, args.log[0], None)
 90 | 
 91 |     if not isinstance(numeric_log_level, int):
 92 |         raise ValueError("Invalid log level: %s" % loglevel)
 93 | 
 94 |     logging.basicConfig(format="%(asctime)s\t%(levelname)s\t%(message)s", 
 95 |                         filename="exelixi.log", 
 96 |                         filemode="w",
 97 |                         level=numeric_log_level
 98 |                         )
 99 |     logging.debug(args)
100 | 
101 |     # report settings for options
102 |     opts = []
103 | 
104 |     if args.uow:
105 |         opts.append(" ...using %s for the UnitOfWork definitions" % (args.uow[0]))
106 | 
107 |     if args.prefix:
108 |         opts.append(" ...using %s for the path prefix in durable storage" % (args.prefix[0]))
109 | 
110 |     # handle the different operational modes
111 |     if args.master:
112 |         logging.info("%s: running a Framework atop an Apache Mesos cluster", APP_NAME)
113 |         logging.info(" ...with master %s and %d workers(s)", args.master[0], args.workers[0])
114 | 
115 |         for x in opts:
116 |             logging.info(x)
117 | 
118 |         try:
119 |             from resource import MesosScheduler
120 | 
121 |             master_uri = get_master_leader(args.master[0])
122 |             exe_path = abspath(sys.argv[0])
123 | 
124 |             # run Mesos driver to launch Framework and manage resource offers
125 |             driver = MesosScheduler.start_framework(master_uri, exe_path, args.workers[0], args.uow[0], args.prefix[0], args.cpu[0], args.mem[0])
126 |             MesosScheduler.stop_framework(driver)
127 |         except ImportError as e:
128 |             logging.critical("Python module 'mesos' has not been installed", exc_info=True)
129 |             raise
130 | 
131 |     elif args.slaves:
132 |         logging.info("%s: running a Framework in standalone mode", APP_NAME)
133 |         logging.info(" ...with slave(s) %s", args.slaves)
134 | 
135 |         for x in opts:
136 |             logging.info(x)
137 | 
138 |         # run UnitOfWork orchestration via REST endpoints on the workers
139 |         fra = Framework(args.uow[0], args.prefix[0])
140 |         fra.set_worker_list(args.slaves)
141 |         fra.orchestrate_uow()
142 | 
143 |     elif args.port:
144 |         logging.info("%s: running a worker service on port %s", APP_NAME, args.port[0])
145 | 
146 |         try:
147 |             svc = Worker(port=int(args.port[0]))
148 |             svc.shard_start()
149 |         except KeyboardInterrupt:
150 |             pass
151 | 
152 |     else:
153 |         logging.info("%s: running an Executor on an Apache Mesos slave", APP_NAME)
154 | 
155 |         try:
156 |             from resource import MesosExecutor
157 |             MesosExecutor.run_executor()
158 |         except ImportError as e:
159 |             logging.critical("Python module 'mesos' has not been installed", exc_info=True)
160 |             raise
161 |         except KeyboardInterrupt:
162 |             pass
163 | 


--------------------------------------------------------------------------------
/src/ga.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | # 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # author: Paco Nathan
 17 | # https://github.com/ceteri/exelixi
 18 | 
 19 | 
 20 | from hat_trie import Trie
 21 | from collections import Counter
 22 | from gevent import Greenlet
 23 | from hashlib import sha224
 24 | from hashring import HashRing
 25 | from json import dumps, loads
 26 | from monoids import dictm
 27 | from random import random, sample
 28 | from service import UnitOfWork
 29 | from string import ascii_lowercase
 30 | from util import instantiate_class, post_distrib_rest
 31 | import logging
 32 | import sys
 33 | 
 34 | 
 35 | ######################################################################
 36 | ## class definitions
 37 | 
 38 | class Population (UnitOfWork):
 39 |     def __init__ (self, uow_name, prefix, indiv_instance):
 40 |         super(Population, self).__init__(uow_name, prefix)
 41 | 
 42 |         logging.debug("INIT POPULATION")
 43 | 
 44 |         self.indiv_class = indiv_instance.__class__
 45 |         self.total_indiv = 0
 46 |         self.current_gen = 0
 47 | 
 48 |         self._shard = {}
 49 |         self._trie = Trie(ascii_lowercase)
 50 | 
 51 | 
 52 |     def perform_task (self, payload):
 53 |         """perform a task consumed from the Worker.task_queue"""
 54 |         key = payload["key"]
 55 |         gen = payload["gen"]
 56 |         feature_set = payload["feature_set"]
 57 |         self.receive_reify(key, gen, feature_set)
 58 | 
 59 | 
 60 |     def orchestrate (self, framework):
 61 |         """
 62 |         initialize a Population of unique Individuals at generation 0,
 63 |         then iterate N times or until a "good enough" solution is found
 64 |         """
 65 |         framework.send_ring_rest("pop/init", {})
 66 |         framework.send_ring_rest("pop/gen", {})
 67 | 
 68 |         while True:
 69 |             framework.phase_barrier()
 70 | 
 71 |             if self.current_gen == self.uow_factory.n_gen:
 72 |                 break
 73 | 
 74 |             # determine the fitness cutoff threshold
 75 |             self.total_indiv = 0
 76 |             hist = {}
 77 | 
 78 |             for shard_msg in framework.send_ring_rest("pop/hist", {}):
 79 |                 logging.debug(shard_msg)
 80 |                 payload = loads(shard_msg)
 81 |                 self.total_indiv += payload["total_indiv"]
 82 |                 hist = dictm.fold([hist, payload["hist"]])
 83 | 
 84 |             # test for the terminating condition
 85 |             hist_items = map(lambda x: (float(x[0]), x[1],), sorted(hist.items(), reverse=True))
 86 | 
 87 |             if self.test_termination(self.current_gen, hist_items):
 88 |                 break
 89 | 
 90 |             ## NB: TODO save Framework state to Zookeeper
 91 | 
 92 |             # apply the fitness cutoff and breed "children" for the
 93 |             # next generation
 94 |             fitness_cutoff = self.get_fitness_cutoff(hist_items)
 95 |             framework.send_ring_rest("pop/next", { "current_gen": self.current_gen, "fitness_cutoff": fitness_cutoff })
 96 |             self.current_gen += 1
 97 | 
 98 |         # report the best Individuals in the final result
 99 |         results = []
100 | 
101 |         for l in framework.send_ring_rest("pop/enum", { "fitness_cutoff": fitness_cutoff }):
102 |             results.extend(loads(l))
103 | 
104 |         results.sort(reverse=True)
105 | 
106 |         for x in results:
107 |             # print results to stdout
108 |             print "\t".join(x)
109 | 
110 | 
111 |     def handle_endpoints (self, worker, uri_path, env, start_response, body):
112 |         """UnitOfWork REST endpoints, delegated from the Worker"""
113 |         if uri_path == '/pop/init':
114 |             # initialize the Population subset on this shard
115 |             Greenlet(self.pop_init, worker, env, start_response, body).start()
116 |             return True
117 |         elif uri_path == '/pop/gen':
118 |             # create generation 0 in this shard
119 |             Greenlet(self.pop_gen, worker, env, start_response, body).start()
120 |             return True
121 |         elif uri_path == '/pop/hist':
122 |             # calculate a partial histogram for the fitness distribution
123 |             Greenlet(self.pop_hist, worker, env, start_response, body).start()
124 |             return True
125 |         elif uri_path == '/pop/next':
126 |             # attempt to run another generation
127 |             Greenlet(self.pop_next, worker, env, start_response, body).start()
128 |             return True
129 |         elif uri_path == '/pop/enum':
130 |             # enumerate the Individuals in this shard of the Population
131 |             Greenlet(self.pop_enum, worker, env, start_response, body).start()
132 |             return True
133 |         elif uri_path == '/pop/reify':
134 |             # test/add a new Individual into the Population (birth)
135 |             Greenlet(self.pop_reify, worker, env, start_response, body).start()
136 |             return True
137 |         else:
138 |             return False
139 | 
140 | 
141 |     ######################################################################
142 |     ## GA-specific REST endpoints implemented as gevent coroutines
143 | 
144 |     def pop_init (self, *args, **kwargs):
145 |         """initialize a Population of unique Individuals on this shard"""
146 |         worker = args[0]
147 |         payload, start_response, body = worker.get_response_context(args[1:])
148 | 
149 |         if worker.auth_request(payload, start_response, body):
150 |             self.set_ring(worker.shard_id, worker.ring)
151 |             worker.prep_task_queue()
152 | 
153 |             start_response('200 OK', [('Content-Type', 'text/plain')])
154 |             body.put("Bokay\r\n")
155 |             body.put(StopIteration)
156 | 
157 | 
158 |     def pop_gen (self, *args, **kwargs):
159 |         """create generation 0 of Individuals in this shard of the Population"""
160 |         worker = args[0]
161 |         payload, start_response, body = worker.get_response_context(args[1:])
162 | 
163 |         if worker.auth_request(payload, start_response, body):
164 |             with worker.wrap_task_event():
165 |                 # HTTP response first, then initiate long-running task
166 |                 start_response('200 OK', [('Content-Type', 'text/plain')])
167 |                 body.put("Bokay\r\n")
168 |                 body.put(StopIteration)
169 | 
170 |                 self.populate(0)
171 | 
172 | 
173 |     def pop_hist (self, *args, **kwargs):
174 |         """calculate a partial histogram for the fitness distribution"""
175 |         worker = args[0]
176 |         payload, start_response, body = worker.get_response_context(args[1:])
177 | 
178 |         if worker.auth_request(payload, start_response, body):
179 |             start_response('200 OK', [('Content-Type', 'application/json')])
180 |             body.put(dumps({ "total_indiv": self.total_indiv, "hist": self.get_part_hist() }))
181 |             body.put("\r\n")
182 |             body.put(StopIteration)
183 | 
184 | 
185 |     def pop_next (self, *args, **kwargs):
186 |         """iterate N times or until a 'good enough' solution is found"""
187 |         worker = args[0]
188 |         payload, start_response, body = worker.get_response_context(args[1:])
189 | 
190 |         if worker.auth_request(payload, start_response, body):
191 |             with worker.wrap_task_event():
192 |                 # HTTP response first, then initiate long-running task
193 |                 start_response('200 OK', [('Content-Type', 'text/plain')])
194 |                 body.put("Bokay\r\n")
195 |                 body.put(StopIteration)
196 | 
197 |                 current_gen = payload["current_gen"]
198 |                 fitness_cutoff = payload["fitness_cutoff"]
199 |                 self.next_generation(current_gen, fitness_cutoff)
200 | 
201 | 
202 |     def pop_enum (self, *args, **kwargs):
203 |         """enumerate the Individuals in this shard of the Population"""
204 |         worker = args[0]
205 |         payload, start_response, body = worker.get_response_context(args[1:])
206 | 
207 |         if worker.auth_request(payload, start_response, body):
208 |             fitness_cutoff = payload["fitness_cutoff"]
209 | 
210 |             start_response('200 OK', [('Content-Type', 'application/json')])
211 |             body.put(dumps(self.enum(fitness_cutoff)))
212 |             body.put("\r\n")
213 |             body.put(StopIteration)
214 | 
215 | 
216 |     def pop_reify (self, *args, **kwargs):
217 |         """test/add a newly generated Individual into the Population (birth)"""
218 |         worker = args[0]
219 |         payload, start_response, body = worker.get_response_context(args[1:])
220 | 
221 |         if worker.auth_request(payload, start_response, body):
222 |             worker.put_task_queue(payload)
223 | 
224 |             start_response('200 OK', [('Content-Type', 'text/plain')])
225 |             body.put("Bokay\r\n")
226 |             body.put(StopIteration)
227 | 
228 | 
229 |     ######################################################################
230 |     ## Individual lifecycle within the local subset of the Population
231 | 
232 |     def populate (self, current_gen):
233 |         """initialize the population"""
234 |         for _ in xrange(self.uow_factory.n_pop):
235 |             # constructor pattern
236 |             indiv = self.indiv_class()
237 |             indiv.populate(current_gen, self.uow_factory.generate_features())
238 | 
239 |             # add the generated Individual to the Population
240 |             # failure semantics: must filter nulls from initial population
241 |             self.reify(indiv)
242 | 
243 | 
244 |     def reify (self, indiv):
245 |         """test/add a newly generated Individual into the Population (birth)"""
246 |         neighbor_shard_id = None
247 |         shard_uri = None
248 | 
249 |         if self._hash_ring:
250 |             neighbor_shard_id = self._hash_ring.get_node(indiv.key)
251 | 
252 |             if neighbor_shard_id != self._shard_id:
253 |                 shard_uri = self._shard_dict[neighbor_shard_id]
254 | 
255 |         # distribute the tasks in this phase throughout the HashRing,
256 |         # using a remote task_queue with synchronization based on a
257 |         # barrier pattern
258 | 
259 |         if shard_uri:
260 |             msg = { "key": indiv.key, "gen": indiv.gen, "feature_set": loads(indiv.get_json_feature_set()) }
261 |             lines = post_distrib_rest(self.prefix, neighbor_shard_id, shard_uri, "pop/reify", msg)
262 |             return False
263 |         else:
264 |             return self._reify_locally(indiv)
265 | 
266 | 
267 |     def receive_reify (self, key, gen, feature_set):
268 |         """test/add a received reify request """
269 |         indiv = self.indiv_class()
270 |         indiv.populate(gen, feature_set)
271 |         self._reify_locally(indiv)
272 | 
273 | 
274 |     def _reify_locally (self, indiv):
275 |         """test/add a newly generated Individual into the Population locally (birth)"""
276 |         if not (indiv.key in self._trie):
277 |             self._trie[indiv.key] = 1
278 |             self.total_indiv += 1
279 | 
280 |             # potentially an expensive operation, deferred until remote reification
281 |             indiv.get_fitness(self.uow_factory, force=True)
282 |             self._shard[indiv.key] = indiv
283 | 
284 |             return True
285 |         else:
286 |             return False
287 | 
288 | 
289 |     def evict (self, indiv):
290 |         """remove an Individual from the Population (death)"""
291 |         if indiv.key in self._shard:
292 |             # Individual only needs to be removed locally
293 |             del self._shard[indiv.key]
294 | 
295 |             # NB: serialize to disk (write behinds)
296 |             url = self._get_storage_path(indiv)
297 | 
298 | 
299 |     def get_part_hist (self):
300 |         """tally counts for the partial histogram of the fitness distribution"""
301 |         l = [ round(indiv.get_fitness(self.uow_factory, force=False), self.uow_factory.hist_granularity) for indiv in self._shard.values() ]
302 |         return dict(Counter(l))
303 | 
304 | 
305 |     def get_fitness_cutoff (self, hist_items):
306 |         """determine fitness cutoff (bin lower bounds) for the parent selection filter"""
307 |         logging.debug("fit: %s", hist_items)
308 | 
309 |         n_indiv = sum([ count for bin, count in hist_items ])
310 |         part_sum = 0
311 |         break_next = False
312 | 
313 |         for bin, count in hist_items:
314 |             if break_next:
315 |                 break
316 | 
317 |             part_sum += count
318 |             percentile = part_sum / float(n_indiv)
319 |             break_next = percentile >= self.uow_factory.selection_rate
320 | 
321 |         logging.debug("fit: percentile %f part_sum %d n_indiv %d bin %f", percentile, part_sum, n_indiv, bin)
322 |         return bin
323 | 
324 | 
325 |     def _get_storage_path (self, indiv):
326 |         """create a path for durable storage of an Individual"""
327 |         return self.prefix + "/" + indiv.key
328 | 
329 | 
330 |     def _boost_diversity (self, current_gen, indiv):
331 |         """randomly select other individuals and mutate them, to promote genetic diversity"""
332 |         if self.uow_factory.mutation_rate > random():
333 |             indiv.mutate(self, current_gen, self.uow_factory)
334 |         elif len(self._shard.values()) >= 3:
335 |             # NB: ensure that at least three parents remain in each
336 |             # shard per generation
337 |             self.evict(indiv)
338 | 
339 | 
340 |     def _select_parents (self, current_gen, fitness_cutoff):
341 |         """select the parents for the next generation"""
342 |         partition = map(lambda x: (round(x.get_fitness(), self.uow_factory.hist_granularity) > fitness_cutoff, x), self._shard.values())
343 |         good_fit = map(lambda x: x[1], filter(lambda x: x[0], partition))
344 |         poor_fit = map(lambda x: x[1], filter(lambda x: not x[0], partition))
345 | 
346 |         # randomly select other individuals to promote genetic
347 |         # diversity, while removing the remnant
348 |         for indiv in poor_fit:
349 |             self._boost_diversity(current_gen, indiv)
350 | 
351 |         return self._shard.values()
352 | 
353 | 
354 |     def next_generation (self, current_gen, fitness_cutoff):
355 |         """select/mutate/crossover parents to produce a new generation"""
356 |         parents = self._select_parents(current_gen, fitness_cutoff)
357 | 
358 |         for _ in xrange(self.uow_factory.n_pop - len(parents)):
359 |             f, m = sample(parents, 2) 
360 |             success = f.breed(self, current_gen, m, self.uow_factory)
361 | 
362 |         # backfill to replenish / avoid the dreaded Population collapse
363 |         new_count = 0
364 | 
365 |         for _ in xrange(self.uow_factory.n_pop - len(self._shard.values())):
366 |             # constructor pattern
367 |             indiv = self.indiv_class()
368 |             indiv.populate(current_gen, self.uow_factory.generate_features())
369 |             self.reify(indiv)
370 | 
371 |         logging.info("gen\t%d\tshard\t%s\tsize\t%d\ttotal\t%d", current_gen, self._shard_id, len(self._shard.values()), self.total_indiv)
372 | 
373 | 
374 |     def test_termination (self, current_gen, hist):
375 |         """evaluate the terminating condition for this generation and report progress"""
376 |         return self.uow_factory.test_termination(current_gen, hist, self.total_indiv)
377 | 
378 | 
379 |     def enum (self, fitness_cutoff):
380 |         """enum all Individuals that exceed the given fitness cutoff"""
381 |         return [[ "indiv", "%0.4f" % indiv.get_fitness(), str(indiv.gen), indiv.get_json_feature_set() ]
382 |                 for indiv in filter(lambda x: x.get_fitness() >= fitness_cutoff, self._shard.values()) ]
383 | 
384 | 
385 | class Individual (object):
386 |     def __init__ (self):
387 |         """create an Individual member of the Population"""
388 |         self.gen = None
389 |         self.key = None
390 |         self._feature_set = None
391 |         self._fitness = None
392 | 
393 | 
394 |     def get_fitness (self, uow_factory=None, force=False):
395 |         """determine the fitness ranging [0.0, 1.0]; higher is better"""
396 |         if uow_factory and uow_factory.use_force(force):
397 |             # potentially the most expensive operation, deferred with careful consideration
398 |             self._fitness = uow_factory.get_fitness(self._feature_set)
399 | 
400 |         return self._fitness
401 | 
402 | 
403 |     def get_json_feature_set (self):
404 |         """dump the feature set as a JSON string"""
405 |         return dumps(tuple(self._feature_set))
406 | 
407 | 
408 |     def populate (self, gen, feature_set):
409 |         """populate the instance variables"""
410 |         self.gen = gen
411 |         self._feature_set = feature_set
412 | 
413 |         # create a unique key using a SHA-3 digest of the JSON representing this feature set
414 |         m = sha224()
415 |         m.update(self.get_json_feature_set())
416 |         self.key = unicode(m.hexdigest())
417 | 
418 | 
419 |     def mutate (self, pop, gen, uow_factory):
420 |         """attempt to mutate the feature set"""
421 |         # constructor pattern
422 |         mutant = self.__class__()
423 |         mutant.populate(gen, uow_factory.mutate_features(self._feature_set))
424 | 
425 |         # add the mutant Individual to the Population, but remove its prior self
426 |         # failure semantics: ignore, mutation rate is approx upper bounds
427 |         if pop.reify(mutant):
428 |             pop.evict(self)
429 |             return True
430 |         else:
431 |             return False
432 | 
433 | 
434 |     def breed (self, pop, gen, mate, uow_factory):
435 |         """breed with a mate to produce a child"""
436 |         # constructor pattern
437 |         child = self.__class__()
438 |         child.populate(gen, uow_factory.breed_features(self._feature_set, mate._feature_set))
439 | 
440 |         # add the child Individual to the Population
441 |         # failure semantics: ignore, the count will rebalance over the hash ring
442 |         return pop.reify(child)
443 | 
444 | 
445 | if __name__=='__main__':
446 |     ## test GA in standalone-mode, without distributed services
447 | 
448 |     # parse command line options
449 |     if len(sys.argv) < 2:
450 |         uow_name = "uow.UnitOfWorkFactory"
451 |     else:
452 |         uow_name = sys.argv[1]
453 | 
454 |     uow_factory = instantiate_class(uow_name)
455 | 
456 |     # initialize a Population of unique Individuals at generation 0
457 |     uow = uow_factory.instantiate_uow(uow_name, "/tmp/exelixi")
458 |     uow.populate(uow.current_gen)
459 |     fitness_cutoff = 0
460 | 
461 |     # iterate N times or until a "good enough" solution is found
462 |     while uow.current_gen < uow_factory.n_gen:
463 |         hist = uow.get_part_hist()
464 |         hist_items = map(lambda x: (float(x[0]), x[1],), sorted(hist.items(), reverse=True))
465 | 
466 |         if uow.test_termination(uow.current_gen, hist_items):
467 |             break
468 | 
469 |         fitness_cutoff = uow.get_fitness_cutoff(hist_items)
470 |         uow.next_generation(uow.current_gen, fitness_cutoff)
471 | 
472 |         uow.current_gen += 1
473 | 
474 |     # report summary
475 |     for x in sorted(uow.enum(fitness_cutoff), reverse=True):
476 |         print "\t".join(x)
477 | 


--------------------------------------------------------------------------------
/src/hashring.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Copyright (c) 2012, Amir Salihefendic
  5 | # All rights reserved.
  6 | # 
  7 | # Redistribution and use in source and binary forms, with or without modification,
  8 | # are permitted provided that the following conditions are met:
  9 | # 
 10 | # 1. Redistributions of source code must retain the above copyright notice, this 
 11 | # list of conditions and the following disclaimer.
 12 | # 
 13 | # 2. Redistributions in binary form must reproduce the above copyright notice, 
 14 | # this list of conditions and the following disclaimer in the documentation 
 15 | # and/or other materials provided with the distribution.
 16 | # 
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
 18 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
 19 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 20 | # IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 21 | # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 22 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 23 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 24 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 25 | # OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 26 | # OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | 
 28 | # author: Amir Salihefendic
 29 | # http://amix.dk/blog/post/19367
 30 | 
 31 | 
 32 | import md5
 33 | 
 34 | 
 35 | class HashRing(object):
 36 | 
 37 |     def __init__(self, nodes=None, replicas=3):
 38 |         """Manages a hash ring.
 39 | 
 40 |         `nodes` is a list of objects that have a proper __str__ representation.
 41 |         `replicas` indicates how many virtual points should be used pr. node,
 42 |         replicas are required to improve the distribution.
 43 |         """
 44 |         self.replicas = replicas
 45 | 
 46 |         self.ring = dict()
 47 |         self._sorted_keys = []
 48 | 
 49 |         if nodes:
 50 |             for node in nodes:
 51 |                 self.add_node(node)
 52 | 
 53 |     def add_node(self, node):
 54 |         """Adds a `node` to the hash ring (including a number of replicas).
 55 |         """
 56 |         for i in xrange(0, self.replicas):
 57 |             key = self.gen_key('%s:%s' % (node, i))
 58 |             self.ring[key] = node
 59 |             self._sorted_keys.append(key)
 60 | 
 61 |         self._sorted_keys.sort()
 62 | 
 63 |     def remove_node(self, node):
 64 |         """Removes `node` from the hash ring and its replicas.
 65 |         """
 66 |         for i in xrange(0, self.replicas):
 67 |             key = self.gen_key('%s:%s' % (node, i))
 68 |             del self.ring[key]
 69 |             self._sorted_keys.remove(key)
 70 | 
 71 |     def get_node(self, string_key):
 72 |         """Given a string key a corresponding node in the hash ring is returned.
 73 | 
 74 |         If the hash ring is empty, `None` is returned.
 75 |         """
 76 |         return self.get_node_pos(string_key)[0]
 77 | 
 78 |     def get_node_pos(self, string_key):
 79 |         """Given a string key a corresponding node in the hash ring is returned
 80 |         along with it's position in the ring.
 81 | 
 82 |         If the hash ring is empty, (`None`, `None`) is returned.
 83 |         """
 84 |         if not self.ring:
 85 |             return None, None
 86 | 
 87 |         key = self.gen_key(string_key)
 88 | 
 89 |         nodes = self._sorted_keys
 90 |         for i in xrange(0, len(nodes)):
 91 |             node = nodes[i]
 92 |             if key <= node:
 93 |                 return self.ring[node], i
 94 | 
 95 |         return self.ring[nodes[0]], 0
 96 | 
 97 |     def get_nodes(self, string_key):
 98 |         """Given a string key it returns the nodes as a generator that can hold the key.
 99 | 
100 |         The generator is never ending and iterates through the ring
101 |         starting at the correct position.
102 |         """
103 |         if not self.ring:
104 |             yield None, None
105 | 
106 |         node, pos = self.get_node_pos(string_key)
107 |         for key in self._sorted_keys[pos:]:
108 |             yield self.ring[key]
109 | 
110 |         while True:
111 |             for key in self._sorted_keys:
112 |                 yield self.ring[key]
113 | 
114 |     def gen_key(self, key):
115 |         """Given a string key it returns a long value,
116 |         this long value represents a place on the hash ring.
117 | 
118 |         md5 is currently used because it mixes well.
119 |         """
120 |         m = md5.new()
121 |         m.update(key)
122 |         return long(m.hexdigest(), 16)
123 | 
124 | 
125 | if __name__=='__main__':
126 |     import random
127 | 
128 |     memcache_servers = ['192.168.0.246:11212',
129 |                         '192.168.0.247:11212',
130 |                         '192.168.0.249:11212']
131 | 
132 |     ring = HashRing(memcache_servers)
133 | 
134 |     print ring.get_node('my_key')
135 |     print ring.get_node('foo bar')
136 |     print ring.get_node(str(random.random()))
137 | 


--------------------------------------------------------------------------------
/src/monoids.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | # Francisco Mota, 2011-11-09
 5 | # http://fmota.eu/blog/monoids-in-python.html
 6 | # see also: http://arxiv.org/abs/1304.7544
 7 | 
 8 | class Monoid (object):
 9 |     def __init__ (self, null, lift, op):
10 |         self.null = null
11 |         self.lift = lift
12 |         self.op   = op
13 |  
14 |     def fold (self, xs):
15 |         if hasattr(xs, "__fold__"):
16 |             return xs.__fold__(self)
17 |         else:
18 |             return reduce(self.op, (self.lift(x) for x in xs), self.null)
19 |  
20 |     def __call__ (self, *args):
21 |         return self.fold(args)
22 |  
23 |     def star (self):
24 |         return Monoid(self.null, self.fold, self.op)
25 | 
26 | 
27 | def dict_op (a, b):
28 |     for key, val in b.items():
29 |         if not key in a:
30 |             a[key] = val
31 |         else:
32 |             a[key] += val
33 | 
34 |     return a
35 | 
36 | 
37 | summ   = Monoid(0,  lambda x: x,      lambda a,b: a+b)
38 | joinm  = Monoid('', lambda x: str(x), lambda a,b: a+b)
39 | listm  = Monoid([], lambda x: [x],    lambda a,b: a+b)
40 | tuplem = Monoid((), lambda x: (x,),   lambda a,b: a+b)
41 | lenm   = Monoid(0,  lambda x: 1,      lambda a,b: a+b)
42 | prodm  = Monoid(1,  lambda x: x,      lambda a,b: a*b)
43 | dictm  = Monoid({}, lambda x: x,      lambda a,b: dict_op(a, b))
44 | 
45 | 
46 | if __name__=='__main__':
47 |     x1 = { "a": 2, "b": 3 }
48 |     x2 = { "b": 2, "c": 7 }
49 | 
50 |     print x1, x2
51 |     print dictm.fold([x1, x2])
52 | 


--------------------------------------------------------------------------------
/src/resource.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | # 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # author: Paco Nathan
 17 | # https://github.com/ceteri/exelixi
 18 | 
 19 | 
 20 | from json import dumps, loads
 21 | from service import Framework, Worker, WorkerInfo
 22 | from threading import Thread
 23 | from util import get_telemetry
 24 | from uuid import uuid1
 25 | import logging
 26 | import mesos
 27 | import mesos_pb2
 28 | import os
 29 | import subprocess
 30 | import sys
 31 | import time
 32 | 
 33 | 
 34 | ######################################################################
 35 | ## class definitions
 36 | 
 37 | class MesosScheduler (mesos.Scheduler):
 38 |     # https://github.com/apache/mesos/blob/master/src/python/src/mesos.py
 39 | 
 40 |     def __init__ (self, executor, exe_path, n_workers, uow_name, prefix, cpu_alloc, mem_alloc):
 41 |         self.executor = executor
 42 |         self.taskData = {}
 43 |         self.tasksLaunched = 0
 44 |         self.tasksFinished = 0
 45 |         self.messagesSent = 0
 46 |         self.messagesReceived = 0
 47 | 
 48 |         # resource requirements
 49 |         self._cpu_alloc = cpu_alloc
 50 |         self._mem_alloc = mem_alloc
 51 | 
 52 |         # protected members to customize for Exelixi needs
 53 |         self._executors = {}
 54 |         self._exe_path = exe_path
 55 |         self._n_workers = n_workers
 56 |         self._uow_name = uow_name
 57 |         self._prefix = prefix
 58 | 
 59 | 
 60 |     def registered (self, driver, frameworkId, masterInfo):
 61 |         """
 62 |         Invoked when the scheduler successfully registers with a Mesos
 63 |         master. It is called with the frameworkId, a unique ID
 64 |         generated by the master, and the masterInfo which is
 65 |         information about the master itself.
 66 |         """
 67 | 
 68 |         logging.info("registered with framework ID %s", frameworkId.value)
 69 | 
 70 | 
 71 |     def resourceOffers (self, driver, offers):
 72 |         """
 73 |         Invoked when resources have been offered to this framework. A
 74 |         single offer will only contain resources from a single slave.
 75 |         Resources associated with an offer will not be re-offered to
 76 |         _this_ framework until either (a) this framework has rejected
 77 |         those resources (see SchedulerDriver.launchTasks) or (b) those
 78 |         resources have been rescinded (see Scheduler.offerRescinded).
 79 |         Note that resources may be concurrently offered to more than
 80 |         one framework at a time (depending on the allocator being
 81 |         used).  In that case, the first framework to launch tasks
 82 |         using those resources will be able to use them while the other
 83 |         frameworks will have those resources rescinded (or if a
 84 |         framework has already launched tasks with those resources then
 85 |         those tasks will fail with a TASK_LOST status and a message
 86 |         saying as much).
 87 |         """
 88 | 
 89 |         logging.debug("Mesos Scheduler: received %d resource offers", len(offers))
 90 | 
 91 |         for offer in offers:
 92 |             tasks = []
 93 |             logging.debug("Mesos Scheduler: received resource offer %s", offer.id.value)
 94 | 
 95 |             ## NB: currently we force 'offer.hostname' to be unique per Executor...
 96 |             ## could be changed, but we'd need to juggle the service port numbers
 97 | 
 98 |             if self.tasksLaunched < self._n_workers and offer.hostname not in self._executors:
 99 |                 tid = self.tasksLaunched
100 |                 self.tasksLaunched += 1
101 |                 logging.debug("Mesos Scheduler: accepting offer on slave %s to start task %d", offer.hostname, tid)
102 | 
103 |                 task = mesos_pb2.TaskInfo()
104 |                 task.task_id.value = str(tid)
105 |                 task.slave_id.value = offer.slave_id.value
106 |                 task.name = "task %d" % tid
107 |                 task.executor.MergeFrom(self.executor)
108 | 
109 |                 cpus = task.resources.add()
110 |                 cpus.name = "cpus"
111 |                 cpus.type = mesos_pb2.Value.SCALAR
112 |                 cpus.scalar.value = self._cpu_alloc
113 | 
114 |                 mem = task.resources.add()
115 |                 mem.name = "mem"
116 |                 mem.type = mesos_pb2.Value.SCALAR
117 |                 mem.scalar.value = self._mem_alloc
118 | 
119 |                 tasks.append(task)
120 |                 self.taskData[task.task_id.value] = (offer.slave_id, task.executor.executor_id)
121 | 
122 |                 # record and report the Mesos slave node's telemetry and state
123 |                 self._executors[offer.hostname] = WorkerInfo(offer, task)
124 | 
125 |                 for exe in self._executors.values():
126 |                     logging.debug(exe.report())
127 | 
128 |             # request the driver to launch the task
129 |             driver.launchTasks(offer.id, tasks)
130 | 
131 | 
132 |     def statusUpdate (self, driver, update):
133 |         """
134 |         Invoked when the status of a task has changed (e.g., a slave
135 |         is lost and so the task is lost, a task finishes and an
136 |         executor sends a status update saying so, etc.) Note that
137 |         returning from this callback acknowledges receipt of this
138 |         status update.  If for whatever reason the scheduler aborts
139 |         during this callback (or the process exits) another status
140 |         update will be delivered.  Note, however, that this is
141 |         currently not true if the slave sending the status update is
142 |         lost or fails during that time.
143 |         """
144 | 
145 |         logging.debug("Mesos Scheduler: task %s is in state %d", update.task_id.value, update.state)
146 | 
147 |         if update.state == mesos_pb2.TASK_FINISHED:
148 |             self.tasksFinished += 1
149 |             slave_id, executor_id = self.taskData[update.task_id.value]
150 | 
151 |             # update WorkerInfo with telemetry from initial discovery task
152 |             telemetry = loads(str(update.data))
153 |             logging.info("telemetry from slave %s, executor %s\n%s", slave_id.value, executor_id.value, str(update.data))
154 | 
155 |             exe = self.lookup_executor(slave_id.value, executor_id.value)
156 |             exe.ip_addr = telemetry["ip_addr"]
157 | 
158 |             ## NB: TODO make the service port a parameter
159 |             exe.port = Worker.DEFAULT_PORT
160 | 
161 |             if self.tasksFinished == self._n_workers:
162 |                 logging.info("Mesos Scheduler: %d init tasks completed", self._n_workers)
163 | 
164 |             # request to launch service as a child process
165 |             self.messagesSent += 1
166 |             message = str(dumps([ self._exe_path, "-p", exe.port ]))
167 |             driver.sendFrameworkMessage(executor_id, slave_id, message)
168 | 
169 | 
170 |     def frameworkMessage (self, driver, executorId, slaveId, message):
171 |         """
172 |         Invoked when an executor sends a message. These messages are
173 |         best effort; do not expect a framework message to be
174 |         retransmitted in any reliable fashion.
175 |         """
176 | 
177 |         self.messagesReceived += 1
178 |         logging.info("Mesos Scheduler: slave %s executor %s", slaveId.value, executorId.value)
179 |         logging.info("message %d received: %s", self.messagesReceived, str(message))
180 | 
181 |         if self.messagesReceived == self._n_workers:
182 |             if self.messagesReceived != self.messagesSent:
183 |                 logging.critical("Mesos Scheduler: framework messages lost! sent %d received %d", self.messagesSent, self.messagesReceived)
184 |                 sys.exit(1)
185 | 
186 |             for exe in self._executors.values():
187 |                 logging.debug(exe.report())
188 | 
189 |             logging.info("all worker services launched and init tasks completed")
190 |             exe_info = self._executors.values()
191 |             worker_list = [ exe.get_shard_uri() for exe in exe_info ]
192 | 
193 |             # run UnitOfWork orchestration via REST endpoints on the workers
194 |             fra = Framework(self._uow_name, self._prefix)
195 |             fra.set_worker_list(worker_list, exe_info)
196 | 
197 |             time.sleep(1)
198 |             fra.orchestrate_uow()
199 | 
200 |             # shutdown the Executors after the end of an algorithm run
201 |             driver.stop()
202 | 
203 | 
204 |     def lookup_executor (self, slave_id, executor_id):
205 |         """lookup the Executor based on IDs"""
206 |         for exe in self._executors.values():
207 |             if exe.slave_id == slave_id:
208 |                 return exe
209 | 
210 | 
211 |     @staticmethod
212 |     def start_framework (master_uri, exe_path, n_workers, uow_name, prefix, cpu_alloc, mem_alloc):
213 |         # initialize an executor
214 |         executor = mesos_pb2.ExecutorInfo()
215 |         executor.executor_id.value = uuid1().hex
216 |         executor.command.value = exe_path
217 |         executor.name = "Exelixi Executor"
218 |         executor.source = "per-job build"
219 | 
220 |         ## NB: TODO download tarball/container from HDFS
221 |         #uri = executor.command.uris.add()
222 |         #uri.executable = false
223 |         #uri.value = "hdfs://namenode/exelixi/exelixi.tgz"
224 | 
225 |         # initialize the framework
226 |         framework = mesos_pb2.FrameworkInfo()
227 |         framework.user = "" # have Mesos fill in the current user
228 |         framework.name = "Exelixi Framework"
229 | 
230 |         if os.getenv("MESOS_CHECKPOINT"):
231 |             logging.debug("Mesos Scheduler: enabling checkpoint for the framework")
232 |             framework.checkpoint = True
233 |     
234 |         # create a scheduler and capture the command line options
235 |         sched = MesosScheduler(executor, exe_path, n_workers, uow_name, prefix, cpu_alloc, mem_alloc)
236 | 
237 |         # initialize a driver
238 |         if os.getenv("MESOS_AUTHENTICATE"):
239 |             logging.debug("Mesos Scheduler: enabling authentication for the framework")
240 |     
241 |             if not os.getenv("DEFAULT_PRINCIPAL"):
242 |                 logging.critical("Mesos Scheduler: expecting authentication principal in the environment")
243 |                 sys.exit(1);
244 | 
245 |             if not os.getenv("DEFAULT_SECRET"):
246 |                 logging.critical("Mesos Scheduler: expecting authentication secret in the environment")
247 |                 sys.exit(1);
248 | 
249 |             credential = mesos_pb2.Credential()
250 |             credential.principal = os.getenv("DEFAULT_PRINCIPAL")
251 |             credential.secret = os.getenv("DEFAULT_SECRET")
252 | 
253 |             driver = mesos.MesosSchedulerDriver(sched, framework, master_uri, credential)
254 |         else:
255 |             driver = mesos.MesosSchedulerDriver(sched, framework, master_uri)
256 | 
257 |         return driver
258 | 
259 | 
260 |     @staticmethod
261 |     def stop_framework (driver):
262 |         """ensure that the driver process terminates"""
263 |         status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1
264 |         driver.stop();
265 |         sys.exit(status)
266 | 
267 | 
268 | class MesosExecutor (mesos.Executor):
269 |     # https://github.com/apache/mesos/blob/master/src/python/src/mesos.py
270 | 
271 |     def launchTask (self, driver, task):
272 |         """
273 |         Invoked when a task has been launched on this executor
274 |         (initiated via Scheduler.launchTasks).  Note that this task
275 |         can be realized with a thread, a process, or some simple
276 |         computation, however, no other callbacks will be invoked on
277 |         this executor until this callback has returned.
278 |         """
279 | 
280 |         ## NB: the following code runs on the Mesos slave (source of the resource offer)
281 | 
282 |         def run_task():
283 |             logging.debug("Mesos Executor: requested task %s", task.task_id.value)
284 | 
285 |             update = mesos_pb2.TaskStatus()
286 |             update.task_id.value = task.task_id.value
287 |             update.state = mesos_pb2.TASK_RUNNING
288 |             update.data = str("running discovery task")
289 | 
290 |             logging.debug(update.data)
291 |             driver.sendStatusUpdate(update)
292 | 
293 |             update = mesos_pb2.TaskStatus()
294 |             update.task_id.value = task.task_id.value
295 |             update.state = mesos_pb2.TASK_FINISHED
296 | 
297 |             ## NB: TODO test port availability...
298 |             update.data = str(dumps(get_telemetry(), indent=4))
299 | 
300 |             ## NB: TODO download tarball/container for service launch
301 | 
302 |             # notify scheduler: ready to launch service
303 |             logging.debug(update.data)
304 |             driver.sendStatusUpdate(update)
305 | 
306 |         # now create a thread to run the requested task: run tasks in
307 |         # new threads or processes, rather than inside launchTask...
308 |         # NB: gevent/coroutines/Greenlets conflict here... must run
309 |         # those in a child shell process
310 | 
311 |         thread = Thread(target=run_task)
312 |         thread.start()
313 | 
314 | 
315 |     def frameworkMessage (self, driver, message):
316 |         """
317 |         Invoked when a framework message has arrived for this
318 |         executor. These messages are best effort; do not expect a
319 |         framework message to be retransmitted in any reliable fashion.
320 |         """
321 | 
322 |         # launch service
323 |         logging.info("Mesos Executor: service launched: %s", message)
324 |         subprocess.Popen(loads(message))
325 | 
326 |         # notify scheduler: service was successfully launched
327 |         driver.sendFrameworkMessage(str("service launched"))
328 | 
329 | 
330 |     @staticmethod
331 |     def run_executor ():
332 |         """run the executor until it is stopped externally by the framework"""
333 |         driver = mesos.MesosExecutorDriver(MesosExecutor())
334 |         sys.exit(0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1)
335 | 
336 | 
337 | if __name__=='__main__':
338 |     print "Starting executor..."
339 |     MesosExecutor.run_executor()
340 | 


--------------------------------------------------------------------------------
/src/sample_lmd.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | # 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # author: Paco Nathan
 17 | # https://github.com/ceteri/exelixi
 18 | 
 19 | 
 20 | from collections import namedtuple
 21 | from copy import deepcopy
 22 | from random import randint, sample
 23 | from uow import UnitOfWorkFactory
 24 | import logging
 25 | import sys
 26 | 
 27 | 
 28 | ######################################################################
 29 | ## class definitions
 30 | 
 31 | OPS = ( "rend", "turn", "sup", "loop" )
 32 | 
 33 | Point = namedtuple('Point', 'x y')
 34 | 
 35 | DIR_W = Point(1, 0)	# DIR_N
 36 | DIR_S = Point(0, 1)	# DIR_W
 37 | DIR_E = Point(-1, 0)	# DIR_S
 38 | DIR_N = Point(0, -1)	# DIR_E
 39 | 
 40 | 
 41 | class Drone (object):
 42 |     def __init__ (self, x, y):
 43 |         self.pos = Point(x, y)
 44 |         self.dir = Point(1, 0)
 45 | 
 46 | 
 47 |     def _mod_math (self, pos, dir, mod):
 48 |         result = pos + dir
 49 | 
 50 |         if result < 0:
 51 |             result += mod
 52 |         else:
 53 |             result %= mod
 54 | 
 55 |         return result
 56 | 
 57 | 
 58 |     def exec_op_sup (self, mod, sup):
 59 |         x = self._mod_math(self.pos.x, sup.x, mod)
 60 |         y = self._mod_math(self.pos.y, sup.y, mod)
 61 |         self.pos = Point(x, y)
 62 |         return x, y
 63 | 
 64 | 
 65 |     def exec_op_move (self, mod):
 66 |         x = self._mod_math(self.pos.x, self.dir.x, mod)
 67 |         y = self._mod_math(self.pos.y, self.dir.y, mod)
 68 |         self.pos = Point(x, y)
 69 |         return x, y
 70 | 
 71 | 
 72 |     def exec_op_turn (self):
 73 |         if self.dir.x == DIR_W.x and self.dir.y == DIR_W.y:
 74 |             self.dir = DIR_N
 75 |         elif self.dir.x == DIR_S.x and self.dir.y == DIR_W.y:
 76 |             self.dir = DIR_W
 77 |         elif self.dir.x == DIR_E.x and self.dir.y == DIR_E.y:
 78 |             self.dir = DIR_S
 79 |         elif self.dir.x == DIR_N.x and self.dir.y == DIR_N.y:
 80 |             self.dir = DIR_E
 81 | 
 82 | 
 83 | class LMDFactory (UnitOfWorkFactory):
 84 |     """UnitOfWork definition for Lawnmower Drone GP"""
 85 | 
 86 |     def __init__ (self):
 87 |         #super(UnitOfWorkFactory, self).__init__()
 88 |         self.n_pop = 300
 89 |         self.n_gen = 200
 90 |         self.max_indiv = 20000
 91 |         self.selection_rate = 0.3
 92 |         self.mutation_rate = 0.3
 93 |         self.term_limit = 5.0e-02
 94 |         self.hist_granularity = 3
 95 | 
 96 |         self.grid = [
 97 |             [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ],
 98 |             [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ],
 99 |             [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ],
100 |             [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ],
101 |             [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ],
102 |             [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ],
103 |             [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ],
104 |             [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ],
105 |             [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ],
106 |             [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ],
107 |             ]
108 | 
109 |         # sampling parameters
110 |         self.length = len(self.grid) ** 2
111 |         self.min = 0
112 |         self.max = len(OPS) - 1
113 | 
114 | 
115 |     def generate_features (self):
116 |         """generate a new feature set for a lawnmower drone"""
117 |         rand_len = randint(1, self.length)
118 |         feature_set = []
119 | 
120 |         while len(feature_set) < rand_len:
121 |             op = randint(self.min, self.max)
122 | 
123 |             if op == OPS.index("sup"):
124 |                 feature_set.append(op)
125 |                 feature_set.append(randint(0, len(self.grid) - 1))
126 |                 feature_set.append(randint(0, len(self.grid) - 1))
127 | 
128 |             elif op == OPS.index("loop"):
129 |                 if len(feature_set) > 2:
130 |                     offset = randint(1, len(feature_set) - 1)
131 |                     feature_set.append(op)
132 |                     feature_set.append(offset)
133 | 
134 |             else:
135 |                 feature_set.append(op)
136 | 
137 |         return feature_set
138 | 
139 | 
140 |     def mutate_features (self, feature_set):
141 |         """mutate a copy of the given GP program"""
142 |         pos_to_mutate = randint(0, len(feature_set) - 1)
143 |         mutated_feature_set = list(feature_set)
144 |         mutated_feature_set[pos_to_mutate] = randint(self.min, self.max)
145 |         return mutated_feature_set
146 | 
147 | 
148 |     def breed_features (self, f_feature_set, m_feature_set):
149 |         """breed two GP programs to produce a toddler GP program"""
150 |         split = randint(1, min(len(f_feature_set), len(m_feature_set)))
151 |         return f_feature_set[split:] + m_feature_set[:split]
152 | 
153 | 
154 |     def _simulate (self, grid, code, drone):
155 |         """simulate the lawnmower grid"""
156 |         sp = 0
157 |         mod = len(self.grid)
158 |         num_ops = 0
159 |         max_ops = self.length
160 |         result = None
161 | 
162 |         try:
163 |             while sp < len(code) and num_ops < max_ops:
164 |                 num_ops += 1
165 |                 op = code[sp]
166 | 
167 |                 if op == OPS.index("rend"):
168 |                     x, y = drone.exec_op_move(mod)
169 |                     grid[y][x] = 0
170 | 
171 |                 elif op == OPS.index("turn"):
172 |                     drone.exec_op_turn()
173 | 
174 |                 elif op == OPS.index("sup"):
175 |                     sup = Point(code[sp + 1], code[sp + 2])
176 |                     sp += 2
177 | 
178 |                     if sup.x == 0 and sup.y == 0:
179 |                         return None
180 | 
181 |                     x, y = drone.exec_op_sup(mod, sup)
182 |                     grid[y][x] = 0
183 | 
184 |                 elif op == OPS.index("loop"):
185 |                     offset = code[sp + 1]
186 | 
187 |                     if offset == 0 or offset > sp:
188 |                         return None
189 | 
190 |                     sp -= offset
191 | 
192 |                 else:
193 |                     return None
194 | 
195 |                 #print num_ops, sp, "pos", drone.pos, "dir", drone.dir
196 |                 sp += 1
197 | 
198 |             result = grid
199 | 
200 |         finally:
201 |             return result
202 | 
203 | 
204 |     def get_fitness (self, feature_set):
205 |         """determine the fitness ranging [0.0, 1.0]; higher is better"""
206 |         drone = Drone(randint(0, len(self.grid)), randint(0, len(self.grid)))
207 |         grid = self._simulate(deepcopy(self.grid), feature_set, drone)
208 |         fitness = 0.0
209 | 
210 |         if grid:
211 |             terrorists = 0
212 | 
213 |             for row in grid:
214 |                 #print row
215 |                 terrorists += sum(row)
216 | 
217 |             fitness = (self.length - terrorists) / float(self.length)
218 | 
219 |             if len(feature_set) > 5:
220 |                 penalty = len(feature_set) / 10.0
221 |                 fitness /= penalty
222 | 
223 |         #print fitness, feature_set
224 |         return fitness
225 | 
226 | 
227 | if __name__=='__main__':
228 |     uow = LMDFactory()
229 | 
230 |     print uow.grid
231 | 


--------------------------------------------------------------------------------
/src/sample_tsp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | # 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # author: Paco Nathan
 17 | # https://github.com/ceteri/exelixi
 18 | 
 19 | 
 20 | from random import randint, sample
 21 | from uow import UnitOfWorkFactory
 22 | import logging
 23 | import sys
 24 | 
 25 | 
 26 | ######################################################################
 27 | ## class definitions
 28 | 
 29 | class TSPFactory (UnitOfWorkFactory):
 30 |     """UnitOfWork definition for Traveling Salesperson Problem"""
 31 | 
 32 |     def __init__ (self):
 33 |         #super(UnitOfWorkFactory, self).__init__()
 34 |         self.n_pop = 10
 35 |         self.n_gen = 23
 36 |         self.max_indiv = 2000
 37 |         self.selection_rate = 0.2
 38 |         self.mutation_rate = 0.02
 39 |         self.term_limit = 5.0e-03
 40 |         self.hist_granularity = 3
 41 | 
 42 |         # cost matrix for an example TSP: optimize the bicycling route
 43 |         # for weekend chores in Mountain View for a young Steve Jobs
 44 |         # tuple definition: (name, addr, duration)
 45 | 
 46 |         self.route_meta = ( ( "Home", "secret", 0 ),
 47 |                             ( "Piazzas Fine Foods", "3922 Middlefield Rd, Palo Alto, CA 94303", 45 ),
 48 |                             ( "Mountain View Public Library", "585 Franklin St, Mountain View, CA 94041", 30 ),
 49 |                             ( "Seascapes Fish & Pets Inc", "298 Castro St, Mountain View, CA 94041", 10 ),
 50 |                             ( "Dana Street Roasting Company", "744 W Dana St, Mountain View, CA 94041", 20 ),
 51 |                             ( "Supercuts", "2420 Charleston Rd, Mountain View, CA 94043", 60 ),
 52 |                             )
 53 | 
 54 |         self.route_cost = ( ( 0, 7, 11, 12, 14, 8 ),
 55 |                             ( 7, 0, 18, 18, 19, 5 ),
 56 |                             ( 14, 19, 0, 2, 3, 19 ),
 57 |                             ( 12, 20, 3, 0, 1, 19 ),
 58 |                             ( 12, 18, 3, 1, 0, 18 ),
 59 |                             ( 8, 5, 18, 18, 19, 0 ),
 60 |                             )
 61 | 
 62 |         # sampling parameters
 63 |         self.length = len(self.route_cost) - 1
 64 |         self.min = 1
 65 |         self.max = self.length
 66 | 
 67 | 
 68 |     def generate_features (self):
 69 |         """generate a new feature set for young Steve pedaling"""
 70 |         features = []
 71 |         expected = list(xrange(self.min, self.max + 1))
 72 | 
 73 |         # sample row indices in the cost matrix, without replacement
 74 |         for _ in xrange(self.length):
 75 |             x = sample(expected, 1)[0]
 76 |             features.append(x)
 77 |             expected.remove(x)
 78 | 
 79 |         return features
 80 | 
 81 | 
 82 |     def mutate_features (self, feature_set):
 83 |         """mutate a copy of the given feature set"""
 84 |         pos_to_mutate = randint(0, len(feature_set) - 1)
 85 |         mutated_feature_set = list(feature_set)
 86 |         mutated_feature_set[pos_to_mutate] = randint(self.min, self.max)
 87 |         return mutated_feature_set
 88 | 
 89 | 
 90 |     def breed_features (self, f_feature_set, m_feature_set):
 91 |         """breed two feature sets to produce a child"""
 92 |         half = len(f_feature_set) / 2
 93 |         return f_feature_set[half:] + m_feature_set[:half]
 94 | 
 95 | 
 96 |     def get_fitness (self, feature_set):
 97 |         """determine the fitness ranging [0.0, 1.0]; higher is better"""
 98 |         #print feature_set
 99 | 
100 |         # 1st estimator: all points were visited?
101 |         expected = set(xrange(self.min, self.max + 1))
102 |         observed = set(feature_set)
103 |         cost1 = len(expected - observed) / float(len(expected))
104 |         #print expected, observed, cost1
105 | 
106 |         # 2nd estimator: travel time was minimized?
107 |         total_cost = 0
108 |         worst_case = float(sum(self.route_cost[0])) * 2.0
109 |         x0 = 0
110 | 
111 |         for x1 in feature_set:
112 |             total_cost += self.route_cost[x0][x1]
113 |             x0 = x1
114 | 
115 |         total_cost += self.route_cost[x0][0]
116 |         cost2 = min(1.0, total_cost / worst_case)
117 |         #print total_cost, worst_case, cost2
118 | 
119 |         # combine the two estimators into a fitness score
120 |         fitness = 1.0 - (cost1 + cost2) / 2.0
121 | 
122 |         if cost1 > 0.0:
123 |             fitness /= 2.0
124 | 
125 |         #print cost1, cost2, fitness, feature_set
126 |         return fitness
127 | 
128 | 
129 | if __name__=='__main__':
130 |     uow = TSPFactory()
131 | 
132 |     print uow.route_meta
133 |     print uow.route_cost
134 | 


--------------------------------------------------------------------------------
/src/service.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | # 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # author: Paco Nathan
 17 | # https://github.com/ceteri/exelixi
 18 | 
 19 | 
 20 | from contextlib import contextmanager
 21 | from gevent import monkey, shutdown, signal, spawn, wsgi, Greenlet
 22 | from gevent.event import Event
 23 | from gevent.queue import JoinableQueue
 24 | from hashring import HashRing
 25 | from json import dumps, loads
 26 | from signal import SIGQUIT
 27 | from util import instantiate_class, post_distrib_rest
 28 | from uuid import uuid1
 29 | import logging
 30 | import sys
 31 | 
 32 | 
 33 | ######################################################################
 34 | ## class definitions
 35 | 
 36 | class Worker (object):
 37 |     # http://www.gevent.org/gevent.wsgi.html
 38 |     # http://toastdriven.com/blog/2011/jul/31/gevent-long-polling-you/
 39 |     # http://blog.pythonisito.com/2012/07/gevent-and-greenlets.html
 40 | 
 41 |     DEFAULT_PORT = "9311"
 42 | 
 43 | 
 44 |     def __init__ (self, port=DEFAULT_PORT):
 45 |         # REST services
 46 |         monkey.patch_all()
 47 |         signal(SIGQUIT, shutdown)
 48 |         self.is_config = False
 49 |         self.server = wsgi.WSGIServer(('', int(port)), self._response_handler, log=None)
 50 | 
 51 |         # sharding
 52 |         self.prefix = None
 53 |         self.shard_id = None
 54 |         self.ring = None
 55 | 
 56 |         # concurrency based on message passing / barrier pattern
 57 |         self._task_event = None
 58 |         self._task_queue = None
 59 | 
 60 |         # UnitOfWork
 61 |         self._uow = None
 62 | 
 63 | 
 64 |     def shard_start (self):
 65 |         """start the worker service for this shard"""
 66 |         self.server.serve_forever()
 67 | 
 68 | 
 69 |     def shard_stop (self, *args, **kwargs):
 70 |         """stop the worker service for this shard"""
 71 |         payload = args[0]
 72 | 
 73 |         if (self.prefix == payload["prefix"]) and (self.shard_id == payload["shard_id"]):
 74 |             logging.info("worker service stopping... you can safely ignore any exceptions that follow")
 75 |             self.server.stop()
 76 |         else:
 77 |             # returns incorrect response in this case, to avoid exception
 78 |             logging.error("incorrect shard %s prefix %s", payload["shard_id"], payload["prefix"])
 79 | 
 80 | 
 81 |     ######################################################################
 82 |     ## authentication methods
 83 | 
 84 |     def auth_request (self, payload, start_response, body):
 85 |         """test the authentication credentials for a REST call"""
 86 |         if (self.prefix == payload["prefix"]) and (self.shard_id == payload["shard_id"]):
 87 |             return True
 88 |         else:
 89 |             # UoW caller did not provide correct credentials to access shard
 90 |             start_response('403 Forbidden', [('Content-Type', 'text/plain')])
 91 |             body.put("Forbidden, incorrect credentials for this shard\r\n")
 92 |             body.put(StopIteration)
 93 | 
 94 |             logging.error("incorrect credentials shard %s prefix %s", payload["shard_id"], payload["prefix"])
 95 |             return False
 96 | 
 97 | 
 98 |     def shard_config (self, *args, **kwargs):
 99 |         """configure the service to run a shard"""
100 |         payload, start_response, body = self.get_response_context(args)
101 | 
102 |         if self.is_config:
103 |             # hey, somebody call security...
104 |             start_response('403 Forbidden', [('Content-Type', 'text/plain')])
105 |             body.put("Forbidden, shard is already in a configured state\r\n")
106 |             body.put(StopIteration)
107 | 
108 |             logging.warning("denied configuring shard %s prefix %s", self.shard_id, self.prefix)
109 |         else:
110 |             self.is_config = True
111 |             self.prefix = payload["prefix"]
112 |             self.shard_id = payload["shard_id"]
113 | 
114 |             # dependency injection for UnitOfWork
115 |             uow_name = payload["uow_name"]
116 |             logging.info("initializing unit of work based on %s", uow_name)
117 | 
118 |             ff = instantiate_class(uow_name)
119 |             self._uow = ff.instantiate_uow(uow_name, self.prefix)
120 | 
121 |             start_response('200 OK', [('Content-Type', 'text/plain')])
122 |             body.put("Bokay\r\n")
123 |             body.put(StopIteration)
124 | 
125 |             logging.info("configuring shard %s prefix %s", self.shard_id, self.prefix)
126 | 
127 | 
128 |     ######################################################################
129 |     ## barrier pattern methods
130 | 
131 |     @contextmanager
132 |     def wrap_task_event (self):
133 |         """initialize a gevent.Event, to which the UnitOfWork will wait as a listener"""
134 |         self._task_event = Event()
135 |         yield
136 | 
137 |         # complete the Event, notifying the UnitOfWork which waited
138 |         self._task_event.set()
139 |         self._task_event = None
140 | 
141 | 
142 |     def _consume_task_queue (self):
143 |         """consume/serve requests until the task_queue empties"""
144 |         while True:
145 |             payload = self._task_queue.get()
146 | 
147 |             try:
148 |                 self._uow.perform_task(payload)
149 |             finally:
150 |                 self._task_queue.task_done()
151 | 
152 | 
153 |     def prep_task_queue (self):
154 |         """prepare task_queue for another set of distributed tasks"""
155 |         self._task_queue = JoinableQueue()
156 |         spawn(self._consume_task_queue)
157 | 
158 | 
159 |     def put_task_queue (self, payload):
160 |         """put the given task definition into the task_queue"""
161 |         self._task_queue.put_nowait(payload)
162 | 
163 | 
164 |     def queue_wait (self, *args, **kwargs):
165 |         """wait until all shards finished sending task_queue requests"""
166 |         payload, start_response, body = self.get_response_context(args)
167 | 
168 |         if self.auth_request(payload, start_response, body):
169 |             if self._task_event:
170 |                 self._task_event.wait()
171 | 
172 |             # HTTP response first, then initiate long-running task
173 |             start_response('200 OK', [('Content-Type', 'text/plain')])
174 |             body.put("Bokay\r\n")
175 |             body.put(StopIteration)
176 | 
177 | 
178 |     def queue_join (self, *args, **kwargs):
179 |         """join on the task_queue, as a barrier to wait until it empties"""
180 |         payload, start_response, body = self.get_response_context(args)
181 | 
182 |         if self.auth_request(payload, start_response, body):
183 |             start_response('200 OK', [('Content-Type', 'text/plain')])
184 |             body.put("join queue...\r\n")
185 | 
186 |             ## NB: TODO this step of emptying out the task_queue on
187 |             ## shards could take a while on a large run... perhaps use
188 |             ## a long-polling HTTP request or websocket instead?
189 |             self._task_queue.join()
190 | 
191 |             body.put("done\r\n")
192 |             body.put(StopIteration)
193 | 
194 | 
195 |     ######################################################################
196 |     ## hash ring methods
197 | 
198 |     def ring_init (self, *args, **kwargs):
199 |         """initialize the HashRing"""
200 |         payload, start_response, body = self.get_response_context(args)
201 | 
202 |         if self.auth_request(payload, start_response, body):
203 |             self.ring = payload["ring"]
204 | 
205 |             start_response('200 OK', [('Content-Type', 'text/plain')])
206 |             body.put("Bokay\r\n")
207 |             body.put(StopIteration)
208 | 
209 |             logging.info("setting hash ring %s", self.ring)
210 | 
211 | 
212 |     ######################################################################
213 |     ## WSGI handler for REST endpoints
214 | 
215 |     def get_response_context (self, args):
216 |         """decode the WSGI response context from the Greenlet args"""
217 |         env = args[0]
218 |         msg = env["wsgi.input"].read()
219 |         payload = loads(msg)
220 |         start_response = args[1]
221 |         body = args[2]
222 | 
223 |         return payload, start_response, body
224 | 
225 | 
226 |     def _response_handler (self, env, start_response):
227 |         """handle HTTP request/response"""
228 |         uri_path = env["PATH_INFO"]
229 |         body = JoinableQueue()
230 | 
231 |         if self._uow and self._uow.handle_endpoints(self, uri_path, env, start_response, body):
232 |             pass
233 | 
234 |         ##########################################
235 |         # Worker endpoints
236 | 
237 |         elif uri_path == '/shard/config':
238 |             # configure the service to run a shard
239 |             Greenlet(self.shard_config, env, start_response, body).start()
240 | 
241 |         elif uri_path == '/shard/stop':
242 |             # shutdown the service
243 |             ## NB: must parse POST data specially, to avoid exception
244 |             payload = loads(env["wsgi.input"].read())
245 |             Greenlet(self.shard_stop, payload).start_later(1)
246 | 
247 |             # HTTP response starts first, to avoid error after server stops
248 |             start_response('200 OK', [('Content-Type', 'text/plain')])
249 |             body.put("Goodbye\r\n")
250 |             body.put(StopIteration)
251 | 
252 |         elif uri_path == '/queue/wait':
253 |             # wait until all shards have finished sending task_queue requests
254 |             Greenlet(self.queue_wait, env, start_response, body).start()
255 | 
256 |         elif uri_path == '/queue/join':
257 |             # join on the task_queue, as a barrier to wait until it empties
258 |             Greenlet(self.queue_join, env, start_response, body).start()
259 | 
260 |         elif uri_path == '/check/persist':
261 |             ## NB: TODO checkpoint the service state to durable storage
262 |             start_response('200 OK', [('Content-Type', 'text/plain')])
263 |             body.put("Bokay\r\n")
264 |             body.put(StopIteration)
265 | 
266 |         elif uri_path == '/check/recover':
267 |             ## NB: TODO restart the service, recovering from most recent checkpoint
268 |             start_response('200 OK', [('Content-Type', 'text/plain')])
269 |             body.put("Bokay\r\n")
270 |             body.put(StopIteration)
271 | 
272 |         ##########################################
273 |         # HashRing endpoints
274 | 
275 |         elif uri_path == '/ring/init':
276 |             # initialize the HashRing
277 |             Greenlet(self.ring_init, env, start_response, body).start()
278 | 
279 |         elif uri_path == '/ring/add':
280 |             ## NB: TODO add a node to the HashRing
281 |             start_response('200 OK', [('Content-Type', 'text/plain')])
282 |             body.put("Bokay\r\n")
283 |             body.put(StopIteration)
284 | 
285 |         elif uri_path == '/ring/del':
286 |             ## NB: TODO delete a node from the HashRing
287 |             start_response('200 OK', [('Content-Type', 'text/plain')])
288 |             body.put("Bokay\r\n")
289 |             body.put(StopIteration)
290 | 
291 |         ##########################################
292 |         # utility endpoints
293 | 
294 |         elif uri_path == '/':
295 |             # dump info about the service in general
296 |             start_response('200 OK', [('Content-Type', 'text/plain')])
297 |             body.put(str(env) + "\r\n")
298 |             body.put(StopIteration)
299 | 
300 |         else:
301 |             # ne znayu
302 |             start_response('404 Not Found', [('Content-Type', 'text/plain')])
303 |             body.put('Not Found\r\n')
304 |             body.put(StopIteration)
305 | 
306 |         return body
307 | 
308 | 
309 | class WorkerInfo (object):
310 |     def __init__ (self, offer, task):
311 |         self.host = offer.hostname
312 |         self.slave_id = offer.slave_id.value
313 |         self.task_id = task.task_id.value
314 |         self.executor_id = task.executor.executor_id.value
315 |         self.ip_addr = None
316 |         self.port = None
317 | 
318 |     def get_shard_uri (self):
319 |         """generate a URI for this worker service"""
320 |         return self.ip_addr + ":" + self.port
321 | 
322 | 
323 |     def report (self):
324 |         """report the slave telemetry + state"""
325 |         return "host %s slave %s task %s exe %s ip %s:%s" % (self.host, self.slave_id, str(self.task_id), self.executor_id, self.ip_addr, self.port)
326 | 
327 | 
328 | class Framework (object):
329 |     def __init__ (self, uow_name, prefix="/tmp/exelixi"):
330 |         """initialize the system parameters, which represent operational state"""
331 |         self.uuid = uuid1().hex
332 |         self.prefix = prefix + "/" + self.uuid
333 |         logging.info("prefix: %s", self.prefix)
334 | 
335 |         # dependency injection for UnitOfWork
336 |         self.uow_name = uow_name
337 |         logging.info("initializing unit of work based on %s", uow_name)
338 | 
339 |         ff = instantiate_class(self.uow_name)
340 |         self._uow = ff.instantiate_uow(self.uow_name, self.prefix)
341 | 
342 |         self._shard_assoc = None
343 |         self._ring = None
344 | 
345 | 
346 |     def _gen_shard_id (self, i, n):
347 |         """generate a shard_id"""
348 |         s = str(i)
349 |         z = ''.join([ '0' for _ in xrange(len(str(n)) - len(s)) ])
350 |         return "shard/" + z + s
351 | 
352 | 
353 |     def set_worker_list (self, worker_list, exe_info=None):
354 |         """associate shards with Executors"""
355 |         self._shard_assoc = {}
356 | 
357 |         for i in xrange(len(worker_list)):
358 |             shard_id = self._gen_shard_id(i, len(worker_list))
359 | 
360 |             if not exe_info:
361 |                 self._shard_assoc[shard_id] = [worker_list[i], None]
362 |             else:
363 |                 self._shard_assoc[shard_id] = [worker_list[i], exe_info[i]]
364 | 
365 |         logging.info("shard list: %s", str(self._shard_assoc))
366 | 
367 | 
368 |     def get_worker_list (self):
369 |         """generator for the worker shards"""
370 |         for shard_id, (shard_uri, exe_info) in self._shard_assoc.items():
371 |             yield shard_id, shard_uri
372 | 
373 | 
374 |     def get_worker_count (self):
375 |         """count the worker shards"""
376 |         return len(self._shard_assoc)
377 | 
378 | 
379 |     def send_worker_rest (self, shard_id, shard_uri, path, base_msg):
380 |         """access a REST endpoint on the specified shard"""
381 |         return post_distrib_rest(self.prefix, shard_id, shard_uri, path, base_msg)
382 | 
383 | 
384 |     def send_ring_rest (self, path, base_msg):
385 |         """access a REST endpoint on each of the shards"""
386 |         json_str = []
387 | 
388 |         for shard_id, (shard_uri, exe_info) in self._shard_assoc.items():
389 |             lines = post_distrib_rest(self.prefix, shard_id, shard_uri, path, base_msg)
390 |             json_str.append(lines[0])
391 | 
392 |         return json_str
393 | 
394 | 
395 |     def phase_barrier (self):
396 |         """
397 |         implements a two-phase barrier to (1) wait until all shards
398 |         have finished sending task_queue requests, then (2) join on
399 |         each task_queue, to wait until it has emptied
400 |         """
401 |         self.send_ring_rest("queue/wait", {})
402 |         self.send_ring_rest("queue/join", {})
403 | 
404 | 
405 |     def orchestrate_uow (self):
406 |         """orchestrate a UnitOfWork distributed across the HashRing via REST endpoints"""
407 |         # configure the shards and the hash ring
408 |         self.send_ring_rest("shard/config", { "uow_name": self.uow_name })
409 | 
410 |         self._ring = { shard_id: shard_uri for shard_id, (shard_uri, exe_info) in self._shard_assoc.items() }
411 |         self.send_ring_rest("ring/init", { "ring": self._ring })
412 | 
413 |         # distribute the UnitOfWork tasks
414 |         self._uow.orchestrate(self)
415 | 
416 |         # shutdown
417 |         self.send_ring_rest("shard/stop", {})
418 | 
419 | 
420 | class UnitOfWork (object):
421 |     def __init__ (self, uow_name, prefix):
422 |         self.uow_name = uow_name
423 |         self.uow_factory = instantiate_class(uow_name)
424 | 
425 |         self.prefix = prefix
426 | 
427 |         self._shard_id = None
428 |         self._shard_dict = None
429 |         self._hash_ring = None
430 | 
431 | 
432 |     def set_ring (self, shard_id, shard_dict):
433 |         """initialize the HashRing"""
434 |         self._shard_id = shard_id
435 |         self._shard_dict = shard_dict
436 |         self._hash_ring = HashRing(shard_dict.keys())
437 | 
438 | 
439 |     def perform_task (self, payload):
440 |         """perform a task consumed from the Worker.task_queue"""
441 |         pass
442 | 
443 | 
444 |     def orchestrate (self, framework):
445 |         """orchestrate Workers via REST endpoints"""
446 |         pass
447 | 
448 | 
449 |     def handle_endpoints (self, worker, uri_path, env, start_response, body):
450 |         """UnitOfWork REST endpoints"""
451 |         pass
452 | 
453 | 
454 | if __name__=='__main__':
455 |     if len(sys.argv) < 2:
456 |         print "usage:\n  %s <host:port> <factory>" % (sys.argv[0])
457 |         sys.exit(1)
458 | 
459 |     shard_uri = sys.argv[1]
460 |     uow_name = sys.argv[2]
461 | 
462 |     fra = Framework(uow_name)
463 |     print "framework launching based on %s stored at %s..." % (fra.uow_name, fra.prefix)
464 | 
465 |     fra.set_worker_list([ shard_uri ])
466 |     fra.orchestrate_uow()
467 | 


--------------------------------------------------------------------------------
/src/uow.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | # 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # author: Paco Nathan
 17 | # https://github.com/ceteri/exelixi
 18 | 
 19 | 
 20 | from ga import Individual, Population
 21 | from random import randint
 22 | from util import instantiate_class
 23 | import logging
 24 | 
 25 | 
 26 | ######################################################################
 27 | ## class definitions
 28 | 
 29 | class UnitOfWorkFactory (object):
 30 |     """encapsulates all of the dependency injection and UnitOfWork definitions"""
 31 | 
 32 |     def __init__ (self):
 33 |         ## NB: override these GA parameters
 34 |         self.n_pop = 23
 35 |         self.n_gen = 10
 36 |         self.term_limit = 5.0e-03
 37 |         self.hist_granularity = 3
 38 |         self.selection_rate = 0.2
 39 |         self.mutation_rate = 0.02
 40 |         self.max_indiv = 2000
 41 | 
 42 |         ## NB: override these feature set parameters
 43 |         self.length = 5
 44 |         self.min = 0
 45 |         self.max = 100
 46 |         self.target = 231
 47 | 
 48 | 
 49 |     def instantiate_uow (self, uow_name, prefix):
 50 |         """instantiate a UnitOfWork, to decouple services from the GA problem domain"""
 51 |         ## NB: override these class references to customize the GA definition
 52 |         return Population(uow_name, prefix, Individual())
 53 | 
 54 | 
 55 |     def get_fitness (self, feature_set):
 56 |         """determine the fitness ranging [0.0, 1.0]; higher is better"""
 57 |         ## NB: override this fitness function
 58 |         return 1.0 - abs(sum(feature_set) - self.target) / float(self.target)
 59 | 
 60 | 
 61 |     def use_force (self, force):
 62 |         """determine whether to force recalculation of a fitness function"""
 63 |         # NB: override in some use cases, e.g., when required for evaluating shared resources
 64 |         return force
 65 | 
 66 | 
 67 |     def generate_features (self):
 68 |         """generate a new feature set"""
 69 |         ## NB: override this feature set generator
 70 |         return sorted([ randint(self.min, self.max) for _ in xrange(self.length) ])
 71 | 
 72 | 
 73 |     def mutate_features (self, feature_set):
 74 |         """mutate a copy of the given feature set"""
 75 |         ## NB: override this feature set mutator
 76 |         pos_to_mutate = randint(0, len(feature_set) - 1)
 77 |         mutated_feature_set = list(feature_set)
 78 |         mutated_feature_set[pos_to_mutate] = randint(self.min, self.max)
 79 |         return sorted(mutated_feature_set)
 80 | 
 81 | 
 82 |     def breed_features (self, f_feature_set, m_feature_set):
 83 |         """breed two feature sets to produce a child"""
 84 |         ## NB: override this feature set crossover
 85 |         half = len(f_feature_set) / 2
 86 |         return sorted(f_feature_set[half:] + m_feature_set[:half])
 87 | 
 88 | 
 89 |     def _calc_median_hist (self, hist_items, n_indiv):
 90 |         """calculate the median from a fitness histogram"""
 91 |         sum_count = 0
 92 |         mid_count = float(n_indiv) / 2
 93 | 
 94 |         if n_indiv == 1:
 95 |             return hist_items[0][0]
 96 |         else:
 97 |             for i in xrange(len(hist_items)):
 98 |                 bin, count = hist_items[i]
 99 |                 sum_count += count
100 | 
101 |                 if sum_count == mid_count:
102 |                     return bin
103 |                 elif sum_count > mid_count:
104 |                     bin0, count0 = hist_items[i - 1]
105 |                     return ((bin0 * count0) + (bin * count)) / (count0 + count)
106 | 
107 | 
108 |     def test_termination (self, current_gen, hist_items, total_indiv):
109 |         """evaluate the terminating condition for this generation and report progress"""
110 |         ## NB: override this termination test
111 | 
112 |         # calculate a mean squared error (MSE) of fitness for a Population
113 |         hist_keys = map(lambda x: x[0], hist_items)
114 |         n_indiv = sum([ count for bin, count in hist_items ])
115 |         fit_mse = sum([ count * (1.0 - float(bin)) ** 2.0 for bin, count in hist_items ]) / float(n_indiv)
116 | 
117 |         # calculate summary stats
118 |         fit_max = max(hist_keys)
119 |         fit_avg = sum(hist_keys) / float(n_indiv)
120 |         fit_med = self._calc_median_hist(hist_items, n_indiv)
121 | 
122 |         # report the progress for one generation
123 |         gen_report = "gen\t%d\tsize\t%d\ttotal\t%d\tmse\t%.2e\tmax\t%.2e\tmed\t%.2e\tavg\t%.2e" % (current_gen, n_indiv, total_indiv, fit_mse, fit_max, fit_med, fit_avg)
124 |         print gen_report
125 |         logging.info(gen_report)
126 |         logging.debug(filter(lambda x: x[1] > 0, hist_items))
127 | 
128 |         # stop when a "good enough" solution is found
129 |         return (fit_mse <= self.term_limit) or (total_indiv >= self.max_indiv)
130 | 
131 | 
132 | if __name__=='__main__':
133 |     # a simple test
134 |     uow_name = "uow.UnitOfWorkFactory"
135 |     uow = instantiate_class(uow_name)
136 | 
137 |     print uow
138 | 


--------------------------------------------------------------------------------
/src/util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | # 
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | # 
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # author: Paco Nathan
 17 | # https://github.com/ceteri/exelixi
 18 | 
 19 | 
 20 | from collections import OrderedDict
 21 | from httplib import BadStatusLine
 22 | from importlib import import_module
 23 | from json import dumps, loads
 24 | from os.path import abspath
 25 | from random import random
 26 | from urllib2 import urlopen, Request, URLError
 27 | import logging
 28 | import psutil
 29 | import socket
 30 | 
 31 | 
 32 | ######################################################################
 33 | ## utilities
 34 | 
 35 | def instantiate_class (class_path):
 36 |     """instantiate a class from the given package.class name"""
 37 |     module_name, class_name = class_path.split(".")
 38 |     return getattr(import_module(module_name), class_name)()
 39 | 
 40 | 
 41 | def post_distrib_rest (prefix, shard_id, shard_uri, path, base_msg):
 42 |     """POST a JSON-based message to a REST endpoint on a shard"""
 43 |     msg = base_msg.copy()
 44 | 
 45 |     # populate credentials
 46 |     msg["prefix"] = prefix
 47 |     msg["shard_id"] = shard_id
 48 | 
 49 |     # POST the JSON payload to the REST endpoint
 50 |     uri = "http://" + shard_uri + "/" + path
 51 |     req = Request(uri)
 52 |     req.add_header('Content-Type', 'application/json')
 53 | 
 54 |     logging.debug("send %s %s", shard_uri, path)
 55 |     logging.debug(dumps(msg))
 56 | 
 57 |     # read/collect the response
 58 |     try:
 59 |         f = urlopen(req, dumps(msg))
 60 |         return f.readlines()
 61 |     except URLError as e:
 62 |         logging.critical("could not reach REST endpoint %s error: %s", uri, str(e.reason), exc_info=True)
 63 |         raise
 64 |     except BadStatusLine as e:
 65 |         logging.critical("REST endpoint died %s error: %s", uri, str(e.line), exc_info=True)
 66 | 
 67 | 
 68 | def get_telemetry ():
 69 |     """get system resource telemetry on a Mesos slave via psutil"""
 70 |     telemetry = OrderedDict()
 71 | 
 72 |     telemetry["ip_addr"] = socket.gethostbyname(socket.gethostname())
 73 | 
 74 |     telemetry["mem_free"] =  psutil.virtual_memory().free
 75 | 
 76 |     telemetry["cpu_num"] = psutil.NUM_CPUS
 77 | 
 78 |     x = psutil.cpu_times()
 79 |     telemetry["cpu_times"] = OrderedDict([ ("user", x.user), ("system", x.system), ("idle", x.idle) ])
 80 | 
 81 |     x = psutil.disk_usage("/tmp")
 82 |     telemetry["disk_usage"] = OrderedDict([ ("free", x.free), ("percent", x.percent) ])
 83 | 
 84 |     x = psutil.disk_io_counters()
 85 |     telemetry["disk_io"] = OrderedDict([ ("read_count", x.read_count), ("write_count", x.write_count), ("read_bytes", x.read_bytes), ("write_bytes", x.write_bytes), ("read_time", x.read_time), ("write_time", x.write_time) ])
 86 | 
 87 |     x = psutil.network_io_counters()
 88 |     telemetry["network_io"] = OrderedDict([ ("bytes_sent", x.bytes_sent), ("bytes_recv", x.bytes_recv), ("packets_sent", x.packets_sent), ("packets_recv", x.packets_recv), ("errin", x.errin), ("errout", x.errout), ("dropin", x.dropin), ("dropout", x.dropout) ])
 89 | 
 90 |     return telemetry
 91 | 
 92 | 
 93 | def get_master_state (master_uri):
 94 |     """get current state, represented as JSON, from the Mesos master"""
 95 |     uri = "http://" + master_uri + "/master/state.json"
 96 | 
 97 |     try:
 98 |         response = urlopen(uri)
 99 |         return loads(response.read())
100 |     except URLError as e:
101 |         logging.critical("could not reach REST endpoint %s error: %s", uri, str(e.reason), exc_info=True)
102 |         raise
103 | 
104 | 
105 | def get_master_leader (master_uri):
106 |     """get the host:port for the Mesos master leader"""
107 |     state = get_master_state(master_uri)
108 |     return state["leader"].split("@")[1]
109 | 
110 | 
111 | def pipe_slave_list (master_uri):
112 |     """report a list of slave IP addr, one per line to stdout -- for building pipes"""
113 |     state = get_master_state(get_master_leader(master_uri))
114 | 
115 |     for s in state["slaves"]:
116 |         print s["pid"].split("@")[1].split(":")[0] 
117 | 
118 | 
119 | if __name__=='__main__':
120 |     pass
121 | 


--------------------------------------------------------------------------------