├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── README.md
├── design.md
├── example.py
├── kubeface
    ├── __init__.py
    ├── backend.py
    ├── backends.py
    ├── bucket_storage.py
    ├── client.py
    ├── commands
    │   ├── __init__.py
    │   ├── copy.py
    │   ├── job.py
    │   ├── run.py
    │   └── run_task.py
    ├── common.py
    ├── context.py
    ├── job.py
    ├── kubernetes_backend.py
    ├── local_process_backend.py
    ├── local_process_docker_backend.py
    ├── naming.py
    ├── remote_object.py
    ├── result.py
    ├── serialization.py
    ├── status_writer.py
    ├── storage.py
    ├── stringable.py
    ├── task.py
    └── worker_configuration.py
├── remote_object_example.py
├── setup.py
└── tests
    ├── __init__.py
    ├── test_client.py
    ├── test_job_command.py
    ├── test_naming.py
    ├── test_storage.py
    └── util.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | language: python
 3 | services:
 4 |   - docker
 5 | python:
 6 |   - "3.6.1"
 7 | install:
 8 |     - pip install -e .    
 9 | script:
10 |     - nosetests
11 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:cudnn-runtime
 2 | 
 3 | MAINTAINER Tim O'Donnell <timodonnell@gmail.com>
 4 | 
 5 | RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections && \
 6 |     apt-get clean && \
 7 |     apt-get update && \
 8 |     apt-get install --yes \
 9 |         gfortran \
10 |         git \
11 |         libatlas-base-dev \
12 |         libatlas3gf-base \
13 |         libblas-dev \
14 |         libfreetype6-dev \
15 |         libhdf5-serial-dev \
16 |         liblapack-dev \
17 |         libpng12-dev \
18 |         libxml2-dev \
19 |         libxslt1-dev \
20 |         libyaml-dev \
21 |         libzmq3-dev \
22 |         pkg-config \
23 |         python-virtualenv \
24 |         python3-dev \
25 |         python-dev && \
26 |     apt-get clean && \
27 |     useradd --create-home --home-dir /home/user --shell /bin/bash -G sudo user && \
28 |     echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
29 |         
30 | RUN locale-gen en_US.UTF-8
31 | ENV LANG en_US.UTF-8
32 | ENV LANGUAGE en_US:en
33 | ENV LC_ALL en_US.UTF-8
34 | 
35 | USER user
36 | ENV HOME=/home/user
37 | ENV SHELL=/bin/bash
38 | ENV USER=user
39 | WORKDIR /home/user
40 | 
41 | # Setup virtual envs and install convenience packages.  Note: installing
42 | RUN virtualenv venv-py3 --python=python3 && \
43 |     venv-py3/bin/pip install --upgrade pip && \
44 |     venv-py3/bin/pip install --upgrade \
45 |         numpy \
46 |         bokeh \
47 |         cherrypy \
48 |         jupyter \
49 |         lxml \
50 |         scipy \
51 |         scikit-learn \
52 |         dill \
53 |         seaborn
54 | 
55 | ENV PATH /home/user/venv-py3/bin:$PATH
56 | COPY . ./kubeface
57 | RUN venv-py3/bin/pip install ./kubeface
58 | 
59 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | kubeface
  2 | [![Build Status](https://travis-ci.org/hammerlab/kubeface.svg?branch=master)](https://travis-ci.org/hammerlab/kubeface)
  3 | ========
  4 | 
  5 | Python library for parallel maps running directly on Kubernetes. Intended for running many expensive tasks (minutes in runtime). Alpha stage. Currently supports only Google Cloud.
  6 | 
  7 | Overview
  8 | ========
  9 | 
 10 | Kubeface aims for reasonably efficient execution of many long running Python tasks with medium sized (up to a few gigabytes) inputs and outputs. Design choices and assumptions:
 11 | 
 12 | * Each task runs in its own bare kubernetes pod. There is no state shared between tasks
 13 | * All communication is through Google Storage Buckets
 14 | * Each task's input and output must fit in memory, but we do not assume that more than one task's data fits simultaneously
 15 | * Work performed as part of jobs that crash can be re-used for reruns
 16 | * We favor debuggability over performance
 17 | 
 18 | The primary motivating application has been neural network model selection for the [MHCflurry](https://github.com/hammerlab/mhcflurry) project.
 19 | 
 20 | See [example.py](example.py) for a simple working example.
 21 | 
 22 | Nomenclature
 23 | -------------
 24 | 
 25 | * **Master:** the Python process the user launches. It uses kubeface to run *jobs*
 26 | * **Worker:** a process running external to the master (probably on a cluster) that executes a *task*
 27 | * **Job:** Each call to `client.map(...)` creates a *job*
 28 | * **Task:** Each invocation of the function given to map is a *task*
 29 | 
 30 | Backends
 31 | -------------
 32 | 
 33 | * The *kubernetes* backend runs tasks on Kubernetes. This is what is used in production
 34 | * The *local-process* backend runs tasks as local processes. Useful for development and testing of both kubeface and code that uses it
 35 | * The *local-process-docker* backend runs tasks as local processes in a docker container. This is used for testing kubeface
 36 | 
 37 | 
 38 | Life of a job
 39 | -------------
 40 | 
 41 | If a user calls (where `client` is a [kubeface.Client](kubeface/client.py) instance):
 42 | 
 43 | ```py
 44 | client.map(lambda x: x**2, range(10))
 45 | ```
 46 | 
 47 | This creates a *job* containing 10 *tasks*. The return value is a generator that will yield the square of the numbers 0-9. The job is executed as follows:
 48 | 
 49 | * Submission: for each task:
 50 |   * an input file containing a pickled (we use the [dill](https://github.com/uqfoundation/dill) library) representation of the task's input is uploaded to cloud storage. In this example the input data is a number 0-9.
 51 |   * A `kubectl` command is issued that creates a bare pod whose entrypoint (i.e. what runs in the pod) installs kubeface if necessary then calls the command `_kubeface-run-task <input-path> <output-path>`.
 52 |   * The `_kubeface-run-task` command downloads the input file from cloud storage, runs the task, and uploads the result to the specified path.
 53 | * After all tasks have been submitted, kubeface waits for all results to appear in cloud storage. It may speculatively re-submit some tasks that appear to be straggling or crashed.
 54 | * Once all results are available, each task’s result is read by the master and yielded to the client code
 55 | 
 56 | 
 57 | Docker images
 58 | -------------
 59 | 
 60 | Kubeface tasks execute in the context of a particular docker image, since they run in a kubernetes pod. You can use any docker image with python installed. If your docker image does not have kubeface installed, then by default kubeface will try to install itself using `pip`. This is inefficient since it will run for every task. If you plan on running many tasks it's a good idea to create your own docker image with kubeface installed.
 61 | 
 62 | Inspecting job status
 63 | ----------------------
 64 | Kubeface writes out HTML and JSON status pages to cloud storage and logs to stdout. However, the best way to figure out what's going on with your job is to use kubernetes directly, via `kubectl get pods` and `kubectl logs <pod-name>`.
 65 | 
 66 | 
 67 | Installation
 68 | ============
 69 | 
 70 | From a checkout:
 71 | 
 72 |     pip install -e .
 73 | 
 74 | To run the tests:
 75 | 
 76 |     # Setting this environment variable is optional.
 77 |     # If you set it in the tests will run against a real google storage bucket.
 78 |     # See https://developers.google.com/identity/protocols/application-default-credentials#howtheywork;
 79 |     # you need to get Application Default Credentials before writing to your bucket.
 80 |     KUBEFACE_STORAGE=gs://kubeface-test  # tests will write to gs://kubeface-test.
 81 | 
 82 |     # Run tests:
 83 |     nosetests
 84 | 
 85 | Shell Example
 86 | =============
 87 | 
 88 | The `kubeface-run` command runs a job from the shell, which is useful for testing or simple tasks.
 89 | 
 90 | If you don’t already have a kubernetes cluster running, use a command like this to start one:
 91 | 
 92 |     gcloud config set compute/zone us-east1-c
 93 |     gcloud components install kubectl  # if you haven't already installed kubectl
 94 |     gcloud container clusters create kubeface-cluster-$(whoami) \
 95 |         --scopes storage-full \
 96 |         --zone us-east1-c \
 97 |         --num-nodes=2 \
 98 |         --enable-autoscaling --min-nodes=1 --max-nodes=100 \
 99 |         --machine-type=n1-standard-16
100 | 
101 | You should see your cluster listed here: <https://console.cloud.google.com/kubernetes/list>
102 | 
103 | Then run this to set it as the default for your session:
104 | 
105 |     gcloud config set container/cluster kubeface-cluster-$(whoami)
106 |     gcloud container clusters get-credentials kubeface-cluster-$(whoami)
107 | 
108 | Now launch a command:
109 | 
110 |     kubeface-run \
111 |         --expression 'value**2' \
112 |         --generator-expression 'range(10)' \
113 |         --kubeface-max-simultaneous-tasks 10 \
114 |         --kubeface-backend kubernetes \
115 |         --kubeface-worker-image continuumio/anaconda3 \
116 |         --kubeface-kubernetes-task-resources-cpu 1 \
117 |         --kubeface-kubernetes-task-resources-memory-mb 500 \
118 |         --verbose \
119 |         --out-csv /tmp/result.csv
120 | 
121 | If you kill the above command, you can run this to kill all the running pods in your cluster:
122 | 
123 |     kubectl delete pods --all
124 | 
125 | When you’re done working, delete your cluster:
126 | 
127 |     gcloud container clusters delete kubeface-cluster-$(whoami)
128 | 
129 | 


--------------------------------------------------------------------------------
/design.md:
--------------------------------------------------------------------------------
  1 | # Design
  2 | 
  3 | ## Motivation
  4 | 
  5 | We would like to run fairly long-running Python tasks over Kubernetes on Google Cloud.
  6 | 
  7 | Two applications we need this for are MHCflurry model selection and data preparation for antigen presentation predictors, where we would like to run some analyses over the full peptidome.
  8 | 
  9 | We have previously experimented with an approach based on running [dask-distributed](https://github.com/dask/distributed) on Kubernetes as described [here](https://github.com/dask/distributedhttps://github.com/hammerlab/dask-distributed-on-kubernetes).
 10 | 
 11 | However, having long running server processes as in dask distributed has resulted in reliability issues for us. When results are large the distributed scheduler seems to slow down or crash. Since we don't care about latency, I think it would be less errorprone to run each task in its own Kubernetes job and use Google Buckets to shuffle data around.
 12 | 
 13 | Our MHCflurry code can use any parallel map implementation, see e.g. [here](https://github.com/hammerlab/mhcflurry/blob/master/mhcflurry/class1_allele_specific/train.py#L308). We should be able to make a library that plugs in there without any significant modification to MHCflurry.
 14 | 
 15 | Design parameters
 16 |  * There's a master process, which the user launches. It calls a parallel map impementation to do work on the cluster.
 17 |  * Tasks are independent, do not communicate
 18 |  * Long running tasks, say 5 min - 5 hours.
 19 |  * Many tasks: as many as 10k.
 20 |  * Significant data exchange. Input and result to *each task* may be as high as 1gb. Full input dataset to all tasks does not fit in memory on any node. Full result set across tasks
 21 |  also does not fit. Input and result from any single task fits on all nodes.
 22 |  * No attempt at recovery if a task throws a Python exception. Kill the whole run.
 23 | 
 24 | The main goal here is simplicity and reliability. We do not care at all about latency; fine if it takes 5 minutes for Kubernetes to launch a task. We want to push all tricky issues, in particular scheduling of tasks and recovery of failed nodes, onto Kubernetes. We should never have two python processes talking directly to each other. We should only interact with Kubernetes and Google Storage Buckets. 
 25 | 
 26 | ## Interface
 27 | 
 28 | This project should expose a library that implements a parallel map, e.g.
 29 | 
 30 | ```python
 31 | def parallel_map(func, iterable):
 32 |     """
 33 |     Parallel map. Each invocation of func is run in its own kubernetes Job.
 34 | 
 35 |     Returns (func(x) for x in iterable)
 36 |     """
 37 | ```
 38 | 
 39 | There is some configuration that is shared across invocations of parallel_map, so it makes sense to put this in a class and then have parallel_map as a method of it, e.g.
 40 | 
 41 | ```python
 42 | class Client(object):
 43 |     def __init__(
 44 |             self,
 45 |             image,
 46 |             bucket,
 47 |             image_pull_policy="Always",
 48 |             cluster=None,
 49 |             available_parallelism=None,
 50 |             python_path='/usr/bin/env python',
 51 |             run_locally=False):
 52 |             """
 53 |             Create a client for running tasks on Kubernetes.
 54 | 
 55 |             Parameters
 56 |             --------------
 57 | 
 58 |             image : string
 59 |                 Docker image to use (on docker hub)
 60 | 
 61 |             image_pull_policy : boolean, optional
 62 |                 Kubernetes imagePullPolicy setting. See [1]
 63 | 
 64 |             cluster : string
 65 |                 Kubernetes cluster to schedule jobs on
 66 | 
 67 |             available_parallelism : int
 68 |                 If specified, max number of jobs to schedule on Kubernetes at once
 69 | 
 70 |             python_path : string
 71 |                 Path to Python binary in the image
 72 | 
 73 |             run_locally: boolean
 74 |                 Run tasks in the current process. Useful for testing
 75 | 
 76 |             [1] http://kubernetes.io/docs/user-guide/images/
 77 |             """
 78 | 
 79 |     def parallel_map(func, iterable):
 80 |     ...
 81 | ```
 82 | 
 83 | 
 84 | ## Implementation
 85 | 
 86 | Possible first-pass implementation. For each task (this is running on the master node):
 87 | 
 88 |  * Serialize the function to run and its input (using e.g. [dill](https://github.com/uqfoundation/dill))
 89 |  * Copy serialized data to a Google Bucket, give the file a unique name.
 90 |  * Schedule a Kubernetes job that runs a Python script that downloads the serialized data from Google Bucket, unserializes it, runs the function on the data, and copies the serialized result to a unique filename on the Google Bucket
 91 | 
 92 | Then the master node would poll for the results on Google Bucket, and perhaps issue Kubernetes commands to watch what's been scheduled etc. and report the progress to the user.
 93 | 
 94 | We can either issue kubernetes and gsutil commandline calls directly or interact with them through their REST APIs using a project like [pykube](https://github.com/kelproject/pykube).
 95 | 
 96 | Kubernetes [secrets](http://kubernetes.io/docs/user-guide/secrets/) may be an alternative approach to sending each task its input data.
 97 | 
 98 | 
 99 | ## Unknowns
100 | 
101 |  * Is Google Bucket going to hold up to having tons of tasks hitting it with downloads and uploads? Is it fast enough?
102 |  * Is Kubernetes stable enough?
103 |  * How can we test this library without actually using Google Cloud? [Kubernetes on vagrant](https://coreos.com/kubernetes/docs/latest/kubernetes-on-vagrant-single.html) may be relevant here. Not sure what to do about Google Bucket dependency.
104 | 
105 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Kubeface simple example.
 3 | 
 4 | Computes the square of numbers 1 .. N, where N is specified on the commandline.
 5 | 
 6 | Example:
 7 | 
 8 | $ python example.py 10 --kubeface-backend local-process --kubeface-storage /tmp
 9 | 
10 | """
11 | 
12 | import argparse
13 | import sys
14 | 
15 | import kubeface
16 | 
17 | parser = argparse.ArgumentParser(usage=__doc__)
18 | parser.add_argument("n", type=int)
19 | kubeface.Client.add_args(parser)  # Add kubeface arguments
20 | 
21 | 
22 | def my_function(x):
23 |     return x**2
24 | 
25 | 
26 | def main(argv):
27 |     args = parser.parse_args(argv)
28 |     client = kubeface.Client.from_args(args)
29 | 
30 |     input_values = range(1, args.n + 1)
31 |     results = client.map(my_function, input_values)
32 | 
33 |     for (x, result) in zip(input_values, results):
34 |         print("%5d**2 = %5d" % (x, result))
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     main(sys.argv[1:])
39 | 


--------------------------------------------------------------------------------
/kubeface/__init__.py:
--------------------------------------------------------------------------------
 1 | from .client import Client
 2 | from .local_process_backend import LocalProcessBackend
 3 | from .local_process_docker_backend import LocalProcessDockerBackend
 4 | from .kubernetes_backend import KubernetesBackend
 5 | from .worker_configuration import WorkerConfiguration
 6 | 
 7 | 
 8 | __all__ = [
 9 |     "Client",
10 |     "LocalProcessBackend",
11 |     "LocalProcessDockerBackend",
12 |     "KubernetesBackend",
13 |     "WorkerConfiguration",
14 | ]
15 | 


--------------------------------------------------------------------------------
/kubeface/backend.py:
--------------------------------------------------------------------------------
1 | class Backend(object):
2 |     def submit_task(self, task_input, task_output):
3 |         raise NotImplementedError
4 | 
5 |     def supports_storage(self, path_or_url):
6 |         return True
7 | 


--------------------------------------------------------------------------------
/kubeface/backends.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | from . import local_process_backend
 3 | from . import local_process_docker_backend
 4 | from . import kubernetes_backend
 5 | 
 6 | BACKENDS = collections.OrderedDict([
 7 |     ('local-process', local_process_backend.LocalProcessBackend),
 8 |     ('local-process-docker',
 9 |         local_process_docker_backend.LocalProcessDockerBackend),
10 |     ('kubernetes', kubernetes_backend.KubernetesBackend),
11 | ])
12 | 
13 | 
14 | def add_args(parser):
15 |     parser.add_argument(
16 |         "--kubeface-backend",
17 |         choices=tuple(BACKENDS),
18 |         default=tuple(BACKENDS)[0])
19 | 
20 |     for (backend, klass) in BACKENDS.items():
21 |         klass.add_args(parser)
22 |     return parser
23 | 
24 | 
25 | def backend_from_args(args):
26 |     return BACKENDS[args.kubeface_backend].from_args(args)
27 | 


--------------------------------------------------------------------------------
/kubeface/bucket_storage.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import tempfile
  3 | import time
  4 | 
  5 | from googleapiclient import discovery
  6 | from googleapiclient import http
  7 | 
  8 | from oauth2client.client import GoogleCredentials
  9 | 
 10 | # Some of this is copied from:
 11 | # https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/storage/api/crud_object.py
 12 | # and:
 13 | # https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/storage/api/list_objects.py
 14 | 
 15 | RETRIES_BEFORE_FAILURE = 12
 16 | FIRST_RETRY_SLEEP = 2.0
 17 | _SERVICE = None
 18 | 
 19 | 
 20 | def get_service():
 21 |     global _SERVICE
 22 |     if _SERVICE is None:
 23 |         _SERVICE = create_service()
 24 |     return _SERVICE
 25 | 
 26 | 
 27 | def create_service():
 28 |     # Get the application default credentials. When running locally, these are
 29 |     # available after running `gcloud init`. When running on compute
 30 |     # engine, these are available from the environment.
 31 |     credentials = GoogleCredentials.get_application_default()
 32 | 
 33 |     # Construct the service object for interacting with the Cloud Storage API -
 34 |     # the 'storage' service, at version 'v1'.
 35 |     # You can browse other available api services and versions here:
 36 |     #     http://g.co/dev/api-client-library/python/apis/
 37 |     return discovery.build('storage', 'v1', credentials=credentials)
 38 | 
 39 | 
 40 | def robustify(function):
 41 |     def robust_function(*args, **kwargs):
 42 |         error_num = 0
 43 |         while True:
 44 |             try:
 45 |                 return function(*args, **kwargs)
 46 |             except Exception as e:
 47 |                 error_num += 1
 48 |                 logging.warning(
 49 |                     "Exception calling %s: '%s'. "
 50 |                     "This call has failed %d times. Will retry up to "
 51 |                     "%d times." % (
 52 |                         str(function),
 53 |                         str(e),
 54 |                         error_num,
 55 |                         RETRIES_BEFORE_FAILURE))
 56 | 
 57 |                 if error_num > RETRIES_BEFORE_FAILURE:
 58 |                     raise
 59 | 
 60 |                 sleep_time = FIRST_RETRY_SLEEP**error_num
 61 |                 logging.warn("Sleeping for %0.2f seconds." % sleep_time)
 62 |                 time.sleep(sleep_time)
 63 |     return robust_function
 64 | 
 65 | 
 66 | def split_bucket_and_name(url):
 67 |     if not url.startswith("gs://"):
 68 |         raise ValueError("Not a gs:// url: %s" % url)
 69 |     return url[len("gs://"):].split("/", 1)
 70 | 
 71 | 
 72 | @robustify
 73 | def list_contents(prefix):
 74 |     splitted = split_bucket_and_name(prefix)
 75 |     if len(splitted) == 1:
 76 |         (bucket_name, file_name_prefix) = (splitted[0], "")
 77 |     else:
 78 |         (bucket_name, file_name_prefix) = splitted
 79 | 
 80 |     # Create a request to objects.list to retrieve a list of objects.
 81 |     fields_to_return = \
 82 |         'nextPageToken,items(name)'
 83 |     req = get_service().objects().list(
 84 |         bucket=bucket_name,
 85 |         prefix=file_name_prefix,
 86 |         maxResults=100000,
 87 |         fields=fields_to_return)
 88 | 
 89 |     all_objects = []
 90 |     # If you have too many items to list in one request, list_next() will
 91 |     # automatically handle paging with the pageToken.
 92 |     while req:
 93 |         resp = req.execute()
 94 |         all_objects.extend(resp.get('items', []))
 95 |         req = get_service().objects().list_next(req, resp)
 96 |     return [item['name'] for item in all_objects]
 97 | 
 98 | 
 99 | @robustify
100 | def move(source, dest):
101 |     # From https://cloud.google.com/storage/docs/json_api/v1/objects/rewrite
102 |     (bucket_name, source_object) = split_bucket_and_name(source)
103 |     (bucket_name2, dest_object) = split_bucket_and_name(dest)
104 |     service = get_service()
105 | 
106 |     request = service.objects().rewrite(
107 |         sourceBucket=bucket_name,
108 |         sourceObject=source_object,
109 |         destinationBucket=bucket_name,
110 |         destinationObject=dest_object,
111 |         body={})
112 |     request.execute()
113 | 
114 |     # Delete source.
115 |     request = service.objects().delete(
116 |         bucket=bucket_name,
117 |         object=source_object)
118 |     request.execute()
119 | 
120 | 
121 | @robustify
122 | def put(
123 |         name,
124 |         input_handle,
125 |         readers=[],
126 |         owners=[],
127 |         mime_type='application/octet-stream'):
128 |     input_handle.seek(0)
129 |     (bucket_name, file_name) = split_bucket_and_name(name)
130 | 
131 |     # This is the request body as specified:
132 |     # http://g.co/cloud/storage/docs/json_api/v1/objects/insert#request
133 |     body = {
134 |         'name': file_name,
135 |     }
136 | 
137 |     # If specified, create the access control objects and add them to the
138 |     # request body
139 |     if readers or owners:
140 |         body['acl'] = []
141 | 
142 |     for r in readers:
143 |         body['acl'].append({
144 |             'entity': 'user-%s' % r,
145 |             'role': 'READER',
146 |             'email': r
147 |         })
148 |     for o in owners:
149 |         body['acl'].append({
150 |             'entity': 'user-%s' % o,
151 |             'role': 'OWNER',
152 |             'email': o
153 |         })
154 | 
155 |     # Now insert them into the specified bucket as a media insertion.
156 |     req = get_service().objects().insert(
157 |         bucket=bucket_name,
158 |         body=body,
159 |         # You can also just set media_body=filename, but # for the sake of
160 |         # demonstration, pass in the more generic file handle, which could
161 |         # very well be a StringIO or similar.
162 |         media_body=http.MediaIoBaseUpload(input_handle, mime_type))
163 |     resp = req.execute()
164 | 
165 |     return resp
166 | 
167 | 
168 | @robustify
169 | def get(name, output_handle=None):
170 |     (bucket_name, file_name) = split_bucket_and_name(name)
171 | 
172 |     if output_handle is None:
173 |         output_handle = tempfile.TemporaryFile(
174 |             prefix="kubeface-bucket-storage-",
175 |             suffix=".data")
176 | 
177 |     # Use get_media instead of get to get the actual contents of the object
178 |     req = get_service().objects().get_media(
179 |         bucket=bucket_name,
180 |         object=file_name)
181 |     downloader = http.MediaIoBaseDownload(output_handle, req)
182 | 
183 |     done = False
184 |     while done is False:
185 |         (status, done) = downloader.next_chunk()
186 |         logging.debug("Download {}%.".format(int(status.progress() * 100)))
187 |     output_handle.seek(0)
188 |     return output_handle
189 | 
190 | 
191 | @robustify
192 | def delete(name):
193 |     (bucket_name, file_name) = split_bucket_and_name(name)
194 |     req = get_service().objects().delete(bucket=bucket_name, object=file_name)
195 |     return req.execute()
196 | 
197 | 
198 | def access_info(name):
199 |     (bucket_name, file_name) = split_bucket_and_name(name)
200 |     return (
201 |         "https://storage.cloud.google.com/"
202 |         "{bucket_name}/{file_name}\t[ {name} ]".format(
203 |             bucket_name=bucket_name,
204 |             file_name=file_name,
205 |             name=name))
206 | 


--------------------------------------------------------------------------------
/kubeface/client.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import logging
  3 | import os
  4 | 
  5 | from .remote_object import RemoteObject
  6 | from .job import Job
  7 | from .task import Task
  8 | from . import (
  9 |     backends,
 10 |     worker_configuration,
 11 |     naming,
 12 |     context,
 13 |     storage)
 14 | 
 15 | 
 16 | def run_multiple(function, values):
 17 |     return [function(v) for v in values]
 18 | 
 19 | 
 20 | class Client(object):
 21 |     """
 22 |     User interface to Kubeface.
 23 |     """
 24 | 
 25 |     @staticmethod
 26 |     def add_args(parser):
 27 |         """
 28 |         Add commandline arguments to argument parser.
 29 |         
 30 |         Parameters
 31 |         ----------
 32 |         parser : argparse.ArgumentParser
 33 |         """
 34 |         group = parser.add_argument_group("kubeface client")
 35 |         group.add_argument(
 36 |             "--kubeface-max-simultaneous-tasks",
 37 |             type=int,
 38 |             default=10)
 39 |         group.add_argument(
 40 |             "--kubeface-poll-seconds",
 41 |             type=float,
 42 |             default=30.0)
 43 |         group.add_argument(
 44 |             "--kubeface-storage",
 45 |             default=os.environ.get("KUBEFACE_STORAGE", "gs://kubeface"),
 46 |             help="Default: %(default)s")
 47 |         group.add_argument(
 48 |             "--kubeface-cache-key-prefix")
 49 |         group.add_argument(
 50 |             "--kubeface-never-cleanup",
 51 |             action="store_true",
 52 |             default=False)
 53 |         group.add_argument(
 54 |             "--kubeface-wait-to-raise-task-exception",
 55 |             action="store_true",
 56 |             default=False)
 57 |         group.add_argument(
 58 |             "--kubeface-speculation-percent",
 59 |             type=float,
 60 |             default=20)
 61 |         group.add_argument(
 62 |             "--kubeface-speculation-runtime-percentile",
 63 |             type=float,
 64 |             default=99)
 65 |         group.add_argument(
 66 |             "--kubeface-speculation-max-reruns",
 67 |             type=int,
 68 |             default=3)
 69 | 
 70 |         worker_configuration.WorkerConfiguration.add_args(group)
 71 |         backends.add_args(group)
 72 | 
 73 |     @staticmethod
 74 |     def from_args(args):
 75 |         """
 76 |         Instantiate a Client from commandline args.
 77 |         
 78 |         Parameters
 79 |         ----------
 80 |         args : argparse.Namespace
 81 | 
 82 |         Returns
 83 |         -------
 84 |         Client
 85 | 
 86 |         """
 87 |         backend = backends.backend_from_args(args)
 88 |         if not backend.supports_storage(args.kubeface_storage):
 89 |             raise ValueError(
 90 |                 "Backend '%s' does not support storage: %s" % (
 91 |                     args.kubeface_backend, args.kubeface_storage))
 92 |         return Client(
 93 |             backend,
 94 |             max_simultaneous_tasks=args.kubeface_max_simultaneous_tasks,
 95 |             poll_seconds=args.kubeface_poll_seconds,
 96 |             storage=args.kubeface_storage,
 97 |             cache_key_prefix=args.kubeface_cache_key_prefix,
 98 |             never_cleanup=args.kubeface_never_cleanup,
 99 |             wait_to_raise_task_exception=(
100 |                 args.kubeface_wait_to_raise_task_exception),
101 |             speculation_percent=args.kubeface_speculation_percent,
102 |             speculation_runtime_percentile=(
103 |                 args.kubeface_speculation_runtime_percentile),
104 |             speculation_max_reruns=args.kubeface_speculation_max_reruns)
105 | 
106 |     def __init__(
107 |             self,
108 |             backend,
109 |             max_simultaneous_tasks=10,
110 |             poll_seconds=30.0,
111 |             storage="gs://kubeface",
112 |             cache_key_prefix=None,
113 |             never_cleanup=False,
114 |             wait_to_raise_task_exception=False,
115 |             speculation_percent=0,
116 |             speculation_runtime_percentile=99,
117 |             speculation_max_reruns=1):
118 |         """
119 |         Parameters
120 |         ----------
121 |         backend : kubeface.Backend
122 |         
123 |         max_simultaneous_tasks : int
124 |             Maximum number of tasks to submit at once.
125 |             
126 |         poll_seconds : float
127 |             How often to poll for task results
128 |         
129 |         storage : str
130 |             Bucket or (for local file process backend) local filesystem path to
131 |             write task inputs and outputs.
132 |         
133 |         cache_key_prefix : str
134 |             If you set this to the same value in multiple clients, they will
135 |             reuse each other's results. Advanced use only.
136 |         
137 |         never_cleanup : boolean
138 |             Do not cleanup after successful tasks. 
139 |             
140 |         wait_to_raise_task_exception : boolean
141 |             If True, all tasks are run before any failing task's exception is
142 |             raised. If False, then the exception is raised as soon as it is
143 |             received.
144 |         
145 |         speculation_percent : float
146 |             No speculation occurs until all tasks have been submitted and at
147 |             least 100 - speculation_percent tasks have completed. So if you set
148 |             this to 20 then the last 20% of tasks will be considered for
149 |             speculatively rerunning.
150 |         
151 |         speculation_runtime_percentile : float
152 |             A task will be rerun when its queue time exceeds
153 |             speculation_runtime_percentile of the queue times of the tasks that
154 |             completed successfully without speculation.
155 |         
156 |         speculation_max_reruns : int
157 |             Tasks can be rerun up to speculation_max_reruns times.
158 |         """
159 | 
160 |         self.backend = backend
161 |         self.max_simultaneous_tasks = max_simultaneous_tasks
162 |         self.poll_seconds = poll_seconds
163 |         self.storage = storage
164 |         self.cache_key_prefix = (
165 |             cache_key_prefix if cache_key_prefix
166 |             else naming.make_cache_key_prefix())
167 |         self.never_cleanup = never_cleanup
168 |         self.wait_to_raise_task_exception = wait_to_raise_task_exception
169 |         self.speculation_percent = speculation_percent
170 |         self.speculation_runtime_percentile = speculation_runtime_percentile
171 |         self.speculation_max_reruns = speculation_max_reruns
172 | 
173 |         self.submitted_jobs = []
174 |         self.next_object_num = 1
175 | 
176 |     def __getstate__(self):
177 |         # Don't serialize jobs
178 |         d = dict(self.__dict__)
179 |         d['submitted_jobs'] = []
180 |         return d
181 | 
182 |     def next_cache_key(self):
183 |         return "%s-%03d" % (
184 |             self.cache_key_prefix,
185 |             len(self.submitted_jobs))
186 | 
187 |     def submit(self, tasks, num_tasks=None, cache_key=None):
188 |         """
189 |         Run a Job.
190 |         
191 |         Parameters
192 |         ----------
193 |         tasks : iterable of kubeface.Task
194 |         
195 |         num_tasks : int
196 |             If tasks has no len(...), for example in the case of a generator,
197 |             if you specify num_tasks then your progress output will use that
198 |             number of tasks.
199 |         
200 |         cache_key : str
201 |             Advanced use only for reusing pre-existing results.
202 | 
203 |         Returns
204 |         -------
205 |         kubeface.Job
206 | 
207 |         """
208 |         if num_tasks is None:
209 |             try:
210 |                 num_tasks = len(tasks)
211 |             except TypeError:
212 |                 pass
213 |         job = Job(
214 |             self.backend,
215 |             tasks,
216 |             num_tasks=num_tasks,
217 |             cache_key=cache_key if cache_key else self.next_cache_key(),
218 |             max_simultaneous_tasks=self.max_simultaneous_tasks,
219 |             storage=self.storage,
220 |             wait_to_raise_task_exception=self.wait_to_raise_task_exception,
221 |             speculation_percent=self.speculation_percent,
222 |             speculation_runtime_percentile=self.speculation_runtime_percentile,
223 |             speculation_max_reruns=self.speculation_max_reruns)
224 |         self.submitted_jobs.append(job)
225 |         return job
226 | 
227 |     def map(
228 |             self,
229 |             function,
230 |             iterable,
231 |             items_per_task=1,
232 |             num_items=None,
233 |             cache_key=None):
234 |         """
235 |         Parallel map. This is the primary user-facing API.
236 |         
237 |         Parameters
238 |         ----------
239 |         function : callable
240 |             Python function to run over each item
241 |         
242 |         iterable : iterable of object
243 |             items to pass to function
244 |         
245 |         items_per_task : int
246 |             If items_per_task is 1 then each item to map over gets its own task.
247 |             If it's 10 then the first 10 items are one task, the next 10 are
248 |             another, etc.
249 |         
250 |         num_items : int
251 |             If the iterable provided has no len(...) then setting num_items
252 |             will give better progress output. Not required in any case though.
253 |         
254 |         cache_key : str
255 |             Advanced use only for reusing pre-existing results.
256 | 
257 |         Returns
258 |         -------
259 |         generator of task results, in order
260 | 
261 |         """
262 |         def grouped():
263 |             iterator = iter(iterable)
264 |             while True:
265 |                 items = []
266 |                 try:
267 |                     while len(items) < items_per_task:
268 |                         items.append(next(iterator))
269 |                 except StopIteration:
270 |                     pass
271 |                 if items:
272 |                     yield items
273 |                 else:
274 |                     break
275 | 
276 |         num_tasks = None
277 |         if num_items is None:
278 |             try:
279 |                 num_items = len(iterable)
280 |                 num_tasks = int(math.ceil(float(num_items) / items_per_task))
281 |             except TypeError:
282 |                 pass
283 | 
284 |         tasks = (
285 |             Task(run_multiple, (function, values)) for values in grouped())
286 |         job = self.submit(tasks, num_tasks=num_tasks, cache_key=cache_key)
287 |         try:
288 |             job.wait(poll_seconds=self.poll_seconds)
289 |             for result in job.results():
290 |                 result.log()
291 |                 result.raise_if_exception()
292 |                 for result_item in result.return_value:
293 |                     yield result_item
294 |         finally:
295 |             self.mark_jobs_done(job_names=[job.job_name])
296 | 
297 |     def mark_jobs_done(self, job_names=None):
298 |         status_pages = set()
299 |         status_prefixes = naming.status_prefixes(job_names=job_names)
300 |         for prefix in status_prefixes:
301 |             status_pages.update(storage.list_contents(
302 |                 self.storage + "/" + prefix))
303 |         for source_object in status_pages:
304 |             parsed = naming.JOB_STATUS_PAGE.make_tuple(source_object)
305 |             if parsed.status == 'active':
306 |                 new_parsed = parsed._replace(status="done")
307 |                 dest_object = naming.JOB_STATUS_PAGE.make_string(new_parsed)
308 |                 logging.info("Marking job '%s' done: renaming %s -> %s" % (
309 |                     parsed.job_name,
310 |                     source_object,
311 |                     dest_object))
312 |                 storage.move(
313 |                     self.storage + "/" + source_object,
314 |                     self.storage + "/" + dest_object)
315 |             else:
316 |                 logging.info("Already marked done: %s" % source_object)
317 | 
318 |     def cleanup_job(self, job_name):
319 |         cache_key = naming.JOB.make_tuple(job_name).cache_key
320 |         results = storage.list_contents(
321 |             self.storage +
322 |             "/" +
323 |             naming.task_result_prefix(cache_key))
324 |         inputs = storage.list_contents(
325 |             self.storage +
326 |             "/" +
327 |             naming.task_input_prefix(cache_key))
328 |         logging.info("Cleaning up cache key '%s': %d results, %d inputs." % (
329 |             cache_key, len(results), len(inputs)))
330 | 
331 |         for item in results + inputs:
332 |             storage.delete(self.storage + "/" + item)
333 | 
334 |         self.mark_jobs_done(job_names=[job_name])
335 | 
336 |     def job_summary(self, job_names=None, include_done=False):
337 |         prefixes = naming.status_prefixes(
338 |             job_names=job_names,
339 |             formats=["json"],
340 |             statuses=(["active"] + (["done"] if include_done else [])))
341 |         all_objects = []
342 |         for prefix in prefixes:
343 |             all_objects.extend(
344 |                 storage.list_contents(
345 |                     self.storage + "/" + prefix))
346 |         logging.debug("Listed %d status pages from prefixes: %s" % (
347 |             len(all_objects), " ".join(prefixes)))
348 |         return [
349 |             naming.JOB_STATUS_PAGE.make_tuple(obj)
350 |             for obj in sorted(all_objects)
351 |         ]
352 | 
353 |     def cleanup(self):
354 |         if self.never_cleanup:
355 |             logging.warn("Cleanup disabled; skipping.")
356 |         else:
357 |             for job in self.submitted_jobs:
358 |                 logging.info("Cleaning up for job: %s" % job.job_name)
359 |                 self.cleanup_job(job.job_name)
360 | 
361 |     def remote_object(self, value):
362 |         file_path = (
363 |             self.storage +
364 |             "/" +
365 |             naming.make_remote_object_name(
366 |                 cache_key_prefix=self.cache_key_prefix,
367 |                 node_id=context.node_id(),
368 |                 object_num=self.next_object_num))
369 |         self.next_object_num += 1
370 |         return RemoteObject(file_path=file_path, value=value)
371 | 


--------------------------------------------------------------------------------
/kubeface/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hammerlab/kubeface/443d7432e6d2f8e4d20b6326e98fabeec7ad68b6/kubeface/commands/__init__.py


--------------------------------------------------------------------------------
/kubeface/commands/copy.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copy files, including support for google storage buckets.
 3 | '''
 4 | 
 5 | import sys
 6 | import argparse
 7 | import logging
 8 | 
 9 | from .. import storage
10 | from ..common import configure_logging
11 | from .. import serialization
12 | 
13 | parser = argparse.ArgumentParser(description=__doc__)
14 | 
15 | parser.add_argument("source")
16 | parser.add_argument("destination")
17 | 
18 | parser.add_argument(
19 |     "--no-error",
20 |     action="store_true",
21 |     default=False,
22 |     help="")
23 | 
24 | parser.add_argument(
25 |     "--quiet",
26 |     action="store_true",
27 |     default=False,
28 |     help="")
29 | 
30 | parser.add_argument(
31 |     "--verbose",
32 |     action="store_true",
33 |     default=False,
34 |     help="")
35 | 
36 | parser.add_argument(
37 |     "--print-deserialized",
38 |     action="store_true",
39 |     default=False,
40 |     help="")
41 | 
42 | 
43 | def run(argv=sys.argv[1:]):
44 |     args = parser.parse_args(argv)
45 |     configure_logging(args)
46 | 
47 |     logging.info("Reading: %s" % args.source)
48 |     input_handle = storage.get(args.source)
49 | 
50 |     if args.print_deserialized:
51 |         deserialized = serialization.load(input_handle)
52 |         input_handle.seek(0)
53 |         print(deserialized)
54 | 
55 |     if args.destination == "-":
56 |         print(input_handle.read())
57 |     else:
58 |         logging.info("Writing: %s" % args.destination)
59 |         storage.put(args.destination, input_handle)
60 | 
61 |     logging.info("Completed.")
62 | 


--------------------------------------------------------------------------------
/kubeface/commands/job.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Get info on and manipulate jobs.
 3 | '''
 4 | 
 5 | import sys
 6 | import argparse
 7 | import collections
 8 | import logging
 9 | 
10 | from ..client import Client
11 | from ..common import configure_logging
12 | from .. import naming
13 | 
14 | parser = argparse.ArgumentParser(description=__doc__)
15 | parser.add_argument("jobs", nargs="*")
16 | parser.add_argument(
17 |     "--cleanup",
18 |     action="store_true",
19 |     default=False)
20 | parser.add_argument(
21 |     "--include-done",
22 |     action="store_true",
23 |     default=False)
24 | 
25 | Client.add_args(parser)
26 | 
27 | 
28 | parser.add_argument(
29 |     "--quiet",
30 |     action="store_true",
31 |     default=False,
32 |     help="")
33 | 
34 | parser.add_argument(
35 |     "--verbose",
36 |     action="store_true",
37 |     default=False,
38 |     help="")
39 | 
40 | 
41 | def run(argv=sys.argv[1:]):
42 |     args = parser.parse_args(argv)
43 |     configure_logging(args)
44 | 
45 |     client = Client.from_args(args)
46 |     summary = client.job_summary(
47 |         job_names=args.jobs if args.jobs else None,
48 |         include_done=args.include_done)
49 | 
50 |     if not summary:
51 |         print("No jobs.")
52 | 
53 |     jobs_by_cache_key = collections.defaultdict(list)
54 |     job_info_by_name = {}
55 |     for job_info_tuple in summary:
56 |         job_info = job_info_tuple._asdict()
57 |         job_name = job_info.pop('job_name')
58 |         del job_info['format']
59 |         cache_key = naming.JOB.make_tuple(job_name).cache_key
60 |         jobs_by_cache_key[cache_key].append(job_name)
61 |         if job_name in job_info_by_name:
62 |             logging.warning("Multiple status pages for job: %s: %s %s" % (
63 |                 job_name,
64 |                 job_info['job_status_page_name'],
65 |                 job_info_by_name[job_name]['job_status_page_name']))
66 |         job_info_by_name[job_name] = job_info
67 | 
68 |     for cache_key in jobs_by_cache_key:
69 |         print("Cache key: %s" % cache_key)
70 |         for job_name in jobs_by_cache_key[cache_key]:
71 |             info = job_info_by_name[job_name]
72 |             print("\t%7s : %s" % (info['status'], job_name))
73 |         print("")
74 | 


--------------------------------------------------------------------------------
/kubeface/commands/run.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Run a job.
 3 | '''
 4 | 
 5 | import sys
 6 | import argparse
 7 | import logging
 8 | import subprocess
 9 | import numpy
10 | import csv
11 | from functools import partial
12 | 
13 | from ..client import Client
14 | from ..common import configure_logging
15 | 
16 | EVAL_ENVIRONMENT = {
17 |     "numpy": numpy,
18 | }
19 | 
20 | parser = argparse.ArgumentParser(description=__doc__)
21 | command_group = parser.add_mutually_exclusive_group()
22 | command_group.add_argument("--shell-command")
23 | command_group.add_argument("--expression")
24 | 
25 | parser.add_argument("--generator-expression", required=True)
26 | 
27 | parser.add_argument("--out-csv")
28 | 
29 | Client.add_args(parser)
30 | 
31 | 
32 | parser.add_argument(
33 |     "--quiet",
34 |     action="store_true",
35 |     default=False,
36 |     help="")
37 | 
38 | parser.add_argument(
39 |     "--verbose",
40 |     action="store_true",
41 |     default=False,
42 |     help="")
43 | 
44 | 
45 | def shell_command_task(shell_command, value):
46 |     interpolated = shell_command.format(value)
47 |     logging.info("Running shell command: %s" % interpolated)
48 |     result = subprocess.check_output(interpolated, shell=True)
49 |     return (value, result)
50 | 
51 | 
52 | def expression_task(expression, value):
53 |     return (value, eval(expression, EVAL_ENVIRONMENT, {"value": value}))
54 | 
55 | 
56 | def generator_from_expression(expression):
57 |     for value in eval(expression, EVAL_ENVIRONMENT):
58 |         yield value
59 | 
60 | 
61 | def run(argv=sys.argv[1:]):
62 |     args = parser.parse_args(argv)
63 |     configure_logging(args)
64 | 
65 |     client = Client.from_args(args)
66 | 
67 |     if args.shell_command:
68 |         task_function = partial(shell_command_task, args.shell_command)
69 |     elif args.expression:
70 |         task_function = partial(expression_task, args.expression)
71 |     else:
72 |         parser.error("Must specify --shell-command or --expression")
73 | 
74 |     if args.generator_expression:
75 |         generator = generator_from_expression(
76 |             args.generator_expression)
77 |     else:
78 |         parser.error("Must specify --generator")
79 | 
80 |     results = client.map(task_function, generator)
81 | 
82 |     if args.out_csv:
83 |         writer = csv.writer(open(args.out_csv, "w"))
84 |     else:
85 |         writer = csv.writer(sys.stdout)
86 | 
87 |     writer.writerow(["Value", "Result"])
88 | 
89 |     for (value, return_value) in results:
90 |         writer.writerow([str(value), str(return_value)])
91 | 
92 |     logging.info("Wrote: %s" % (args.out_csv if args.out_csv else "(stdout)"))
93 | 


--------------------------------------------------------------------------------
/kubeface/commands/run_task.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Run a task. Used internally, not meant to be called by a user.
 3 | '''
 4 | 
 5 | import sys
 6 | import argparse
 7 | import logging
 8 | import tempfile
 9 | import math
10 | import signal
11 | import traceback
12 | import os
13 | 
14 | from .. import storage, serialization
15 | from ..common import configure_logging
16 | from ..context import RUNTIME_CONTEXT
17 | 
18 | parser = argparse.ArgumentParser(description=__doc__)
19 | 
20 | parser.add_argument("input_path")
21 | parser.add_argument("result_path")
22 | 
23 | parser.add_argument(
24 |     "--delete-input",
25 |     action="store_true",
26 |     default=False,
27 |     help="Delete input file on success.")
28 | 
29 | parser.add_argument(
30 |     "--quiet",
31 |     action="store_true",
32 |     default=False,
33 |     help="")
34 | 
35 | parser.add_argument(
36 |     "--verbose",
37 |     action="store_true",
38 |     default=False,
39 |     help="")
40 | 
41 | 
42 | def run(argv=sys.argv[1:]):
43 |     args = parser.parse_args(argv)
44 | 
45 |     # On sigusr1 print stack trace
46 |     print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid())
47 |     signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack())
48 | 
49 |     configure_logging(args)
50 | 
51 |     RUNTIME_CONTEXT["node_type"] = "task"
52 |     RUNTIME_CONTEXT["task_input_path"] = args.input_path
53 |     RUNTIME_CONTEXT["task_result_path"] = args.result_path
54 | 
55 |     logging.info("Reading: %s" % args.input_path)
56 |     input_handle = storage.get(args.input_path)
57 |     task = serialization.load(input_handle)
58 | 
59 |     logging.info("Deserialized task: %s" % task)
60 |     logging.info("Running task.")
61 |     result = task.run(input_size=input_handle.tell())
62 |     logging.info("Done running task.")
63 | 
64 |     result_path = args.result_path.format(
65 |         result_type=result.result_type,
66 |         result_time=int(math.ceil(result.end_time)))
67 | 
68 |     with tempfile.TemporaryFile(
69 |             prefix="kubeface-run-task-result-", suffix=".pkl") as fd:
70 |         logging.info("Serializing result.")
71 |         serialization.dump(result, fd)
72 |         logging.info("Serialized result to %d bytes." % fd.tell())
73 |         fd.seek(0)
74 |         logging.info("Writing: %s" % result_path)
75 |         storage.put(result_path, fd)
76 | 
77 |     if args.delete_input:
78 |         logging.info("Deleting: %s" % args.input_path)
79 |         storage.delete(args.input_path)
80 | 
81 |     logging.info("Done.")
82 | 


--------------------------------------------------------------------------------
/kubeface/common.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import subprocess
 3 | import sys
 4 | 
 5 | 
 6 | def check_call(*args, **kwargs):
 7 |     logging.info("Running: %s %s" % (args, kwargs))
 8 |     subprocess.check_call(*args, **kwargs)
 9 | 
10 | 
11 | def configure_logging(args=None, verbose=False):
12 |     if verbose or (args is not None and args.verbose):
13 |         level = logging.DEBUG
14 |     else:
15 |         level = logging.INFO
16 | 
17 |     logging.basicConfig(
18 |         format="%(asctime)s.%(msecs)d %(levelname)s %(module)s - %(funcName)s:"
19 |         " %(message)s",
20 |         datefmt="%Y-%m-%d %H:%M:%S",
21 |         stream=sys.stderr,
22 |         level=level)
23 | 
24 | 
25 | def human_readable_memory_size(num, suffix='B'):
26 |     # From: http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
27 |     for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
28 |         if abs(num) < 1024.0:
29 |             return "%3.1f %s%s" % (num, unit, suffix)
30 |         num /= 1024.0
31 |     return "%.1f %s%s" % (num, 'Yi', suffix)
32 | 
33 | 
34 | def truncate(s, max_length):
35 |     if len(s) < max_length:
36 |         return s
37 |     return s[:max_length] + "..."
38 | 


--------------------------------------------------------------------------------
/kubeface/context.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module defines information that allows code to determine if it is running
 3 | on a master Kubeface node (node_type == "master") or as a task
 4 | (node_type == "task").
 5 | 
 6 | This dict defaults to indicating running on a master node, and is updated by
 7 | the run-task command with task-specific information.
 8 | """
 9 | 
10 | from .naming import hash_value
11 | 
12 | RUNTIME_CONTEXT = {
13 |     "node_type": "master",
14 |     "task_input_path": None,
15 |     "task_result_path": None,
16 | }
17 | 
18 | 
19 | def node_id():
20 |     if RUNTIME_CONTEXT["node_type"] == "master":
21 |         return "node-master"
22 |     return "node-%s" % (
23 |         hash_value(
24 |             RUNTIME_CONTEXT["task_result_path"]))
25 | 


--------------------------------------------------------------------------------
/kubeface/job.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | import tempfile
  4 | import collections
  5 | 
  6 | from numpy import percentile, mean
  7 | 
  8 | from .serialization import dump
  9 | from . import storage, naming, context
 10 | from .status_writer import DefaultStatusWriter
 11 | from .common import human_readable_memory_size
 12 | from .result import Result
 13 | 
 14 | 
 15 | class Job(object):
 16 |     def __init__(
 17 |             self,
 18 |             backend,
 19 |             tasks_iter,
 20 |             max_simultaneous_tasks,
 21 |             storage,
 22 |             cache_key,
 23 |             num_tasks=None,
 24 |             wait_to_raise_task_exception=False,
 25 |             speculation_percent=0,
 26 |             speculation_runtime_percentile=99,
 27 |             speculation_max_reruns=0):
 28 | 
 29 |         self.backend = backend
 30 |         self.tasks_iter = tasks_iter
 31 |         self.max_simultaneous_tasks = max_simultaneous_tasks
 32 |         self.storage = storage
 33 |         self.cache_key = cache_key
 34 |         self.num_tasks = num_tasks
 35 |         self.wait_to_raise_task_exception = wait_to_raise_task_exception
 36 |         self.speculation_percent = speculation_percent
 37 |         self.speculation_runtime_percentile = speculation_runtime_percentile
 38 |         self.speculation_max_reruns = speculation_max_reruns
 39 | 
 40 |         self.job_name = naming.make_job_name(
 41 |             self.cache_key, node_id=context.node_id())
 42 |         self.task_queue_times = collections.defaultdict(list)
 43 |         self.submitted_tasks = []
 44 |         self.reused_tasks = set()
 45 |         self.completed_tasks = {}
 46 |         self.running_tasks = set()
 47 |         self.status_writer = DefaultStatusWriter(storage, self.job_name)
 48 | 
 49 |         self.status_writer.print_info()
 50 | 
 51 |         self.static_status_dict = {
 52 |             'backend': str(self.backend),
 53 |             'job_name': self.job_name,
 54 |             'cache_key': self.cache_key,
 55 |             'max_simultaneous_tasks': self.max_simultaneous_tasks,
 56 |             'num_tasks': self.num_tasks,
 57 |             'start_time': time.asctime(),
 58 |         }
 59 | 
 60 |     def status_dict(self):
 61 |         result = dict(self.static_status_dict)
 62 |         result["submitted_tasks"] = list(self.submitted_tasks)
 63 |         result["completed_tasks"] = list(self.completed_tasks)
 64 |         result["running_tasks"] = list(self.running_tasks)
 65 |         result['reused_tasks'] = list(self.reused_tasks)
 66 |         return result
 67 | 
 68 |     def storage_path(self, filename):
 69 |         return self.storage + "/" + filename
 70 | 
 71 |     def submit_task(self, task_name):
 72 |         queue_time = int(time.time())
 73 |         task_result_template = self.storage_path(
 74 |             naming.TASK_RESULT.template.format(
 75 |                 task_name=task_name,
 76 |                 attempt_num=len(self.task_queue_times[task_name]),
 77 |                 queue_time=queue_time,
 78 |                 result_type="{result_type}",   # filled in by worker
 79 |                 result_time="{result_time}"))  # filled in by worker
 80 | 
 81 |         task_input = self.storage_path(
 82 |             naming.TASK_INPUT.make_string(task_name=task_name))
 83 | 
 84 |         self.backend.submit_task(task_name, task_input, task_result_template)
 85 |         self.status_writer.update(self.status_dict())
 86 |         self.submitted_tasks.append(task_name)
 87 |         self.task_queue_times[task_name].append(queue_time)
 88 | 
 89 |     def submit_next_task(self):
 90 |         task_name = None
 91 |         while task_name is None:
 92 |             try:
 93 |                 task = next(self.tasks_iter)
 94 |             except StopIteration:
 95 |                 return False
 96 | 
 97 |             task_name = naming.TASK.make_string(
 98 |                 cache_key=self.cache_key,
 99 |                 task_num=len(self.submitted_tasks))
100 | 
101 |             if task_name in self.completed_tasks:
102 |                 completed_task_info = self.completed_tasks[task_name]
103 |                 logging.info("Using existing result: %s" % (
104 |                     completed_task_info['task_result_name']))
105 |                 self.reused_tasks.add(task_name)
106 |                 self.submitted_tasks.append(task_name)
107 |                 task_name = None
108 | 
109 |         task_input = self.storage_path(
110 |             naming.TASK_INPUT.make_string(task_name=task_name))
111 |         with tempfile.TemporaryFile(prefix="kubeface-upload-") as fd:
112 |             dump(task, fd)
113 |             size_string = human_readable_memory_size(fd.tell())
114 |             logging.info("Uploading: %s [%s] for task %s" % (
115 |                 task_input,
116 |                 size_string,
117 |                 task_name))
118 |             fd.seek(0)
119 |             storage.put(task_input, fd)
120 | 
121 |         self.submit_task(task_name)
122 |         return True
123 | 
124 |     def update(self):
125 |         completed_task_result_names = storage.list_contents(
126 |             self.storage_path(
127 |                 naming.task_result_prefix(self.cache_key, self.running_tasks)))
128 |         for completed_task_result_name in completed_task_result_names:
129 |             info = naming.TASK_RESULT.make_tuple(completed_task_result_name)
130 |             if info.task_name not in self.completed_tasks:
131 |                 if info.result_type == 'exception':
132 |                     result = Result.from_storage(
133 |                         self.storage_path(completed_task_result_name))
134 |                     result.log()
135 |                     if self.wait_to_raise_task_exception:
136 |                         logging.warning(
137 |                             "Waiting for other tasks to run before raising "
138 |                             "exception.")
139 |                     else:
140 |                         result.raise_if_exception()
141 |                         assert False
142 |                 self.completed_tasks[info.task_name] = {
143 |                     'parsed_result_name': info,
144 |                     'task_result_name': completed_task_result_name,
145 |                 }
146 | 
147 |         self.running_tasks = set(self.submitted_tasks).difference(
148 |             set(self.completed_tasks))
149 | 
150 |     def tasks_elegible_for_speculation(self, speculation_runtime_threshold):
151 |         # Consider speculating.
152 |         elegible_tasks_by_runtime = [
153 |             task_name
154 |             for task_name in self.running_tasks
155 |             if (
156 |                 time.time() - self.task_queue_times[task_name][-1] >
157 |                 speculation_runtime_threshold)
158 |         ]
159 |         elegible_tasks = [
160 |             task_name
161 |             for task_name in elegible_tasks_by_runtime
162 |             if (
163 |                 len(self.task_queue_times[task_name]) <
164 |                 self.speculation_max_reruns)
165 |         ]
166 |         logging.info(
167 |             "%d tasks could be speculatively rerun based "
168 |             "on a queue time threshold of %0.2f sec; of "
169 |             "these %d are elegible because they have not "
170 |             "been run more than %d times." % (
171 |                 len(elegible_tasks_by_runtime),
172 |                 speculation_runtime_threshold,
173 |                 len(elegible_tasks),
174 |                 self.speculation_max_reruns))
175 |         return elegible_tasks
176 | 
177 |     def wait(self, poll_seconds=5.0):
178 |         """
179 |         Run all tasks to completion.
180 | 
181 |         Speculation algorithm:
182 |             - No speculation occurs until all tasks have been submitted and at
183 |               least 100 - speculation_percent tasks have completed.
184 |             - Once this threshold is reached, tasks are rerun in order, i.e.
185 |               based how long they have been queued.
186 |             - A task will be rerun when its queue time exceeds
187 |               speculation_runtime_percentile of the queue times of the
188 |               tasks that completed successfully without speculation. This will
189 |               reset its queue time to 0.
190 |             - Tasks can be rerun up to speculation_max_reruns times.
191 |             - We are still limited by max_simultaneous_tasks. If more than this
192 |               number of tasks fail, we won't be able to recover.
193 |         """
194 | 
195 |         while True:
196 |             self.update()
197 |             num_to_submit = max(
198 |                 0,
199 |                 self.max_simultaneous_tasks -
200 |                 len(self.running_tasks))
201 |             if num_to_submit == 0:
202 |                 time.sleep(poll_seconds)
203 |                 continue
204 | 
205 |             logging.info("Submitting %d tasks" % num_to_submit)
206 |             if not all(self.submit_next_task() for _ in range(num_to_submit)):
207 |                 # We've submitted all our tasks.
208 |                 speculation_runtime_threshold = None
209 |                 while True:
210 |                     self.update()
211 |                     self.status_writer.update(self.status_dict())
212 |                     if not self.running_tasks:
213 |                         return
214 | 
215 |                     if speculation_runtime_threshold is None:
216 |                         percent_tasks_running = (
217 |                             len(self.running_tasks) * 100.0 /
218 |                             len(self.submitted_tasks))
219 |                         if percent_tasks_running < self.speculation_percent:
220 |                             elapsed_times = [
221 |                                 int(t["parsed_result_name"].result_time) -
222 |                                 int(t["parsed_result_name"].queue_time)
223 |                                 for t in self.completed_tasks.values()
224 |                             ]
225 |                             speculation_runtime_threshold = percentile(
226 |                                 elapsed_times,
227 |                                 self.speculation_runtime_percentile)
228 |                             logging.info(
229 |                                 "Enabling speculation: %0.2f%% of tasks "
230 |                                 "running. "
231 |                                 "Task queue times (sec): "
232 |                                 "min=%0.1f mean=%0.1f max=%0.1f. Queue time "
233 |                                 "threshold for resubmitting tasks will be "
234 |                                 "%0.0f percentile of these times, which is "
235 |                                 "%0.2f" % (
236 |                                     percent_tasks_running,
237 |                                     min(elapsed_times),
238 |                                     mean(elapsed_times),
239 |                                     max(elapsed_times),
240 |                                     self.speculation_runtime_percentile,
241 |                                     speculation_runtime_threshold))
242 | 
243 |                     if speculation_runtime_threshold is not None:
244 |                         elegible_tasks = self.tasks_elegible_for_speculation(
245 |                             speculation_runtime_threshold)
246 | 
247 |                         if elegible_tasks:
248 |                             capacity = max(
249 |                                 0,
250 |                                 self.max_simultaneous_tasks - sum(
251 |                                     len(self.task_queue_times[task_name])
252 |                                     for task_name in self.running_tasks))
253 |                             to_speculate = elegible_tasks[:capacity]
254 |                             logging.info(
255 |                                 "Capacity for re-running up to %d tasks. "
256 |                                 "Will speculatively re-run %d tasks." % (
257 |                                     capacity,
258 |                                     len(to_speculate)))
259 |                             for task_name in to_speculate:
260 |                                 self.submit_task(task_name)
261 | 
262 |                     logging.info("Waiting for %d tasks to complete: %s" % (
263 |                         len(self.running_tasks),
264 |                         " ".join(self.running_tasks)))
265 |                     time.sleep(poll_seconds)
266 | 
267 |     def results(self):
268 |         self.update()
269 |         if self.running_tasks:
270 |             raise RuntimeError("Not all tasks have completed")
271 |         for task_name in self.submitted_tasks:
272 |             result_file = self.storage_path(
273 |                 self.completed_tasks[task_name]['task_result_name'])
274 |             result = Result.from_storage(result_file)
275 |             yield result
276 | 


--------------------------------------------------------------------------------
/kubeface/kubernetes_backend.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | import json
  3 | import logging
  4 | import subprocess
  5 | import time
  6 | 
  7 | from .backend import Backend
  8 | from .worker_configuration import WorkerConfiguration
  9 | from .common import check_call
 10 | from .storage import is_google_storage_bucket
 11 | from . import naming
 12 | 
 13 | 
 14 | class KubernetesBackend(Backend):
 15 | 
 16 |     @staticmethod
 17 |     def add_args(parser):
 18 |         default = KubernetesBackend(worker_configuration=None)
 19 |         parser.add_argument(
 20 |             "--kubeface-kubernetes-cluster",
 21 |             default=default.cluster,
 22 |             help="Cluster. Default: %(default)s")
 23 |         parser.add_argument(
 24 |             "--kubeface-kubernetes-task-resources-cpu",
 25 |             default=default.task_resources_cpu,
 26 |             type=int,
 27 |             help="CPUs per task. Default: %(default)s")
 28 |         parser.add_argument(
 29 |             "--kubeface-kubernetes-task-resources-memory-mb",
 30 |             default=default.task_resources_memory_mb,
 31 |             type=float,
 32 |             help="Memory (mb) per task. Default: %(default)s")
 33 |         parser.add_argument(
 34 |             "--kubeface-kubernetes-retries",
 35 |             default=default.retries,
 36 |             type=int,
 37 |             help="Max retries for kubernetes commands. Default: %(default)s")
 38 |         parser.add_argument(
 39 |             "--kubeface-kubernetes-image-pull-policy",
 40 |             default=default.image_pull_policy,
 41 |             choices=("Always", "IfNotPresent", "Never"),
 42 |             help="Image pull policy. Default: %(default)s")
 43 | 
 44 |     @staticmethod
 45 |     def from_args(args):
 46 |         arg_prefix = "kubeface_kubernetes_"
 47 |         return KubernetesBackend(
 48 |             worker_configuration=WorkerConfiguration.from_args(args),
 49 |             **dict(
 50 |                 (key[len(arg_prefix):], value)
 51 |                 for (key, value) in args._get_kwargs()
 52 |                 if key.startswith(arg_prefix)))
 53 | 
 54 |     def __init__(
 55 |             self,
 56 |             worker_configuration,
 57 |             cluster=None,
 58 |             task_resources_cpu=1,
 59 |             task_resources_memory_mb=1000.0,
 60 |             retries=12,
 61 |             image_pull_policy='Always'):
 62 |         self.worker_configuration = worker_configuration
 63 |         self.cluster = cluster
 64 |         self.task_resources_cpu = task_resources_cpu
 65 |         self.task_resources_memory_mb = task_resources_memory_mb
 66 |         self.retries = retries
 67 |         self.image_pull_policy = image_pull_policy
 68 | 
 69 |     def submit_task(self, task_name, task_input, task_output):
 70 |         specification = self.task_specification(
 71 |             task_name,
 72 |             task_input,
 73 |             task_output)
 74 |         with tempfile.NamedTemporaryFile(
 75 |                 mode="w+",
 76 |                 prefix="kubeface-kubernetes-%s" % task_name,
 77 |                 suffix=".json") as fd:
 78 |             json.dump(specification, fd, indent=4)
 79 |             logging.debug(json.dumps(specification, indent=4))
 80 |             fd.flush()
 81 |             retry_num = 0
 82 |             while True:
 83 |                 try:
 84 |                     check_call(["kubectl", "create", "-f", fd.name])
 85 |                     return task_name
 86 |                 except subprocess.CalledProcessError:
 87 |                     logging.warn("Error calling kutectl on spec: \n%s" % (
 88 |                         json.dumps(specification, indent=4)))
 89 |                     retry_num += 1
 90 |                     if retry_num >= self.retries:
 91 |                         raise
 92 |                     sleep_time = 2.0**retry_num
 93 |                     logging.info("Retry %d / %d. Sleeping for %0.1f sec." % (
 94 |                         retry_num, self.retries, sleep_time))
 95 |                     time.sleep(sleep_time)
 96 | 
 97 |     def task_specification(self, task_name, task_input, task_output):
 98 |         task_info = naming.TASK.make_tuple(task_name)
 99 |         logging.info(
100 |             "Generating kubernetes specification for task %d in job %s" % (
101 |                 task_info.task_num, task_info.cache_key))
102 | 
103 |         sanitized_task_name = naming.sanitize(task_name)
104 |         sanitized_cache_key = naming.sanitize(task_info.cache_key)
105 | 
106 |         result = {
107 |             "kind": "Pod",
108 |             "apiVersion": "v1",
109 |             "metadata": {
110 |                 "name": "%s-%s" % (
111 |                     sanitized_task_name, naming.hash_value(task_output)),
112 |                 "labels": {
113 |                     "kubeface_job": sanitized_cache_key,
114 |                 },
115 |                 "namespace": "",
116 |             },
117 |             "spec": {
118 |                 "containers": [
119 |                     {
120 |                         "name": str(task_info.task_num),
121 |                         "image": self.worker_configuration.image,
122 |                         "imagePullPolicy": self.image_pull_policy,
123 |                         "command": [
124 |                             "sh",
125 |                             "-c",
126 |                             self.worker_configuration.command(
127 |                                 task_input,
128 |                                 task_output),
129 |                         ],
130 |                         "resources": {
131 |                             "requests": {
132 |                                 "cpu": self.task_resources_cpu,
133 |                                 "memory": (
134 |                                     "%sMi" %
135 |                                     self.task_resources_memory_mb),
136 |                             },
137 |                         },
138 |                     },
139 |                 ],
140 |                 "restartPolicy": "Never",
141 |             }
142 |         }
143 |         return result
144 | 
145 |     @staticmethod
146 |     def supports_storage(path):
147 |         # kubernetes backend requires bucket storage
148 |         return is_google_storage_bucket(path)
149 | 


--------------------------------------------------------------------------------
/kubeface/local_process_backend.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import subprocess
 3 | from copy import copy
 4 | 
 5 | from .backend import Backend
 6 | from .worker_configuration import (
 7 |     WorkerConfiguration,
 8 |     DEFAULT as DEFAULT_WORKER_CONFIG
 9 | )
10 | 
11 | 
12 | class LocalProcessBackend(Backend):
13 |     @staticmethod
14 |     def add_args(parser):
15 |         pass
16 | 
17 |     @staticmethod
18 |     def from_args(args):
19 |         return LocalProcessBackend(
20 |             worker_configuration=WorkerConfiguration.from_args(args))
21 | 
22 |     def __init__(self, worker_configuration=DEFAULT_WORKER_CONFIG):
23 |         unsupported_worker_configuration_fields = [
24 |             'image',
25 |             'pip',
26 |             'pip_packages',
27 |             'kubeface_install_command',
28 |         ]
29 |         bad_fields = worker_configuration.non_default_fields().intersection(
30 |             set(unsupported_worker_configuration_fields))
31 |         if bad_fields:
32 |             raise ValueError(
33 |                 "LocalProcessBackend does not handle these worker "
34 |                 "configuration fields: %s" % ' '.join(bad_fields))
35 |         if worker_configuration.kubeface_install_policy == 'always':
36 |             raise ValueError(
37 |                 "LocalProcessBackend does not support worker configurations "
38 |                 "with kubeface_install_policy = 'always'")
39 |         self.worker_configuration = copy(worker_configuration)
40 |         self.worker_configuration.kubeface_install_policy = 'never'
41 | 
42 |     def submit_task(self, task_name, task_input, task_output):
43 |         command = self.worker_configuration.command(task_input, task_output)
44 |         logging.debug("Running task '%s': %s" % (task_name, command))
45 |         return subprocess.Popen(command, shell=True)
46 | 


--------------------------------------------------------------------------------
/kubeface/local_process_docker_backend.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import subprocess
 3 | import shlex
 4 | import os
 5 | 
 6 | from .backend import Backend
 7 | from .worker_configuration import (
 8 |     WorkerConfiguration,
 9 |     DEFAULT as DEFAULT_WORKER_CONFIG
10 | )
11 | 
12 | 
13 | DOCKER_MOUNT = "/kubeface-data"
14 | KUBEFACE_MOUNT = "/kubeface-package"
15 | 
16 | 
17 | class LocalProcessDockerBackend(Backend):
18 |     @staticmethod
19 |     def add_args(parser):
20 |         parser.add_argument(
21 |             "--kubeface-local-process-docker-command",
22 |             default="docker")
23 | 
24 |     @staticmethod
25 |     def from_args(args):
26 |         return LocalProcessDockerBackend(
27 |             worker_configuration=WorkerConfiguration.from_args(args),
28 |             docker_command=args.kubeface_local_process_docker_command)
29 | 
30 |     def __init__(
31 |             self,
32 |             worker_configuration=DEFAULT_WORKER_CONFIG,
33 |             docker_command="docker"):
34 |         self.worker_configuration = worker_configuration
35 |         self.docker_command = docker_command
36 | 
37 |     def submit_task(self, task_name, task_input, task_output):
38 |         volume_mounts = []  # pairs of (host path, docker path)
39 |         if not task_input.startswith("gs://"):
40 |             # Using a local filesystem as storage, so we'll want to
41 |             # mount it on the docker image.
42 |             data_dir = os.path.dirname(task_input)
43 |             assert os.path.dirname(task_output) == data_dir
44 | 
45 |             task_input = os.path.join(
46 |                 DOCKER_MOUNT, os.path.basename(task_input))
47 |             task_output = os.path.join(
48 |                 DOCKER_MOUNT, os.path.basename(task_output))
49 |             volume_mounts.append((data_dir, DOCKER_MOUNT))
50 | 
51 |         # We also mount the kubeface package directory, so it can
52 |         # installed on the docker image if desired.
53 |         kubeface_package_dir = os.path.abspath(
54 |             os.path.join(
55 |                 os.path.dirname(__file__),
56 |                 ".."))
57 |         volume_mounts.append((kubeface_package_dir, KUBEFACE_MOUNT))
58 | 
59 |         volume_mount_args = []
60 |         for (host_path, docker_path) in volume_mounts:
61 |             volume_mount_args.append("-v")
62 |             volume_mount_args.append("%s:%s" % (host_path, docker_path))
63 | 
64 |         command = (
65 |             shlex.split(self.docker_command) +
66 |             ["run"] +
67 |             volume_mount_args +
68 |             [
69 |                 self.worker_configuration.image,
70 |                 "sh",
71 |                 "-c",
72 |                 self.worker_configuration.command(task_input, task_output),
73 |             ]
74 |         )
75 |         logging.info("Running task '%s': %s" % (task_name, str(command)))
76 |         return subprocess.Popen(command)
77 | 


--------------------------------------------------------------------------------
/kubeface/naming.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | from datetime import datetime
 3 | import getpass
 4 | import hashlib
 5 | import time
 6 | 
 7 | from .stringable import Stringable
 8 | 
 9 | JOB = Stringable(
10 |     "Job",
11 |     "{cache_key}::{node_id}::{randomness}")
12 | 
13 | TASK = Stringable(
14 |     "Task",
15 |     "{cache_key}::{task_num:06d}")
16 | 
17 | TASK_INPUT = Stringable(
18 |     "TaskInput",
19 |     "input::{task_name}")
20 | 
21 | TASK_RESULT = Stringable(
22 |     "TaskResult",
23 |     "result::{task_name}+{attempt_num:d}+{queue_time}+{result_time}+"
24 |     "+{result_type}",
25 |     valid_values={
26 |         'result_type': ["value", "exception"],
27 |     })
28 | 
29 | JOB_STATUS_PAGE = Stringable(
30 |     "JobStatusPage",
31 |     "{status}::{format}::{job_name}.{format}",
32 |     valid_values={
33 |         'format': ['html', 'json'],
34 |         'status': ['active', 'done'],
35 |     })
36 | 
37 | REMOTE_OBJECT = Stringable(
38 |     "RemoteObject",
39 |     "object::{cache_key_prefix}::{node_id}::{object_num:d}-{randomness}")
40 | 
41 | 
42 | def hash_value(s, characters=8):
43 |     return hashlib.sha1(str(s).encode()).hexdigest()[:characters]
44 | 
45 | 
46 | def make_cache_key_prefix():
47 |     cache_key_prefix = "%s-%s-%s-%s" % (
48 |         socket.gethostname()[:8],
49 |         getpass.getuser(),
50 |         datetime.strftime(datetime.now(), "%Y-%m-%d-%H:%M:%S"),
51 |         hash_value(time.time()))
52 |     return cache_key_prefix
53 | 
54 | 
55 | def make_job_name(cache_key, node_id):
56 |     return JOB.make_string(
57 |         cache_key=cache_key,
58 |         node_id=node_id,
59 |         randomness=hash_value(time.time()))
60 | 
61 | 
62 | def make_remote_object_name(cache_key_prefix, node_id, object_num):
63 |     return REMOTE_OBJECT.make_string(
64 |         cache_key_prefix=cache_key_prefix,
65 |         node_id=node_id,
66 |         object_num=object_num,
67 |         randomness=hash_value(time.time()))
68 | 
69 | 
70 | def task_result_prefix(cache_key, task_names=[]):
71 |     prefix = "result::" + cache_key
72 |     if task_names:
73 |         better_prefix = TASK_RESULT.prefix(task_name=list(task_names))
74 |         assert better_prefix.startswith(prefix)
75 |         return better_prefix
76 |     return prefix
77 | 
78 | 
79 | def task_input_prefix(cache_key):
80 |     return "input::" + cache_key
81 | 
82 | 
83 | def status_prefixes(job_names=None, formats=None, statuses=None):
84 |     return JOB_STATUS_PAGE.prefixes(
85 |         max_prefixes=4,
86 |         job_name=job_names,
87 |         format=formats,
88 |         status=statuses)
89 | 
90 | 
91 | def sanitize(name):
92 |     return (
93 |         name
94 |         .replace(".", "-")
95 |         .replace(":", "-")
96 |         .replace("_", "-").lower())
97 | 


--------------------------------------------------------------------------------
/kubeface/remote_object.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import tempfile
 3 | from contextlib import closing
 4 | 
 5 | from . import common, serialization, storage
 6 | 
 7 | 
 8 | class RemoteObject(object):
 9 |     def __init__(self, file_path, value):
10 |         self.file_path = file_path
11 |         self._value = value
12 |         self.written = False
13 |         self.loaded = True
14 | 
15 |     @property
16 |     def value(self):
17 |         """
18 |         Value is lazy loaded when it is first accessed.
19 |         """
20 |         if not self.loaded:
21 |             with closing(storage.get(self.file_path)) as fd:
22 |                 self._value = serialization.load(fd)
23 |             self.loaded = True
24 |         return self._value
25 | 
26 |     def __getstate__(self):
27 |         """
28 |         The first time the object is pickled, we write it to file_path.
29 |         The pickled representation is just the path to the file.
30 |         """
31 |         if not self.written:
32 |             assert self.loaded
33 |             with tempfile.TemporaryFile(prefix="kubeface-object-") as fd:
34 |                 serialization.dump(self._value, fd)
35 |                 logging.info("Writing object (%s): %s" % (
36 |                     common.human_readable_memory_size(fd.tell()),
37 |                     self.file_path))
38 |                 fd.seek(0)
39 |                 storage.put(self.file_path, fd)
40 |             self.written = True
41 |         return {"file_path": self.file_path}
42 | 
43 |     def __setstate__(self, state):
44 |         assert list(state) == ['file_path']
45 |         self.file_path = state['file_path']
46 |         self._value = None
47 |         self.written = True
48 |         self.loaded = False
49 | 


--------------------------------------------------------------------------------
/kubeface/result.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import socket
  3 | import logging
  4 | import time
  5 | import platform
  6 | from datetime import timedelta
  7 | from contextlib import closing
  8 | 
  9 | from . import storage
 10 | from .serialization import load
 11 | from .common import human_readable_memory_size
 12 | 
 13 | 
 14 | def get_process_info():
 15 |     # For debugging we record some process info in results.
 16 |     return {
 17 |         'invocation_args': sys.argv,
 18 |         'python_version': sys.version,
 19 |         'hostname': socket.gethostname(),
 20 |         'platform': platform.platform(),
 21 |     }
 22 | 
 23 | 
 24 | class Result(object):
 25 |     @staticmethod
 26 |     def from_storage(storage_path):
 27 |         with closing(storage.get(storage_path)) as handle:
 28 |             value = load(handle)
 29 |             assert isinstance(value, Result), type(value)
 30 |             value.serialization_info["storage_path"] = storage_path
 31 |             value.serialization_info["result_bytes"] = handle.tell()
 32 |         return value
 33 | 
 34 |     def __init__(
 35 |             self,
 36 |             start_time,
 37 |             end_time,
 38 |             input_size=None,
 39 |             exception=None,
 40 |             exception_traceback_string=None,
 41 |             return_value=None,
 42 |             process_info=get_process_info()):
 43 |         self.input_size = input_size
 44 |         self.start_time = start_time
 45 |         self.end_time = end_time
 46 |         self.exception = exception
 47 |         self.exception_traceback_string = exception_traceback_string
 48 |         self.return_value = return_value
 49 |         self.process_info = process_info
 50 | 
 51 |         if exception is not None:
 52 |             assert return_value is None
 53 |             assert exception_traceback_string is not None
 54 |             self.result_type = "exception"
 55 |         else:
 56 |             self.result_type = "value"
 57 | 
 58 |         self.serialization_info = {}  # set upon deserialization
 59 | 
 60 |     def run_seconds(self):
 61 |         return self.end_time - self.start_time
 62 | 
 63 |     def description(self, indent=""):
 64 |         fields = [
 65 |             ("result type", self.result_type),
 66 |             ("start time", time.asctime(time.localtime(self.start_time))),
 67 |             ("run time", str(timedelta(seconds=self.run_seconds()))),
 68 |             ("hostname", self.process_info['hostname']),
 69 |             ("platform", self.process_info['platform']),
 70 |             ("python version", self.process_info['python_version']),
 71 |             ("invocation arguments", "\n".join(
 72 |                 self.process_info['invocation_args'])),
 73 |         ]
 74 |         if self.input_size:
 75 |             fields.append(
 76 |                 ("input size", human_readable_memory_size(self.input_size)))
 77 |         if 'result_bytes' in self.serialization_info:
 78 |             fields.append(
 79 |                 ("result size",
 80 |                     human_readable_memory_size(
 81 |                         self.serialization_info['result_bytes'])))
 82 | 
 83 |         if self.result_type == 'value':
 84 |             fields.append(("return value type", str(type(self.return_value))))
 85 |         else:
 86 |             fields.extend([
 87 |                 ("exception", str(self.exception)),
 88 |                 ("traceback", self.exception_traceback_string),
 89 |             ])
 90 | 
 91 |         max_header_length = max(len(pair[0]) for pair in fields)
 92 |         row_template = "%" + str(max_header_length) + "s : %s"
 93 | 
 94 |         def format_value(s):
 95 |             return s.replace("\n", "\n" + "   " + " " * max_header_length)
 96 | 
 97 |         return (
 98 |             "\n" +
 99 |             "\n".join(
100 |                 row_template % (key, format_value(value))
101 |                 for (key, value) in fields)
102 |         ).replace("\n", "\n" + indent)
103 | 
104 |     def log(self):
105 |         indent = " *  "
106 |         if self.result_type == 'value':
107 |             logging.debug("Result (success): %s" % (
108 |                 self.description(indent=indent)))
109 |         else:
110 |             logging.error("Result (exception): %s" % (
111 |                 self.description(indent=indent)))
112 | 
113 |     def raise_if_exception(self):
114 |         if self.result_type == 'exception':
115 |             logging.error("Re-raising exception for task.")
116 |             raise self.exception
117 | 


--------------------------------------------------------------------------------
/kubeface/serialization.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import dill
 4 | import dill.detect
 5 | 
 6 | PICKLE_PROTOCOL = 2
 7 | 
 8 | CHECK_SERIALIZATION = False
 9 | 
10 | 
11 | def check(obj):
12 |     if not CHECK_SERIALIZATION:
13 |         return
14 |     try:
15 |         dill.loads(dill.dumps(obj))
16 |     except Exception as e:
17 |         logging.error(
18 |             "Couldn't serialize: %s\n'%s'\nBad objects:\n%s" % (
19 |                 str(obj), str(e), dill.detect.badobjects(obj, depth=2)))
20 |         raise
21 | 
22 | 
23 | def dumps(obj):
24 |     check(obj)
25 |     return dill.dumps(obj, protocol=PICKLE_PROTOCOL)
26 | 
27 | 
28 | def dump(obj, fd):
29 |     check(obj)
30 |     return dill.dump(obj, fd, protocol=PICKLE_PROTOCOL)
31 | 
32 | 
33 | def loads(s):
34 |     return dill.loads(s)
35 | 
36 | 
37 | def load(fd):
38 |     return dill.load(fd)
39 | 


--------------------------------------------------------------------------------
/kubeface/status_writer.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | import json
 3 | import time
 4 | 
 5 | from . import naming, storage
 6 | 
 7 | 
 8 | class DefaultStatusWriter(object):
 9 |     def __init__(self, storage_path, job_name):
10 |         self.storage_path = storage_path
11 |         self.job_name = job_name
12 |         self.json_path = (
13 |             storage_path +
14 |             "/" +
15 |             naming.JOB_STATUS_PAGE.make_string(
16 |                 job_name=job_name,
17 |                 format="json",
18 |                 status="active"))
19 |         self.html_path = (
20 |             storage_path +
21 |             "/" +
22 |             naming.JOB_STATUS_PAGE.make_string(
23 |                 job_name=job_name,
24 |                 format="html",
25 |                 status="active"))
26 | 
27 |     def print_info(self):
28 |         print("Job status available at:")
29 |         print("\t%s" % storage.access_info(self.json_path))
30 |         print("\t%s" % storage.access_info(self.html_path))
31 | 
32 |     def make_html(self, status_dict):
33 |         d = dict(status_dict)
34 |         d["num_running_tasks"] = len(d["running_tasks"])
35 |         d["num_completed_tasks"] = len(d["completed_tasks"])
36 |         d["num_submitted_tasks"] = len(d["submitted_tasks"])
37 |         d["num_reused_tasks"] = len(d["reused_tasks"])
38 |         if d["num_tasks"]:
39 |             d["percent_complete"] = (
40 |                 d["num_completed_tasks"] * 100.0 / d["num_tasks"])
41 |         else:
42 |             d["percent_complete"] = "unknown"
43 |             d["num_tasks"] = "unknown"
44 |         d["status_time"] = time.asctime()
45 | 
46 |         return """
47 |         <html>
48 |         <head>
49 |         <title>Kubeface status: {job_name}</title>
50 |         </head>
51 |         <body>
52 |         <h1>Kubeface</h1>
53 |         <h2>{job_name}</h2>
54 |         <table>
55 |         <tr><td>Job</td><td>{job_name}</td></tr>
56 |         <tr><td>Cache key</td><td>{cache_key}</td></tr>
57 |         <tr><td>Backend</td><td>{backend}</td></tr>
58 |         <tr><td>Max simultaneous tasks</td>
59 |             <td>{max_simultaneous_tasks}</td></tr>
60 |         <tr><td>Start time</td><td>{start_time}</td></tr>
61 |         <tr><td>Status time</td><td>{status_time}</td></tr>
62 |         </table>
63 | 
64 |         <h2>Status</h2>
65 |         <table>
66 |         <tr><td>Percent complete</td><td>{percent_complete}</td></tr>
67 |         <tr><td>Running tasks</td><td>{num_running_tasks}</td></tr>
68 |         <tr><td>Completed tasks</td><td>{num_completed_tasks}</td></tr>
69 |         <tr><td>Submitted tasks</td><td>{num_submitted_tasks}</td></tr>
70 |         <tr><td>Reused tasks</td><td>{num_reused_tasks}</td></tr>
71 |         <tr><td>Total tasks</td><td>{num_tasks}</td></tr>
72 |         </table>
73 |         </body>
74 |         </html>
75 |         """.format(**d)
76 | 
77 |     def update(self, status_dict):
78 |         storage.put(
79 |             self.json_path,
80 |             BytesIO(json.dumps(status_dict).encode()),
81 |             mime_type="application/json")
82 |         storage.put(
83 |             self.html_path,
84 |             BytesIO(self.make_html(status_dict).encode()),
85 |             mime_type="text/html")
86 | 


--------------------------------------------------------------------------------
/kubeface/storage.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | from . import bucket_storage
 4 | 
 5 | 
 6 | def is_google_storage_bucket(name):
 7 |     return name.startswith("gs://")
 8 | 
 9 | 
10 | def list_contents(prefix):
11 |     if is_google_storage_bucket(prefix):
12 |         return bucket_storage.list_contents(prefix)
13 |     else:
14 |         globbed = glob.glob(prefix + "*")
15 |         return [os.path.basename(x) for x in globbed]
16 | 
17 | 
18 | def put(name, input_handle, readers=[], owners=[], **kwargs):
19 |     if is_google_storage_bucket(name):
20 |         return bucket_storage.put(
21 |             name, input_handle, readers, owners, **kwargs)
22 | 
23 |     # Local file
24 |     with open(name, 'wb') as fd:
25 |         fd.write(input_handle.read())
26 | 
27 | 
28 | def get(name, output_handle=None):
29 |     if is_google_storage_bucket(name):
30 |         return bucket_storage.get(name, output_handle)
31 | 
32 |     # Local file
33 |     if output_handle is None:
34 |         return open(name, "rb")
35 | 
36 |     with open(name, "rb") as fd:
37 |         output_handle.write(fd.read())
38 | 
39 |     return output_handle
40 | 
41 | 
42 | def delete(name):
43 |     if is_google_storage_bucket(name):
44 |         return bucket_storage.delete(name)
45 | 
46 |     os.unlink(name)
47 | 
48 | 
49 | def move(source, dest):
50 |     if is_google_storage_bucket(source):
51 |         assert is_google_storage_bucket(dest)
52 |         return bucket_storage.move(source, dest)
53 |     assert not is_google_storage_bucket(dest)
54 |     os.rename(source, dest)
55 | 
56 | 
57 | def access_info(name):
58 |     if is_google_storage_bucket(name):
59 |         return bucket_storage.access_info(name)
60 |     return name
61 | 


--------------------------------------------------------------------------------
/kubeface/stringable.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import string
  3 | from os.path import commonprefix
  4 | 
  5 | import parse
  6 | 
  7 | FORMATTER = string.Formatter()
  8 | 
  9 | 
 10 | class Stringable(object):
 11 |     def __init__(self, name, template, valid_values={}):
 12 |         self.template_pieces = []
 13 |         self.field_names = []
 14 |         parsed = FORMATTER.parse(template)
 15 |         for (literal_text, field_name, format_spec, conversion) in parsed:
 16 |             assert not conversion
 17 |             self.template_pieces.append((literal_text, field_name))
 18 |             if field_name not in self.field_names:
 19 |                 self.field_names.append(field_name)
 20 | 
 21 |         self.name = name
 22 |         self.template = template
 23 |         self.compiled_template = parse.compile(template)
 24 |         self.tuple_class = collections.namedtuple(
 25 |             self.name,
 26 |             list(self.field_names))
 27 | 
 28 |         self.valid_values = dict(valid_values)
 29 |         for key in self.valid_values:
 30 |             assert key in self.field_names
 31 | 
 32 |     def make_tuple(self, string_value=None, **kwargs):
 33 |         if string_value is not None:
 34 |             assert not kwargs
 35 |             parsed = self.compiled_template.parse(string_value)
 36 |             if parsed is None:
 37 |                 raise ValueError(
 38 |                     "Stringable [%s]: Couldn't parse '%s' according to "
 39 |                     "template '%s'" % (
 40 |                         self.name, string_value, self.template))
 41 |             assert not parsed.fixed
 42 |             fields_dict = parsed.named
 43 |         else:
 44 |             fields_dict = kwargs
 45 |         self.check_fields(**fields_dict)
 46 |         return self.tuple_class(**fields_dict)
 47 | 
 48 |     def check_fields(self, **fields_dict):
 49 |         assert set(fields_dict) == set(self.field_names), (
 50 |             "%s: passed fields %s != expected fields %s" % (
 51 |                 self.name, set(fields_dict), set(self.field_names)))
 52 |         for (key, values) in self.valid_values.items():
 53 |             if fields_dict[key] not in values:
 54 |                 raise RuntimeError(
 55 |                     "Invalid value %s='%s', must be one of %s." % (
 56 |                         key, fields_dict[key], ', '.join(values)))
 57 | 
 58 |     def make_string(self, tpl=None, **fields_dict):
 59 |         if tpl is not None:
 60 |             assert not fields_dict
 61 |             fields_dict = tpl._asdict()
 62 |         self.check_fields(**fields_dict)
 63 |         return self.template.format(**fields_dict)
 64 | 
 65 |     def prefix(self, **fields_dict):
 66 |         (prefix,) = self.prefixes(**fields_dict)
 67 |         return prefix
 68 | 
 69 |     def prefixes(self, max_prefixes=1, **fields_dict):
 70 |         for (key, value) in fields_dict.items():
 71 |             assert key in self.field_names, key
 72 |             assert value is None or isinstance(value, list), type(value)
 73 | 
 74 |         def make_prefixes(
 75 |                 template_pieces,
 76 |                 max_prefixes=max_prefixes,
 77 |                 fields_dict=fields_dict):
 78 |             result = [[]]
 79 |             if not template_pieces:
 80 |                 return result
 81 | 
 82 |             (literal, field_name) = template_pieces[0]
 83 |             if literal:
 84 |                 for piece in result:
 85 |                     piece.append(literal)
 86 | 
 87 |             values = fields_dict.get(field_name)
 88 |             if values is None:
 89 |                 values = self.valid_values.get(field_name)
 90 |             if values is not None:
 91 |                 if len(result) * len(values) > max_prefixes:
 92 |                     common_prefix = commonprefix(values)
 93 |                     for piece in result:
 94 |                         piece.append(common_prefix)
 95 |                 else:
 96 |                     new_result = []
 97 |                     for value in values:
 98 |                         new_fields_dict = dict(fields_dict)
 99 |                         new_fields_dict[field_name] = [value]
100 |                         rest = make_prefixes(
101 |                             template_pieces[1:],
102 |                             max_prefixes=max_prefixes / (
103 |                                 len(result) * len(values)),
104 |                             fields_dict=new_fields_dict)
105 |                         for some_rest in rest:
106 |                             new_result.extend(
107 |                                 [x + [value] + some_rest for x in result])
108 |                     result = new_result
109 |             return result
110 | 
111 |         prefix_components = make_prefixes(self.template_pieces)
112 |         assert len(prefix_components) <= max_prefixes
113 |         return [''.join(x) for x in prefix_components]
114 | 


--------------------------------------------------------------------------------
/kubeface/task.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import logging
 3 | import types
 4 | import traceback
 5 | 
 6 | from .result import Result
 7 | 
 8 | 
 9 | class Task(object):
10 |     def __init__(self, function, args=(), kwargs={}):
11 |         self.function = function
12 |         self.args = args
13 |         self.kwargs = kwargs
14 | 
15 |     def run(self, input_size=None):
16 |         start_time = time.time()
17 |         exception = None
18 |         exception_traceback_string = None
19 | 
20 |         try:
21 |             return_value = self.function(*self.args, **self.kwargs)
22 |             if isinstance(return_value, types.GeneratorType):
23 |                 return_value = list(return_value)
24 |         except Exception as e:
25 |             traceback_string = traceback.format_exc()
26 |             logging.warn("Task execution raised exception: %s. %s" % (
27 |                 e, traceback_string))
28 |             exception = e
29 |             exception_traceback_string = traceback_string
30 |             return_value = None
31 | 
32 |         return Result(
33 |             start_time=start_time,
34 |             end_time=time.time(),
35 |             exception=exception,
36 |             exception_traceback_string=exception_traceback_string,
37 |             return_value=return_value,
38 |             input_size=input_size)
39 | 


--------------------------------------------------------------------------------
/kubeface/worker_configuration.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from six.moves import shlex_quote as quote
  4 | 
  5 | 
  6 | class WorkerConfiguration(object):
  7 |     @staticmethod
  8 |     def add_args(parser):
  9 |         parser.add_argument(
 10 |             "--kubeface-worker-image",
 11 |             default=DEFAULT.image)
 12 |         parser.add_argument(
 13 |             "--kubeface-worker-path-prefix",
 14 |             default=DEFAULT.path_prefix)
 15 |         parser.add_argument(
 16 |             "--kubeface-worker-pip",
 17 |             default=DEFAULT.pip)
 18 |         parser.add_argument(
 19 |             "--kubeface-worker-pip-packages",
 20 |             default=DEFAULT.pip_packages, nargs="+")
 21 |         parser.add_argument(
 22 |             "--kubeface-worker-kubeface-install-policy",
 23 |             choices=('if-not-present', 'always', 'never'),
 24 |             default=DEFAULT.kubeface_install_policy)
 25 |         parser.add_argument(
 26 |             "--kubeface-worker-kubeface-install-command",
 27 |             default=DEFAULT.kubeface_install_command)
 28 | 
 29 |     @staticmethod
 30 |     def from_args(args):
 31 |         arg_prefix = "kubeface_worker_"
 32 |         return WorkerConfiguration(
 33 |             **dict(
 34 |                 (key[len(arg_prefix):], value)
 35 |                 for (key, value) in args._get_kwargs()
 36 |                 if key.startswith(arg_prefix)))
 37 | 
 38 |     def __init__(
 39 |             self,
 40 |             image='continuumio/anaconda3',
 41 |             path_prefix='',
 42 |             pip='pip',
 43 |             pip_packages=[],
 44 |             kubeface_install_policy='if-not-present',
 45 | 
 46 |             # TODO: this should default to installing the version of kubeface
 47 |             # running in the current process, not HEAD.
 48 |             kubeface_install_command=(
 49 |                 "{pip} install "
 50 |                 "https://github.com/hammerlab/kubeface/archive/master.zip"
 51 |             )):
 52 | 
 53 |         if kubeface_install_policy not in (
 54 |                 'if-not-present', 'always', 'never'):
 55 |             raise ValueError(
 56 |                 "Invalid kubeface_install_policy: %s"
 57 |                 % kubeface_install_policy)
 58 | 
 59 |         self.image = image
 60 |         self.path_prefix = path_prefix
 61 |         self.pip = pip
 62 |         self.pip_packages = pip_packages
 63 |         self.kubeface_install_policy = kubeface_install_policy
 64 |         self.kubeface_install_command = kubeface_install_command
 65 | 
 66 |     def non_default_fields(self):
 67 |         return set([
 68 |             field for field in dir(self)
 69 |             if getattr(self, field) != getattr(DEFAULT, field)
 70 |         ])
 71 | 
 72 |     def command(self, task_input, task_output, extra_task_args=[]):
 73 |         def quote_and_join(arguments):
 74 |             return " ".join([quote(arg) for arg in arguments])
 75 | 
 76 |         pieces = []
 77 |         run_pip = quote(os.path.join(self.path_prefix, 'pip'))
 78 |         run_task = quote(
 79 |             os.path.join(self.path_prefix, '_kubeface-run-task'))
 80 |         kubeface_install_command = self.kubeface_install_command.format(
 81 |             pip=run_pip)
 82 |         if self.kubeface_install_policy == 'if-not-present':
 83 |             # From: http://stackoverflow.com/questions/592620/check-if-a-program-exists-from-a-bash-script
 84 |             pieces.append("command -v %s || { %s ; } " % (
 85 |                 run_task,
 86 |                 kubeface_install_command))
 87 |         elif self.kubeface_install_policy == 'always':
 88 |             pieces.append(kubeface_install_command)
 89 |         if self.pip_packages:
 90 |             pieces.append("%s install %s" % (
 91 |                 run_pip,
 92 |                 quote_and_join(self.pip_packages)))
 93 |         pieces.append(
 94 |             run_task +
 95 |             " " +
 96 |             quote_and_join([
 97 |                 task_input,
 98 |                 task_output,
 99 |                 "--verbose",
100 |             ] + extra_task_args))
101 |         result = " && ".join(pieces)
102 |         return result
103 | 
104 | 
105 | DEFAULT = WorkerConfiguration()
106 | 


--------------------------------------------------------------------------------
/remote_object_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Kubeface example with remote objects.
 3 | 
 4 | Prepends numbers 1-3 to a big string, showing how to use remote objects to
 5 | reduce the size of the uploaded task.
 6 | 
 7 | Example:
 8 | 
 9 | $ python remote_object_example.py \
10 |     --kubeface-backend local-process \
11 |     --kubeface-storage /tmp
12 | 
13 | """
14 | 
15 | import argparse
16 | from collections import Counter
17 | import logging
18 | import sys
19 | 
20 | import kubeface
21 | 
22 | parser = argparse.ArgumentParser(usage=__doc__)
23 | kubeface.Client.add_args(parser)  # Add kubeface arguments
24 | 
25 | 
26 | def main(argv):
27 |     args = parser.parse_args(argv)
28 |     logging.basicConfig(
29 |         format="%(asctime)s.%(msecs)d %(levelname)s %(module)s - %(funcName)s:"
30 |         " %(message)s",
31 |         datefmt="%Y-%m-%d %H:%M:%S",
32 |         stream=sys.stderr,
33 |         level=logging.INFO)
34 | 
35 |     client = kubeface.Client.from_args(args)
36 |     input_values = range(3)
37 | 
38 |     big_string = "i am a string" * 100000
39 |     big_wrapped = client.remote_object(big_string)
40 | 
41 |     logging.info('Using remote object: note size of uploaded task')
42 | 
43 |     def my_func_with_remote_object(x):
44 |         return str(x) + big_wrapped.data
45 |     results = client.map(my_func_with_remote_object, input_values)
46 |     for (x, result) in zip(input_values, results):
47 |         print("%d, %s" % (x, Counter(result)))
48 | 
49 |     logging.info(
50 |         'Now running without remote object: see uploaded task size')
51 | 
52 |     def my_func_without_remote_object(x):
53 |         return str(x) + big_string
54 | 
55 |     results = client.map(my_func_without_remote_object, input_values)
56 |     for (x, result) in zip(input_values, results):
57 |         print("%d, %s" % (x, Counter(result)))
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     main(sys.argv[1:])
62 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from setuptools import setup
 3 | except ImportError:
 4 |     from distutils.core import setup
 5 | 
 6 | version = "0.0.1"
 7 | 
 8 | setup(
 9 |     name="kubeface",
10 |     version=version,
11 |     author="Tim O'Donnell",
12 |     author_email="timodonnell@gmail.com",
13 |     packages=["kubeface", "kubeface.commands"],
14 |     url="https://github.com/hammerlab/kubeface",
15 |     license="Apache License",
16 |     description="Python parallel for loops on kubernetes",
17 |     long_description=open('README.md').read(),
18 |     download_url='https://github.com/hammerlab/kubeface/tarball/%s' % version,
19 |     classifiers=[
20 |         "Development Status :: 1 - Planning",
21 |         "Intended Audience :: Developers",
22 |         "License :: OSI Approved :: MIT License",
23 |         "Programming Language :: Python :: 2",
24 |         "Programming Language :: Python :: 2.7",
25 |         "Programming Language :: Python :: 3",
26 |         "Programming Language :: Python :: 3.4",
27 |     ],
28 |     entry_points={
29 |         'console_scripts': [
30 |             'kubeface-copy = kubeface.commands.copy:run',
31 |             'kubeface-run = kubeface.commands.run:run',
32 |             'kubeface-job = kubeface.commands.job:run',
33 |             '_kubeface-run-task = kubeface.commands.run_task:run',
34 |         ]
35 |     },
36 |     install_requires=[
37 |         "dill>=0.2.5",
38 |         "six",
39 |         "numpy",
40 |         "parse",
41 |         "oauth2client==4.0.0",
42 |         "google-api-python-client==1.5.5",
43 |         "mock",
44 |         "nose>=1.3.1",
45 |     ]
46 | )
47 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hammerlab/kubeface/443d7432e6d2f8e4d20b6326e98fabeec7ad68b6/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_client.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import argparse
  3 | 
  4 | from numpy import testing
  5 | 
  6 | from kubeface import (
  7 |     client,
  8 |     local_process_backend,
  9 |     local_process_docker_backend,
 10 |     worker_configuration,
 11 |     serialization,
 12 |     common)
 13 | 
 14 | from . import util
 15 | 
 16 | common.configure_logging(verbose=True)
 17 | 
 18 | 
 19 | def client_from_commandline_args(argv):
 20 |     parser = argparse.ArgumentParser()
 21 |     client.Client.add_args(parser)
 22 |     args = parser.parse_args(argv)
 23 |     return client.Client.from_args(args)
 24 | 
 25 | 
 26 | def exercise_client(c, low=1, high=10):
 27 |     # Using division gives us an easy way to test handling of tasks
 28 |     # that throw division (by making low < 0) so it throws ZeroDivisionError
 29 |     testing.assert_equal(
 30 |         list(c.map(lambda x: 2.0 / x, range(low, high))),
 31 |         2.0 / numpy.arange(low, high))
 32 | 
 33 | 
 34 | @util.with_local_and_bucket_storage
 35 | def test_local_process_backend(bucket):
 36 |     backend = local_process_backend.LocalProcessBackend()
 37 |     c = client.Client(
 38 |         backend,
 39 |         poll_seconds=1.0,
 40 |         max_simultaneous_tasks=3,
 41 |         storage=bucket)
 42 |     exercise_client(c)
 43 | 
 44 | 
 45 | @util.with_local_storage
 46 | def test_local_process_docker_backend(bucket):
 47 |     worker_config = worker_configuration.WorkerConfiguration(
 48 |         kubeface_install_command="{pip} install /kubeface-package")
 49 |     backend = local_process_docker_backend.LocalProcessDockerBackend(
 50 |         worker_configuration=worker_config)
 51 |     c = client.Client(
 52 |         backend,
 53 |         poll_seconds=1.0,
 54 |         max_simultaneous_tasks=1,
 55 |         storage=bucket)
 56 |     exercise_client(c, high=3)
 57 | 
 58 | 
 59 | @util.with_local_and_bucket_storage
 60 | def test_worker_exception_delayed(bucket):
 61 |     c = client_from_commandline_args([
 62 |         "--kubeface-poll-seconds", "1.1",
 63 |         "--kubeface-backend", "local-process",
 64 |         "--kubeface-storage", bucket,
 65 |         "--kubeface-wait-to-raise-task-exception",
 66 |     ])
 67 |     mapper = c.map(lambda x: 2 / (x - 2), range(10))
 68 |     testing.assert_equal(next(mapper), -1)
 69 |     testing.assert_equal(next(mapper), -2)
 70 |     testing.assert_equal(len(c.job_summary(include_done=False)), 1)
 71 |     testing.assert_equal(len(c.job_summary(include_done=True)), 1)
 72 |     testing.assert_raises(ZeroDivisionError, next, mapper)
 73 |     testing.assert_equal(len(c.job_summary(include_done=False)), 0)
 74 |     testing.assert_equal(len(c.job_summary(include_done=True)), 1)
 75 |     testing.assert_raises(StopIteration, next, mapper)
 76 |     testing.assert_equal(len(c.job_summary(include_done=False)), 0)
 77 |     testing.assert_equal(len(c.job_summary(include_done=True)), 1)
 78 | 
 79 | 
 80 | @util.with_local_and_bucket_storage
 81 | def test_worker_exception(bucket):
 82 |     c = client_from_commandline_args([
 83 |         "--kubeface-poll-seconds", "1.1",
 84 |         "--kubeface-backend", "local-process",
 85 |         "--kubeface-storage", bucket,
 86 |         "--kubeface-cache-key-prefix", "foo",
 87 |     ])
 88 |     mapper = c.map(lambda x: 2 / (x - 2), range(10))
 89 |     testing.assert_raises(ZeroDivisionError, next, mapper)
 90 | 
 91 |     # TODO: in the future we may want reruns to not re-use excpetions.
 92 |     # Here is a test for that functionality, which is currently not
 93 |     # implemented.
 94 |     # c = client_from_commandline_args([
 95 |     #     "--kubeface-poll-seconds", "1.1",
 96 |     #     "--kubeface-backend", "local-process",
 97 |     #     "--kubeface-storage", bucket,
 98 |     #     "--kubeface-cache-key-prefix", "foo",
 99 |     # ])
100 |     # results = list(c.map(lambda x: 2 / (x - 200), range(10)))
101 |     # print(results)  # should not raise
102 | 
103 | 
104 | @util.with_local_and_bucket_storage
105 | def test_job_summary(bucket):
106 |     c = client_from_commandline_args([
107 |         "--kubeface-poll-seconds", "1.1",
108 |         "--kubeface-backend", "local-process",
109 |         "--kubeface-storage", bucket,
110 |     ])
111 | 
112 |     exercise_client(c, high=5)
113 |     testing.assert_equal(len(c.job_summary(include_done=False)), 0)
114 |     testing.assert_equal(len(c.job_summary(include_done=True)), 1)
115 | 
116 |     exercise_client(c, high=2)
117 |     testing.assert_equal(len(c.job_summary(include_done=False)), 0)
118 |     testing.assert_equal(len(c.job_summary(include_done=True)), 2)
119 | 
120 |     mapper = c.map(lambda x: x + 5, range(10))
121 |     testing.assert_equal(next(mapper), 5)
122 |     testing.assert_equal(len(c.job_summary(include_done=False)), 1)
123 |     testing.assert_equal(len(c.job_summary(include_done=True)), 3)
124 |     testing.assert_equal(list(mapper), numpy.arange(1, 10) + 5)
125 |     testing.assert_equal(len(c.job_summary(include_done=False)), 0)
126 |     testing.assert_equal(len(c.job_summary(include_done=True)), 3)
127 | 
128 |     c.cleanup()
129 |     testing.assert_equal(len(c.job_summary()), 0)
130 | 
131 | 
132 | def test_invalid_client():
133 |     with testing.assert_raises(ValueError):
134 |         client_from_commandline_args([
135 |             "--kubeface-poll-seconds", "1.1",
136 |             "--kubeface-backend", "kubernetes",
137 |             "--kubeface-storage", "/tmp",
138 |         ])
139 | 
140 | 
141 | @util.with_local_and_bucket_storage
142 | def test_remote_object(bucket):
143 |     c = client_from_commandline_args([
144 |         "--kubeface-poll-seconds", "1.1",
145 |         "--kubeface-backend", "local-process",
146 |         "--kubeface-storage", bucket,
147 |     ])
148 |     data = numpy.arange(10000)**2
149 |     serialized_data = serialization.dumps(data)
150 |     testing.assert_equal(serialization.loads(serialized_data), data)
151 | 
152 |     remote = c.remote_object(data)
153 |     serialized_remote = serialization.dumps(remote)
154 |     assert len(serialized_remote) < len(serialized_data) / 10
155 |     testing.assert_equal(serialization.loads(serialized_remote).value, data)
156 | 
157 | 
158 | @util.with_local_and_bucket_storage
159 | def test_pickle_client(bucket):
160 |     c = client_from_commandline_args([
161 |         "--kubeface-poll-seconds", "1.1",
162 |         "--kubeface-backend", "local-process",
163 |         "--kubeface-storage", bucket,
164 |     ])
165 |     testing.assert_equal(
166 |         c.cache_key_prefix,
167 |         serialization.loads(serialization.dumps(c)).cache_key_prefix)
168 | 
169 | 
170 | @util.with_local_and_bucket_storage
171 | def test_return_remote_object(bucket):
172 |     c = client_from_commandline_args([
173 |         "--kubeface-poll-seconds", "1.1",
174 |         "--kubeface-backend", "local-process",
175 |         "--kubeface-storage", bucket,
176 |     ])
177 |     mapper = c.map(lambda x: c.remote_object(x**2), range(10))
178 |     obj = next(mapper)
179 |     testing.assert_equal(obj.written, True)
180 |     testing.assert_equal(obj.loaded, False)
181 |     testing.assert_equal(obj.value, 0)
182 |     testing.assert_equal(obj.loaded, True)
183 |     testing.assert_equal(obj.value, 0)
184 | 
185 |     obj = next(mapper)
186 |     testing.assert_equal(obj.written, True)
187 |     testing.assert_equal(obj.loaded, False)
188 |     testing.assert_equal(obj.value, 1)
189 |     testing.assert_equal(obj.loaded, True)
190 |     testing.assert_equal(obj.value, 1)
191 | 
192 |     obj = next(mapper)
193 |     testing.assert_equal(obj.written, True)
194 |     testing.assert_equal(obj.loaded, False)
195 |     testing.assert_equal(obj.value, 4)
196 |     testing.assert_equal(obj.loaded, True)
197 |     testing.assert_equal(obj.value, 4)
198 | 


--------------------------------------------------------------------------------
/tests/test_job_command.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import argparse
 3 | import subprocess
 4 | from numpy import testing
 5 | 
 6 | from kubeface import (
 7 |     client,
 8 |     common)
 9 | 
10 | from . import util
11 | 
12 | common.configure_logging(verbose=True)
13 | 
14 | 
15 | def client_from_commandline_args(argv):
16 |     parser = argparse.ArgumentParser()
17 |     client.Client.add_args(parser)
18 |     args = parser.parse_args(argv)
19 |     return client.Client.from_args(args)
20 | 
21 | 
22 | def run_job_command(bucket, argv):
23 |     result = subprocess.check_output(
24 |         ["kubeface-job", "--kubeface-storage", bucket] + argv).decode()
25 |     print(result)
26 |     return result
27 | 
28 | 
29 | def find_line_with(needle, haystack, nth=0):
30 |     result = [x for x in haystack.split("\n") if needle in x][nth]
31 |     print("Found line: %s" % result)
32 |     return result
33 | 
34 | 
35 | @util.with_local_storage
36 | def test_job_command(bucket):
37 |     c = client_from_commandline_args([
38 |         "--kubeface-poll-seconds", "1.1",
39 |         "--kubeface-backend", "local-process",
40 |         "--kubeface-storage", bucket,
41 |     ])
42 | 
43 |     mapper = c.map(math.exp, range(10), cache_key='FOOBARBAZ')
44 |     testing.assert_equal(next(mapper), 1)
45 |     assert 'FOOBARBAZ' in run_job_command(bucket, [])
46 |     assert 'active' in (
47 |         find_line_with(
48 |             "FOOBARBAZ",
49 |             run_job_command(bucket, ["--include-done"]),
50 |             nth=1))
51 |     list(mapper)
52 |     assert 'FOOBARBAZ' not in run_job_command(bucket, [])
53 | 


--------------------------------------------------------------------------------
/tests/test_naming.py:
--------------------------------------------------------------------------------
 1 | from numpy import testing
 2 | 
 3 | from kubeface import naming
 4 | 
 5 | 
 6 | def test_basics():
 7 |     job = naming.JOB.make_string(
 8 |         cache_key="foo", node_id="node-master", randomness="123")
 9 |     print(job)
10 |     testing.assert_equal(
11 |         naming.JOB.make_string(naming.JOB.make_tuple(job)),
12 |         job)
13 |     testing.assert_equal(
14 |         naming.JOB.prefix(cache_key=["foo"]),
15 |         "foo::")
16 |     testing.assert_equal(
17 |         naming.JOB.prefix(cache_key=["foo", "fob"]),
18 |         "fo")
19 | 
20 |     job_status = naming.JOB_STATUS_PAGE.make_string(
21 |         format="json", status="active", job_name="foobar")
22 |     testing.assert_equal(
23 |         naming.JOB_STATUS_PAGE.make_string(
24 |             naming.JOB_STATUS_PAGE.make_tuple(job_status)),
25 |         job_status)
26 |     testing.assert_equal(
27 |         set(naming.JOB_STATUS_PAGE.prefixes(
28 |             max_prefixes=2,
29 |             status=["active", "done"])),
30 |         set(["done::", "active::"]))
31 |     testing.assert_equal(
32 |         set(naming.JOB_STATUS_PAGE.prefixes(
33 |             max_prefixes=4,
34 |             status=["active", "done"],
35 |             format=["html", "json"])),
36 |         set([
37 |             "done::html::",
38 |             "active::html::",
39 |             "done::json::",
40 |             "active::json::"]))
41 |     testing.assert_equal(
42 |         set(naming.JOB_STATUS_PAGE.prefixes(
43 |             max_prefixes=4)),
44 |         set([
45 |             "done::html::",
46 |             "active::html::",
47 |             "done::json::",
48 |             "active::json::"]))
49 |     testing.assert_equal(
50 |         set(naming.JOB_STATUS_PAGE.prefixes(
51 |             max_prefixes=4,
52 |             job_name=["foo1", "foo2"])),
53 |         set([
54 |             "done::html::foo",
55 |             "active::html::foo",
56 |             "done::json::foo",
57 |             "active::json::foo"]))
58 |     testing.assert_equal(
59 |         set(naming.JOB_STATUS_PAGE.prefixes(
60 |             max_prefixes=9,
61 |             job_name=["foo1", "foo2"])),
62 |         set(
63 |             [
64 |                 "done::html::foo1.html",
65 |                 "active::html::foo1.html",
66 |                 "done::json::foo1.json",
67 |                 "active::json::foo1.json",
68 |                 "done::html::foo2.html",
69 |                 "active::html::foo2.html",
70 |                 "done::json::foo2.json",
71 |                 "active::json::foo2.json",
72 |             ]
73 |         ))
74 | 


--------------------------------------------------------------------------------
/tests/test_storage.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from six import BytesIO
 3 | from numpy import testing
 4 | 
 5 | from kubeface import bucket_storage, storage
 6 | 
 7 | from .util import with_local_and_bucket_storage
 8 | 
 9 | 
10 | def test_url_parse():
11 |     testing.assert_equal(
12 |         bucket_storage.split_bucket_and_name("gs://foo/bar"),
13 |         ("foo", "bar"))
14 | 
15 |     testing.assert_equal(
16 |         bucket_storage.split_bucket_and_name("gs://foo/bar/baz.txt"),
17 |         ("foo", "bar/baz.txt"))
18 | 
19 | 
20 | @with_local_and_bucket_storage
21 | def test_put_and_get_to_bucket(bucket):
22 |     data = "ABCDe" * 1000
23 |     data_handle = BytesIO(data.encode("UTF-8"))
24 |     file_name = "kubeface-test-%s.txt" % (
25 |         str(time.time()).replace(".", ""))
26 |     name = "%s/%s" % (bucket, file_name)
27 |     storage.put(name, data_handle)
28 |     testing.assert_equal(storage.list_contents(name), [file_name])
29 |     testing.assert_(
30 |         file_name in storage.list_contents("%s/kubeface-test-" % bucket))
31 | 
32 |     result_handle = storage.get(name)
33 |     testing.assert_equal(result_handle.read().decode("UTF-8"), data)
34 |     storage.delete(name)
35 |     testing.assert_(
36 |         file_name not in storage.list_contents("%s/" % bucket))
37 | 
38 | 
39 | @with_local_and_bucket_storage
40 | def test_move(bucket):
41 |     data = "ABCDe" * 1000
42 |     data_handle = BytesIO(data.encode("UTF-8"))
43 |     file_name = "kubeface-test-%s.txt" % (
44 |         str(time.time()).replace(".", ""))
45 |     name = "%s/%s" % (bucket, file_name)
46 |     name2 = "%s/moved-%s" % (bucket, file_name)
47 |     storage.put(name, data_handle)
48 |     testing.assert_equal(storage.list_contents(name), [file_name])
49 |     storage.move(name, name2)
50 |     testing.assert_equal(storage.list_contents(name), [])
51 |     testing.assert_equal(
52 |         storage.list_contents(name2),
53 |         ["moved-%s" % file_name])
54 |     result_handle = storage.get(name2)
55 |     testing.assert_equal(result_handle.read().decode("UTF-8"), data)
56 |     storage.delete(name2)
57 |     testing.assert_(
58 |         ("moved-%s" % file_name) not in storage.list_contents("%s/" % bucket))
59 | 


--------------------------------------------------------------------------------
/tests/util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import tempfile
 4 | import time
 5 | import logging
 6 | 
 7 | import kubeface
 8 | import kubeface.bucket_storage
 9 | 
10 | 
11 | logging.basicConfig(level=logging.DEBUG)
12 | 
13 | KEEP_FILES = os.environ.get("KUBEFACE_TEST_KEEP_FILES")
14 | 
15 | 
16 | def wipe_bucket(bucket_url):
17 |     objects = kubeface.bucket_storage.list_contents(bucket_url)
18 |     for obj in objects:
19 |         kubeface.bucket_storage.delete(bucket_url + "/" + obj)
20 | 
21 | 
22 | def check_empty(bucket_url):
23 |     assert not kubeface.bucket_storage.list_contents(bucket_url)
24 | 
25 | 
26 | def with_bucket_storage(function):
27 |     bucket = os.environ.get("KUBEFACE_STORAGE")
28 |     if not bucket:
29 |         logging.fatal("No bucket defined")
30 | 
31 |     def test_function():
32 |         # check_empty("gs://" + bucket)
33 |         wipe_bucket("gs://" + bucket)
34 |         function("gs://" + bucket)
35 |         wipe_bucket("gs://" + bucket)
36 |     return test_function
37 | 
38 | 
39 | def with_local_storage(function):
40 |     def test_function():
41 |         tempdir = tempfile.mkdtemp(dir='/tmp')
42 |         function(tempdir)
43 |         if not KEEP_FILES:
44 |             shutil.rmtree(tempdir)
45 |     return test_function
46 | 
47 | 
48 | def with_local_and_bucket_storage(function):
49 |     bucket = os.environ.get("KUBEFACE_STORAGE")
50 |     if not bucket:
51 |         logging.warning(
52 |             "Set KUBEFACE_STORAGE to run test: %s" % str(function))
53 |         return with_local_storage(function)
54 | 
55 |     def test_function():
56 |         with_local_storage(function)()
57 |         with_bucket_storage(function)()
58 |     return test_function
59 | 


--------------------------------------------------------------------------------