├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── design.md ├── example.py ├── kubeface ├── __init__.py ├── backend.py ├── backends.py ├── bucket_storage.py ├── client.py ├── commands │ ├── __init__.py │ ├── copy.py │ ├── job.py │ ├── run.py │ └── run_task.py ├── common.py ├── context.py ├── job.py ├── kubernetes_backend.py ├── local_process_backend.py ├── local_process_docker_backend.py ├── naming.py ├── remote_object.py ├── result.py ├── serialization.py ├── status_writer.py ├── storage.py ├── stringable.py ├── task.py └── worker_configuration.py ├── remote_object_example.py ├── setup.py └── tests ├── __init__.py ├── test_client.py ├── test_job_command.py ├── test_naming.py ├── test_storage.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | language: python 3 | services: 4 | - docker 5 | python: 6 | - "3.6.1" 7 | install: 8 | - pip install -e . 9 | script: 10 | - nosetests 11 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:cudnn-runtime 2 | 3 | MAINTAINER Tim O'Donnell 4 | 5 | RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections && \ 6 | apt-get clean && \ 7 | apt-get update && \ 8 | apt-get install --yes \ 9 | gfortran \ 10 | git \ 11 | libatlas-base-dev \ 12 | libatlas3gf-base \ 13 | libblas-dev \ 14 | libfreetype6-dev \ 15 | libhdf5-serial-dev \ 16 | liblapack-dev \ 17 | libpng12-dev \ 18 | libxml2-dev \ 19 | libxslt1-dev \ 20 | libyaml-dev \ 21 | libzmq3-dev \ 22 | pkg-config \ 23 | python-virtualenv \ 24 | python3-dev \ 25 | python-dev && \ 26 | apt-get clean && \ 27 | useradd --create-home --home-dir /home/user --shell /bin/bash -G sudo user && \ 28 | echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers 29 | 30 | RUN locale-gen en_US.UTF-8 31 | ENV LANG en_US.UTF-8 32 | ENV LANGUAGE en_US:en 33 | ENV LC_ALL en_US.UTF-8 34 | 35 | USER user 36 | ENV HOME=/home/user 37 | ENV SHELL=/bin/bash 38 | ENV USER=user 39 | WORKDIR /home/user 40 | 41 | # Setup virtual envs and install convenience packages. Note: installing 42 | RUN virtualenv venv-py3 --python=python3 && \ 43 | venv-py3/bin/pip install --upgrade pip && \ 44 | venv-py3/bin/pip install --upgrade \ 45 | numpy \ 46 | bokeh \ 47 | cherrypy \ 48 | jupyter \ 49 | lxml \ 50 | scipy \ 51 | scikit-learn \ 52 | dill \ 53 | seaborn 54 | 55 | ENV PATH /home/user/venv-py3/bin:$PATH 56 | COPY . ./kubeface 57 | RUN venv-py3/bin/pip install ./kubeface 58 | 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | kubeface 2 | [![Build Status](https://travis-ci.org/hammerlab/kubeface.svg?branch=master)](https://travis-ci.org/hammerlab/kubeface) 3 | ======== 4 | 5 | Python library for parallel maps running directly on Kubernetes. Intended for running many expensive tasks (minutes in runtime). Alpha stage. Currently supports only Google Cloud. 6 | 7 | Overview 8 | ======== 9 | 10 | Kubeface aims for reasonably efficient execution of many long running Python tasks with medium sized (up to a few gigabytes) inputs and outputs. Design choices and assumptions: 11 | 12 | * Each task runs in its own bare kubernetes pod. There is no state shared between tasks 13 | * All communication is through Google Storage Buckets 14 | * Each task's input and output must fit in memory, but we do not assume that more than one task's data fits simultaneously 15 | * Work performed as part of jobs that crash can be re-used for reruns 16 | * We favor debuggability over performance 17 | 18 | The primary motivating application has been neural network model selection for the [MHCflurry](https://github.com/hammerlab/mhcflurry) project. 19 | 20 | See [example.py](example.py) for a simple working example. 21 | 22 | Nomenclature 23 | ------------- 24 | 25 | * **Master:** the Python process the user launches. It uses kubeface to run *jobs* 26 | * **Worker:** a process running external to the master (probably on a cluster) that executes a *task* 27 | * **Job:** Each call to `client.map(...)` creates a *job* 28 | * **Task:** Each invocation of the function given to map is a *task* 29 | 30 | Backends 31 | ------------- 32 | 33 | * The *kubernetes* backend runs tasks on Kubernetes. This is what is used in production 34 | * The *local-process* backend runs tasks as local processes. Useful for development and testing of both kubeface and code that uses it 35 | * The *local-process-docker* backend runs tasks as local processes in a docker container. This is used for testing kubeface 36 | 37 | 38 | Life of a job 39 | ------------- 40 | 41 | If a user calls (where `client` is a [kubeface.Client](kubeface/client.py) instance): 42 | 43 | ```py 44 | client.map(lambda x: x**2, range(10)) 45 | ``` 46 | 47 | This creates a *job* containing 10 *tasks*. The return value is a generator that will yield the square of the numbers 0-9. The job is executed as follows: 48 | 49 | * Submission: for each task: 50 | * an input file containing a pickled (we use the [dill](https://github.com/uqfoundation/dill) library) representation of the task's input is uploaded to cloud storage. In this example the input data is a number 0-9. 51 | * A `kubectl` command is issued that creates a bare pod whose entrypoint (i.e. what runs in the pod) installs kubeface if necessary then calls the command `_kubeface-run-task `. 52 | * The `_kubeface-run-task` command downloads the input file from cloud storage, runs the task, and uploads the result to the specified path. 53 | * After all tasks have been submitted, kubeface waits for all results to appear in cloud storage. It may speculatively re-submit some tasks that appear to be straggling or crashed. 54 | * Once all results are available, each task’s result is read by the master and yielded to the client code 55 | 56 | 57 | Docker images 58 | ------------- 59 | 60 | Kubeface tasks execute in the context of a particular docker image, since they run in a kubernetes pod. You can use any docker image with python installed. If your docker image does not have kubeface installed, then by default kubeface will try to install itself using `pip`. This is inefficient since it will run for every task. If you plan on running many tasks it's a good idea to create your own docker image with kubeface installed. 61 | 62 | Inspecting job status 63 | ---------------------- 64 | Kubeface writes out HTML and JSON status pages to cloud storage and logs to stdout. However, the best way to figure out what's going on with your job is to use kubernetes directly, via `kubectl get pods` and `kubectl logs `. 65 | 66 | 67 | Installation 68 | ============ 69 | 70 | From a checkout: 71 | 72 | pip install -e . 73 | 74 | To run the tests: 75 | 76 | # Setting this environment variable is optional. 77 | # If you set it in the tests will run against a real google storage bucket. 78 | # See https://developers.google.com/identity/protocols/application-default-credentials#howtheywork; 79 | # you need to get Application Default Credentials before writing to your bucket. 80 | KUBEFACE_STORAGE=gs://kubeface-test # tests will write to gs://kubeface-test. 81 | 82 | # Run tests: 83 | nosetests 84 | 85 | Shell Example 86 | ============= 87 | 88 | The `kubeface-run` command runs a job from the shell, which is useful for testing or simple tasks. 89 | 90 | If you don’t already have a kubernetes cluster running, use a command like this to start one: 91 | 92 | gcloud config set compute/zone us-east1-c 93 | gcloud components install kubectl # if you haven't already installed kubectl 94 | gcloud container clusters create kubeface-cluster-$(whoami) \ 95 | --scopes storage-full \ 96 | --zone us-east1-c \ 97 | --num-nodes=2 \ 98 | --enable-autoscaling --min-nodes=1 --max-nodes=100 \ 99 | --machine-type=n1-standard-16 100 | 101 | You should see your cluster listed here: 102 | 103 | Then run this to set it as the default for your session: 104 | 105 | gcloud config set container/cluster kubeface-cluster-$(whoami) 106 | gcloud container clusters get-credentials kubeface-cluster-$(whoami) 107 | 108 | Now launch a command: 109 | 110 | kubeface-run \ 111 | --expression 'value**2' \ 112 | --generator-expression 'range(10)' \ 113 | --kubeface-max-simultaneous-tasks 10 \ 114 | --kubeface-backend kubernetes \ 115 | --kubeface-worker-image continuumio/anaconda3 \ 116 | --kubeface-kubernetes-task-resources-cpu 1 \ 117 | --kubeface-kubernetes-task-resources-memory-mb 500 \ 118 | --verbose \ 119 | --out-csv /tmp/result.csv 120 | 121 | If you kill the above command, you can run this to kill all the running pods in your cluster: 122 | 123 | kubectl delete pods --all 124 | 125 | When you’re done working, delete your cluster: 126 | 127 | gcloud container clusters delete kubeface-cluster-$(whoami) 128 | 129 | -------------------------------------------------------------------------------- /design.md: -------------------------------------------------------------------------------- 1 | # Design 2 | 3 | ## Motivation 4 | 5 | We would like to run fairly long-running Python tasks over Kubernetes on Google Cloud. 6 | 7 | Two applications we need this for are MHCflurry model selection and data preparation for antigen presentation predictors, where we would like to run some analyses over the full peptidome. 8 | 9 | We have previously experimented with an approach based on running [dask-distributed](https://github.com/dask/distributed) on Kubernetes as described [here](https://github.com/dask/distributedhttps://github.com/hammerlab/dask-distributed-on-kubernetes). 10 | 11 | However, having long running server processes as in dask distributed has resulted in reliability issues for us. When results are large the distributed scheduler seems to slow down or crash. Since we don't care about latency, I think it would be less errorprone to run each task in its own Kubernetes job and use Google Buckets to shuffle data around. 12 | 13 | Our MHCflurry code can use any parallel map implementation, see e.g. [here](https://github.com/hammerlab/mhcflurry/blob/master/mhcflurry/class1_allele_specific/train.py#L308). We should be able to make a library that plugs in there without any significant modification to MHCflurry. 14 | 15 | Design parameters 16 | * There's a master process, which the user launches. It calls a parallel map impementation to do work on the cluster. 17 | * Tasks are independent, do not communicate 18 | * Long running tasks, say 5 min - 5 hours. 19 | * Many tasks: as many as 10k. 20 | * Significant data exchange. Input and result to *each task* may be as high as 1gb. Full input dataset to all tasks does not fit in memory on any node. Full result set across tasks 21 | also does not fit. Input and result from any single task fits on all nodes. 22 | * No attempt at recovery if a task throws a Python exception. Kill the whole run. 23 | 24 | The main goal here is simplicity and reliability. We do not care at all about latency; fine if it takes 5 minutes for Kubernetes to launch a task. We want to push all tricky issues, in particular scheduling of tasks and recovery of failed nodes, onto Kubernetes. We should never have two python processes talking directly to each other. We should only interact with Kubernetes and Google Storage Buckets. 25 | 26 | ## Interface 27 | 28 | This project should expose a library that implements a parallel map, e.g. 29 | 30 | ```python 31 | def parallel_map(func, iterable): 32 | """ 33 | Parallel map. Each invocation of func is run in its own kubernetes Job. 34 | 35 | Returns (func(x) for x in iterable) 36 | """ 37 | ``` 38 | 39 | There is some configuration that is shared across invocations of parallel_map, so it makes sense to put this in a class and then have parallel_map as a method of it, e.g. 40 | 41 | ```python 42 | class Client(object): 43 | def __init__( 44 | self, 45 | image, 46 | bucket, 47 | image_pull_policy="Always", 48 | cluster=None, 49 | available_parallelism=None, 50 | python_path='/usr/bin/env python', 51 | run_locally=False): 52 | """ 53 | Create a client for running tasks on Kubernetes. 54 | 55 | Parameters 56 | -------------- 57 | 58 | image : string 59 | Docker image to use (on docker hub) 60 | 61 | image_pull_policy : boolean, optional 62 | Kubernetes imagePullPolicy setting. See [1] 63 | 64 | cluster : string 65 | Kubernetes cluster to schedule jobs on 66 | 67 | available_parallelism : int 68 | If specified, max number of jobs to schedule on Kubernetes at once 69 | 70 | python_path : string 71 | Path to Python binary in the image 72 | 73 | run_locally: boolean 74 | Run tasks in the current process. Useful for testing 75 | 76 | [1] http://kubernetes.io/docs/user-guide/images/ 77 | """ 78 | 79 | def parallel_map(func, iterable): 80 | ... 81 | ``` 82 | 83 | 84 | ## Implementation 85 | 86 | Possible first-pass implementation. For each task (this is running on the master node): 87 | 88 | * Serialize the function to run and its input (using e.g. [dill](https://github.com/uqfoundation/dill)) 89 | * Copy serialized data to a Google Bucket, give the file a unique name. 90 | * Schedule a Kubernetes job that runs a Python script that downloads the serialized data from Google Bucket, unserializes it, runs the function on the data, and copies the serialized result to a unique filename on the Google Bucket 91 | 92 | Then the master node would poll for the results on Google Bucket, and perhaps issue Kubernetes commands to watch what's been scheduled etc. and report the progress to the user. 93 | 94 | We can either issue kubernetes and gsutil commandline calls directly or interact with them through their REST APIs using a project like [pykube](https://github.com/kelproject/pykube). 95 | 96 | Kubernetes [secrets](http://kubernetes.io/docs/user-guide/secrets/) may be an alternative approach to sending each task its input data. 97 | 98 | 99 | ## Unknowns 100 | 101 | * Is Google Bucket going to hold up to having tons of tasks hitting it with downloads and uploads? Is it fast enough? 102 | * Is Kubernetes stable enough? 103 | * How can we test this library without actually using Google Cloud? [Kubernetes on vagrant](https://coreos.com/kubernetes/docs/latest/kubernetes-on-vagrant-single.html) may be relevant here. Not sure what to do about Google Bucket dependency. 104 | 105 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Kubeface simple example. 3 | 4 | Computes the square of numbers 1 .. N, where N is specified on the commandline. 5 | 6 | Example: 7 | 8 | $ python example.py 10 --kubeface-backend local-process --kubeface-storage /tmp 9 | 10 | """ 11 | 12 | import argparse 13 | import sys 14 | 15 | import kubeface 16 | 17 | parser = argparse.ArgumentParser(usage=__doc__) 18 | parser.add_argument("n", type=int) 19 | kubeface.Client.add_args(parser) # Add kubeface arguments 20 | 21 | 22 | def my_function(x): 23 | return x**2 24 | 25 | 26 | def main(argv): 27 | args = parser.parse_args(argv) 28 | client = kubeface.Client.from_args(args) 29 | 30 | input_values = range(1, args.n + 1) 31 | results = client.map(my_function, input_values) 32 | 33 | for (x, result) in zip(input_values, results): 34 | print("%5d**2 = %5d" % (x, result)) 35 | 36 | 37 | if __name__ == '__main__': 38 | main(sys.argv[1:]) 39 | -------------------------------------------------------------------------------- /kubeface/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import Client 2 | from .local_process_backend import LocalProcessBackend 3 | from .local_process_docker_backend import LocalProcessDockerBackend 4 | from .kubernetes_backend import KubernetesBackend 5 | from .worker_configuration import WorkerConfiguration 6 | 7 | 8 | __all__ = [ 9 | "Client", 10 | "LocalProcessBackend", 11 | "LocalProcessDockerBackend", 12 | "KubernetesBackend", 13 | "WorkerConfiguration", 14 | ] 15 | -------------------------------------------------------------------------------- /kubeface/backend.py: -------------------------------------------------------------------------------- 1 | class Backend(object): 2 | def submit_task(self, task_input, task_output): 3 | raise NotImplementedError 4 | 5 | def supports_storage(self, path_or_url): 6 | return True 7 | -------------------------------------------------------------------------------- /kubeface/backends.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from . import local_process_backend 3 | from . import local_process_docker_backend 4 | from . import kubernetes_backend 5 | 6 | BACKENDS = collections.OrderedDict([ 7 | ('local-process', local_process_backend.LocalProcessBackend), 8 | ('local-process-docker', 9 | local_process_docker_backend.LocalProcessDockerBackend), 10 | ('kubernetes', kubernetes_backend.KubernetesBackend), 11 | ]) 12 | 13 | 14 | def add_args(parser): 15 | parser.add_argument( 16 | "--kubeface-backend", 17 | choices=tuple(BACKENDS), 18 | default=tuple(BACKENDS)[0]) 19 | 20 | for (backend, klass) in BACKENDS.items(): 21 | klass.add_args(parser) 22 | return parser 23 | 24 | 25 | def backend_from_args(args): 26 | return BACKENDS[args.kubeface_backend].from_args(args) 27 | -------------------------------------------------------------------------------- /kubeface/bucket_storage.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import tempfile 3 | import time 4 | 5 | from googleapiclient import discovery 6 | from googleapiclient import http 7 | 8 | from oauth2client.client import GoogleCredentials 9 | 10 | # Some of this is copied from: 11 | # https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/storage/api/crud_object.py 12 | # and: 13 | # https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/storage/api/list_objects.py 14 | 15 | RETRIES_BEFORE_FAILURE = 12 16 | FIRST_RETRY_SLEEP = 2.0 17 | _SERVICE = None 18 | 19 | 20 | def get_service(): 21 | global _SERVICE 22 | if _SERVICE is None: 23 | _SERVICE = create_service() 24 | return _SERVICE 25 | 26 | 27 | def create_service(): 28 | # Get the application default credentials. When running locally, these are 29 | # available after running `gcloud init`. When running on compute 30 | # engine, these are available from the environment. 31 | credentials = GoogleCredentials.get_application_default() 32 | 33 | # Construct the service object for interacting with the Cloud Storage API - 34 | # the 'storage' service, at version 'v1'. 35 | # You can browse other available api services and versions here: 36 | # http://g.co/dev/api-client-library/python/apis/ 37 | return discovery.build('storage', 'v1', credentials=credentials) 38 | 39 | 40 | def robustify(function): 41 | def robust_function(*args, **kwargs): 42 | error_num = 0 43 | while True: 44 | try: 45 | return function(*args, **kwargs) 46 | except Exception as e: 47 | error_num += 1 48 | logging.warning( 49 | "Exception calling %s: '%s'. " 50 | "This call has failed %d times. Will retry up to " 51 | "%d times." % ( 52 | str(function), 53 | str(e), 54 | error_num, 55 | RETRIES_BEFORE_FAILURE)) 56 | 57 | if error_num > RETRIES_BEFORE_FAILURE: 58 | raise 59 | 60 | sleep_time = FIRST_RETRY_SLEEP**error_num 61 | logging.warn("Sleeping for %0.2f seconds." % sleep_time) 62 | time.sleep(sleep_time) 63 | return robust_function 64 | 65 | 66 | def split_bucket_and_name(url): 67 | if not url.startswith("gs://"): 68 | raise ValueError("Not a gs:// url: %s" % url) 69 | return url[len("gs://"):].split("/", 1) 70 | 71 | 72 | @robustify 73 | def list_contents(prefix): 74 | splitted = split_bucket_and_name(prefix) 75 | if len(splitted) == 1: 76 | (bucket_name, file_name_prefix) = (splitted[0], "") 77 | else: 78 | (bucket_name, file_name_prefix) = splitted 79 | 80 | # Create a request to objects.list to retrieve a list of objects. 81 | fields_to_return = \ 82 | 'nextPageToken,items(name)' 83 | req = get_service().objects().list( 84 | bucket=bucket_name, 85 | prefix=file_name_prefix, 86 | maxResults=100000, 87 | fields=fields_to_return) 88 | 89 | all_objects = [] 90 | # If you have too many items to list in one request, list_next() will 91 | # automatically handle paging with the pageToken. 92 | while req: 93 | resp = req.execute() 94 | all_objects.extend(resp.get('items', [])) 95 | req = get_service().objects().list_next(req, resp) 96 | return [item['name'] for item in all_objects] 97 | 98 | 99 | @robustify 100 | def move(source, dest): 101 | # From https://cloud.google.com/storage/docs/json_api/v1/objects/rewrite 102 | (bucket_name, source_object) = split_bucket_and_name(source) 103 | (bucket_name2, dest_object) = split_bucket_and_name(dest) 104 | service = get_service() 105 | 106 | request = service.objects().rewrite( 107 | sourceBucket=bucket_name, 108 | sourceObject=source_object, 109 | destinationBucket=bucket_name, 110 | destinationObject=dest_object, 111 | body={}) 112 | request.execute() 113 | 114 | # Delete source. 115 | request = service.objects().delete( 116 | bucket=bucket_name, 117 | object=source_object) 118 | request.execute() 119 | 120 | 121 | @robustify 122 | def put( 123 | name, 124 | input_handle, 125 | readers=[], 126 | owners=[], 127 | mime_type='application/octet-stream'): 128 | input_handle.seek(0) 129 | (bucket_name, file_name) = split_bucket_and_name(name) 130 | 131 | # This is the request body as specified: 132 | # http://g.co/cloud/storage/docs/json_api/v1/objects/insert#request 133 | body = { 134 | 'name': file_name, 135 | } 136 | 137 | # If specified, create the access control objects and add them to the 138 | # request body 139 | if readers or owners: 140 | body['acl'] = [] 141 | 142 | for r in readers: 143 | body['acl'].append({ 144 | 'entity': 'user-%s' % r, 145 | 'role': 'READER', 146 | 'email': r 147 | }) 148 | for o in owners: 149 | body['acl'].append({ 150 | 'entity': 'user-%s' % o, 151 | 'role': 'OWNER', 152 | 'email': o 153 | }) 154 | 155 | # Now insert them into the specified bucket as a media insertion. 156 | req = get_service().objects().insert( 157 | bucket=bucket_name, 158 | body=body, 159 | # You can also just set media_body=filename, but # for the sake of 160 | # demonstration, pass in the more generic file handle, which could 161 | # very well be a StringIO or similar. 162 | media_body=http.MediaIoBaseUpload(input_handle, mime_type)) 163 | resp = req.execute() 164 | 165 | return resp 166 | 167 | 168 | @robustify 169 | def get(name, output_handle=None): 170 | (bucket_name, file_name) = split_bucket_and_name(name) 171 | 172 | if output_handle is None: 173 | output_handle = tempfile.TemporaryFile( 174 | prefix="kubeface-bucket-storage-", 175 | suffix=".data") 176 | 177 | # Use get_media instead of get to get the actual contents of the object 178 | req = get_service().objects().get_media( 179 | bucket=bucket_name, 180 | object=file_name) 181 | downloader = http.MediaIoBaseDownload(output_handle, req) 182 | 183 | done = False 184 | while done is False: 185 | (status, done) = downloader.next_chunk() 186 | logging.debug("Download {}%.".format(int(status.progress() * 100))) 187 | output_handle.seek(0) 188 | return output_handle 189 | 190 | 191 | @robustify 192 | def delete(name): 193 | (bucket_name, file_name) = split_bucket_and_name(name) 194 | req = get_service().objects().delete(bucket=bucket_name, object=file_name) 195 | return req.execute() 196 | 197 | 198 | def access_info(name): 199 | (bucket_name, file_name) = split_bucket_and_name(name) 200 | return ( 201 | "https://storage.cloud.google.com/" 202 | "{bucket_name}/{file_name}\t[ {name} ]".format( 203 | bucket_name=bucket_name, 204 | file_name=file_name, 205 | name=name)) 206 | -------------------------------------------------------------------------------- /kubeface/client.py: -------------------------------------------------------------------------------- 1 | import math 2 | import logging 3 | import os 4 | 5 | from .remote_object import RemoteObject 6 | from .job import Job 7 | from .task import Task 8 | from . import ( 9 | backends, 10 | worker_configuration, 11 | naming, 12 | context, 13 | storage) 14 | 15 | 16 | def run_multiple(function, values): 17 | return [function(v) for v in values] 18 | 19 | 20 | class Client(object): 21 | """ 22 | User interface to Kubeface. 23 | """ 24 | 25 | @staticmethod 26 | def add_args(parser): 27 | """ 28 | Add commandline arguments to argument parser. 29 | 30 | Parameters 31 | ---------- 32 | parser : argparse.ArgumentParser 33 | """ 34 | group = parser.add_argument_group("kubeface client") 35 | group.add_argument( 36 | "--kubeface-max-simultaneous-tasks", 37 | type=int, 38 | default=10) 39 | group.add_argument( 40 | "--kubeface-poll-seconds", 41 | type=float, 42 | default=30.0) 43 | group.add_argument( 44 | "--kubeface-storage", 45 | default=os.environ.get("KUBEFACE_STORAGE", "gs://kubeface"), 46 | help="Default: %(default)s") 47 | group.add_argument( 48 | "--kubeface-cache-key-prefix") 49 | group.add_argument( 50 | "--kubeface-never-cleanup", 51 | action="store_true", 52 | default=False) 53 | group.add_argument( 54 | "--kubeface-wait-to-raise-task-exception", 55 | action="store_true", 56 | default=False) 57 | group.add_argument( 58 | "--kubeface-speculation-percent", 59 | type=float, 60 | default=20) 61 | group.add_argument( 62 | "--kubeface-speculation-runtime-percentile", 63 | type=float, 64 | default=99) 65 | group.add_argument( 66 | "--kubeface-speculation-max-reruns", 67 | type=int, 68 | default=3) 69 | 70 | worker_configuration.WorkerConfiguration.add_args(group) 71 | backends.add_args(group) 72 | 73 | @staticmethod 74 | def from_args(args): 75 | """ 76 | Instantiate a Client from commandline args. 77 | 78 | Parameters 79 | ---------- 80 | args : argparse.Namespace 81 | 82 | Returns 83 | ------- 84 | Client 85 | 86 | """ 87 | backend = backends.backend_from_args(args) 88 | if not backend.supports_storage(args.kubeface_storage): 89 | raise ValueError( 90 | "Backend '%s' does not support storage: %s" % ( 91 | args.kubeface_backend, args.kubeface_storage)) 92 | return Client( 93 | backend, 94 | max_simultaneous_tasks=args.kubeface_max_simultaneous_tasks, 95 | poll_seconds=args.kubeface_poll_seconds, 96 | storage=args.kubeface_storage, 97 | cache_key_prefix=args.kubeface_cache_key_prefix, 98 | never_cleanup=args.kubeface_never_cleanup, 99 | wait_to_raise_task_exception=( 100 | args.kubeface_wait_to_raise_task_exception), 101 | speculation_percent=args.kubeface_speculation_percent, 102 | speculation_runtime_percentile=( 103 | args.kubeface_speculation_runtime_percentile), 104 | speculation_max_reruns=args.kubeface_speculation_max_reruns) 105 | 106 | def __init__( 107 | self, 108 | backend, 109 | max_simultaneous_tasks=10, 110 | poll_seconds=30.0, 111 | storage="gs://kubeface", 112 | cache_key_prefix=None, 113 | never_cleanup=False, 114 | wait_to_raise_task_exception=False, 115 | speculation_percent=0, 116 | speculation_runtime_percentile=99, 117 | speculation_max_reruns=1): 118 | """ 119 | Parameters 120 | ---------- 121 | backend : kubeface.Backend 122 | 123 | max_simultaneous_tasks : int 124 | Maximum number of tasks to submit at once. 125 | 126 | poll_seconds : float 127 | How often to poll for task results 128 | 129 | storage : str 130 | Bucket or (for local file process backend) local filesystem path to 131 | write task inputs and outputs. 132 | 133 | cache_key_prefix : str 134 | If you set this to the same value in multiple clients, they will 135 | reuse each other's results. Advanced use only. 136 | 137 | never_cleanup : boolean 138 | Do not cleanup after successful tasks. 139 | 140 | wait_to_raise_task_exception : boolean 141 | If True, all tasks are run before any failing task's exception is 142 | raised. If False, then the exception is raised as soon as it is 143 | received. 144 | 145 | speculation_percent : float 146 | No speculation occurs until all tasks have been submitted and at 147 | least 100 - speculation_percent tasks have completed. So if you set 148 | this to 20 then the last 20% of tasks will be considered for 149 | speculatively rerunning. 150 | 151 | speculation_runtime_percentile : float 152 | A task will be rerun when its queue time exceeds 153 | speculation_runtime_percentile of the queue times of the tasks that 154 | completed successfully without speculation. 155 | 156 | speculation_max_reruns : int 157 | Tasks can be rerun up to speculation_max_reruns times. 158 | """ 159 | 160 | self.backend = backend 161 | self.max_simultaneous_tasks = max_simultaneous_tasks 162 | self.poll_seconds = poll_seconds 163 | self.storage = storage 164 | self.cache_key_prefix = ( 165 | cache_key_prefix if cache_key_prefix 166 | else naming.make_cache_key_prefix()) 167 | self.never_cleanup = never_cleanup 168 | self.wait_to_raise_task_exception = wait_to_raise_task_exception 169 | self.speculation_percent = speculation_percent 170 | self.speculation_runtime_percentile = speculation_runtime_percentile 171 | self.speculation_max_reruns = speculation_max_reruns 172 | 173 | self.submitted_jobs = [] 174 | self.next_object_num = 1 175 | 176 | def __getstate__(self): 177 | # Don't serialize jobs 178 | d = dict(self.__dict__) 179 | d['submitted_jobs'] = [] 180 | return d 181 | 182 | def next_cache_key(self): 183 | return "%s-%03d" % ( 184 | self.cache_key_prefix, 185 | len(self.submitted_jobs)) 186 | 187 | def submit(self, tasks, num_tasks=None, cache_key=None): 188 | """ 189 | Run a Job. 190 | 191 | Parameters 192 | ---------- 193 | tasks : iterable of kubeface.Task 194 | 195 | num_tasks : int 196 | If tasks has no len(...), for example in the case of a generator, 197 | if you specify num_tasks then your progress output will use that 198 | number of tasks. 199 | 200 | cache_key : str 201 | Advanced use only for reusing pre-existing results. 202 | 203 | Returns 204 | ------- 205 | kubeface.Job 206 | 207 | """ 208 | if num_tasks is None: 209 | try: 210 | num_tasks = len(tasks) 211 | except TypeError: 212 | pass 213 | job = Job( 214 | self.backend, 215 | tasks, 216 | num_tasks=num_tasks, 217 | cache_key=cache_key if cache_key else self.next_cache_key(), 218 | max_simultaneous_tasks=self.max_simultaneous_tasks, 219 | storage=self.storage, 220 | wait_to_raise_task_exception=self.wait_to_raise_task_exception, 221 | speculation_percent=self.speculation_percent, 222 | speculation_runtime_percentile=self.speculation_runtime_percentile, 223 | speculation_max_reruns=self.speculation_max_reruns) 224 | self.submitted_jobs.append(job) 225 | return job 226 | 227 | def map( 228 | self, 229 | function, 230 | iterable, 231 | items_per_task=1, 232 | num_items=None, 233 | cache_key=None): 234 | """ 235 | Parallel map. This is the primary user-facing API. 236 | 237 | Parameters 238 | ---------- 239 | function : callable 240 | Python function to run over each item 241 | 242 | iterable : iterable of object 243 | items to pass to function 244 | 245 | items_per_task : int 246 | If items_per_task is 1 then each item to map over gets its own task. 247 | If it's 10 then the first 10 items are one task, the next 10 are 248 | another, etc. 249 | 250 | num_items : int 251 | If the iterable provided has no len(...) then setting num_items 252 | will give better progress output. Not required in any case though. 253 | 254 | cache_key : str 255 | Advanced use only for reusing pre-existing results. 256 | 257 | Returns 258 | ------- 259 | generator of task results, in order 260 | 261 | """ 262 | def grouped(): 263 | iterator = iter(iterable) 264 | while True: 265 | items = [] 266 | try: 267 | while len(items) < items_per_task: 268 | items.append(next(iterator)) 269 | except StopIteration: 270 | pass 271 | if items: 272 | yield items 273 | else: 274 | break 275 | 276 | num_tasks = None 277 | if num_items is None: 278 | try: 279 | num_items = len(iterable) 280 | num_tasks = int(math.ceil(float(num_items) / items_per_task)) 281 | except TypeError: 282 | pass 283 | 284 | tasks = ( 285 | Task(run_multiple, (function, values)) for values in grouped()) 286 | job = self.submit(tasks, num_tasks=num_tasks, cache_key=cache_key) 287 | try: 288 | job.wait(poll_seconds=self.poll_seconds) 289 | for result in job.results(): 290 | result.log() 291 | result.raise_if_exception() 292 | for result_item in result.return_value: 293 | yield result_item 294 | finally: 295 | self.mark_jobs_done(job_names=[job.job_name]) 296 | 297 | def mark_jobs_done(self, job_names=None): 298 | status_pages = set() 299 | status_prefixes = naming.status_prefixes(job_names=job_names) 300 | for prefix in status_prefixes: 301 | status_pages.update(storage.list_contents( 302 | self.storage + "/" + prefix)) 303 | for source_object in status_pages: 304 | parsed = naming.JOB_STATUS_PAGE.make_tuple(source_object) 305 | if parsed.status == 'active': 306 | new_parsed = parsed._replace(status="done") 307 | dest_object = naming.JOB_STATUS_PAGE.make_string(new_parsed) 308 | logging.info("Marking job '%s' done: renaming %s -> %s" % ( 309 | parsed.job_name, 310 | source_object, 311 | dest_object)) 312 | storage.move( 313 | self.storage + "/" + source_object, 314 | self.storage + "/" + dest_object) 315 | else: 316 | logging.info("Already marked done: %s" % source_object) 317 | 318 | def cleanup_job(self, job_name): 319 | cache_key = naming.JOB.make_tuple(job_name).cache_key 320 | results = storage.list_contents( 321 | self.storage + 322 | "/" + 323 | naming.task_result_prefix(cache_key)) 324 | inputs = storage.list_contents( 325 | self.storage + 326 | "/" + 327 | naming.task_input_prefix(cache_key)) 328 | logging.info("Cleaning up cache key '%s': %d results, %d inputs." % ( 329 | cache_key, len(results), len(inputs))) 330 | 331 | for item in results + inputs: 332 | storage.delete(self.storage + "/" + item) 333 | 334 | self.mark_jobs_done(job_names=[job_name]) 335 | 336 | def job_summary(self, job_names=None, include_done=False): 337 | prefixes = naming.status_prefixes( 338 | job_names=job_names, 339 | formats=["json"], 340 | statuses=(["active"] + (["done"] if include_done else []))) 341 | all_objects = [] 342 | for prefix in prefixes: 343 | all_objects.extend( 344 | storage.list_contents( 345 | self.storage + "/" + prefix)) 346 | logging.debug("Listed %d status pages from prefixes: %s" % ( 347 | len(all_objects), " ".join(prefixes))) 348 | return [ 349 | naming.JOB_STATUS_PAGE.make_tuple(obj) 350 | for obj in sorted(all_objects) 351 | ] 352 | 353 | def cleanup(self): 354 | if self.never_cleanup: 355 | logging.warn("Cleanup disabled; skipping.") 356 | else: 357 | for job in self.submitted_jobs: 358 | logging.info("Cleaning up for job: %s" % job.job_name) 359 | self.cleanup_job(job.job_name) 360 | 361 | def remote_object(self, value): 362 | file_path = ( 363 | self.storage + 364 | "/" + 365 | naming.make_remote_object_name( 366 | cache_key_prefix=self.cache_key_prefix, 367 | node_id=context.node_id(), 368 | object_num=self.next_object_num)) 369 | self.next_object_num += 1 370 | return RemoteObject(file_path=file_path, value=value) 371 | -------------------------------------------------------------------------------- /kubeface/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hammerlab/kubeface/443d7432e6d2f8e4d20b6326e98fabeec7ad68b6/kubeface/commands/__init__.py -------------------------------------------------------------------------------- /kubeface/commands/copy.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copy files, including support for google storage buckets. 3 | ''' 4 | 5 | import sys 6 | import argparse 7 | import logging 8 | 9 | from .. import storage 10 | from ..common import configure_logging 11 | from .. import serialization 12 | 13 | parser = argparse.ArgumentParser(description=__doc__) 14 | 15 | parser.add_argument("source") 16 | parser.add_argument("destination") 17 | 18 | parser.add_argument( 19 | "--no-error", 20 | action="store_true", 21 | default=False, 22 | help="") 23 | 24 | parser.add_argument( 25 | "--quiet", 26 | action="store_true", 27 | default=False, 28 | help="") 29 | 30 | parser.add_argument( 31 | "--verbose", 32 | action="store_true", 33 | default=False, 34 | help="") 35 | 36 | parser.add_argument( 37 | "--print-deserialized", 38 | action="store_true", 39 | default=False, 40 | help="") 41 | 42 | 43 | def run(argv=sys.argv[1:]): 44 | args = parser.parse_args(argv) 45 | configure_logging(args) 46 | 47 | logging.info("Reading: %s" % args.source) 48 | input_handle = storage.get(args.source) 49 | 50 | if args.print_deserialized: 51 | deserialized = serialization.load(input_handle) 52 | input_handle.seek(0) 53 | print(deserialized) 54 | 55 | if args.destination == "-": 56 | print(input_handle.read()) 57 | else: 58 | logging.info("Writing: %s" % args.destination) 59 | storage.put(args.destination, input_handle) 60 | 61 | logging.info("Completed.") 62 | -------------------------------------------------------------------------------- /kubeface/commands/job.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Get info on and manipulate jobs. 3 | ''' 4 | 5 | import sys 6 | import argparse 7 | import collections 8 | import logging 9 | 10 | from ..client import Client 11 | from ..common import configure_logging 12 | from .. import naming 13 | 14 | parser = argparse.ArgumentParser(description=__doc__) 15 | parser.add_argument("jobs", nargs="*") 16 | parser.add_argument( 17 | "--cleanup", 18 | action="store_true", 19 | default=False) 20 | parser.add_argument( 21 | "--include-done", 22 | action="store_true", 23 | default=False) 24 | 25 | Client.add_args(parser) 26 | 27 | 28 | parser.add_argument( 29 | "--quiet", 30 | action="store_true", 31 | default=False, 32 | help="") 33 | 34 | parser.add_argument( 35 | "--verbose", 36 | action="store_true", 37 | default=False, 38 | help="") 39 | 40 | 41 | def run(argv=sys.argv[1:]): 42 | args = parser.parse_args(argv) 43 | configure_logging(args) 44 | 45 | client = Client.from_args(args) 46 | summary = client.job_summary( 47 | job_names=args.jobs if args.jobs else None, 48 | include_done=args.include_done) 49 | 50 | if not summary: 51 | print("No jobs.") 52 | 53 | jobs_by_cache_key = collections.defaultdict(list) 54 | job_info_by_name = {} 55 | for job_info_tuple in summary: 56 | job_info = job_info_tuple._asdict() 57 | job_name = job_info.pop('job_name') 58 | del job_info['format'] 59 | cache_key = naming.JOB.make_tuple(job_name).cache_key 60 | jobs_by_cache_key[cache_key].append(job_name) 61 | if job_name in job_info_by_name: 62 | logging.warning("Multiple status pages for job: %s: %s %s" % ( 63 | job_name, 64 | job_info['job_status_page_name'], 65 | job_info_by_name[job_name]['job_status_page_name'])) 66 | job_info_by_name[job_name] = job_info 67 | 68 | for cache_key in jobs_by_cache_key: 69 | print("Cache key: %s" % cache_key) 70 | for job_name in jobs_by_cache_key[cache_key]: 71 | info = job_info_by_name[job_name] 72 | print("\t%7s : %s" % (info['status'], job_name)) 73 | print("") 74 | -------------------------------------------------------------------------------- /kubeface/commands/run.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Run a job. 3 | ''' 4 | 5 | import sys 6 | import argparse 7 | import logging 8 | import subprocess 9 | import numpy 10 | import csv 11 | from functools import partial 12 | 13 | from ..client import Client 14 | from ..common import configure_logging 15 | 16 | EVAL_ENVIRONMENT = { 17 | "numpy": numpy, 18 | } 19 | 20 | parser = argparse.ArgumentParser(description=__doc__) 21 | command_group = parser.add_mutually_exclusive_group() 22 | command_group.add_argument("--shell-command") 23 | command_group.add_argument("--expression") 24 | 25 | parser.add_argument("--generator-expression", required=True) 26 | 27 | parser.add_argument("--out-csv") 28 | 29 | Client.add_args(parser) 30 | 31 | 32 | parser.add_argument( 33 | "--quiet", 34 | action="store_true", 35 | default=False, 36 | help="") 37 | 38 | parser.add_argument( 39 | "--verbose", 40 | action="store_true", 41 | default=False, 42 | help="") 43 | 44 | 45 | def shell_command_task(shell_command, value): 46 | interpolated = shell_command.format(value) 47 | logging.info("Running shell command: %s" % interpolated) 48 | result = subprocess.check_output(interpolated, shell=True) 49 | return (value, result) 50 | 51 | 52 | def expression_task(expression, value): 53 | return (value, eval(expression, EVAL_ENVIRONMENT, {"value": value})) 54 | 55 | 56 | def generator_from_expression(expression): 57 | for value in eval(expression, EVAL_ENVIRONMENT): 58 | yield value 59 | 60 | 61 | def run(argv=sys.argv[1:]): 62 | args = parser.parse_args(argv) 63 | configure_logging(args) 64 | 65 | client = Client.from_args(args) 66 | 67 | if args.shell_command: 68 | task_function = partial(shell_command_task, args.shell_command) 69 | elif args.expression: 70 | task_function = partial(expression_task, args.expression) 71 | else: 72 | parser.error("Must specify --shell-command or --expression") 73 | 74 | if args.generator_expression: 75 | generator = generator_from_expression( 76 | args.generator_expression) 77 | else: 78 | parser.error("Must specify --generator") 79 | 80 | results = client.map(task_function, generator) 81 | 82 | if args.out_csv: 83 | writer = csv.writer(open(args.out_csv, "w")) 84 | else: 85 | writer = csv.writer(sys.stdout) 86 | 87 | writer.writerow(["Value", "Result"]) 88 | 89 | for (value, return_value) in results: 90 | writer.writerow([str(value), str(return_value)]) 91 | 92 | logging.info("Wrote: %s" % (args.out_csv if args.out_csv else "(stdout)")) 93 | -------------------------------------------------------------------------------- /kubeface/commands/run_task.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Run a task. Used internally, not meant to be called by a user. 3 | ''' 4 | 5 | import sys 6 | import argparse 7 | import logging 8 | import tempfile 9 | import math 10 | import signal 11 | import traceback 12 | import os 13 | 14 | from .. import storage, serialization 15 | from ..common import configure_logging 16 | from ..context import RUNTIME_CONTEXT 17 | 18 | parser = argparse.ArgumentParser(description=__doc__) 19 | 20 | parser.add_argument("input_path") 21 | parser.add_argument("result_path") 22 | 23 | parser.add_argument( 24 | "--delete-input", 25 | action="store_true", 26 | default=False, 27 | help="Delete input file on success.") 28 | 29 | parser.add_argument( 30 | "--quiet", 31 | action="store_true", 32 | default=False, 33 | help="") 34 | 35 | parser.add_argument( 36 | "--verbose", 37 | action="store_true", 38 | default=False, 39 | help="") 40 | 41 | 42 | def run(argv=sys.argv[1:]): 43 | args = parser.parse_args(argv) 44 | 45 | # On sigusr1 print stack trace 46 | print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid()) 47 | signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack()) 48 | 49 | configure_logging(args) 50 | 51 | RUNTIME_CONTEXT["node_type"] = "task" 52 | RUNTIME_CONTEXT["task_input_path"] = args.input_path 53 | RUNTIME_CONTEXT["task_result_path"] = args.result_path 54 | 55 | logging.info("Reading: %s" % args.input_path) 56 | input_handle = storage.get(args.input_path) 57 | task = serialization.load(input_handle) 58 | 59 | logging.info("Deserialized task: %s" % task) 60 | logging.info("Running task.") 61 | result = task.run(input_size=input_handle.tell()) 62 | logging.info("Done running task.") 63 | 64 | result_path = args.result_path.format( 65 | result_type=result.result_type, 66 | result_time=int(math.ceil(result.end_time))) 67 | 68 | with tempfile.TemporaryFile( 69 | prefix="kubeface-run-task-result-", suffix=".pkl") as fd: 70 | logging.info("Serializing result.") 71 | serialization.dump(result, fd) 72 | logging.info("Serialized result to %d bytes." % fd.tell()) 73 | fd.seek(0) 74 | logging.info("Writing: %s" % result_path) 75 | storage.put(result_path, fd) 76 | 77 | if args.delete_input: 78 | logging.info("Deleting: %s" % args.input_path) 79 | storage.delete(args.input_path) 80 | 81 | logging.info("Done.") 82 | -------------------------------------------------------------------------------- /kubeface/common.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import subprocess 3 | import sys 4 | 5 | 6 | def check_call(*args, **kwargs): 7 | logging.info("Running: %s %s" % (args, kwargs)) 8 | subprocess.check_call(*args, **kwargs) 9 | 10 | 11 | def configure_logging(args=None, verbose=False): 12 | if verbose or (args is not None and args.verbose): 13 | level = logging.DEBUG 14 | else: 15 | level = logging.INFO 16 | 17 | logging.basicConfig( 18 | format="%(asctime)s.%(msecs)d %(levelname)s %(module)s - %(funcName)s:" 19 | " %(message)s", 20 | datefmt="%Y-%m-%d %H:%M:%S", 21 | stream=sys.stderr, 22 | level=level) 23 | 24 | 25 | def human_readable_memory_size(num, suffix='B'): 26 | # From: http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size 27 | for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: 28 | if abs(num) < 1024.0: 29 | return "%3.1f %s%s" % (num, unit, suffix) 30 | num /= 1024.0 31 | return "%.1f %s%s" % (num, 'Yi', suffix) 32 | 33 | 34 | def truncate(s, max_length): 35 | if len(s) < max_length: 36 | return s 37 | return s[:max_length] + "..." 38 | -------------------------------------------------------------------------------- /kubeface/context.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module defines information that allows code to determine if it is running 3 | on a master Kubeface node (node_type == "master") or as a task 4 | (node_type == "task"). 5 | 6 | This dict defaults to indicating running on a master node, and is updated by 7 | the run-task command with task-specific information. 8 | """ 9 | 10 | from .naming import hash_value 11 | 12 | RUNTIME_CONTEXT = { 13 | "node_type": "master", 14 | "task_input_path": None, 15 | "task_result_path": None, 16 | } 17 | 18 | 19 | def node_id(): 20 | if RUNTIME_CONTEXT["node_type"] == "master": 21 | return "node-master" 22 | return "node-%s" % ( 23 | hash_value( 24 | RUNTIME_CONTEXT["task_result_path"])) 25 | -------------------------------------------------------------------------------- /kubeface/job.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | import tempfile 4 | import collections 5 | 6 | from numpy import percentile, mean 7 | 8 | from .serialization import dump 9 | from . import storage, naming, context 10 | from .status_writer import DefaultStatusWriter 11 | from .common import human_readable_memory_size 12 | from .result import Result 13 | 14 | 15 | class Job(object): 16 | def __init__( 17 | self, 18 | backend, 19 | tasks_iter, 20 | max_simultaneous_tasks, 21 | storage, 22 | cache_key, 23 | num_tasks=None, 24 | wait_to_raise_task_exception=False, 25 | speculation_percent=0, 26 | speculation_runtime_percentile=99, 27 | speculation_max_reruns=0): 28 | 29 | self.backend = backend 30 | self.tasks_iter = tasks_iter 31 | self.max_simultaneous_tasks = max_simultaneous_tasks 32 | self.storage = storage 33 | self.cache_key = cache_key 34 | self.num_tasks = num_tasks 35 | self.wait_to_raise_task_exception = wait_to_raise_task_exception 36 | self.speculation_percent = speculation_percent 37 | self.speculation_runtime_percentile = speculation_runtime_percentile 38 | self.speculation_max_reruns = speculation_max_reruns 39 | 40 | self.job_name = naming.make_job_name( 41 | self.cache_key, node_id=context.node_id()) 42 | self.task_queue_times = collections.defaultdict(list) 43 | self.submitted_tasks = [] 44 | self.reused_tasks = set() 45 | self.completed_tasks = {} 46 | self.running_tasks = set() 47 | self.status_writer = DefaultStatusWriter(storage, self.job_name) 48 | 49 | self.status_writer.print_info() 50 | 51 | self.static_status_dict = { 52 | 'backend': str(self.backend), 53 | 'job_name': self.job_name, 54 | 'cache_key': self.cache_key, 55 | 'max_simultaneous_tasks': self.max_simultaneous_tasks, 56 | 'num_tasks': self.num_tasks, 57 | 'start_time': time.asctime(), 58 | } 59 | 60 | def status_dict(self): 61 | result = dict(self.static_status_dict) 62 | result["submitted_tasks"] = list(self.submitted_tasks) 63 | result["completed_tasks"] = list(self.completed_tasks) 64 | result["running_tasks"] = list(self.running_tasks) 65 | result['reused_tasks'] = list(self.reused_tasks) 66 | return result 67 | 68 | def storage_path(self, filename): 69 | return self.storage + "/" + filename 70 | 71 | def submit_task(self, task_name): 72 | queue_time = int(time.time()) 73 | task_result_template = self.storage_path( 74 | naming.TASK_RESULT.template.format( 75 | task_name=task_name, 76 | attempt_num=len(self.task_queue_times[task_name]), 77 | queue_time=queue_time, 78 | result_type="{result_type}", # filled in by worker 79 | result_time="{result_time}")) # filled in by worker 80 | 81 | task_input = self.storage_path( 82 | naming.TASK_INPUT.make_string(task_name=task_name)) 83 | 84 | self.backend.submit_task(task_name, task_input, task_result_template) 85 | self.status_writer.update(self.status_dict()) 86 | self.submitted_tasks.append(task_name) 87 | self.task_queue_times[task_name].append(queue_time) 88 | 89 | def submit_next_task(self): 90 | task_name = None 91 | while task_name is None: 92 | try: 93 | task = next(self.tasks_iter) 94 | except StopIteration: 95 | return False 96 | 97 | task_name = naming.TASK.make_string( 98 | cache_key=self.cache_key, 99 | task_num=len(self.submitted_tasks)) 100 | 101 | if task_name in self.completed_tasks: 102 | completed_task_info = self.completed_tasks[task_name] 103 | logging.info("Using existing result: %s" % ( 104 | completed_task_info['task_result_name'])) 105 | self.reused_tasks.add(task_name) 106 | self.submitted_tasks.append(task_name) 107 | task_name = None 108 | 109 | task_input = self.storage_path( 110 | naming.TASK_INPUT.make_string(task_name=task_name)) 111 | with tempfile.TemporaryFile(prefix="kubeface-upload-") as fd: 112 | dump(task, fd) 113 | size_string = human_readable_memory_size(fd.tell()) 114 | logging.info("Uploading: %s [%s] for task %s" % ( 115 | task_input, 116 | size_string, 117 | task_name)) 118 | fd.seek(0) 119 | storage.put(task_input, fd) 120 | 121 | self.submit_task(task_name) 122 | return True 123 | 124 | def update(self): 125 | completed_task_result_names = storage.list_contents( 126 | self.storage_path( 127 | naming.task_result_prefix(self.cache_key, self.running_tasks))) 128 | for completed_task_result_name in completed_task_result_names: 129 | info = naming.TASK_RESULT.make_tuple(completed_task_result_name) 130 | if info.task_name not in self.completed_tasks: 131 | if info.result_type == 'exception': 132 | result = Result.from_storage( 133 | self.storage_path(completed_task_result_name)) 134 | result.log() 135 | if self.wait_to_raise_task_exception: 136 | logging.warning( 137 | "Waiting for other tasks to run before raising " 138 | "exception.") 139 | else: 140 | result.raise_if_exception() 141 | assert False 142 | self.completed_tasks[info.task_name] = { 143 | 'parsed_result_name': info, 144 | 'task_result_name': completed_task_result_name, 145 | } 146 | 147 | self.running_tasks = set(self.submitted_tasks).difference( 148 | set(self.completed_tasks)) 149 | 150 | def tasks_elegible_for_speculation(self, speculation_runtime_threshold): 151 | # Consider speculating. 152 | elegible_tasks_by_runtime = [ 153 | task_name 154 | for task_name in self.running_tasks 155 | if ( 156 | time.time() - self.task_queue_times[task_name][-1] > 157 | speculation_runtime_threshold) 158 | ] 159 | elegible_tasks = [ 160 | task_name 161 | for task_name in elegible_tasks_by_runtime 162 | if ( 163 | len(self.task_queue_times[task_name]) < 164 | self.speculation_max_reruns) 165 | ] 166 | logging.info( 167 | "%d tasks could be speculatively rerun based " 168 | "on a queue time threshold of %0.2f sec; of " 169 | "these %d are elegible because they have not " 170 | "been run more than %d times." % ( 171 | len(elegible_tasks_by_runtime), 172 | speculation_runtime_threshold, 173 | len(elegible_tasks), 174 | self.speculation_max_reruns)) 175 | return elegible_tasks 176 | 177 | def wait(self, poll_seconds=5.0): 178 | """ 179 | Run all tasks to completion. 180 | 181 | Speculation algorithm: 182 | - No speculation occurs until all tasks have been submitted and at 183 | least 100 - speculation_percent tasks have completed. 184 | - Once this threshold is reached, tasks are rerun in order, i.e. 185 | based how long they have been queued. 186 | - A task will be rerun when its queue time exceeds 187 | speculation_runtime_percentile of the queue times of the 188 | tasks that completed successfully without speculation. This will 189 | reset its queue time to 0. 190 | - Tasks can be rerun up to speculation_max_reruns times. 191 | - We are still limited by max_simultaneous_tasks. If more than this 192 | number of tasks fail, we won't be able to recover. 193 | """ 194 | 195 | while True: 196 | self.update() 197 | num_to_submit = max( 198 | 0, 199 | self.max_simultaneous_tasks - 200 | len(self.running_tasks)) 201 | if num_to_submit == 0: 202 | time.sleep(poll_seconds) 203 | continue 204 | 205 | logging.info("Submitting %d tasks" % num_to_submit) 206 | if not all(self.submit_next_task() for _ in range(num_to_submit)): 207 | # We've submitted all our tasks. 208 | speculation_runtime_threshold = None 209 | while True: 210 | self.update() 211 | self.status_writer.update(self.status_dict()) 212 | if not self.running_tasks: 213 | return 214 | 215 | if speculation_runtime_threshold is None: 216 | percent_tasks_running = ( 217 | len(self.running_tasks) * 100.0 / 218 | len(self.submitted_tasks)) 219 | if percent_tasks_running < self.speculation_percent: 220 | elapsed_times = [ 221 | int(t["parsed_result_name"].result_time) - 222 | int(t["parsed_result_name"].queue_time) 223 | for t in self.completed_tasks.values() 224 | ] 225 | speculation_runtime_threshold = percentile( 226 | elapsed_times, 227 | self.speculation_runtime_percentile) 228 | logging.info( 229 | "Enabling speculation: %0.2f%% of tasks " 230 | "running. " 231 | "Task queue times (sec): " 232 | "min=%0.1f mean=%0.1f max=%0.1f. Queue time " 233 | "threshold for resubmitting tasks will be " 234 | "%0.0f percentile of these times, which is " 235 | "%0.2f" % ( 236 | percent_tasks_running, 237 | min(elapsed_times), 238 | mean(elapsed_times), 239 | max(elapsed_times), 240 | self.speculation_runtime_percentile, 241 | speculation_runtime_threshold)) 242 | 243 | if speculation_runtime_threshold is not None: 244 | elegible_tasks = self.tasks_elegible_for_speculation( 245 | speculation_runtime_threshold) 246 | 247 | if elegible_tasks: 248 | capacity = max( 249 | 0, 250 | self.max_simultaneous_tasks - sum( 251 | len(self.task_queue_times[task_name]) 252 | for task_name in self.running_tasks)) 253 | to_speculate = elegible_tasks[:capacity] 254 | logging.info( 255 | "Capacity for re-running up to %d tasks. " 256 | "Will speculatively re-run %d tasks." % ( 257 | capacity, 258 | len(to_speculate))) 259 | for task_name in to_speculate: 260 | self.submit_task(task_name) 261 | 262 | logging.info("Waiting for %d tasks to complete: %s" % ( 263 | len(self.running_tasks), 264 | " ".join(self.running_tasks))) 265 | time.sleep(poll_seconds) 266 | 267 | def results(self): 268 | self.update() 269 | if self.running_tasks: 270 | raise RuntimeError("Not all tasks have completed") 271 | for task_name in self.submitted_tasks: 272 | result_file = self.storage_path( 273 | self.completed_tasks[task_name]['task_result_name']) 274 | result = Result.from_storage(result_file) 275 | yield result 276 | -------------------------------------------------------------------------------- /kubeface/kubernetes_backend.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import json 3 | import logging 4 | import subprocess 5 | import time 6 | 7 | from .backend import Backend 8 | from .worker_configuration import WorkerConfiguration 9 | from .common import check_call 10 | from .storage import is_google_storage_bucket 11 | from . import naming 12 | 13 | 14 | class KubernetesBackend(Backend): 15 | 16 | @staticmethod 17 | def add_args(parser): 18 | default = KubernetesBackend(worker_configuration=None) 19 | parser.add_argument( 20 | "--kubeface-kubernetes-cluster", 21 | default=default.cluster, 22 | help="Cluster. Default: %(default)s") 23 | parser.add_argument( 24 | "--kubeface-kubernetes-task-resources-cpu", 25 | default=default.task_resources_cpu, 26 | type=int, 27 | help="CPUs per task. Default: %(default)s") 28 | parser.add_argument( 29 | "--kubeface-kubernetes-task-resources-memory-mb", 30 | default=default.task_resources_memory_mb, 31 | type=float, 32 | help="Memory (mb) per task. Default: %(default)s") 33 | parser.add_argument( 34 | "--kubeface-kubernetes-retries", 35 | default=default.retries, 36 | type=int, 37 | help="Max retries for kubernetes commands. Default: %(default)s") 38 | parser.add_argument( 39 | "--kubeface-kubernetes-image-pull-policy", 40 | default=default.image_pull_policy, 41 | choices=("Always", "IfNotPresent", "Never"), 42 | help="Image pull policy. Default: %(default)s") 43 | 44 | @staticmethod 45 | def from_args(args): 46 | arg_prefix = "kubeface_kubernetes_" 47 | return KubernetesBackend( 48 | worker_configuration=WorkerConfiguration.from_args(args), 49 | **dict( 50 | (key[len(arg_prefix):], value) 51 | for (key, value) in args._get_kwargs() 52 | if key.startswith(arg_prefix))) 53 | 54 | def __init__( 55 | self, 56 | worker_configuration, 57 | cluster=None, 58 | task_resources_cpu=1, 59 | task_resources_memory_mb=1000.0, 60 | retries=12, 61 | image_pull_policy='Always'): 62 | self.worker_configuration = worker_configuration 63 | self.cluster = cluster 64 | self.task_resources_cpu = task_resources_cpu 65 | self.task_resources_memory_mb = task_resources_memory_mb 66 | self.retries = retries 67 | self.image_pull_policy = image_pull_policy 68 | 69 | def submit_task(self, task_name, task_input, task_output): 70 | specification = self.task_specification( 71 | task_name, 72 | task_input, 73 | task_output) 74 | with tempfile.NamedTemporaryFile( 75 | mode="w+", 76 | prefix="kubeface-kubernetes-%s" % task_name, 77 | suffix=".json") as fd: 78 | json.dump(specification, fd, indent=4) 79 | logging.debug(json.dumps(specification, indent=4)) 80 | fd.flush() 81 | retry_num = 0 82 | while True: 83 | try: 84 | check_call(["kubectl", "create", "-f", fd.name]) 85 | return task_name 86 | except subprocess.CalledProcessError: 87 | logging.warn("Error calling kutectl on spec: \n%s" % ( 88 | json.dumps(specification, indent=4))) 89 | retry_num += 1 90 | if retry_num >= self.retries: 91 | raise 92 | sleep_time = 2.0**retry_num 93 | logging.info("Retry %d / %d. Sleeping for %0.1f sec." % ( 94 | retry_num, self.retries, sleep_time)) 95 | time.sleep(sleep_time) 96 | 97 | def task_specification(self, task_name, task_input, task_output): 98 | task_info = naming.TASK.make_tuple(task_name) 99 | logging.info( 100 | "Generating kubernetes specification for task %d in job %s" % ( 101 | task_info.task_num, task_info.cache_key)) 102 | 103 | sanitized_task_name = naming.sanitize(task_name) 104 | sanitized_cache_key = naming.sanitize(task_info.cache_key) 105 | 106 | result = { 107 | "kind": "Pod", 108 | "apiVersion": "v1", 109 | "metadata": { 110 | "name": "%s-%s" % ( 111 | sanitized_task_name, naming.hash_value(task_output)), 112 | "labels": { 113 | "kubeface_job": sanitized_cache_key, 114 | }, 115 | "namespace": "", 116 | }, 117 | "spec": { 118 | "containers": [ 119 | { 120 | "name": str(task_info.task_num), 121 | "image": self.worker_configuration.image, 122 | "imagePullPolicy": self.image_pull_policy, 123 | "command": [ 124 | "sh", 125 | "-c", 126 | self.worker_configuration.command( 127 | task_input, 128 | task_output), 129 | ], 130 | "resources": { 131 | "requests": { 132 | "cpu": self.task_resources_cpu, 133 | "memory": ( 134 | "%sMi" % 135 | self.task_resources_memory_mb), 136 | }, 137 | }, 138 | }, 139 | ], 140 | "restartPolicy": "Never", 141 | } 142 | } 143 | return result 144 | 145 | @staticmethod 146 | def supports_storage(path): 147 | # kubernetes backend requires bucket storage 148 | return is_google_storage_bucket(path) 149 | -------------------------------------------------------------------------------- /kubeface/local_process_backend.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import subprocess 3 | from copy import copy 4 | 5 | from .backend import Backend 6 | from .worker_configuration import ( 7 | WorkerConfiguration, 8 | DEFAULT as DEFAULT_WORKER_CONFIG 9 | ) 10 | 11 | 12 | class LocalProcessBackend(Backend): 13 | @staticmethod 14 | def add_args(parser): 15 | pass 16 | 17 | @staticmethod 18 | def from_args(args): 19 | return LocalProcessBackend( 20 | worker_configuration=WorkerConfiguration.from_args(args)) 21 | 22 | def __init__(self, worker_configuration=DEFAULT_WORKER_CONFIG): 23 | unsupported_worker_configuration_fields = [ 24 | 'image', 25 | 'pip', 26 | 'pip_packages', 27 | 'kubeface_install_command', 28 | ] 29 | bad_fields = worker_configuration.non_default_fields().intersection( 30 | set(unsupported_worker_configuration_fields)) 31 | if bad_fields: 32 | raise ValueError( 33 | "LocalProcessBackend does not handle these worker " 34 | "configuration fields: %s" % ' '.join(bad_fields)) 35 | if worker_configuration.kubeface_install_policy == 'always': 36 | raise ValueError( 37 | "LocalProcessBackend does not support worker configurations " 38 | "with kubeface_install_policy = 'always'") 39 | self.worker_configuration = copy(worker_configuration) 40 | self.worker_configuration.kubeface_install_policy = 'never' 41 | 42 | def submit_task(self, task_name, task_input, task_output): 43 | command = self.worker_configuration.command(task_input, task_output) 44 | logging.debug("Running task '%s': %s" % (task_name, command)) 45 | return subprocess.Popen(command, shell=True) 46 | -------------------------------------------------------------------------------- /kubeface/local_process_docker_backend.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import subprocess 3 | import shlex 4 | import os 5 | 6 | from .backend import Backend 7 | from .worker_configuration import ( 8 | WorkerConfiguration, 9 | DEFAULT as DEFAULT_WORKER_CONFIG 10 | ) 11 | 12 | 13 | DOCKER_MOUNT = "/kubeface-data" 14 | KUBEFACE_MOUNT = "/kubeface-package" 15 | 16 | 17 | class LocalProcessDockerBackend(Backend): 18 | @staticmethod 19 | def add_args(parser): 20 | parser.add_argument( 21 | "--kubeface-local-process-docker-command", 22 | default="docker") 23 | 24 | @staticmethod 25 | def from_args(args): 26 | return LocalProcessDockerBackend( 27 | worker_configuration=WorkerConfiguration.from_args(args), 28 | docker_command=args.kubeface_local_process_docker_command) 29 | 30 | def __init__( 31 | self, 32 | worker_configuration=DEFAULT_WORKER_CONFIG, 33 | docker_command="docker"): 34 | self.worker_configuration = worker_configuration 35 | self.docker_command = docker_command 36 | 37 | def submit_task(self, task_name, task_input, task_output): 38 | volume_mounts = [] # pairs of (host path, docker path) 39 | if not task_input.startswith("gs://"): 40 | # Using a local filesystem as storage, so we'll want to 41 | # mount it on the docker image. 42 | data_dir = os.path.dirname(task_input) 43 | assert os.path.dirname(task_output) == data_dir 44 | 45 | task_input = os.path.join( 46 | DOCKER_MOUNT, os.path.basename(task_input)) 47 | task_output = os.path.join( 48 | DOCKER_MOUNT, os.path.basename(task_output)) 49 | volume_mounts.append((data_dir, DOCKER_MOUNT)) 50 | 51 | # We also mount the kubeface package directory, so it can 52 | # installed on the docker image if desired. 53 | kubeface_package_dir = os.path.abspath( 54 | os.path.join( 55 | os.path.dirname(__file__), 56 | "..")) 57 | volume_mounts.append((kubeface_package_dir, KUBEFACE_MOUNT)) 58 | 59 | volume_mount_args = [] 60 | for (host_path, docker_path) in volume_mounts: 61 | volume_mount_args.append("-v") 62 | volume_mount_args.append("%s:%s" % (host_path, docker_path)) 63 | 64 | command = ( 65 | shlex.split(self.docker_command) + 66 | ["run"] + 67 | volume_mount_args + 68 | [ 69 | self.worker_configuration.image, 70 | "sh", 71 | "-c", 72 | self.worker_configuration.command(task_input, task_output), 73 | ] 74 | ) 75 | logging.info("Running task '%s': %s" % (task_name, str(command))) 76 | return subprocess.Popen(command) 77 | -------------------------------------------------------------------------------- /kubeface/naming.py: -------------------------------------------------------------------------------- 1 | import socket 2 | from datetime import datetime 3 | import getpass 4 | import hashlib 5 | import time 6 | 7 | from .stringable import Stringable 8 | 9 | JOB = Stringable( 10 | "Job", 11 | "{cache_key}::{node_id}::{randomness}") 12 | 13 | TASK = Stringable( 14 | "Task", 15 | "{cache_key}::{task_num:06d}") 16 | 17 | TASK_INPUT = Stringable( 18 | "TaskInput", 19 | "input::{task_name}") 20 | 21 | TASK_RESULT = Stringable( 22 | "TaskResult", 23 | "result::{task_name}+{attempt_num:d}+{queue_time}+{result_time}+" 24 | "+{result_type}", 25 | valid_values={ 26 | 'result_type': ["value", "exception"], 27 | }) 28 | 29 | JOB_STATUS_PAGE = Stringable( 30 | "JobStatusPage", 31 | "{status}::{format}::{job_name}.{format}", 32 | valid_values={ 33 | 'format': ['html', 'json'], 34 | 'status': ['active', 'done'], 35 | }) 36 | 37 | REMOTE_OBJECT = Stringable( 38 | "RemoteObject", 39 | "object::{cache_key_prefix}::{node_id}::{object_num:d}-{randomness}") 40 | 41 | 42 | def hash_value(s, characters=8): 43 | return hashlib.sha1(str(s).encode()).hexdigest()[:characters] 44 | 45 | 46 | def make_cache_key_prefix(): 47 | cache_key_prefix = "%s-%s-%s-%s" % ( 48 | socket.gethostname()[:8], 49 | getpass.getuser(), 50 | datetime.strftime(datetime.now(), "%Y-%m-%d-%H:%M:%S"), 51 | hash_value(time.time())) 52 | return cache_key_prefix 53 | 54 | 55 | def make_job_name(cache_key, node_id): 56 | return JOB.make_string( 57 | cache_key=cache_key, 58 | node_id=node_id, 59 | randomness=hash_value(time.time())) 60 | 61 | 62 | def make_remote_object_name(cache_key_prefix, node_id, object_num): 63 | return REMOTE_OBJECT.make_string( 64 | cache_key_prefix=cache_key_prefix, 65 | node_id=node_id, 66 | object_num=object_num, 67 | randomness=hash_value(time.time())) 68 | 69 | 70 | def task_result_prefix(cache_key, task_names=[]): 71 | prefix = "result::" + cache_key 72 | if task_names: 73 | better_prefix = TASK_RESULT.prefix(task_name=list(task_names)) 74 | assert better_prefix.startswith(prefix) 75 | return better_prefix 76 | return prefix 77 | 78 | 79 | def task_input_prefix(cache_key): 80 | return "input::" + cache_key 81 | 82 | 83 | def status_prefixes(job_names=None, formats=None, statuses=None): 84 | return JOB_STATUS_PAGE.prefixes( 85 | max_prefixes=4, 86 | job_name=job_names, 87 | format=formats, 88 | status=statuses) 89 | 90 | 91 | def sanitize(name): 92 | return ( 93 | name 94 | .replace(".", "-") 95 | .replace(":", "-") 96 | .replace("_", "-").lower()) 97 | -------------------------------------------------------------------------------- /kubeface/remote_object.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import tempfile 3 | from contextlib import closing 4 | 5 | from . import common, serialization, storage 6 | 7 | 8 | class RemoteObject(object): 9 | def __init__(self, file_path, value): 10 | self.file_path = file_path 11 | self._value = value 12 | self.written = False 13 | self.loaded = True 14 | 15 | @property 16 | def value(self): 17 | """ 18 | Value is lazy loaded when it is first accessed. 19 | """ 20 | if not self.loaded: 21 | with closing(storage.get(self.file_path)) as fd: 22 | self._value = serialization.load(fd) 23 | self.loaded = True 24 | return self._value 25 | 26 | def __getstate__(self): 27 | """ 28 | The first time the object is pickled, we write it to file_path. 29 | The pickled representation is just the path to the file. 30 | """ 31 | if not self.written: 32 | assert self.loaded 33 | with tempfile.TemporaryFile(prefix="kubeface-object-") as fd: 34 | serialization.dump(self._value, fd) 35 | logging.info("Writing object (%s): %s" % ( 36 | common.human_readable_memory_size(fd.tell()), 37 | self.file_path)) 38 | fd.seek(0) 39 | storage.put(self.file_path, fd) 40 | self.written = True 41 | return {"file_path": self.file_path} 42 | 43 | def __setstate__(self, state): 44 | assert list(state) == ['file_path'] 45 | self.file_path = state['file_path'] 46 | self._value = None 47 | self.written = True 48 | self.loaded = False 49 | -------------------------------------------------------------------------------- /kubeface/result.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import socket 3 | import logging 4 | import time 5 | import platform 6 | from datetime import timedelta 7 | from contextlib import closing 8 | 9 | from . import storage 10 | from .serialization import load 11 | from .common import human_readable_memory_size 12 | 13 | 14 | def get_process_info(): 15 | # For debugging we record some process info in results. 16 | return { 17 | 'invocation_args': sys.argv, 18 | 'python_version': sys.version, 19 | 'hostname': socket.gethostname(), 20 | 'platform': platform.platform(), 21 | } 22 | 23 | 24 | class Result(object): 25 | @staticmethod 26 | def from_storage(storage_path): 27 | with closing(storage.get(storage_path)) as handle: 28 | value = load(handle) 29 | assert isinstance(value, Result), type(value) 30 | value.serialization_info["storage_path"] = storage_path 31 | value.serialization_info["result_bytes"] = handle.tell() 32 | return value 33 | 34 | def __init__( 35 | self, 36 | start_time, 37 | end_time, 38 | input_size=None, 39 | exception=None, 40 | exception_traceback_string=None, 41 | return_value=None, 42 | process_info=get_process_info()): 43 | self.input_size = input_size 44 | self.start_time = start_time 45 | self.end_time = end_time 46 | self.exception = exception 47 | self.exception_traceback_string = exception_traceback_string 48 | self.return_value = return_value 49 | self.process_info = process_info 50 | 51 | if exception is not None: 52 | assert return_value is None 53 | assert exception_traceback_string is not None 54 | self.result_type = "exception" 55 | else: 56 | self.result_type = "value" 57 | 58 | self.serialization_info = {} # set upon deserialization 59 | 60 | def run_seconds(self): 61 | return self.end_time - self.start_time 62 | 63 | def description(self, indent=""): 64 | fields = [ 65 | ("result type", self.result_type), 66 | ("start time", time.asctime(time.localtime(self.start_time))), 67 | ("run time", str(timedelta(seconds=self.run_seconds()))), 68 | ("hostname", self.process_info['hostname']), 69 | ("platform", self.process_info['platform']), 70 | ("python version", self.process_info['python_version']), 71 | ("invocation arguments", "\n".join( 72 | self.process_info['invocation_args'])), 73 | ] 74 | if self.input_size: 75 | fields.append( 76 | ("input size", human_readable_memory_size(self.input_size))) 77 | if 'result_bytes' in self.serialization_info: 78 | fields.append( 79 | ("result size", 80 | human_readable_memory_size( 81 | self.serialization_info['result_bytes']))) 82 | 83 | if self.result_type == 'value': 84 | fields.append(("return value type", str(type(self.return_value)))) 85 | else: 86 | fields.extend([ 87 | ("exception", str(self.exception)), 88 | ("traceback", self.exception_traceback_string), 89 | ]) 90 | 91 | max_header_length = max(len(pair[0]) for pair in fields) 92 | row_template = "%" + str(max_header_length) + "s : %s" 93 | 94 | def format_value(s): 95 | return s.replace("\n", "\n" + " " + " " * max_header_length) 96 | 97 | return ( 98 | "\n" + 99 | "\n".join( 100 | row_template % (key, format_value(value)) 101 | for (key, value) in fields) 102 | ).replace("\n", "\n" + indent) 103 | 104 | def log(self): 105 | indent = " * " 106 | if self.result_type == 'value': 107 | logging.debug("Result (success): %s" % ( 108 | self.description(indent=indent))) 109 | else: 110 | logging.error("Result (exception): %s" % ( 111 | self.description(indent=indent))) 112 | 113 | def raise_if_exception(self): 114 | if self.result_type == 'exception': 115 | logging.error("Re-raising exception for task.") 116 | raise self.exception 117 | -------------------------------------------------------------------------------- /kubeface/serialization.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import dill 4 | import dill.detect 5 | 6 | PICKLE_PROTOCOL = 2 7 | 8 | CHECK_SERIALIZATION = False 9 | 10 | 11 | def check(obj): 12 | if not CHECK_SERIALIZATION: 13 | return 14 | try: 15 | dill.loads(dill.dumps(obj)) 16 | except Exception as e: 17 | logging.error( 18 | "Couldn't serialize: %s\n'%s'\nBad objects:\n%s" % ( 19 | str(obj), str(e), dill.detect.badobjects(obj, depth=2))) 20 | raise 21 | 22 | 23 | def dumps(obj): 24 | check(obj) 25 | return dill.dumps(obj, protocol=PICKLE_PROTOCOL) 26 | 27 | 28 | def dump(obj, fd): 29 | check(obj) 30 | return dill.dump(obj, fd, protocol=PICKLE_PROTOCOL) 31 | 32 | 33 | def loads(s): 34 | return dill.loads(s) 35 | 36 | 37 | def load(fd): 38 | return dill.load(fd) 39 | -------------------------------------------------------------------------------- /kubeface/status_writer.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | import json 3 | import time 4 | 5 | from . import naming, storage 6 | 7 | 8 | class DefaultStatusWriter(object): 9 | def __init__(self, storage_path, job_name): 10 | self.storage_path = storage_path 11 | self.job_name = job_name 12 | self.json_path = ( 13 | storage_path + 14 | "/" + 15 | naming.JOB_STATUS_PAGE.make_string( 16 | job_name=job_name, 17 | format="json", 18 | status="active")) 19 | self.html_path = ( 20 | storage_path + 21 | "/" + 22 | naming.JOB_STATUS_PAGE.make_string( 23 | job_name=job_name, 24 | format="html", 25 | status="active")) 26 | 27 | def print_info(self): 28 | print("Job status available at:") 29 | print("\t%s" % storage.access_info(self.json_path)) 30 | print("\t%s" % storage.access_info(self.html_path)) 31 | 32 | def make_html(self, status_dict): 33 | d = dict(status_dict) 34 | d["num_running_tasks"] = len(d["running_tasks"]) 35 | d["num_completed_tasks"] = len(d["completed_tasks"]) 36 | d["num_submitted_tasks"] = len(d["submitted_tasks"]) 37 | d["num_reused_tasks"] = len(d["reused_tasks"]) 38 | if d["num_tasks"]: 39 | d["percent_complete"] = ( 40 | d["num_completed_tasks"] * 100.0 / d["num_tasks"]) 41 | else: 42 | d["percent_complete"] = "unknown" 43 | d["num_tasks"] = "unknown" 44 | d["status_time"] = time.asctime() 45 | 46 | return """ 47 | 48 | 49 | Kubeface status: {job_name} 50 | 51 | 52 |

Kubeface

53 |

{job_name}

54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 |
Job{job_name}
Cache key{cache_key}
Backend{backend}
Max simultaneous tasks{max_simultaneous_tasks}
Start time{start_time}
Status time{status_time}
63 | 64 |

Status

65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 |
Percent complete{percent_complete}
Running tasks{num_running_tasks}
Completed tasks{num_completed_tasks}
Submitted tasks{num_submitted_tasks}
Reused tasks{num_reused_tasks}
Total tasks{num_tasks}
73 | 74 | 75 | """.format(**d) 76 | 77 | def update(self, status_dict): 78 | storage.put( 79 | self.json_path, 80 | BytesIO(json.dumps(status_dict).encode()), 81 | mime_type="application/json") 82 | storage.put( 83 | self.html_path, 84 | BytesIO(self.make_html(status_dict).encode()), 85 | mime_type="text/html") 86 | -------------------------------------------------------------------------------- /kubeface/storage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | from . import bucket_storage 4 | 5 | 6 | def is_google_storage_bucket(name): 7 | return name.startswith("gs://") 8 | 9 | 10 | def list_contents(prefix): 11 | if is_google_storage_bucket(prefix): 12 | return bucket_storage.list_contents(prefix) 13 | else: 14 | globbed = glob.glob(prefix + "*") 15 | return [os.path.basename(x) for x in globbed] 16 | 17 | 18 | def put(name, input_handle, readers=[], owners=[], **kwargs): 19 | if is_google_storage_bucket(name): 20 | return bucket_storage.put( 21 | name, input_handle, readers, owners, **kwargs) 22 | 23 | # Local file 24 | with open(name, 'wb') as fd: 25 | fd.write(input_handle.read()) 26 | 27 | 28 | def get(name, output_handle=None): 29 | if is_google_storage_bucket(name): 30 | return bucket_storage.get(name, output_handle) 31 | 32 | # Local file 33 | if output_handle is None: 34 | return open(name, "rb") 35 | 36 | with open(name, "rb") as fd: 37 | output_handle.write(fd.read()) 38 | 39 | return output_handle 40 | 41 | 42 | def delete(name): 43 | if is_google_storage_bucket(name): 44 | return bucket_storage.delete(name) 45 | 46 | os.unlink(name) 47 | 48 | 49 | def move(source, dest): 50 | if is_google_storage_bucket(source): 51 | assert is_google_storage_bucket(dest) 52 | return bucket_storage.move(source, dest) 53 | assert not is_google_storage_bucket(dest) 54 | os.rename(source, dest) 55 | 56 | 57 | def access_info(name): 58 | if is_google_storage_bucket(name): 59 | return bucket_storage.access_info(name) 60 | return name 61 | -------------------------------------------------------------------------------- /kubeface/stringable.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import string 3 | from os.path import commonprefix 4 | 5 | import parse 6 | 7 | FORMATTER = string.Formatter() 8 | 9 | 10 | class Stringable(object): 11 | def __init__(self, name, template, valid_values={}): 12 | self.template_pieces = [] 13 | self.field_names = [] 14 | parsed = FORMATTER.parse(template) 15 | for (literal_text, field_name, format_spec, conversion) in parsed: 16 | assert not conversion 17 | self.template_pieces.append((literal_text, field_name)) 18 | if field_name not in self.field_names: 19 | self.field_names.append(field_name) 20 | 21 | self.name = name 22 | self.template = template 23 | self.compiled_template = parse.compile(template) 24 | self.tuple_class = collections.namedtuple( 25 | self.name, 26 | list(self.field_names)) 27 | 28 | self.valid_values = dict(valid_values) 29 | for key in self.valid_values: 30 | assert key in self.field_names 31 | 32 | def make_tuple(self, string_value=None, **kwargs): 33 | if string_value is not None: 34 | assert not kwargs 35 | parsed = self.compiled_template.parse(string_value) 36 | if parsed is None: 37 | raise ValueError( 38 | "Stringable [%s]: Couldn't parse '%s' according to " 39 | "template '%s'" % ( 40 | self.name, string_value, self.template)) 41 | assert not parsed.fixed 42 | fields_dict = parsed.named 43 | else: 44 | fields_dict = kwargs 45 | self.check_fields(**fields_dict) 46 | return self.tuple_class(**fields_dict) 47 | 48 | def check_fields(self, **fields_dict): 49 | assert set(fields_dict) == set(self.field_names), ( 50 | "%s: passed fields %s != expected fields %s" % ( 51 | self.name, set(fields_dict), set(self.field_names))) 52 | for (key, values) in self.valid_values.items(): 53 | if fields_dict[key] not in values: 54 | raise RuntimeError( 55 | "Invalid value %s='%s', must be one of %s." % ( 56 | key, fields_dict[key], ', '.join(values))) 57 | 58 | def make_string(self, tpl=None, **fields_dict): 59 | if tpl is not None: 60 | assert not fields_dict 61 | fields_dict = tpl._asdict() 62 | self.check_fields(**fields_dict) 63 | return self.template.format(**fields_dict) 64 | 65 | def prefix(self, **fields_dict): 66 | (prefix,) = self.prefixes(**fields_dict) 67 | return prefix 68 | 69 | def prefixes(self, max_prefixes=1, **fields_dict): 70 | for (key, value) in fields_dict.items(): 71 | assert key in self.field_names, key 72 | assert value is None or isinstance(value, list), type(value) 73 | 74 | def make_prefixes( 75 | template_pieces, 76 | max_prefixes=max_prefixes, 77 | fields_dict=fields_dict): 78 | result = [[]] 79 | if not template_pieces: 80 | return result 81 | 82 | (literal, field_name) = template_pieces[0] 83 | if literal: 84 | for piece in result: 85 | piece.append(literal) 86 | 87 | values = fields_dict.get(field_name) 88 | if values is None: 89 | values = self.valid_values.get(field_name) 90 | if values is not None: 91 | if len(result) * len(values) > max_prefixes: 92 | common_prefix = commonprefix(values) 93 | for piece in result: 94 | piece.append(common_prefix) 95 | else: 96 | new_result = [] 97 | for value in values: 98 | new_fields_dict = dict(fields_dict) 99 | new_fields_dict[field_name] = [value] 100 | rest = make_prefixes( 101 | template_pieces[1:], 102 | max_prefixes=max_prefixes / ( 103 | len(result) * len(values)), 104 | fields_dict=new_fields_dict) 105 | for some_rest in rest: 106 | new_result.extend( 107 | [x + [value] + some_rest for x in result]) 108 | result = new_result 109 | return result 110 | 111 | prefix_components = make_prefixes(self.template_pieces) 112 | assert len(prefix_components) <= max_prefixes 113 | return [''.join(x) for x in prefix_components] 114 | -------------------------------------------------------------------------------- /kubeface/task.py: -------------------------------------------------------------------------------- 1 | import time 2 | import logging 3 | import types 4 | import traceback 5 | 6 | from .result import Result 7 | 8 | 9 | class Task(object): 10 | def __init__(self, function, args=(), kwargs={}): 11 | self.function = function 12 | self.args = args 13 | self.kwargs = kwargs 14 | 15 | def run(self, input_size=None): 16 | start_time = time.time() 17 | exception = None 18 | exception_traceback_string = None 19 | 20 | try: 21 | return_value = self.function(*self.args, **self.kwargs) 22 | if isinstance(return_value, types.GeneratorType): 23 | return_value = list(return_value) 24 | except Exception as e: 25 | traceback_string = traceback.format_exc() 26 | logging.warn("Task execution raised exception: %s. %s" % ( 27 | e, traceback_string)) 28 | exception = e 29 | exception_traceback_string = traceback_string 30 | return_value = None 31 | 32 | return Result( 33 | start_time=start_time, 34 | end_time=time.time(), 35 | exception=exception, 36 | exception_traceback_string=exception_traceback_string, 37 | return_value=return_value, 38 | input_size=input_size) 39 | -------------------------------------------------------------------------------- /kubeface/worker_configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from six.moves import shlex_quote as quote 4 | 5 | 6 | class WorkerConfiguration(object): 7 | @staticmethod 8 | def add_args(parser): 9 | parser.add_argument( 10 | "--kubeface-worker-image", 11 | default=DEFAULT.image) 12 | parser.add_argument( 13 | "--kubeface-worker-path-prefix", 14 | default=DEFAULT.path_prefix) 15 | parser.add_argument( 16 | "--kubeface-worker-pip", 17 | default=DEFAULT.pip) 18 | parser.add_argument( 19 | "--kubeface-worker-pip-packages", 20 | default=DEFAULT.pip_packages, nargs="+") 21 | parser.add_argument( 22 | "--kubeface-worker-kubeface-install-policy", 23 | choices=('if-not-present', 'always', 'never'), 24 | default=DEFAULT.kubeface_install_policy) 25 | parser.add_argument( 26 | "--kubeface-worker-kubeface-install-command", 27 | default=DEFAULT.kubeface_install_command) 28 | 29 | @staticmethod 30 | def from_args(args): 31 | arg_prefix = "kubeface_worker_" 32 | return WorkerConfiguration( 33 | **dict( 34 | (key[len(arg_prefix):], value) 35 | for (key, value) in args._get_kwargs() 36 | if key.startswith(arg_prefix))) 37 | 38 | def __init__( 39 | self, 40 | image='continuumio/anaconda3', 41 | path_prefix='', 42 | pip='pip', 43 | pip_packages=[], 44 | kubeface_install_policy='if-not-present', 45 | 46 | # TODO: this should default to installing the version of kubeface 47 | # running in the current process, not HEAD. 48 | kubeface_install_command=( 49 | "{pip} install " 50 | "https://github.com/hammerlab/kubeface/archive/master.zip" 51 | )): 52 | 53 | if kubeface_install_policy not in ( 54 | 'if-not-present', 'always', 'never'): 55 | raise ValueError( 56 | "Invalid kubeface_install_policy: %s" 57 | % kubeface_install_policy) 58 | 59 | self.image = image 60 | self.path_prefix = path_prefix 61 | self.pip = pip 62 | self.pip_packages = pip_packages 63 | self.kubeface_install_policy = kubeface_install_policy 64 | self.kubeface_install_command = kubeface_install_command 65 | 66 | def non_default_fields(self): 67 | return set([ 68 | field for field in dir(self) 69 | if getattr(self, field) != getattr(DEFAULT, field) 70 | ]) 71 | 72 | def command(self, task_input, task_output, extra_task_args=[]): 73 | def quote_and_join(arguments): 74 | return " ".join([quote(arg) for arg in arguments]) 75 | 76 | pieces = [] 77 | run_pip = quote(os.path.join(self.path_prefix, 'pip')) 78 | run_task = quote( 79 | os.path.join(self.path_prefix, '_kubeface-run-task')) 80 | kubeface_install_command = self.kubeface_install_command.format( 81 | pip=run_pip) 82 | if self.kubeface_install_policy == 'if-not-present': 83 | # From: http://stackoverflow.com/questions/592620/check-if-a-program-exists-from-a-bash-script 84 | pieces.append("command -v %s || { %s ; } " % ( 85 | run_task, 86 | kubeface_install_command)) 87 | elif self.kubeface_install_policy == 'always': 88 | pieces.append(kubeface_install_command) 89 | if self.pip_packages: 90 | pieces.append("%s install %s" % ( 91 | run_pip, 92 | quote_and_join(self.pip_packages))) 93 | pieces.append( 94 | run_task + 95 | " " + 96 | quote_and_join([ 97 | task_input, 98 | task_output, 99 | "--verbose", 100 | ] + extra_task_args)) 101 | result = " && ".join(pieces) 102 | return result 103 | 104 | 105 | DEFAULT = WorkerConfiguration() 106 | -------------------------------------------------------------------------------- /remote_object_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Kubeface example with remote objects. 3 | 4 | Prepends numbers 1-3 to a big string, showing how to use remote objects to 5 | reduce the size of the uploaded task. 6 | 7 | Example: 8 | 9 | $ python remote_object_example.py \ 10 | --kubeface-backend local-process \ 11 | --kubeface-storage /tmp 12 | 13 | """ 14 | 15 | import argparse 16 | from collections import Counter 17 | import logging 18 | import sys 19 | 20 | import kubeface 21 | 22 | parser = argparse.ArgumentParser(usage=__doc__) 23 | kubeface.Client.add_args(parser) # Add kubeface arguments 24 | 25 | 26 | def main(argv): 27 | args = parser.parse_args(argv) 28 | logging.basicConfig( 29 | format="%(asctime)s.%(msecs)d %(levelname)s %(module)s - %(funcName)s:" 30 | " %(message)s", 31 | datefmt="%Y-%m-%d %H:%M:%S", 32 | stream=sys.stderr, 33 | level=logging.INFO) 34 | 35 | client = kubeface.Client.from_args(args) 36 | input_values = range(3) 37 | 38 | big_string = "i am a string" * 100000 39 | big_wrapped = client.remote_object(big_string) 40 | 41 | logging.info('Using remote object: note size of uploaded task') 42 | 43 | def my_func_with_remote_object(x): 44 | return str(x) + big_wrapped.data 45 | results = client.map(my_func_with_remote_object, input_values) 46 | for (x, result) in zip(input_values, results): 47 | print("%d, %s" % (x, Counter(result))) 48 | 49 | logging.info( 50 | 'Now running without remote object: see uploaded task size') 51 | 52 | def my_func_without_remote_object(x): 53 | return str(x) + big_string 54 | 55 | results = client.map(my_func_without_remote_object, input_values) 56 | for (x, result) in zip(input_values, results): 57 | print("%d, %s" % (x, Counter(result))) 58 | 59 | 60 | if __name__ == '__main__': 61 | main(sys.argv[1:]) 62 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | try: 2 | from setuptools import setup 3 | except ImportError: 4 | from distutils.core import setup 5 | 6 | version = "0.0.1" 7 | 8 | setup( 9 | name="kubeface", 10 | version=version, 11 | author="Tim O'Donnell", 12 | author_email="timodonnell@gmail.com", 13 | packages=["kubeface", "kubeface.commands"], 14 | url="https://github.com/hammerlab/kubeface", 15 | license="Apache License", 16 | description="Python parallel for loops on kubernetes", 17 | long_description=open('README.md').read(), 18 | download_url='https://github.com/hammerlab/kubeface/tarball/%s' % version, 19 | classifiers=[ 20 | "Development Status :: 1 - Planning", 21 | "Intended Audience :: Developers", 22 | "License :: OSI Approved :: MIT License", 23 | "Programming Language :: Python :: 2", 24 | "Programming Language :: Python :: 2.7", 25 | "Programming Language :: Python :: 3", 26 | "Programming Language :: Python :: 3.4", 27 | ], 28 | entry_points={ 29 | 'console_scripts': [ 30 | 'kubeface-copy = kubeface.commands.copy:run', 31 | 'kubeface-run = kubeface.commands.run:run', 32 | 'kubeface-job = kubeface.commands.job:run', 33 | '_kubeface-run-task = kubeface.commands.run_task:run', 34 | ] 35 | }, 36 | install_requires=[ 37 | "dill>=0.2.5", 38 | "six", 39 | "numpy", 40 | "parse", 41 | "oauth2client==4.0.0", 42 | "google-api-python-client==1.5.5", 43 | "mock", 44 | "nose>=1.3.1", 45 | ] 46 | ) 47 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hammerlab/kubeface/443d7432e6d2f8e4d20b6326e98fabeec7ad68b6/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_client.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import argparse 3 | 4 | from numpy import testing 5 | 6 | from kubeface import ( 7 | client, 8 | local_process_backend, 9 | local_process_docker_backend, 10 | worker_configuration, 11 | serialization, 12 | common) 13 | 14 | from . import util 15 | 16 | common.configure_logging(verbose=True) 17 | 18 | 19 | def client_from_commandline_args(argv): 20 | parser = argparse.ArgumentParser() 21 | client.Client.add_args(parser) 22 | args = parser.parse_args(argv) 23 | return client.Client.from_args(args) 24 | 25 | 26 | def exercise_client(c, low=1, high=10): 27 | # Using division gives us an easy way to test handling of tasks 28 | # that throw division (by making low < 0) so it throws ZeroDivisionError 29 | testing.assert_equal( 30 | list(c.map(lambda x: 2.0 / x, range(low, high))), 31 | 2.0 / numpy.arange(low, high)) 32 | 33 | 34 | @util.with_local_and_bucket_storage 35 | def test_local_process_backend(bucket): 36 | backend = local_process_backend.LocalProcessBackend() 37 | c = client.Client( 38 | backend, 39 | poll_seconds=1.0, 40 | max_simultaneous_tasks=3, 41 | storage=bucket) 42 | exercise_client(c) 43 | 44 | 45 | @util.with_local_storage 46 | def test_local_process_docker_backend(bucket): 47 | worker_config = worker_configuration.WorkerConfiguration( 48 | kubeface_install_command="{pip} install /kubeface-package") 49 | backend = local_process_docker_backend.LocalProcessDockerBackend( 50 | worker_configuration=worker_config) 51 | c = client.Client( 52 | backend, 53 | poll_seconds=1.0, 54 | max_simultaneous_tasks=1, 55 | storage=bucket) 56 | exercise_client(c, high=3) 57 | 58 | 59 | @util.with_local_and_bucket_storage 60 | def test_worker_exception_delayed(bucket): 61 | c = client_from_commandline_args([ 62 | "--kubeface-poll-seconds", "1.1", 63 | "--kubeface-backend", "local-process", 64 | "--kubeface-storage", bucket, 65 | "--kubeface-wait-to-raise-task-exception", 66 | ]) 67 | mapper = c.map(lambda x: 2 / (x - 2), range(10)) 68 | testing.assert_equal(next(mapper), -1) 69 | testing.assert_equal(next(mapper), -2) 70 | testing.assert_equal(len(c.job_summary(include_done=False)), 1) 71 | testing.assert_equal(len(c.job_summary(include_done=True)), 1) 72 | testing.assert_raises(ZeroDivisionError, next, mapper) 73 | testing.assert_equal(len(c.job_summary(include_done=False)), 0) 74 | testing.assert_equal(len(c.job_summary(include_done=True)), 1) 75 | testing.assert_raises(StopIteration, next, mapper) 76 | testing.assert_equal(len(c.job_summary(include_done=False)), 0) 77 | testing.assert_equal(len(c.job_summary(include_done=True)), 1) 78 | 79 | 80 | @util.with_local_and_bucket_storage 81 | def test_worker_exception(bucket): 82 | c = client_from_commandline_args([ 83 | "--kubeface-poll-seconds", "1.1", 84 | "--kubeface-backend", "local-process", 85 | "--kubeface-storage", bucket, 86 | "--kubeface-cache-key-prefix", "foo", 87 | ]) 88 | mapper = c.map(lambda x: 2 / (x - 2), range(10)) 89 | testing.assert_raises(ZeroDivisionError, next, mapper) 90 | 91 | # TODO: in the future we may want reruns to not re-use excpetions. 92 | # Here is a test for that functionality, which is currently not 93 | # implemented. 94 | # c = client_from_commandline_args([ 95 | # "--kubeface-poll-seconds", "1.1", 96 | # "--kubeface-backend", "local-process", 97 | # "--kubeface-storage", bucket, 98 | # "--kubeface-cache-key-prefix", "foo", 99 | # ]) 100 | # results = list(c.map(lambda x: 2 / (x - 200), range(10))) 101 | # print(results) # should not raise 102 | 103 | 104 | @util.with_local_and_bucket_storage 105 | def test_job_summary(bucket): 106 | c = client_from_commandline_args([ 107 | "--kubeface-poll-seconds", "1.1", 108 | "--kubeface-backend", "local-process", 109 | "--kubeface-storage", bucket, 110 | ]) 111 | 112 | exercise_client(c, high=5) 113 | testing.assert_equal(len(c.job_summary(include_done=False)), 0) 114 | testing.assert_equal(len(c.job_summary(include_done=True)), 1) 115 | 116 | exercise_client(c, high=2) 117 | testing.assert_equal(len(c.job_summary(include_done=False)), 0) 118 | testing.assert_equal(len(c.job_summary(include_done=True)), 2) 119 | 120 | mapper = c.map(lambda x: x + 5, range(10)) 121 | testing.assert_equal(next(mapper), 5) 122 | testing.assert_equal(len(c.job_summary(include_done=False)), 1) 123 | testing.assert_equal(len(c.job_summary(include_done=True)), 3) 124 | testing.assert_equal(list(mapper), numpy.arange(1, 10) + 5) 125 | testing.assert_equal(len(c.job_summary(include_done=False)), 0) 126 | testing.assert_equal(len(c.job_summary(include_done=True)), 3) 127 | 128 | c.cleanup() 129 | testing.assert_equal(len(c.job_summary()), 0) 130 | 131 | 132 | def test_invalid_client(): 133 | with testing.assert_raises(ValueError): 134 | client_from_commandline_args([ 135 | "--kubeface-poll-seconds", "1.1", 136 | "--kubeface-backend", "kubernetes", 137 | "--kubeface-storage", "/tmp", 138 | ]) 139 | 140 | 141 | @util.with_local_and_bucket_storage 142 | def test_remote_object(bucket): 143 | c = client_from_commandline_args([ 144 | "--kubeface-poll-seconds", "1.1", 145 | "--kubeface-backend", "local-process", 146 | "--kubeface-storage", bucket, 147 | ]) 148 | data = numpy.arange(10000)**2 149 | serialized_data = serialization.dumps(data) 150 | testing.assert_equal(serialization.loads(serialized_data), data) 151 | 152 | remote = c.remote_object(data) 153 | serialized_remote = serialization.dumps(remote) 154 | assert len(serialized_remote) < len(serialized_data) / 10 155 | testing.assert_equal(serialization.loads(serialized_remote).value, data) 156 | 157 | 158 | @util.with_local_and_bucket_storage 159 | def test_pickle_client(bucket): 160 | c = client_from_commandline_args([ 161 | "--kubeface-poll-seconds", "1.1", 162 | "--kubeface-backend", "local-process", 163 | "--kubeface-storage", bucket, 164 | ]) 165 | testing.assert_equal( 166 | c.cache_key_prefix, 167 | serialization.loads(serialization.dumps(c)).cache_key_prefix) 168 | 169 | 170 | @util.with_local_and_bucket_storage 171 | def test_return_remote_object(bucket): 172 | c = client_from_commandline_args([ 173 | "--kubeface-poll-seconds", "1.1", 174 | "--kubeface-backend", "local-process", 175 | "--kubeface-storage", bucket, 176 | ]) 177 | mapper = c.map(lambda x: c.remote_object(x**2), range(10)) 178 | obj = next(mapper) 179 | testing.assert_equal(obj.written, True) 180 | testing.assert_equal(obj.loaded, False) 181 | testing.assert_equal(obj.value, 0) 182 | testing.assert_equal(obj.loaded, True) 183 | testing.assert_equal(obj.value, 0) 184 | 185 | obj = next(mapper) 186 | testing.assert_equal(obj.written, True) 187 | testing.assert_equal(obj.loaded, False) 188 | testing.assert_equal(obj.value, 1) 189 | testing.assert_equal(obj.loaded, True) 190 | testing.assert_equal(obj.value, 1) 191 | 192 | obj = next(mapper) 193 | testing.assert_equal(obj.written, True) 194 | testing.assert_equal(obj.loaded, False) 195 | testing.assert_equal(obj.value, 4) 196 | testing.assert_equal(obj.loaded, True) 197 | testing.assert_equal(obj.value, 4) 198 | -------------------------------------------------------------------------------- /tests/test_job_command.py: -------------------------------------------------------------------------------- 1 | import math 2 | import argparse 3 | import subprocess 4 | from numpy import testing 5 | 6 | from kubeface import ( 7 | client, 8 | common) 9 | 10 | from . import util 11 | 12 | common.configure_logging(verbose=True) 13 | 14 | 15 | def client_from_commandline_args(argv): 16 | parser = argparse.ArgumentParser() 17 | client.Client.add_args(parser) 18 | args = parser.parse_args(argv) 19 | return client.Client.from_args(args) 20 | 21 | 22 | def run_job_command(bucket, argv): 23 | result = subprocess.check_output( 24 | ["kubeface-job", "--kubeface-storage", bucket] + argv).decode() 25 | print(result) 26 | return result 27 | 28 | 29 | def find_line_with(needle, haystack, nth=0): 30 | result = [x for x in haystack.split("\n") if needle in x][nth] 31 | print("Found line: %s" % result) 32 | return result 33 | 34 | 35 | @util.with_local_storage 36 | def test_job_command(bucket): 37 | c = client_from_commandline_args([ 38 | "--kubeface-poll-seconds", "1.1", 39 | "--kubeface-backend", "local-process", 40 | "--kubeface-storage", bucket, 41 | ]) 42 | 43 | mapper = c.map(math.exp, range(10), cache_key='FOOBARBAZ') 44 | testing.assert_equal(next(mapper), 1) 45 | assert 'FOOBARBAZ' in run_job_command(bucket, []) 46 | assert 'active' in ( 47 | find_line_with( 48 | "FOOBARBAZ", 49 | run_job_command(bucket, ["--include-done"]), 50 | nth=1)) 51 | list(mapper) 52 | assert 'FOOBARBAZ' not in run_job_command(bucket, []) 53 | -------------------------------------------------------------------------------- /tests/test_naming.py: -------------------------------------------------------------------------------- 1 | from numpy import testing 2 | 3 | from kubeface import naming 4 | 5 | 6 | def test_basics(): 7 | job = naming.JOB.make_string( 8 | cache_key="foo", node_id="node-master", randomness="123") 9 | print(job) 10 | testing.assert_equal( 11 | naming.JOB.make_string(naming.JOB.make_tuple(job)), 12 | job) 13 | testing.assert_equal( 14 | naming.JOB.prefix(cache_key=["foo"]), 15 | "foo::") 16 | testing.assert_equal( 17 | naming.JOB.prefix(cache_key=["foo", "fob"]), 18 | "fo") 19 | 20 | job_status = naming.JOB_STATUS_PAGE.make_string( 21 | format="json", status="active", job_name="foobar") 22 | testing.assert_equal( 23 | naming.JOB_STATUS_PAGE.make_string( 24 | naming.JOB_STATUS_PAGE.make_tuple(job_status)), 25 | job_status) 26 | testing.assert_equal( 27 | set(naming.JOB_STATUS_PAGE.prefixes( 28 | max_prefixes=2, 29 | status=["active", "done"])), 30 | set(["done::", "active::"])) 31 | testing.assert_equal( 32 | set(naming.JOB_STATUS_PAGE.prefixes( 33 | max_prefixes=4, 34 | status=["active", "done"], 35 | format=["html", "json"])), 36 | set([ 37 | "done::html::", 38 | "active::html::", 39 | "done::json::", 40 | "active::json::"])) 41 | testing.assert_equal( 42 | set(naming.JOB_STATUS_PAGE.prefixes( 43 | max_prefixes=4)), 44 | set([ 45 | "done::html::", 46 | "active::html::", 47 | "done::json::", 48 | "active::json::"])) 49 | testing.assert_equal( 50 | set(naming.JOB_STATUS_PAGE.prefixes( 51 | max_prefixes=4, 52 | job_name=["foo1", "foo2"])), 53 | set([ 54 | "done::html::foo", 55 | "active::html::foo", 56 | "done::json::foo", 57 | "active::json::foo"])) 58 | testing.assert_equal( 59 | set(naming.JOB_STATUS_PAGE.prefixes( 60 | max_prefixes=9, 61 | job_name=["foo1", "foo2"])), 62 | set( 63 | [ 64 | "done::html::foo1.html", 65 | "active::html::foo1.html", 66 | "done::json::foo1.json", 67 | "active::json::foo1.json", 68 | "done::html::foo2.html", 69 | "active::html::foo2.html", 70 | "done::json::foo2.json", 71 | "active::json::foo2.json", 72 | ] 73 | )) 74 | -------------------------------------------------------------------------------- /tests/test_storage.py: -------------------------------------------------------------------------------- 1 | import time 2 | from six import BytesIO 3 | from numpy import testing 4 | 5 | from kubeface import bucket_storage, storage 6 | 7 | from .util import with_local_and_bucket_storage 8 | 9 | 10 | def test_url_parse(): 11 | testing.assert_equal( 12 | bucket_storage.split_bucket_and_name("gs://foo/bar"), 13 | ("foo", "bar")) 14 | 15 | testing.assert_equal( 16 | bucket_storage.split_bucket_and_name("gs://foo/bar/baz.txt"), 17 | ("foo", "bar/baz.txt")) 18 | 19 | 20 | @with_local_and_bucket_storage 21 | def test_put_and_get_to_bucket(bucket): 22 | data = "ABCDe" * 1000 23 | data_handle = BytesIO(data.encode("UTF-8")) 24 | file_name = "kubeface-test-%s.txt" % ( 25 | str(time.time()).replace(".", "")) 26 | name = "%s/%s" % (bucket, file_name) 27 | storage.put(name, data_handle) 28 | testing.assert_equal(storage.list_contents(name), [file_name]) 29 | testing.assert_( 30 | file_name in storage.list_contents("%s/kubeface-test-" % bucket)) 31 | 32 | result_handle = storage.get(name) 33 | testing.assert_equal(result_handle.read().decode("UTF-8"), data) 34 | storage.delete(name) 35 | testing.assert_( 36 | file_name not in storage.list_contents("%s/" % bucket)) 37 | 38 | 39 | @with_local_and_bucket_storage 40 | def test_move(bucket): 41 | data = "ABCDe" * 1000 42 | data_handle = BytesIO(data.encode("UTF-8")) 43 | file_name = "kubeface-test-%s.txt" % ( 44 | str(time.time()).replace(".", "")) 45 | name = "%s/%s" % (bucket, file_name) 46 | name2 = "%s/moved-%s" % (bucket, file_name) 47 | storage.put(name, data_handle) 48 | testing.assert_equal(storage.list_contents(name), [file_name]) 49 | storage.move(name, name2) 50 | testing.assert_equal(storage.list_contents(name), []) 51 | testing.assert_equal( 52 | storage.list_contents(name2), 53 | ["moved-%s" % file_name]) 54 | result_handle = storage.get(name2) 55 | testing.assert_equal(result_handle.read().decode("UTF-8"), data) 56 | storage.delete(name2) 57 | testing.assert_( 58 | ("moved-%s" % file_name) not in storage.list_contents("%s/" % bucket)) 59 | -------------------------------------------------------------------------------- /tests/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import time 5 | import logging 6 | 7 | import kubeface 8 | import kubeface.bucket_storage 9 | 10 | 11 | logging.basicConfig(level=logging.DEBUG) 12 | 13 | KEEP_FILES = os.environ.get("KUBEFACE_TEST_KEEP_FILES") 14 | 15 | 16 | def wipe_bucket(bucket_url): 17 | objects = kubeface.bucket_storage.list_contents(bucket_url) 18 | for obj in objects: 19 | kubeface.bucket_storage.delete(bucket_url + "/" + obj) 20 | 21 | 22 | def check_empty(bucket_url): 23 | assert not kubeface.bucket_storage.list_contents(bucket_url) 24 | 25 | 26 | def with_bucket_storage(function): 27 | bucket = os.environ.get("KUBEFACE_STORAGE") 28 | if not bucket: 29 | logging.fatal("No bucket defined") 30 | 31 | def test_function(): 32 | # check_empty("gs://" + bucket) 33 | wipe_bucket("gs://" + bucket) 34 | function("gs://" + bucket) 35 | wipe_bucket("gs://" + bucket) 36 | return test_function 37 | 38 | 39 | def with_local_storage(function): 40 | def test_function(): 41 | tempdir = tempfile.mkdtemp(dir='/tmp') 42 | function(tempdir) 43 | if not KEEP_FILES: 44 | shutil.rmtree(tempdir) 45 | return test_function 46 | 47 | 48 | def with_local_and_bucket_storage(function): 49 | bucket = os.environ.get("KUBEFACE_STORAGE") 50 | if not bucket: 51 | logging.warning( 52 | "Set KUBEFACE_STORAGE to run test: %s" % str(function)) 53 | return with_local_storage(function) 54 | 55 | def test_function(): 56 | with_local_storage(function)() 57 | with_bucket_storage(function)() 58 | return test_function 59 | --------------------------------------------------------------------------------