├── .gitignore
├── LICENSE.txt
├── Makefile
├── README.md
├── cloudtools
    ├── __init__.py
    ├── __main__.py
    ├── cluster_config.py
    ├── connect.py
    ├── describe.py
    ├── diagnose.py
    ├── init_notebook.py
    ├── latest.py
    ├── list_clusters.py
    ├── modify.py
    ├── safe_call.py
    ├── start.py
    ├── stop.py
    ├── submit.py
    └── utils.py
├── cluster-sanity-check-0.1.py
├── cluster-sanity-check-0.2.py
├── hail-ci-build-image
├── hail-ci-build.sh
├── hail-ci-deploy.sh
├── pr-builder
    └── Dockerfile
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 | dist/
4 | build/
5 | *.egg-info/
6 | test.py
7 | *~
8 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017, cloudtools contributors.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: hail-ci-build-image
 2 | 
 3 | BUILD_IMAGE_SHORT_NAME = cloud-tools-pr-builder
 4 | 
 5 | latest-hail-ci-build-image:
 6 | 	cd pr-builder && docker build . -t ${BUILD_IMAGE_SHORT_NAME}
 7 | 
 8 | hail-ci-build-image: HASH = $(shell docker images -q --no-trunc ${BUILD_IMAGE_SHORT_NAME} | head -n 1 | sed -e 's,[^:]*:,,')
 9 | hail-ci-build-image: latest-hail-ci-build-image
10 | 	docker tag ${BUILD_IMAGE_SHORT_NAME} ${BUILD_IMAGE_SHORT_NAME}:${HASH}
11 | 
12 | push-hail-ci-build-image: HASH = $(shell docker images -q --no-trunc ${BUILD_IMAGE_SHORT_NAME} | head -n 1 | sed -e 's,[^:]*:,,')
13 | push-hail-ci-build-image: hail-ci-build-image
14 | 	docker tag ${BUILD_IMAGE_SHORT_NAME}:${HASH} gcr.io/broad-ctsa/${BUILD_IMAGE_SHORT_NAME}:${HASH}
15 | 	docker push gcr.io/broad-ctsa/${BUILD_IMAGE_SHORT_NAME}:${HASH}
16 | 	echo gcr.io/broad-ctsa/${BUILD_IMAGE_SHORT_NAME}:${HASH} > hail-ci-build-image
17 | 
18 | deploy:
19 | 	rm -f dist/*
20 | 	python2 setup.py bdist_wheel
21 | 	python3 setup.py sdist bdist_wheel
22 | 	twine upload dist/*
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Deprecation Notice
  2 | 
  3 | cloudtools has been deprecated in favor of the `hailctl dataproc` 
  4 | command-line utility. See the [forum thread](https://discuss.hail.is/t/new-command-line-utility-hailctl/981)
  5 | for more information.
  6 | 
  7 | # cloudtools
  8 | 
  9 | [![PyPI](https://img.shields.io/pypi/v/cloudtools.svg)]()
 10 | 
 11 | cloudtools is a small collection of command line tools intended to make using [Hail](https://hail.is) on clusters running in Google Cloud's Dataproc service simpler. 
 12 | 
 13 | These tools are written in Python and mostly function as wrappers around the `gcloud` suite of command line tools included in the Google Cloud SDK. 
 14 | 
 15 | ## Installation
 16 | 
 17 | Prerequisites:
 18 | - Mac OS X
 19 | - Python 2 or 3
 20 | - [Google Cloud SDK](https://cloud.google.com/sdk/docs/quickstart-mac-os-x)
 21 | - (Optional) Google Chrome installed in the (default) location `/Applications/Google Chrome.app/Contents/MacOS/Google Chrome`
 22 | 
 23 | cloudtools can be installed from the Python package index using the pip installer: `pip install cloudtools`
 24 | 
 25 | To update to the latest version: `pip install cloudtools --upgrade`
 26 | 
 27 | ## Usage
 28 | 
 29 | All functionality in cloudtools is accessed through the `cluster` module.
 30 | 
 31 | There are 7 commands within the `cluster` module:
 32 | - `cluster start <name> [args]`
 33 | - `cluster submit <name> [args]`
 34 | - `cluster connect <name> [args]`
 35 | - `cluster modify <name> [args]`
 36 | - `cluster diagnose <name> [args]`
 37 | - `cluster stop <name>`
 38 | - `cluster list`
 39 | 
 40 | where `<name>` is the required, user-supplied name of the Dataproc cluster.
 41 | 
 42 | **REMINDER:** Don't forget to shut down your cluster when you're done! You can do this using `cluster stop <name>`, through the Google Cloud Console, or using the Google Cloud SDK directly with `gcloud dataproc clusters delete name`.
 43 | 
 44 | ## Examples
 45 | 
 46 | ### Script submission
 47 | 
 48 | One way to use the Dataproc service is to write complete Python scripts that use Hail, and then submit those scripts to the Dataproc cluster. An example of using cloudtools to interact with Dataproc in this way would be:
 49 | ```
 50 | $ cluster start testcluster -p 6
 51 | ...wait for cluster to start...
 52 | $ cluster submit testcluster myhailscript.py
 53 | ...Hail job output...
 54 | Job [...] finished successfully.
 55 | ```
 56 | where `myhailscript.py` lives on your computer in your current working directory and looks something like:
 57 | ```
 58 | import hail as hl
 59 | hl.init()
 60 | ...
 61 | ```
 62 | 
 63 | This snippet starts a cluster named "testcluster" with the 1 master machine, 2 worker machines (the minimum/default), and 6 additional preemptible worker machines. Then, after the cluster is started (this can take a few minutes), a Hail script is submitted to the cluster "testcluster".
 64 | 
 65 | You can also pass arguments to the Hail script using the `--args` argument:
 66 | ```
 67 | $ cluster submit testcluster myhailscript.py --args "arg1 arg2"
 68 | ```
 69 | where `myhailscript.py` is
 70 | ```
 71 | import sys
 72 | print('First argument: ', sys.argv[1])
 73 | print('Second argument: ', sys.argv[2])
 74 | ```
 75 | would print
 76 | ```
 77 | First argument: arg1
 78 | Second argument: arg2
 79 | ```
 80 | 
 81 | ### Interactive Hail with Jupyter Notebooks
 82 | 
 83 | Another way to use the Dataproc service is through a Jupyter notebook running on the cluster's master machine. By default, `cluster name start` sets up and starts a Jupyter server process - complete with a Hail kernel - on the master machine of the cluster. 
 84 | 
 85 | To use Hail in a Jupyter notebook, you'll need to have Google Chrome installed on your computer as described in the installation section above. Then, use
 86 | ```
 87 | cluster connect testcluster notebook
 88 | ```
 89 | 
 90 | to open a connection to the cluster "testcluster" through Chrome. 
 91 | 
 92 | A new browser will open with the address `localhost:8123` -- this is port 8123 on the cluster's master machine, which is where the Jupyter notebook server is running. You should see the Google Storage home directory of the project your cluster was launched in, with all of the project's buckets listed.
 93 | 
 94 | Select the bucket you'd like to work in, and you should see all of the files and directories in that bucket. You can either resume working on an existing `.ipynb` file in the bucket, or create a new Hail notebook by selecting `Hail` from the `New` notebook drop-down in the upper-right corner.
 95 | 
 96 | From the notebook, you can use Hail the same way that you would in a complete job script:
 97 | ```
 98 | import hail as hl
 99 | hl.init()
100 | ...
101 | ```
102 | To read or write files stored in a Google bucket outside of Hail-specific commands, use Hail's `hadoop_read()` and `hadoop_write()` helper functions. For example, to read in a TSV file from Google storage to a pandas dataframe:
103 | ```
104 | import hail as hl
105 | import pandas as pd
106 | 
107 | hl.init()
108 | 
109 | with hl.hadoop_open('gs://mybucket/mydata.tsv', 'r') as f:
110 |     df = pd.read_table(f)
111 | ```
112 | 
113 | When you save your notebooks using either `File -> Save and Checkpoint` or `command + s`, they'll be saved automatically to the bucket you're working in.
114 | 
115 | ### Monitoring Hail jobs
116 | 
117 | While your job is running, you can monitor its progress through the Spark Web UI running on the cluster's master machine at port 4040. To connect to the SparkUI from your local machine, use
118 | ```
119 | cluster connect testcluster ui
120 | ```
121 | If you've attempted to start multiple Hail/Spark contexts, you may find that the web UI for a particular job is accessible through ports 4041 or 4042 instead. To connect to these ports, use
122 | ```
123 | cluster connect testcluster ui1
124 | ```
125 | to connect to 4041, or
126 | ```
127 | cluster connect testcluster ui2
128 | ```
129 | to connect to 4042.  
130 | 
131 | To view details on a job that has completed, you can access the Spark history server running on port 18080 with
132 | ```
133 | cluster connect testcluster spark-history
134 | ```
135 | 
136 | ### Module usage
137 | 
138 | ```
139 | $ cluster -h
140 | usage: cluster [-h] {start,submit,connect,diagnose,stop} ...
141 | 
142 | Deploy and monitor Google Dataproc clusters to use with Hail.
143 | 
144 | positional arguments:
145 |   {start,submit,connect,diagnose,stop}
146 |     start               Start a Dataproc cluster configured for Hail.
147 |     submit              Submit a Python script to a running Dataproc cluster.
148 |     connect             Connect to a running Dataproc cluster.
149 |     diagnose            Diagnose problems in a Dataproc cluster.
150 |     stop                Shut down a Dataproc cluster.
151 |     
152 | optional arguments:
153 |   -h, --help            show this help message and exit
154 | ```
155 | 
156 | ```
157 | $ cluster start -h
158 | usage: cluster start [-h] [--hash HASH] [--spark {2.0.2,2.2.0}]
159 |                      [--version {0.1,0.2}]
160 |                      [--master-machine-type MASTER_MACHINE_TYPE]
161 |                      [--master-memory-fraction MASTER_MEMORY_FRACTION]
162 |                      [--master-boot-disk-size MASTER_BOOT_DISK_SIZE]
163 |                      [--num-master-local-ssds NUM_MASTER_LOCAL_SSDS]
164 |                      [--num-preemptible-workers NUM_PREEMPTIBLE_WORKERS]
165 |                      [--num-worker-local-ssds NUM_WORKER_LOCAL_SSDS]
166 |                      [--num-workers NUM_WORKERS]
167 |                      [--preemptible-worker-boot-disk-size PREEMPTIBLE_WORKER_BOOT_DISK_SIZE]
168 |                      [--worker-boot-disk-size WORKER_BOOT_DISK_SIZE]
169 |                      [--worker-machine-type WORKER_MACHINE_TYPE] [--zone ZONE]
170 |                      [--properties PROPERTIES] [--metadata METADATA]
171 |                      [--packages PACKAGES] [--jar JAR] [--zip ZIP]
172 |                      [--init INIT] [--init_timeout INIT_TIMEOUT] [--vep] [--dry-run]
173 |                      name
174 | Start a Dataproc cluster configured for Hail.
175 | 
176 | positional arguments:
177 |   name                  Cluster name.
178 | 
179 | optional arguments:
180 |   -h, --help            show this help message and exit
181 |   --hash HASH           Hail build to use for notebook initialization
182 |                         (default: latest).
183 |   --spark {2.0.2,2.2.0}
184 |                         Spark version used to build Hail (default: 2.2.0)
185 |   --version {0.1,0.2}
186 |                         Hail version to use (default: 0.2).
187 |   --master-machine-type MASTER_MACHINE_TYPE, --master MASTER_MACHINE_TYPE, -m MASTER_MACHINE_TYPE
188 |                         Master machine type (default: n1-highmem-8).
189 |   --master-memory-fraction MASTER_MEMORY_FRACTION
190 |                         Fraction of master memory allocated to the JVM. Use a
191 |                         smaller value to reserve more memory for Python.
192 |                         (default: 0.8)
193 |   --master-boot-disk-size MASTER_BOOT_DISK_SIZE
194 |                         Disk size of master machine, in GB (default: 100).
195 |   --num-master-local-ssds NUM_MASTER_LOCAL_SSDS
196 |                         Number of local SSDs to attach to the master machine
197 |                         (default: 0).
198 |   --num-preemptible-workers NUM_PREEMPTIBLE_WORKERS, --n-pre-workers NUM_PREEMPTIBLE_WORKERS, -p NUM_PREEMPTIBLE_WORKERS
199 |                         Number of preemptible worker machines (default: 0).
200 |   --num-worker-local-ssds NUM_WORKER_LOCAL_SSDS
201 |                         Number of local SSDs to attach to each worker machine
202 |                         (default: 0).
203 |   --num-workers NUM_WORKERS, --n-workers NUM_WORKERS, -w NUM_WORKERS
204 |                         Number of worker machines (default: 2).
205 |   --preemptible-worker-boot-disk-size PREEMPTIBLE_WORKER_BOOT_DISK_SIZE
206 |                         Disk size of preemptible machines, in GB (default:
207 |                         40).
208 |   --worker-boot-disk-size WORKER_BOOT_DISK_SIZE
209 |                         Disk size of worker machines, in GB (default: 40).
210 |   --worker-machine-type WORKER_MACHINE_TYPE, --worker WORKER_MACHINE_TYPE
211 |                         Worker machine type (default: n1-standard-8, or
212 |                         n1-highmem-8 with --vep).
213 |   --zone ZONE           Compute zone for the cluster (default: us-central1-b).
214 |   --properties PROPERTIES
215 |                         Additional configuration properties for the cluster
216 |   --metadata METADATA   Comma-separated list of metadata to add:
217 |                         KEY1=VALUE1,KEY2=VALUE2...
218 |   --packages PACKAGES, --pkgs PACKAGES
219 |                         Comma-separated list of Python packages to be
220 |                         installed on the master node.
221 |   --jar JAR             Hail jar to use for Jupyter notebook.
222 |   --zip ZIP             Hail zip to use for Jupyter notebook.
223 |   --init INIT           Comma-separated list of init scripts to run.
224 |   --init_timeout INIT_TIMEOUT
225 |                         Flag to specify a timeout period for the
226 |                         initialization action
227 |   --vep                 Configure the cluster to run VEP.
228 |   --dry-run             Print gcloud dataproc command, but don't run it.```
229 | 
230 | ```
231 | 
232 | ```
233 | $ cluster submit -h
234 | usage: cluster submit [-h] [--properties PROPERTIES]
235 |                       [--args ARGS]
236 |                       name script
237 | 
238 | Submit a Python script to a running Dataproc cluster.
239 | 
240 | positional arguments:
241 |   name                  Cluster name.
242 |   script
243 | 
244 | optional arguments:
245 |   -h, --help            show this help message and exit
246 |   --properties PROPERTIES, -p PROPERTIES
247 |                         Extra Spark properties to set.
248 |   --args ARGS           Quoted string of arguments to pass to the Hail script
249 |                         being submitted.
250 | ```
251 | 
252 | ```
253 | $ cluster connect -h
254 | usage: cluster connect [-h] [--port PORT] [--zone ZONE]
255 |                        name
256 |                        {notebook,nb,spark-ui,ui,spark-ui1,ui1,spark-ui2,ui2,spark-history,hist}
257 | 
258 | Connect to a running Dataproc cluster.
259 | 
260 | positional arguments:
261 |   name                  Cluster name.
262 |   {notebook,nb,spark-ui,ui,spark-ui1,ui1,spark-ui2,ui2,spark-history,hist}
263 |                         Web service to launch.
264 | 
265 | optional arguments:
266 |   -h, --help            show this help message and exit
267 |   --port PORT, -p PORT  Local port to use for SSH tunnel to master node
268 |                         (default: 10000).
269 |   --zone ZONE, -z ZONE  Compute zone for Dataproc cluster (default: us-
270 |                         central1-b).
271 | ```
272 | 
273 | ```
274 | $ cluster modify -h
275 | usage: cluster modify [-h] [--jar JAR] [--zip ZIP] [--num-workers NUM_WORKERS]
276 |                       [--num-preemptible-workers NUM_PREEMPTIBLE_WORKERS]
277 |                       [--graceful-decommission-timeout GRACEFUL_DECOMMISSION_TIMEOUT]
278 |                       [--max-idle MAX_IDLE] [--dry-run] [--zone ZONE]
279 |                       name
280 | 
281 | Modify active Dataproc clusters.
282 | 
283 | positional arguments:
284 |   name                  Cluster name.
285 | 
286 | optional arguments:
287 |   -h, --help            show this help message and exit
288 |   --jar JAR             New Hail JAR.
289 |   --zip ZIP             New Hail ZIP.
290 |   --num-workers NUM_WORKERS, --n-workers NUM_WORKERS, -w NUM_WORKERS
291 |                         New number of worker machines (min. 2).
292 |   --num-preemptible-workers NUM_PREEMPTIBLE_WORKERS, --n-pre-workers NUM_PREEMPTIBLE_WORKERS, -p NUM_PREEMPTIBLE_WORKERS
293 |                         New number of preemptible worker machines.
294 |   --graceful-decommission-timeout GRACEFUL_DECOMMISSION_TIMEOUT, --graceful GRACEFUL_DECOMMISSION_TIMEOUT
295 |                         If set, cluster size downgrade will use graceful
296 |                         decommissionnig with the given timeout (e.g. "60m").
297 |   --max-idle MAX_IDLE   New maximum idle time before shutdown (e.g. "60m").
298 |   --dry-run             Print gcloud dataproc command, but don't run it.
299 |   --zone ZONE, -z ZONE  Compute zone for Dataproc cluster (default: us-
300 |                         central1-b).
301 | ```
302 | 
303 | ```
304 | $ cluster diagnose -h
305 | usage: cluster diagnose [-h] --dest DEST [--hail-log HAIL_LOG] [--overwrite]
306 |                         [--no-diagnose] [--compress]
307 |                         [--workers [WORKERS [WORKERS ...]]] [--take TAKE]
308 |                         name
309 | 
310 | Diagnose problems in a Dataproc cluster.
311 | 
312 | positional arguments:
313 |   name                  Cluster name.
314 | 
315 | optional arguments:
316 |   -h, --help            show this help message and exit
317 |   --dest DEST, -d DEST  Directory for diagnose output -- must be local.
318 |   --hail-log HAIL_LOG, -l HAIL_LOG
319 |                         Path for hail.log file.
320 |   --overwrite           Delete dest directory before adding new files.
321 |   --no-diagnose         Do not run gcloud dataproc clusters diagnose.
322 |   --compress, -z        GZIP all files.
323 |   --workers [WORKERS [WORKERS ...]]
324 |                         Specific workers to get log files from.
325 |   --take TAKE           Only download logs from the first N workers.
326 | ```
327 | 
328 | ```
329 | $ cluster stop -h
330 | usage: cluster stop [-h] name
331 | 
332 | Shut down a Dataproc cluster.
333 | 
334 | positional arguments:
335 |   name        Cluster name.
336 | 
337 | optional arguments:
338 |   -h, --help  show this help message and exit
339 | ```
340 | 
341 | ```
342 | cluster list -h
343 | usage: cluster list [-h]
344 | 
345 | List active Dataproc clusters.
346 | 
347 | optional arguments:
348 |   -h, --help  show this help message and exit
349 | ```
350 | 
351 | ## Deploying
352 | ```
353 | TWINE_USERNAME=username TWINE_PASSWORD=password make deploy
354 | ```
355 | 
356 | ## Creating the k8s Deployment Secrets
357 | 
358 | pypi-username and pypi-password are files containing credentials sufficiently
359 | privileged to publish cloudtools
360 | 
361 | ```
362 | kubectl create secret generic \
363 |   ci-deploy-0-1--nealelab-cloudtools \
364 |   --from-file=secrets/pypi-username \
365 |   --from-file=secrets/pypi-password
366 | ```
367 | 


--------------------------------------------------------------------------------
/cloudtools/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '4.2.0'
2 | 


--------------------------------------------------------------------------------
/cloudtools/__main__.py:
--------------------------------------------------------------------------------
  1 | from .utils import decode
  2 | import argparse
  3 | import sys
  4 | from cloudtools import start
  5 | from cloudtools import submit
  6 | from cloudtools import connect
  7 | from cloudtools import diagnose
  8 | from cloudtools import stop
  9 | from cloudtools import list_clusters
 10 | from cloudtools import modify
 11 | from cloudtools import describe
 12 | from cloudtools import latest
 13 | from cloudtools import __version__
 14 | 
 15 | 
 16 | def main():
 17 |     main_parser = argparse.ArgumentParser(description='Deploy and monitor Google Dataproc clusters to use with Hail.')
 18 |     subs = main_parser.add_subparsers()
 19 | 
 20 |     start_parser = subs.add_parser('start',
 21 |                                    help='Start a Dataproc cluster configured for Hail.',
 22 |                                    description='Start a Dataproc cluster configured for Hail.')
 23 |     submit_parser = subs.add_parser('submit',
 24 |                                     help='Submit a Python script to a running Dataproc cluster.',
 25 |                                     description='Submit a Python script to a running Dataproc cluster.')
 26 |     connect_parser = subs.add_parser('connect',
 27 |                                      help='Connect to a running Dataproc cluster.',
 28 |                                      description='Connect to a running Dataproc cluster.')
 29 |     diagnose_parser = subs.add_parser('diagnose',
 30 |                                       help='Diagnose problems in a Dataproc cluster.',
 31 |                                       description='Diagnose problems in a Dataproc cluster.')
 32 |     stop_parser = subs.add_parser('stop',
 33 |                                   help='Shut down a Dataproc cluster.',
 34 |                                   description='Shut down a Dataproc cluster.')
 35 | 
 36 |     list_parser = subs.add_parser('list',
 37 |                                   help='List active Dataproc clusters.',
 38 |                                   description='List active Dataproc clusters.')
 39 | 
 40 |     modify_parser = subs.add_parser('modify',
 41 |                                     help='Modify active Dataproc clusters.',
 42 |                                     description='Modify active Dataproc clusters.')
 43 | 
 44 |     describe_parser = subs.add_parser('describe',
 45 |                                       help='Gather information about a hail file (including the schema)',
 46 |                                       description='Gather information about a hail file (including the schema)')
 47 | 
 48 |     latest_parser = subs.add_parser('latest',
 49 |                                     help='Find the newest deployed SHA and the locations of the newest JARs and ZIPs',
 50 |                                     description='Find the newest deployed SHA and the locations of the newest JARs and ZIPs')
 51 | 
 52 |     start_parser.set_defaults(module='start')
 53 |     start.init_parser(start_parser)
 54 | 
 55 |     submit_parser.set_defaults(module='submit')
 56 |     submit.init_parser(submit_parser)
 57 | 
 58 |     connect_parser.set_defaults(module='connect')
 59 |     connect.init_parser(connect_parser)
 60 | 
 61 |     diagnose_parser.set_defaults(module='diagnose')
 62 |     diagnose.init_parser(diagnose_parser)
 63 | 
 64 |     stop_parser.set_defaults(module='stop')
 65 |     stop.init_parser(stop_parser)
 66 | 
 67 |     list_parser.set_defaults(module='list')
 68 | 
 69 |     modify_parser.set_defaults(module='modify')
 70 |     modify.init_parser(modify_parser)
 71 | 
 72 |     describe_parser.set_defaults(module='describe')
 73 |     describe.init_parser(describe_parser)
 74 | 
 75 |     latest_parser.set_defaults(module='latest')
 76 |     latest.init_parser(latest_parser)
 77 | 
 78 |     if len(sys.argv) == 1:
 79 |         main_parser.print_help()
 80 |         sys.exit(0)
 81 | 
 82 |     args, pass_through_args = main_parser.parse_known_args()
 83 | 
 84 |     if args.module == 'start':
 85 |         start.main(args)
 86 | 
 87 |     elif args.module == 'submit':
 88 |         submit.main(args, pass_through_args)
 89 | 
 90 |     elif args.module == 'connect':
 91 |         connect.main(args)
 92 | 
 93 |     elif args.module == 'diagnose':
 94 |         diagnose.main(args)
 95 | 
 96 |     elif args.module == 'stop':
 97 |         stop.main(args)
 98 | 
 99 |     elif args.module == 'list':
100 |         list_clusters.main(args)
101 | 
102 |     elif args.module == 'modify':
103 |         modify.main(args)
104 | 
105 |     elif args.module == 'describe':
106 |         describe.main(args)
107 | 
108 |     elif args.module == 'latest':
109 |         latest.main(args)
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     import sys
114 |     sys.stderr.write("cloudtools is now deprecated in favor of the 'hailctl dataproc' utility.\n"
115 |                      "  For more information, see: "
116 |                      "https://discuss.hail.is/t/new-command-line-utility-hailctl/981\n")
117 |     main()
118 | 


--------------------------------------------------------------------------------
/cloudtools/cluster_config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | class ClusterConfig:
 4 |     def __init__(self, json_str):
 5 |         params = json.loads(json_str)
 6 |         self.vars = params['vars']
 7 |         self.flags = params['flags']
 8 | 
 9 |     def extend_flag(self, flag, values):
10 |         if flag not in self.flags:
11 |             self.flags[flag] = values
12 |         elif isinstance(self.flags[flag], list):
13 |             assert isinstance(values, list)
14 |             self.flags[flag].extend(values)
15 |         else:
16 |             assert isinstance(self.flags[flag], dict)
17 |             assert isinstance(values, dict)
18 |             self.flags[flag].update(values)
19 | 
20 |     def parse_and_extend(self, flag, values):
21 |         values = dict(tuple(pair.split('=')) for pair in values.split(',') if '=' in pair)
22 |         self.extend_flag(flag, values)
23 | 
24 |     def format(self, obj):
25 |         if isinstance(obj, dict):
26 |             return self.format(['{}={}'.format(k, v) for k, v in obj.items()])
27 |         if isinstance(obj, list):
28 |             return self.format(','.join(obj))
29 |         else:
30 |             return str(obj).format(**self.vars)
31 | 
32 |     def jar(self):
33 |         return self.flags['metadata']['JAR'].format(**self.vars)
34 | 
35 |     def zip(self):
36 |         return self.flags['metadata']['ZIP'].format(**self.vars)
37 | 
38 |     def configure(self, sha, spark):
39 |         self.vars['spark'] = spark
40 |         image = self.vars['supported_spark'].get(spark)
41 |         if image is None:
42 |             raise ValueError(
43 |                 'Incompatible spark version {spark}, compatible versions are: {compat}'.format(
44 |                     spark=spark, compat=list(self.vars['supported_spark'])))
45 |         self.vars['image'] = image
46 |         self.vars['hash'] = sha
47 | 
48 |     def get_command(self, name):
49 |         flags = ['--{}={}'.format(f, self.format(v)) for f, v in self.flags.items()]
50 |         return ['gcloud',
51 |                 'dataproc',
52 |                 'clusters',
53 |                 'create',
54 |                 name] + flags
55 | 


--------------------------------------------------------------------------------
/cloudtools/connect.py:
--------------------------------------------------------------------------------
 1 | import subprocess as sp
 2 | import os
 3 | from .safe_call import safe_call
 4 | 
 5 | def init_parser(parser):
 6 |     parser.add_argument('name', type=str, help='Cluster name.')
 7 |     parser.add_argument('service', type=str,
 8 |                         choices=['notebook', 'nb', 'spark-ui', 'ui', 'spark-ui1', 'ui1',
 9 |                                  'spark-ui2', 'ui2', 'spark-history', 'hist'],
10 |                         help='Web service to launch.')
11 |     parser.add_argument('--port', '-p', default='10000', type=str,
12 |                         help='Local port to use for SSH tunnel to master node (default: %(default)s).')
13 |     parser.add_argument('--zone', '-z', default='us-central1-b', type=str,
14 |                         help='Compute zone for Dataproc cluster (default: %(default)s).')
15 | 
16 | def main(args):
17 |     print("Connecting to cluster '{}'...".format(args.name))
18 | 
19 |     # shortcut mapping
20 |     shortcut = {
21 |         'ui': 'spark-ui',
22 |         'ui1': 'spark-ui1',
23 |         'ui2': 'spark-ui2',
24 |         'hist': 'history',
25 |         'nb': 'notebook'
26 |     }
27 | 
28 |     service = args.service
29 |     if service in shortcut:
30 |         service = shortcut[service]
31 | 
32 |     # Dataproc port mapping
33 |     dataproc_ports = {
34 |         'spark-ui': 4040,
35 |         'spark-ui1': 4041,
36 |         'spark-ui2': 4042,
37 |         'spark-history': 18080,
38 |         'notebook': 8123
39 |     }
40 |     connect_port = dataproc_ports[service]
41 | 
42 |     # open SSH tunnel to master node
43 |     sp.check_call(
44 |         ['gcloud',
45 |          'compute',
46 |          'ssh',
47 |          '{}-m'.format(args.name),
48 |          '--zone={}'.format(args.zone),
49 |          '--ssh-flag=-D {}'.format(args.port),
50 |          '--ssh-flag=-N',
51 |          '--ssh-flag=-f',
52 |          '--ssh-flag=-n'],
53 |         stderr=sp.STDOUT
54 |     )
55 | 
56 |     # open Chrome with SOCKS proxy configuration
57 |     with open(os.devnull, 'w') as f:
58 |         sp.Popen([
59 |             r'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
60 |             'http://localhost:{}'.format(connect_port),
61 |             '--proxy-server=socks5://localhost:{}'.format(args.port),
62 |             '--host-resolver-rules=MAP * 0.0.0.0 , EXCLUDE localhost',
63 |             '--proxy-bypass-list=<-loopback>', # https://chromium.googlesource.com/chromium/src/+/da790f920bbc169a6805a4fb83b4c2ab09532d91
64 |             '--user-data-dir=/tmp/'
65 |         ], stdout=f, stderr=f)
66 | 


--------------------------------------------------------------------------------
/cloudtools/describe.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from zlib import decompress, MAX_WBITS
  3 | from subprocess import check_output
  4 | from statistics import median, mean, stdev
  5 | from collections import OrderedDict
  6 | 
  7 | SECTION_SEPARATOR = '-'*40
  8 | IDENT = ' '*4
  9 | 
 10 | def parse_schema(s):
 11 |     def parse_type(s, end_delimiter, element_type):
 12 |         keys = []
 13 |         values = []
 14 |         i = 0
 15 |         while i < len(s):
 16 |             if s[i] == end_delimiter:
 17 |                 if s[:i]:
 18 |                     values.append(s[:i])
 19 |                 if element_type in ['Array', 'Set', 'Dict']:
 20 |                     return {'type': element_type, 'value': values}, s[i + 1:]
 21 |                 else:
 22 |                     return {'type': element_type, 'value': OrderedDict(zip(keys, values))}, s[i + 1:]
 23 |             elif s[i] == ':':
 24 |                 keys.append(s[:i])
 25 |                 s = s[i + 1:]
 26 |                 i = 0
 27 |             elif s[i] == '{':
 28 |                 struct, s = parse_type(s[i + 1:], '}', s[:i])
 29 |                 values.append(struct)
 30 |                 i = 0
 31 |             elif s[i] == '[':
 32 |                 arr, s = parse_type(s[i + 1:], ']', s[:i] if s[:i] else 'Array')
 33 |                 values.append(arr)
 34 |                 i = 0
 35 |             elif s[i] == ',':
 36 |                 if s[:i]:
 37 |                     values.append(s[:i])
 38 |                 s = s[i + 1:]
 39 |                 i = 0
 40 |             else:
 41 |                 i += 1
 42 | 
 43 |         raise Exception('End of {} not found'.format(element_type))
 44 | 
 45 |     start_schema_index = s.index('{')
 46 |     return parse_type(s[start_schema_index+1:], "}", s[:start_schema_index])[0]
 47 | 
 48 | 
 49 | def type_str(t, depth=1):
 50 |     NAME_MAP = {
 51 |         'Boolean': 'bool',
 52 |         'String': 'str'
 53 |     }
 54 | 
 55 |     def element_str(e):
 56 |         if isinstance(e, dict):
 57 |             if e['type'] == 'Struct':
 58 |                 return "struct {{\n{}\n{}}}".format(
 59 |                     type_str(e['value'], depth + 1),
 60 |                     (IDENT * depth)
 61 |                 )
 62 |             else:
 63 |                 return "{}<{}>".format(
 64 |                     e['type'].lower(),
 65 |                     ", ".join([element_str(x) for x in e['value']])
 66 |                 )
 67 |         else:
 68 |             return NAME_MAP.get(e, e).lower().replace('(', '<').replace(')', '>')
 69 | 
 70 |     return "\n".join(
 71 |         "{}'{}': {}".format(IDENT * depth, k, element_str(v))
 72 |         for k, v in t.items()
 73 |     )
 74 | 
 75 | def key_str(k):
 76 |     if isinstance(k, dict):
 77 |         return '[{}]'.format(', '.join([key_str(x) for x in k['value']]))
 78 |     else:
 79 |         return "'{}'".format(k)
 80 | 
 81 | 
 82 | def get_partitions_info_str(j):
 83 |     partitions = j['components']['partition_counts']['counts']
 84 |     partitions_info = {
 85 |                           'Partitions': len(partitions),
 86 |                           'Rows': sum(partitions),
 87 |                           'Empty partitions': len([p for p in partitions if p == 0])
 88 |                       }
 89 |     if partitions_info['Partitions'] > 1:
 90 |         partitions_info.update({
 91 |             'Min(rows/partition)': min(partitions),
 92 |             'Max(rows/partition)': max(partitions),
 93 |             'Median(rows/partition)': median(partitions),
 94 |             'Mean(rows/partition)': int(mean(partitions)),
 95 |             'StdDev(rows/partition)': int(stdev(partitions))
 96 |         })
 97 | 
 98 | 
 99 |     return "\n{}".format(IDENT).join(['{}: {}'.format(k, v) for k, v in partitions_info.items()])
100 | 
101 | 
102 | def init_parser(parser):
103 |     # arguments with default parameters
104 |     parser.add_argument('file', type=str, help='Path to hail file (either MatrixTable or Table).')
105 | 
106 | def main(args):
107 | 
108 |     command = ['gsutil'] if args.file.startswith('gs://') else []
109 | 
110 |     j = json.loads(
111 |         decompress(
112 |             check_output(command + ['cat', args.file + '/metadata.json.gz']),
113 |             16+MAX_WBITS
114 |         )
115 |     )
116 | 
117 |     # Get the file schema
118 |     file_schema = parse_schema(j[[k for k in j.keys() if k.endswith('type')][0]])
119 | 
120 |     # Print file information
121 |     print(SECTION_SEPARATOR)
122 |     print('File Type: {}'.format(file_schema['type']))
123 |     print(IDENT + get_partitions_info_str(j))
124 | 
125 |     # Print global fields
126 |     print(SECTION_SEPARATOR)
127 |     print('Global fields:')
128 |     print(type_str(file_schema['value']['global']['value']))
129 | 
130 |     # Print column fields if present
131 |     if 'col' in file_schema['value']:
132 |         print(SECTION_SEPARATOR)
133 |         print('Column fields:')
134 |         print(type_str(file_schema['value']['col']['value']))
135 | 
136 |     # Print row fields
137 |     print(SECTION_SEPARATOR)
138 |     print('Row fields:')
139 |     print(type_str(file_schema['value']['row']['value']))
140 | 
141 |     # Print entry fields if present
142 |     if 'entry' in file_schema['value']:
143 |         print(SECTION_SEPARATOR)
144 |         print('Entry fields:')
145 |         print(type_str(file_schema['value']['entry']['value']))
146 | 
147 |     # Print keys
148 |     print(SECTION_SEPARATOR)
149 |     if 'col_key' in file_schema['value']:
150 |         print("Column key: {}".format(key_str(file_schema['value']['col_key'])))
151 |         print("Row key: {}".format(key_str(file_schema['value']['row_key'])))
152 |     else:
153 |         print("Key: {}".format(key_str(file_schema['value']['key'])))
154 |     print(SECTION_SEPARATOR)
155 | 
156 |     # Check for _SUCCESS
157 |     try:
158 |         check_output(command + ['ls', args.file + '/_SUCCESS'])
159 |     except:
160 |         print("\033[;1m\033[1;31mCould not find _SUCCESS for file: {}\nThis file will not work.\033[0m".format(args.file))
161 | 


--------------------------------------------------------------------------------
/cloudtools/diagnose.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | from subprocess import call, Popen, PIPE
  4 | 
  5 | 
  6 | def init_parser(parser):
  7 |     parser.add_argument('name', type=str, help='Cluster name.')
  8 |     parser.add_argument('--dest', '-d', required=True, type=str, help="Directory for diagnose output -- must be local.")
  9 |     parser.add_argument('--hail-log', '-l', required=False, type=str, default='/home/hail/hail.log',
 10 |                         help="Path for hail.log file.")
 11 |     parser.add_argument('--overwrite', required=False, action='store_true',
 12 |                         help="Delete dest directory before adding new files.")
 13 |     parser.add_argument('--no-diagnose', required=False, action='store_true',
 14 |                         help="Do not run gcloud dataproc clusters diagnose.")
 15 |     parser.add_argument('--compress', '-z', required=False, action='store_true', help="GZIP all files.")
 16 |     parser.add_argument('--workers', required=False, nargs='*', help="Specific workers to get log files from.")
 17 |     parser.add_argument('--take', required=False, type=int, default=None,
 18 |                         help="Only download logs from the first N workers.")
 19 | 
 20 | 
 21 | def main(args):
 22 |     print("Diagnosing cluster '{}'...".format(args.name))
 23 | 
 24 | 
 25 |     is_local = not args.dest.startswith("gs://")
 26 | 
 27 |     if args.overwrite:
 28 |         if is_local:
 29 |             call('rm -r {dir}'.format(dir=args.dest), shell=True)
 30 |         else:
 31 |             call('gsutil -m rm -r {dir}'.format(dir=args.dest), shell=True)
 32 | 
 33 | 
 34 |     master_dest = args.dest.rstrip('/') + "/master/"
 35 |     worker_dest = args.dest.rstrip('/') + "/workers/"
 36 | 
 37 |     if is_local:
 38 |         call('mkdir -p {dir}'.format(dir=master_dest), shell=True)
 39 |         call('mkdir -p {dir}'.format(dir=worker_dest), shell=True)
 40 | 
 41 |     desc = json.loads(Popen('gcloud dataproc clusters describe {name} --format json'.format(name=args.name),
 42 |                             shell=True,
 43 |                             stdout=PIPE,
 44 |                             stderr=PIPE).communicate()[0].strip())
 45 | 
 46 |     config = desc['config']
 47 | 
 48 |     master = config['masterConfig']['instanceNames'][0]
 49 |     try:
 50 |         workers = config['workerConfig']['instanceNames'] + config['secondaryWorkerConfig']['instanceNames']
 51 |     except KeyError:
 52 |         workers = config['workerConfig']['instanceNames']
 53 |     zone = re.search('zones\/(?P<zone>\S+)$', config['gceClusterConfig']['zoneUri']).group('zone')
 54 | 
 55 |     if args.workers:
 56 |         invalid_workers = set(args.workers).difference(set(workers))
 57 |         assert len(invalid_workers) == 0, "Non-existent workers specified: " + ", ".join(invalid_workers)
 58 |         workers = args.workers
 59 | 
 60 |     if args.take:
 61 |         assert args.take > 0 and args.take <= len(workers), "Number of workers to take must be in the range of [0, nWorkers]. Found " + args.take + "."
 62 |         workers = workers[:args.take]
 63 | 
 64 | 
 65 |     def gcloud_ssh(remote, command):
 66 |         return 'gcloud compute ssh {remote} --zone {zone} --command "{command}"'.format(remote=remote, zone=zone, command=command)
 67 | 
 68 | 
 69 |     def gcloud_copy_files(remote, src, dest):
 70 |         return 'gcloud compute copy-files {remote}:{src} {dest} --zone {zone}'.format(remote=remote, src=src, dest=dest, zone=zone)
 71 | 
 72 | 
 73 |     def gsutil_cp(src, dest):
 74 |         return 'gsutil -m cp -r {src} {dest}'.format(src=src, dest=dest)
 75 | 
 76 | 
 77 |     def copy_files_tmp(remote, files, dest, tmp):
 78 |         init_cmd = ['mkdir -p {tmp}; rm -r {tmp}/*'.format(tmp=tmp)]
 79 | 
 80 |         copy_tmp_cmds = ['sudo cp -r {file} {tmp}'.format(file=file, tmp=tmp) for file in files]
 81 |         copy_tmp_cmds.append('sudo chmod -R 777 {tmp}'.format(tmp=tmp))
 82 | 
 83 |         if args.compress:
 84 |             copy_tmp_cmds.append('sudo find ' + tmp + ' -type f ! -name \'*.gz\' -exec gzip "{}" \;')
 85 | 
 86 |         call(gcloud_ssh(remote, '; '.join(init_cmd + copy_tmp_cmds)), shell=True)
 87 | 
 88 |         if not is_local:
 89 |             copy_dest_cmd = gcloud_ssh(remote, 'gsutil -m cp -r {tmp} {dest}'.format(tmp=tmp, dest=dest))
 90 |         else:
 91 |             copy_dest_cmd = gcloud_copy_files(remote, tmp, dest)
 92 | 
 93 |         call(copy_dest_cmd, shell=True)
 94 | 
 95 | 
 96 |     if not args.no_diagnose:
 97 |         diagnose_tar_path = re.search('Diagnostic results saved in: (?P<tarfile>gs:\/\/\S+diagnostic\.tar)',
 98 |                                       str(Popen('gcloud dataproc clusters diagnose {name}'.format(name=args.name),
 99 |                                                 shell=True,
100 |                                                 stdout=PIPE,
101 |                                                 stderr=PIPE).communicate())).group('tarfile')
102 | 
103 |         call(gsutil_cp(diagnose_tar_path, args.dest), shell=True)
104 | 
105 | 
106 |     master_log_files = [ '/var/log/hive/hive-*',
107 |                          '/var/log/google-dataproc-agent.0.log',
108 |                          '/var/log/dataproc-initialization-script-0.log',
109 |                          '/var/log/hadoop-mapreduce/mapred-mapred-historyserver*',
110 |                          '/var/log/hadoop-hdfs/*-m.*',
111 |                          '/var/log/hadoop-yarn/yarn-yarn-resourcemanager-*-m.*',
112 |                          args.hail_log
113 |                          ]
114 | 
115 |     copy_files_tmp(master, master_log_files, master_dest, '/tmp/' + master + '/')
116 | 
117 | 
118 |     worker_log_files = ['/var/log/hadoop-hdfs/hadoop-hdfs-datanode-*.*',
119 |                         '/var/log/dataproc-startup-script.log',
120 |                         '/var/log/hadoop-yarn/yarn-yarn-nodemanager-*.*']
121 | 
122 |     for worker in workers:
123 |         copy_files_tmp(worker, worker_log_files, worker_dest, '/tmp/' + worker + '/')
124 |         copy_files_tmp(worker, ['/var/log/hadoop-yarn/userlogs/'], args.dest, '/tmp/hadoop-yarn/')
125 | 


--------------------------------------------------------------------------------
/cloudtools/init_notebook.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import json
  3 | import os
  4 | import subprocess as sp
  5 | import sys
  6 | from subprocess import check_output
  7 | 
  8 | 
  9 | if sys.version_info >= (3,0):
 10 |     decode = lambda s: s.decode()
 11 |     # Python 3 check_output returns a byte string
 12 | else:
 13 |     # In Python 2, bytes and str are the same
 14 |     decode = lambda s: s
 15 | 
 16 | if sys.version_info >= (3,7):
 17 |     def safe_call(*args):
 18 |         sp.run(args, capture_output=True, check=True)
 19 | else:
 20 |     def safe_call(*args):
 21 |         try:
 22 |             sp.check_output(args, stderr=sp.STDOUT)
 23 |         except sp.CalledProcessError as e:
 24 |             print(decode(e.output))
 25 |             raise e
 26 | 
 27 | def get_metadata(key):
 28 |     return decode(check_output(['/usr/share/google/get_metadata_value', 'attributes/{}'.format(key)]))
 29 | 
 30 | def mkdir_if_not_exists(path):
 31 |     try:
 32 |         os.makedirs(path)
 33 |     except OSError as e:
 34 |         if e.errno != os.errno.EEXIST:
 35 |             raise
 36 | 
 37 | # get role of machine (master or worker)
 38 | role = get_metadata('dataproc-role')
 39 | 
 40 | if role == 'Master':
 41 |     # additional packages to install
 42 |     conda_pkgs = [
 43 |         'mkl<2020',
 44 |         'numpy<2',
 45 |         'scipy<2',
 46 |         # pandas uses minor version for backwards incompatible changes
 47 |         # https://pandas.pydata.org/pandas-docs/version/0.22/whatsnew.html
 48 |         'pandas<0.24'
 49 |     ]
 50 |     pip_pkgs = [
 51 |         'seaborn<0.10',
 52 |         'decorator==4.2.1',
 53 |         'parsimonious<0.9',
 54 |         'ipywidgets<8',
 55 |         'jupyter_console<5',
 56 |         'nbconvert<6',
 57 |         'notebook<6',
 58 |         'qtconsole<5',
 59 |         'jupyter', 'tornado<6', # https://github.com/hail-is/hail/issues/5505
 60 |         'lxml<5',
 61 |         'jupyter-spark<0.5',
 62 |         'bokeh<0.14',
 63 |         'google-cloud==0.32.0',
 64 |         'jgscm<0.2'
 65 |     ]
 66 |     if sys.version_info < (3,5):
 67 |         pip_pkgs.extend([
 68 |             'matplotlib<3',
 69 |             # ipython 6 requires python>=3.3
 70 |             'ipython<6',
 71 |             # the jupyter metapackage has no version dependencies, so it always
 72 |             # pulls latest ipykernel, ipykernel >=5 requires python>=3.4
 73 |             'ipykernel<5',
 74 |         ])
 75 |     else:
 76 |         pip_pkgs.extend([
 77 |             'matplotlib<4',
 78 |             'ipython<7',
 79 |             'ipykernel<6',
 80 |         ])
 81 | 
 82 |     # add user-requested packages
 83 |     try:
 84 |         user_pkgs = get_metadata('PKGS')
 85 |     except:
 86 |         pass
 87 |     else:
 88 |         pip_pkgs.extend(user_pkgs.split('|'))
 89 | 
 90 |     safe_call('/opt/conda/bin/conda', 'update', 'setuptools')
 91 | 
 92 |     print('conda packages are {}'.format(conda_pkgs))
 93 |     command = ['/opt/conda/bin/conda', 'install']
 94 |     command.extend(conda_pkgs)
 95 |     safe_call(*command)
 96 | 
 97 |     print('pip packages are {}'.format(pip_pkgs))
 98 |     command = ['/opt/conda/bin/pip', 'install']
 99 |     command.extend(pip_pkgs)
100 |     safe_call(*command)
101 | 
102 |     py4j = decode(check_output('ls /usr/lib/spark/python/lib/py4j*', shell=True).strip())
103 | 
104 |     print('getting metadata')
105 | 
106 |     jar_path = get_metadata('JAR')
107 |     zip_path = get_metadata('ZIP')
108 | 
109 |     print('copying jar and zip')
110 |     safe_call('gsutil', 'cp', jar_path, '/home/hail/hail.jar')
111 |     safe_call('gsutil', 'cp', zip_path, '/home/hail/hail.zip')
112 | 
113 |     env_to_set = {
114 |         'PYTHONHASHSEED': '0',
115 |         'PYTHONPATH':
116 |          '/usr/lib/spark/python/:{}:/home/hail/hail.zip'.format(py4j),
117 |         'SPARK_HOME': '/usr/lib/spark/',
118 |         'PYSPARK_PYTHON': '/opt/conda/bin/python',
119 |         'PYSPARK_DRIVER_PYTHON': '/opt/conda/bin/python'
120 |     }
121 | 
122 |     print('setting environment')
123 | 
124 |     for e, value in env_to_set.items():
125 |         safe_call('/bin/sh', '-c',
126 |                    'echo "export {}={}" | tee -a /etc/environment /usr/lib/spark/conf/spark-env.sh'.format(e, value))
127 | 
128 |     conf_to_set = [
129 |         'spark.jars=/home/hail/hail.jar',
130 |         'spark.executorEnv.PYTHONHASHSEED=0',
131 |         'spark.submit.pyFiles=/home/hail/hail.zip',
132 |         'spark.driver.extraClassPath=/home/hail/hail.jar',
133 |         'spark.executor.extraClassPath=./hail.jar'
134 |     ]
135 | 
136 |     print('setting spark-defaults.conf')
137 | 
138 |     for c in conf_to_set:
139 |         safe_call('/bin/sh', '-c', 'echo "{}" >> /etc/spark/conf/spark-defaults.conf'.format(c))
140 | 
141 |     # modify custom Spark conf file to reference Hail jar and zip
142 | 
143 |     # create Jupyter kernel spec file
144 |     kernel = {
145 |         'argv': [
146 |             '/opt/conda/bin/python',
147 |             '-m',
148 |             'ipykernel',
149 |             '-f',
150 |             '{connection_file}'
151 |         ],
152 |         'display_name': 'Hail',
153 |         'language': 'python',
154 |         'env': env_to_set
155 |     }
156 | 
157 |     # write kernel spec file to default Jupyter kernel directory
158 |     mkdir_if_not_exists('/opt/conda/share/jupyter/kernels/hail/')
159 |     with open('/opt/conda/share/jupyter/kernels/hail/kernel.json', 'w') as f:
160 |         json.dump(kernel, f)
161 | 
162 |     # create Jupyter configuration file
163 |     mkdir_if_not_exists('/opt/conda/etc/jupyter/')
164 |     with open('/opt/conda/etc/jupyter/jupyter_notebook_config.py', 'w') as f:
165 |         opts = [
166 |             'c.Application.log_level = "DEBUG"',
167 |             'c.NotebookApp.ip = "127.0.0.1"',
168 |             'c.NotebookApp.open_browser = False',
169 |             'c.NotebookApp.port = 8123',
170 |             'c.NotebookApp.token = ""',
171 |             'c.NotebookApp.contents_manager_class = "jgscm.GoogleStorageContentManager"'
172 |         ]
173 |         f.write('\n'.join(opts) + '\n')
174 | 
175 |     # setup jupyter-spark extension
176 |     safe_call('/opt/conda/bin/jupyter', 'serverextension', 'enable', '--user', '--py', 'jupyter_spark')
177 |     safe_call('/opt/conda/bin/jupyter', 'nbextension', 'install', '--user', '--py', 'jupyter_spark')
178 |     safe_call('/opt/conda/bin/jupyter', 'nbextension', 'enable', '--user', '--py', 'jupyter_spark')
179 |     safe_call('/opt/conda/bin/jupyter', 'nbextension', 'enable', '--user', '--py', 'widgetsnbextension')
180 | 
181 |     # create systemd service file for Jupyter notebook server process
182 |     with open('/lib/systemd/system/jupyter.service', 'w') as f:
183 |         opts = [
184 |             '[Unit]',
185 |             'Description=Jupyter Notebook',
186 |             'After=hadoop-yarn-resourcemanager.service',
187 |             '[Service]',
188 |             'Type=simple',
189 |             'User=root',
190 |             'Group=root',
191 |             'WorkingDirectory=/home/hail/',
192 |             'ExecStart=/opt/conda/bin/python /opt/conda/bin/jupyter notebook --allow-root',
193 |             'Restart=always',
194 |             'RestartSec=1',
195 |             '[Install]',
196 |             'WantedBy=multi-user.target'
197 |         ]
198 |         f.write('\n'.join(opts) + '\n')
199 | 
200 |     # add Jupyter service to autorun and start it
201 |     safe_call('systemctl', 'daemon-reload')
202 |     safe_call('systemctl', 'enable', 'jupyter')
203 |     safe_call('service', 'jupyter', 'start')
204 | 


--------------------------------------------------------------------------------
/cloudtools/latest.py:
--------------------------------------------------------------------------------
 1 | from .utils import latest_sha, load_config
 2 | 
 3 | def init_parser(parser):
 4 |     parser.add_argument('version', type=str, choices=['0.1', '0.2'],
 5 |                         help='Hail version to use (default: %(default)s).')
 6 |     parser.add_argument('spark', type=str,
 7 |                         help='Spark version used to build Hail (default: 2.2.0 for 0.2 and 2.0.2 for 0.1)')
 8 |     parser.add_argument('--sha', action='store_true', help="Print the newest deployed SHA.")
 9 |     parser.add_argument('--jar', action='store_true', help="Print the location of the newest deployed jar.")
10 |     parser.add_argument('--zip', action='store_true', help="Print the location of the newest deployed zip.")
11 | 
12 | def main(args):
13 |     sha = latest_sha(args.version, args.spark)
14 |     if args.sha:
15 |         print(sha)
16 |     if args.jar or args.zip:
17 |         config = load_config(sha, args.version)
18 |         config.configure(sha, args.spark)
19 |         if args.jar:
20 |             print(config.jar())
21 |         if args.zip:
22 |             print(config.zip())
23 | 


--------------------------------------------------------------------------------
/cloudtools/list_clusters.py:
--------------------------------------------------------------------------------
1 | from subprocess import check_call
2 | 
3 | def main(args):
4 |     check_call(['gcloud', 'dataproc', 'clusters', 'list'])
5 | 


--------------------------------------------------------------------------------
/cloudtools/modify.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from subprocess import check_call
  3 | 
  4 | 
  5 | def init_parser(parser):
  6 |     parser.add_argument('name', type=str, help='Cluster name.')
  7 |     parser.add_argument('--jar', type=str, help='New Hail JAR.')
  8 |     parser.add_argument('--zip', type=str, help='New Hail ZIP.')
  9 |     parser.add_argument('--num-workers', '--n-workers', '-w', type=int,
 10 |                         help='New number of worker machines (min. 2).')
 11 |     parser.add_argument('--num-preemptible-workers', '--n-pre-workers', '-p', type=int,
 12 |                         help='New number of preemptible worker machines.')
 13 |     parser.add_argument('--graceful-decommission-timeout', '--graceful', type=str,
 14 |                         help='If set, cluster size downgrade will use graceful decommissionnig with the given timeout (e.g. "60m").')
 15 |     parser.add_argument('--max-idle', type=str, help='New maximum idle time before shutdown (e.g. "60m").')
 16 |     parser.add_argument('--dry-run', action='store_true', help="Print gcloud dataproc command, but don't run it.")
 17 |     parser.add_argument('--zone', '-z', default='us-central1-b', type=str,
 18 |                         help='Compute zone for Dataproc cluster (default: %(default)s).')
 19 | 
 20 | def main(args):
 21 | 
 22 |     modify_args = []
 23 |     if args.num_workers is not None:
 24 |         modify_args.append('--num-workers={}'.format(args.num_workers))
 25 | 
 26 |     if args.num_preemptible_workers is not None:
 27 |         modify_args.append('--num-preemptible-workers={}'.format(args.num_preemptible_workers))
 28 | 
 29 |     if args.graceful_decommission_timeout:
 30 |         if not modify_args:
 31 |             sys.exit("Error: Cannot use --graceful-decommission-timeout without resizing the cluster.")
 32 |         modify_args.append('--graceful-decommission-timeout={}'.format(args.graceful_decommission_timeout))
 33 | 
 34 |     if args.max_idle:
 35 |         modify_args.append('--max-idle={}'.format(args.max_idle))
 36 | 
 37 |     if modify_args:
 38 |         cmd = [
 39 |             'gcloud',
 40 |             'dataproc',
 41 |             'clusters',
 42 |             'update',
 43 |             args.name] + modify_args
 44 | 
 45 |         if args.max_idle or args.graceful_decommission_timeout:
 46 |             cmd.insert(1, 'beta')
 47 | 
 48 |         # print underlying gcloud command
 49 |         print('gcloud update config command:')
 50 |         print(' '.join(cmd[:5]) + ' \\\n    ' + ' \\\n    '.join(cmd[5:]))
 51 | 
 52 |         # Update cluster
 53 |         if not args.dry_run:
 54 |             print("Updating cluster '{}'...".format(args.name))
 55 |             check_call(cmd)
 56 | 
 57 |     if (args.jar is not None):
 58 |         print('gcloud jar update command(s):')
 59 |         _scp_and_sudo_move(args.jar, args.name, '/home/hail/hail.jar', args.zone)
 60 |     if (args.zip is not None):
 61 |         print('gcloud zip update command(s):')
 62 |         _scp_and_sudo_move(args.zip, args.name, '/home/hail/hail.zip', args.zone)
 63 | 
 64 | 
 65 | # user doesn't have access to /home/hail/ so we copy then use sudo
 66 | def _scp_and_sudo_move(source, destination_host, destination, zone):
 67 |     cmds = []
 68 |     if source.startswith("gs://"):
 69 |         cmds.append([
 70 |             'gcloud',
 71 |             'compute',
 72 |             'ssh',
 73 |             '{}-m'.format(destination_host),
 74 |             '--zone={}'.format(zone),
 75 |             '--',
 76 |             'sudo gsutil cp {} {}'.format(source, destination)
 77 |         ])
 78 |     else:
 79 |         cmds.extend([
 80 |             [
 81 |                 'gcloud',
 82 |                 'compute',
 83 |                 'scp',
 84 |                 '--zone={}'.format(zone),
 85 |                 source,
 86 |                 '{}-m:/tmp/foo'.format(destination_host)
 87 |             ],
 88 |             [
 89 |                 'gcloud',
 90 |                 'compute',
 91 |                 'ssh',
 92 |                 '{}-m'.format(destination_host),
 93 |                 '--zone={}'.format(zone),
 94 |                 '--',
 95 |                 'sudo mv /tmp/foo {}'.format(destination)
 96 |             ]
 97 |         ])
 98 | 
 99 |     for cmd in cmds:
100 |         print(cmd)
101 |         check_call(cmd)
102 | 


--------------------------------------------------------------------------------
/cloudtools/safe_call.py:
--------------------------------------------------------------------------------
 1 | from .utils import decode
 2 | import subprocess as sp
 3 | import sys
 4 | 
 5 | def safe_call(*args):
 6 |     '''only print output on error'''
 7 |     try:
 8 |         sp.check_output(args, stderr=sp.STDOUT)
 9 |     except sp.CalledProcessError as e:
10 |         print(decode(e.output))
11 |         raise e
12 | 


--------------------------------------------------------------------------------
/cloudtools/start.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import sys
  4 | import re
  5 | from subprocess import check_call
  6 | from .utils import latest_sha, load_config, load_config_file
  7 | 
  8 | COMPATIBILITY_VERSION = 1
  9 | init_script = 'gs://hail-common/cloudtools/init_notebook{}.py'.format(COMPATIBILITY_VERSION)
 10 | 
 11 | # master machine type to memory map, used for setting spark.driver.memory property
 12 | machine_mem = {
 13 |     'n1-standard-1': 3.75,
 14 |     'n1-standard-2': 7.5,
 15 |     'n1-standard-4': 15,
 16 |     'n1-standard-8': 30,
 17 |     'n1-standard-16': 60,
 18 |     'n1-standard-32': 120,
 19 |     'n1-standard-64': 240,
 20 |     'n1-highmem-2': 13,
 21 |     'n1-highmem-4': 26,
 22 |     'n1-highmem-8': 52,
 23 |     'n1-highmem-16': 104,
 24 |     'n1-highmem-32': 208,
 25 |     'n1-highmem-64': 416,
 26 |     'n1-highcpu-2': 1.8,
 27 |     'n1-highcpu-4': 3.6,
 28 |     'n1-highcpu-8': 7.2,
 29 |     'n1-highcpu-16': 14.4,
 30 |     'n1-highcpu-32': 28.8,
 31 |     'n1-highcpu-64': 57.6
 32 | }
 33 | 
 34 | 
 35 | def init_parser(parser):
 36 |     parser.add_argument('name', type=str, help='Cluster name.')
 37 | 
 38 |     # arguments with default parameters
 39 |     parser.add_argument('--hash', default='latest', type=str,
 40 |                         help='Hail build to use for notebook initialization (default: %(default)s).')
 41 |     parser.add_argument('--spark', type=str,
 42 |                         help='Spark version used to build Hail (default: 2.4.0 for 0.2 and 2.0.2 for 0.1)')
 43 |     parser.add_argument('--version', default='0.2', type=str, choices=['0.1', '0.2'],
 44 |                         help='Hail version to use (default: %(default)s).')
 45 |     parser.add_argument('--master-machine-type', '--master', '-m', default='n1-highmem-8', type=str,
 46 |                         help='Master machine type (default: %(default)s).')
 47 |     parser.add_argument('--master-memory-fraction', default=0.8, type=float,
 48 |                         help='Fraction of master memory allocated to the JVM. '
 49 |                              'Use a smaller value to reserve more memory '
 50 |                              'for Python. (default: %(default)s)')
 51 |     parser.add_argument('--master-boot-disk-size', default=100, type=int,
 52 |                         help='Disk size of master machine, in GB (default: %(default)s).')
 53 |     parser.add_argument('--num-master-local-ssds', default=0, type=int,
 54 |                         help='Number of local SSDs to attach to the master machine (default: %(default)s).')
 55 |     parser.add_argument('--num-preemptible-workers', '--n-pre-workers', '-p', default=0, type=int,
 56 |                         help='Number of preemptible worker machines (default: %(default)s).')
 57 |     parser.add_argument('--num-worker-local-ssds', default=0, type=int,
 58 |                         help='Number of local SSDs to attach to each worker machine (default: %(default)s).')
 59 |     parser.add_argument('--num-workers', '--n-workers', '-w', default=2, type=int,
 60 |                         help='Number of worker machines (default: %(default)s).')
 61 |     parser.add_argument('--preemptible-worker-boot-disk-size', default=40, type=int,
 62 |                         help='Disk size of preemptible machines, in GB (default: %(default)s).')
 63 |     parser.add_argument('--worker-boot-disk-size', default=40, type=int,
 64 |                         help='Disk size of worker machines, in GB (default: %(default)s).')
 65 |     parser.add_argument('--worker-machine-type', '--worker',
 66 |                         help='Worker machine type (default: n1-standard-8, or n1-highmem-8 with --vep).')
 67 |     parser.add_argument('--zone', default='us-central1-b',
 68 |                         help='Compute zone for the cluster (default: %(default)s).')
 69 |     parser.add_argument('--properties',
 70 |                         help='Additional configuration properties for the cluster')
 71 |     parser.add_argument('--metadata',
 72 |                         help='Comma-separated list of metadata to add: KEY1=VALUE1,KEY2=VALUE2...')
 73 |     parser.add_argument('--packages', '--pkgs',
 74 |                         help='Comma-separated list of Python packages to be installed on the master node.')
 75 |     parser.add_argument('--project', help='Google Cloud project to start cluster (defaults to currently set project).')
 76 |     parser.add_argument('--configuration', help='Google Cloud configuration to start cluster (defaults to currently set configuration).')
 77 |     parser.add_argument('--max-idle', type=str, help='If specified, maximum idle time before shutdown (e.g. 60m).')
 78 |     parser.add_argument('--max-age', type=str, help='If specified, maximum age before shutdown (e.g. 60m).')
 79 |     parser.add_argument('--bucket', type=str, help='The Google Cloud Storage bucket to use for cluster staging (just the bucket name, no gs:// prefix).')
 80 | 
 81 |     # specify custom Hail jar and zip
 82 |     parser.add_argument('--jar', help='Hail jar to use for Jupyter notebook.')
 83 |     parser.add_argument('--zip', help='Hail zip to use for Jupyter notebook.')
 84 | 
 85 |     # initialization action flags
 86 |     parser.add_argument('--init', default='', help='Comma-separated list of init scripts to run.')
 87 |     parser.add_argument('--init_timeout', default='20m', help='Flag to specify a timeout period for the initialization action')
 88 |     parser.add_argument('--vep', action='store_true', help='Configure the cluster to run VEP.')
 89 |     parser.add_argument('--vep-reference', default='GRCh37', help='Set the reference genome version for VEP.',
 90 |                         choices=['GRCh37', 'GRCh38'])
 91 |     parser.add_argument('--dry-run', action='store_true', help="Print gcloud dataproc command, but don't run it.")
 92 | 
 93 |     # custom config file
 94 |     parser.add_argument('--config-file', help='Pass in a custom json file to load configurations.')
 95 | 
 96 | 
 97 | def main(args):
 98 |     if not args.spark:
 99 |         args.spark = '2.4.0' if args.version == '0.2' else '2.0.2'
100 | 
101 |     if args.hash == 'latest':
102 |         hash = latest_sha(args.version, args.spark)
103 |     else:
104 |         hash_length = len(args.hash)
105 |         if hash_length < 12:
106 |             raise ValueError('--hash expects a 12 character git commit hash, received {}'.format(args.hash))
107 |         elif hash_length > 12:
108 |             print('--hash expects a 12 character git commit hash, I will truncate this longer hash to tweleve characters: {}'.format(args.hash),
109 |                   file=sys.stderr)
110 |             hash = args.hash[0:12]
111 |         else:
112 |             hash = args.hash
113 | 
114 |     if not args.config_file:
115 |         conf = load_config(hash, args.version)
116 |     else:
117 |         conf = load_config_file(args.config_file)
118 | 
119 |     if args.spark not in conf.vars['supported_spark'].keys():
120 |         sys.stderr.write("ERROR: Hail version '{}' requires one of Spark {}."
121 |                          .format(args.version, ','.join(conf.vars['supported_spark'].keys())))
122 |         sys.exit(1)
123 |     conf.configure(hash, args.spark)
124 | 
125 |     # parse Spark and HDFS configuration parameters, combine into properties argument
126 |     conf.extend_flag('properties',
127 |                      {
128 |                          'dataproc:dataproc.logging.stackdriver.enable': 'false',
129 |                          'dataproc:dataproc.monitoring.stackdriver.enable': 'false'
130 |                      })
131 |     if args.properties:
132 |         conf.parse_and_extend('properties', args.properties)
133 | 
134 |     # default to highmem machines if using VEP
135 |     if not args.worker_machine_type:
136 |         args.worker_machine_type = 'n1-highmem-8' if args.vep else 'n1-standard-8'
137 | 
138 |     # default initialization script to start up cluster with
139 |     conf.extend_flag('initialization-actions',
140 |                      ['gs://dataproc-initialization-actions/conda/bootstrap-conda.sh',
141 |                       init_script])
142 |     # add VEP init script
143 |     if args.vep:
144 |         if args.version == '0.1':
145 |             vep_init = 'gs://hail-common/vep/vep/vep85-init.sh'
146 |         else:
147 |             vep_init = 'gs://hail-common/vep/vep/vep{vep_version}-loftee-1.0-{vep_ref}-init-docker.sh'.format(
148 |                 vep_version=85 if args.vep_reference == 'GRCh37' else 95,
149 |                 vep_ref=args.vep_reference)
150 |         conf.extend_flag('initialization-actions', [vep_init])
151 |     # add custom init scripts
152 |     if args.init:
153 |         conf.extend_flag('initialization-actions', args.init.split(','))
154 | 
155 |     if args.jar and args.zip:
156 |         conf.extend_flag('metadata', {'JAR': args.jar, 'ZIP': args.zip})
157 |     elif args.jar or args.zip:
158 |         sys.stderr.write('ERROR: pass both --jar and --zip or neither')
159 |         sys.exit(1)
160 | 
161 |     if args.metadata:
162 |         conf.parse_and_extend('metadata', args.metadata)
163 |     # if Python packages requested, add metadata variable
164 |     if args.packages:
165 |         metadata_pkgs = conf.flags['metadata'].get('PKGS')
166 |         packages = []
167 |         split_regex = r'[|,]'
168 |         if metadata_pkgs:
169 |             packages.extend(re.split(split_regex, metadata_pkgs))
170 | 
171 |         packages.extend(re.split(split_regex, args.packages))
172 |         conf.extend_flag('metadata', {'PKGS': '|'.join(packages)})
173 | 
174 |     conf.vars['driver_memory'] = str(int(machine_mem[args.master_machine_type] * args.master_memory_fraction))
175 |     conf.flags['master-machine-type'] = args.master_machine_type
176 |     conf.flags['master-boot-disk-size'] = '{}GB'.format(args.master_boot_disk_size)
177 |     conf.flags['num-master-local-ssds'] = args.num_master_local_ssds
178 |     conf.flags['num-preemptible-workers'] = args.num_preemptible_workers
179 |     conf.flags['num-worker-local-ssds'] = args.num_worker_local_ssds
180 |     conf.flags['num-workers'] = args.num_workers
181 |     conf.flags['preemptible-worker-boot-disk-size']='{}GB'.format(args.preemptible_worker_boot_disk_size)
182 |     conf.flags['worker-boot-disk-size'] = args.worker_boot_disk_size
183 |     conf.flags['worker-machine-type'] = args.worker_machine_type
184 |     conf.flags['zone'] = args.zone
185 |     conf.flags['initialization-action-timeout'] = args.init_timeout
186 |     if args.configuration:
187 |         conf.flags['configuration'] = args.configuration
188 |     if args.project:
189 |         conf.flags['project'] = args.project
190 |     if args.bucket:
191 |         conf.flags['bucket'] = args.bucket
192 | 
193 |     # command to start cluster
194 |     cmd = conf.get_command(args.name)
195 | 
196 |     if args.max_idle or args.max_age:
197 |         cmd.insert(1, 'beta')
198 |     if args.max_idle:
199 |         cmd.append('--max-idle={}'.format(args.max_idle))
200 |     if args.max_age:
201 |         cmd.append('--max-age={}'.format(args.max_age))
202 | 
203 |     # print underlying gcloud command
204 |     print(' '.join(cmd[:5]) + ' \\\n    ' + ' \\\n    '.join(cmd[5:]))
205 | 
206 |     # spin up cluster
207 |     if not args.dry_run:
208 |         print("Starting cluster '{}'...".format(args.name))
209 |         check_call(cmd)
210 | 


--------------------------------------------------------------------------------
/cloudtools/stop.py:
--------------------------------------------------------------------------------
 1 | from subprocess import check_call
 2 | 
 3 | def init_parser(parser):
 4 |     parser.add_argument('name', type=str, help='Cluster name.')
 5 |     parser.add_argument('--async', action='store_true', help="Do not wait for cluster deletion.")
 6 | 
 7 | def main(args):
 8 |     print("Stopping cluster '{}'...".format(args.name))
 9 | 
10 |     cmd = ['gcloud', 'dataproc', 'clusters', 'delete', '--quiet', args.name]
11 |     if vars(args)['async']:
12 |         cmd.append('--async')
13 | 
14 |     check_call(cmd)
15 | 


--------------------------------------------------------------------------------
/cloudtools/submit.py:
--------------------------------------------------------------------------------
 1 | from subprocess import check_call
 2 | import os
 3 | import tempfile
 4 | import zipfile
 5 | 
 6 | try:
 7 |     standard_scripts = os.environ['HAIL_SCRIPTS'].split(':')
 8 | except Exception:
 9 |     standard_scripts = None
10 | 
11 | 
12 | def init_parser(parser):
13 |     parser.add_argument('name', type=str, help='Cluster name.')
14 |     parser.add_argument('script', type=str)
15 |     parser.add_argument('--files', required=False, type=str, help='Comma-separated list of files to add to the working directory of the Hail application.')
16 |     parser.add_argument('--pyfiles', required=False, type=str, help='Comma-separated list of files (or directories with python files) to add to the PYTHONPATH.')
17 |     parser.add_argument('--properties', '-p', required=False, type=str, help='Extra Spark properties to set.')
18 | 
19 | 
20 | def main(args, pass_through_args):
21 |     print("Submitting to cluster '{}'...".format(args.name))
22 | 
23 |     # create files argument
24 |     files = ''
25 |     if args.files:
26 |         files = args.files
27 |     pyfiles = []
28 |     if args.pyfiles:
29 |         pyfiles.extend(args.pyfiles.split(','))
30 |     if standard_scripts:
31 |         pyfiles.extend(standard_scripts)
32 |     if pyfiles:
33 |         tfile = tempfile.mkstemp(suffix='.zip', prefix='pyscripts_')[1]
34 |         zipf = zipfile.ZipFile(tfile, 'w', zipfile.ZIP_DEFLATED)
35 |         for hail_script_entry in pyfiles:
36 |             if hail_script_entry.endswith('.py'):
37 |                 zipf.write(hail_script_entry, arcname=os.path.basename(hail_script_entry))
38 |             else:
39 |                 for root, _, pyfiles_walk in os.walk(hail_script_entry):
40 |                     for pyfile in pyfiles_walk:
41 |                         if pyfile.endswith('.py'):
42 |                             zipf.write(os.path.join(root, pyfile),
43 |                                        os.path.relpath(os.path.join(root, pyfile),
44 |                                                        os.path.join(hail_script_entry, '..')))
45 |         zipf.close()
46 |         pyfiles = tfile
47 |     else:
48 |         pyfiles = ''
49 | 
50 |     # create properties argument
51 |     properties = ''
52 |     if args.properties:
53 |         properties = args.properties
54 | 
55 |     # pyspark submit command
56 |     cmd = [
57 |         'gcloud',
58 |         'dataproc',
59 |         'jobs',
60 |         'submit',
61 |         'pyspark',
62 |         args.script,
63 |         '--cluster={}'.format(args.name),
64 |         '--files={}'.format(files),
65 |         '--py-files={}'.format(pyfiles),
66 |         '--properties={}'.format(properties)
67 |     ]
68 | 
69 |     # append arguments to pass to the Hail script
70 |     if pass_through_args is not None:
71 |         cmd.append('--')
72 |         cmd.extend(pass_through_args)
73 | 
74 |     # print underlying gcloud command
75 |     print('gcloud command:')
76 |     print(' '.join(cmd[:6]) + ' \\\n    ' + ' \\\n    '.join(cmd[6:]))
77 | 
78 |     # submit job
79 |     check_call(cmd)
80 | 


--------------------------------------------------------------------------------
/cloudtools/utils.py:
--------------------------------------------------------------------------------
 1 | from .cluster_config import ClusterConfig
 2 | import subprocess as sp
 3 | import sys
 4 | from . import __version__
 5 | 
 6 | 
 7 | if sys.version_info >= (3,0):
 8 |     decode = lambda s: s.decode()
 9 |     # Python 3 check_output returns a byte string
10 | else:
11 |     # In Python 2, bytes and str are the same
12 |     decode = lambda s: s
13 | 
14 | def latest_sha(version, spark):
15 |     cloudtools_version = __version__.strip().split('.')
16 |     hash_file = 'gs://hail-common/builds/{}/latest-hash/cloudtools-{}-spark-{}.txt'.format(
17 |         version,
18 |         cloudtools_version[0],
19 |         spark)
20 |     return decode(sp.check_output(['gsutil', 'cat', hash_file]).strip())
21 | 
22 | 
23 | def get_config_filename(sha, version):
24 |     fname = 'gs://hail-common/builds/{version}/config/hail-config-{version}-{hash}.json'.format(
25 |         version=version, hash=sha)
26 |     if sp.call(['gsutil', '-q', 'stat', fname]) != 0:
27 |         return 'gs://hail-common/builds/{version}/config/hail-config-{version}-default.json'.format(
28 |             version=version)
29 |     return fname
30 | 
31 | 
32 | def load_config_file(fname):
33 |     if fname.startswith('gs://'):
34 |         return ClusterConfig(sp.check_output(['gsutil', 'cat', fname]).strip())
35 |     return ClusterConfig(sp.check_output(['cat', fname]).strip())
36 | 
37 | 
38 | def load_config(sha, version):
39 |     return load_config_file(get_config_filename(sha, version))
40 | 


--------------------------------------------------------------------------------
/cluster-sanity-check-0.1.py:
--------------------------------------------------------------------------------
1 | from hail import *
2 | 
3 | hc = HailContext()
4 | vds = hc.import_vcf('gs://hail-1kg/1kg_coreexome.vcf.bgz')
5 | vds.count_variants()
6 | 


--------------------------------------------------------------------------------
/cluster-sanity-check-0.2.py:
--------------------------------------------------------------------------------
1 | import hail as hl
2 | 
3 | mt = hl.import_vcf('gs://hail-1kg/1kg_coreexome.vcf.bgz')
4 | mt = mt.annotate_rows(x = 5)
5 | mt._force_count_rows()
6 | 


--------------------------------------------------------------------------------
/hail-ci-build-image:
--------------------------------------------------------------------------------
1 | gcr.io/broad-ctsa/cloud-tools-pr-builder:fb9831faa58bc1d436b0774bd7fcca9eaaed52d42a22ad60a32c6503333ff515
2 | 


--------------------------------------------------------------------------------
/hail-ci-build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | gcloud auth activate-service-account \
 6 |      hail-ci-0-1@broad-ctsa.iam.gserviceaccount.com \
 7 |      --key-file=/secrets/hail-ci-0-1.key
 8 | 
 9 | gcloud config set project broad-ctsa
10 | 
11 | shutdown_cluster() {
12 |     trap "" INT TERM
13 |     set +e
14 |     for CLUSTER_NAME in $CLUSTERS
15 |     do
16 |         time gcloud dataproc clusters delete ${CLUSTER_NAME} --async
17 |     done
18 | }
19 | trap shutdown_cluster EXIT
20 | 
21 | trap "exit 42" INT TERM
22 | 
23 | for PY_VERSION in 2 3
24 | do
25 |     PIP=pip${PY_VERSION}
26 | 
27 |     $PIP install ./
28 | 
29 |     # remove any symlinks to a default python
30 |     ls -al $(which python)
31 |     rm -rf $(which python)
32 | 
33 |     CLUSTER_NAME_0_2=cloudtools-ci-$(LC_CTYPE=C LC_ALL=C tr -dc 'a-z0-9' < /dev/urandom | head -c 8)
34 |     CLUSTER_NAME_0_1=cloudtools-ci-$(LC_CTYPE=C LC_ALL=C tr -dc 'a-z0-9' < /dev/urandom | head -c 8)
35 | 
36 |     CLUSTERS="${CLUSTERS} ${CLUSTER_NAME_0_2} ${CLUSTER_NAME_0_1}"
37 | 
38 |     # check binary exists
39 |     time cluster start --help
40 | 
41 |     SHA=$(cluster latest --sha 0.1 2.0.2)
42 |     JAR=$(cluster latest --jar 0.1 2.0.2)
43 |     ZIP=$(cluster latest --zip 0.1 2.0.2)
44 |     gsutil ls $JAR
45 |     gsutil ls $ZIP
46 |     SHA=$(cluster latest --sha 0.2 2.2.0)
47 |     JAR=$(cluster latest --jar 0.2 2.2.0)
48 |     ZIP=$(cluster latest --zip 0.2 2.2.0)
49 |     gsutil ls $JAR
50 |     gsutil ls $ZIP
51 | 
52 |     # check 0.2 cluster starts and runs hail
53 |     time cluster start ${CLUSTER_NAME_0_2} \
54 |          --version 0.2 \
55 |          --spark 2.2.0 \
56 |          --max-idle 40m \
57 |          --bucket=hail-ci-0-1-dataproc-staging-bucket
58 |     time cluster submit ${CLUSTER_NAME_0_2} \
59 |          cluster-sanity-check-0.2.py
60 |     time cluster stop --async ${CLUSTER_NAME_0_2}
61 | 
62 |     # check 0.1 cluster starts and runs hail
63 |     time cluster start ${CLUSTER_NAME_0_1} \
64 |          --version 0.1 \
65 |          --spark 2.0.2 \
66 |          --max-idle 40m \
67 |          --bucket=hail-ci-0-1-dataproc-staging-bucket
68 |     time cluster submit ${CLUSTER_NAME_0_1} \
69 |          cluster-sanity-check-0.1.py
70 |     time cluster stop --async ${CLUSTER_NAME_0_1}
71 | 
72 |     yes | $PIP uninstall cloudtools
73 | done
74 | 


--------------------------------------------------------------------------------
/hail-ci-deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | PUBLISHED=$(pip --no-cache-dir search cloudtools | grep '^cloudtools' | sed 's/cloudtools (//'  | sed 's/).*//')
 6 | CURRENT=$(grep '__version__ = ' cloudtools/__init__.py | sed -E "s/.*__version__ = \'(.*)\'/\1/")
 7 | 
 8 | if [[ "${PUBLISHED}" != "${CURRENT}" ]]
 9 | then
10 |     echo deploying ${CURRENT}, was ${PUBLISHED}
11 |     set +x
12 |     export TWINE_USERNAME=$(cat /secrets/pypi-username)
13 |     export TWINE_PASSWORD=$(cat /secrets/pypi-password)
14 |     set -x
15 |     make deploy
16 | else
17 |     echo nothing to do ${PUBLISHED} == ${CURRENT}
18 | fi
19 | 


--------------------------------------------------------------------------------
/pr-builder/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM alpine:3.8
 2 | 
 3 | RUN apk --no-cache add \
 4 |     python2 \
 5 |     py2-pip \
 6 |     python3 \
 7 |     py3-pip \
 8 |     bash \
 9 |     git \
10 |     openssh \
11 |     curl \
12 |     make \
13 |     && \
14 |     pip2 --no-cache-dir install --upgrade twine wheel \
15 |     && \
16 |     pip3 --no-cache-dir install --upgrade twine wheel
17 | 
18 | # this seems easier than getting the keys right for apt
19 | #
20 | # source: https://cloud.google.com/storage/docs/gsutil_install#linux
21 | RUN /bin/sh -c 'curl https://sdk.cloud.google.com | bash' && \
22 |     /root/google-cloud-sdk/bin/gcloud components install beta
23 | ENV PATH $PATH:/root/google-cloud-sdk/bin
24 | 
25 | VOLUME /secrets
26 | WORKDIR /
27 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from cloudtools import __version__
 3 | 
 4 | setup(name='cloudtools',
 5 |       version=__version__,
 6 |       description='Collection of utilities for working on the Google Cloud Platform.',
 7 |       url='https://github.com/Nealelab/cloudtools',
 8 |       author='Liam Abbott',
 9 |       author_email='labbott@broadinstitute.org',
10 |       license='MIT',
11 |       classifiers=[
12 | 	  'Development Status :: 3 - Alpha',
13 | 	  'License :: OSI Approved :: MIT License',
14 | 	  'Programming Language :: Python :: 3.6'
15 |       ],
16 |       keywords='google cloud dataproc spark jupyter hail notebook ipython',
17 |       packages=['cloudtools'],
18 |       install_requires=[
19 |           'statistics;python_version<"3.4"',
20 |       ],
21 |       entry_points={
22 | 	  'console_scripts': [
23 | 	      'cluster = cloudtools.__main__:main'
24 | 	  ]
25 |       },
26 | )
27 | 


--------------------------------------------------------------------------------