├── tests ├── __init__.py ├── data │ ├── dir │ │ └── test.jar │ ├── episodes.avro │ └── episodes.py ├── test_pricing.py ├── test_parser.py ├── test_poll.py └── test_sparksteps.py ├── sparksteps ├── __init__.py ├── poll.py ├── cluster.py ├── steps.py ├── pricing.py └── __main__.py ├── docs ├── overview.rst ├── modules.rst ├── index.rst ├── sparksteps.rst ├── Makefile ├── make.bat └── conf.py ├── .coveragerc ├── examples ├── episodes.avro ├── lib │ └── spark-avro_2.10-2.0.2-custom.jar ├── episodes.py └── wordcount.py ├── MANIFEST.in ├── requirements.txt ├── setup.cfg ├── .github └── workflows │ └── tests.yml ├── bootstrap └── install-jupyter-notebook.sh ├── tox.ini ├── setup.py ├── Makefile ├── CHANGELOG.rst ├── .gitignore ├── CODE_OF_CONDUCT.md ├── README.rst └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sparksteps/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data/dir/test.jar: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/overview.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | # .coveragerc to control coverage.py 2 | [run] 3 | branch=False 4 | source=sparksteps 5 | -------------------------------------------------------------------------------- /examples/episodes.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwplayer/sparksteps/HEAD/examples/episodes.avro -------------------------------------------------------------------------------- /tests/data/episodes.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwplayer/sparksteps/HEAD/tests/data/episodes.avro -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | sparksteps 2 | ========== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | sparksteps 8 | -------------------------------------------------------------------------------- /examples/lib/spark-avro_2.10-2.0.2-custom.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwplayer/sparksteps/HEAD/examples/lib/spark-avro_2.10-2.0.2-custom.jar -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE README.rst CHANGELOG.rst requirements.txt tox.ini 2 | prune examples* 3 | prune bootstrap* 4 | recursive-include tests * 5 | recursive-exclude docs/_build * -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto3==1.14.46 2 | botocore==1.17.46 3 | docutils==0.15.2 4 | jmespath==0.10.0 5 | polling==0.3.0 6 | python-dateutil==2.8.1 7 | s3transfer==0.3.3 8 | six==1.15.0 9 | urllib3==1.26.5 10 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | SparkSteps: Launch Spark jobs on AWS EMR 2 | ======================================== 3 | 4 | .. only:: html 5 | 6 | :Release: |version| 7 | :Date: |today| 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | 12 | overview 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | 22 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file=README.rst 3 | 4 | [aliases] 5 | test=pytest 6 | 7 | [tool:pytest] 8 | addopts=-vv --flake8 -m 'not integration' 9 | markers= 10 | integration 11 | 12 | [flake8] 13 | max-line-length=120 14 | exclude= 15 | .git, 16 | __pycache__, 17 | docs/conf.py, 18 | old, 19 | build, 20 | dist 21 | 22 | [bdist_wheel] 23 | universal=1 24 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | jobs: 8 | test: 9 | name: Test sparksteps on Python ${{ matrix.python_version }} 10 | strategy: 11 | matrix: 12 | python_version: [3.6, 3.7, 3.8] 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Set up Python ${{ matrix.python_version }} 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: ${{ matrix.python_version }} 20 | - name: Run tests on Python ${{ matrix.python_version }} 21 | run: make test 22 | -------------------------------------------------------------------------------- /docs/sparksteps.rst: -------------------------------------------------------------------------------- 1 | sparksteps package 2 | ================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | sparksteps.cluster module 8 | ------------------------- 9 | 10 | .. automodule:: sparksteps.cluster 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | sparksteps.pricing module 16 | ------------------------- 17 | 18 | .. automodule:: sparksteps.pricing 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | sparksteps.steps module 24 | ----------------------- 25 | 26 | .. automodule:: sparksteps.steps 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | 32 | Module contents 33 | --------------- 34 | 35 | .. automodule:: sparksteps 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /bootstrap/install-jupyter-notebook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x -e 3 | 4 | 5 | #Installing iPython Notebook 6 | if grep isMaster /mnt/var/lib/info/instance.json | grep true; 7 | then 8 | cd /home/hadoop 9 | sudo pip install virtualenv 10 | mkdir Jupyter 11 | cd Jupyter 12 | /usr/bin/virtualenv -p /usr/bin/python2.7 venv 13 | source venv/bin/activate 14 | 15 | #Install jupyter and dependency 16 | pip install --upgrade pip 17 | pip install jupyter requests numpy matplotlib s3cmd 18 | 19 | #Create profile 20 | # jupyter profile create default 21 | jupyter notebook --generate-config 22 | 23 | #Run on master /slave based on configuration 24 | 25 | echo "c = get_config()" > /home/hadoop/.jupyter/jupyter_notebook_config.py 26 | echo "c.NotebookApp.ip = '*'" >> /home/hadoop/.jupyter/jupyter_notebook_config.py 27 | echo "c.NotebookApp.open_browser = False" >> /home/hadoop/.jupyter/jupyter_notebook_config.py 28 | echo "c.NotebookApp.port = 8192" >> /home/hadoop/.jupyter/jupyter_notebook_config.py 29 | 30 | fi 31 | -------------------------------------------------------------------------------- /examples/episodes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """episodes.py test script. 3 | 4 | Prompt parameters: 5 | help (-h): argparse help 6 | input (-i): input path 7 | 8 | Examples: 9 | $ spark-submit \ 10 | --packages com.databricks:spark-avro_2.10:2.0.1 \ 11 | episodes.py \ 12 | --input episodes.avro 13 | 14 | """ 15 | 16 | import ntpath 17 | from argparse import RawDescriptionHelpFormatter, ArgumentParser 18 | import subprocess 19 | 20 | from pyspark import SparkContext 21 | from pyspark.sql import SQLContext 22 | 23 | parser = ArgumentParser(description=__doc__, 24 | formatter_class=RawDescriptionHelpFormatter) 25 | parser.add_argument('--input', '-i', required=True) 26 | args = parser.parse_args() 27 | 28 | if __name__ == "__main__": 29 | in_path = args.input 30 | 31 | filename = ntpath.basename(in_path) 32 | subprocess.call(["hadoop", "fs", "-put", in_path, filename]) 33 | 34 | sc = SparkContext(appName="Episodes") 35 | sqlContext = SQLContext(sc) 36 | df = sqlContext.read.format("com.databricks.spark.avro").load(filename) 37 | df.first() 38 | -------------------------------------------------------------------------------- /tests/data/episodes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """episodes.py test script. 3 | 4 | Prompt parameters: 5 | help (-h): argparse help 6 | input (-i): input path 7 | 8 | Examples: 9 | $ spark-submit \ 10 | --packages com.databricks:spark-avro_2.10:2.0.1 \ 11 | episodes.py \ 12 | --input episodes.avro 13 | 14 | """ 15 | 16 | import ntpath 17 | from argparse import RawDescriptionHelpFormatter, ArgumentParser 18 | import subprocess 19 | 20 | from pyspark import SparkContext 21 | from pyspark.sql import SQLContext 22 | 23 | parser = ArgumentParser(description=__doc__, 24 | formatter_class=RawDescriptionHelpFormatter) 25 | parser.add_argument('--input', '-i', required=True) 26 | args = parser.parse_args() 27 | 28 | if __name__ == "__main__": 29 | in_path = args.input 30 | 31 | filename = ntpath.basename(in_path) 32 | subprocess.call(["hadoop", "fs", "-put", in_path, filename]) 33 | 34 | sc = SparkContext(appName="Episodes") 35 | sqlContext = SQLContext(sc) 36 | df = sqlContext.read.format("com.databricks.spark.avro").load(filename) 37 | df.first() 38 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = docs,flake8,py36,py37,py38 3 | skipsdist=True 4 | 5 | [testenv] 6 | usedevelop=True 7 | deps=-r{toxinidir}/requirements.txt 8 | commands= 9 | {envpython} setup.py test 10 | setenv= 11 | PYTHONWARNINGS=always::DeprecationWarning 12 | passenv=TRAVIS 13 | 14 | [testenv:flake8] 15 | usedevelop=False 16 | skip_install=True 17 | deps= 18 | flake8 19 | commands= 20 | flake8 --version 21 | flake8 setup.py sparksteps 22 | 23 | [testenv:docs] 24 | # only work if first run python setup.py develop 25 | commands= 26 | rm -rf {toxinidir}/docs/_build 27 | make -C {toxinidir}/docs html 28 | whitelist_externals= 29 | rm 30 | make 31 | 32 | [testenv:upload] 33 | deps=wheel 34 | twine 35 | commands= 36 | python setup.py clean --all rotate -k - -m .whl,.tar.gz,.zip 37 | python setup.py -q egg_info 38 | python setup.py -q sdist --formats zip bdist_wheel register 39 | 40 | [testenv:dist] 41 | deps= wheel 42 | whitelist_externals = rm 43 | commands= 44 | python setup.py -q clean --all 45 | python setup.py -q rotate -k 0 -m .egg,.zip,.whl,.tar.gz 46 | python setup.py -q egg_info 47 | python setup.py -q sdist --formats zip,bztar bdist_wheel upload 48 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Distutils setup file, used to install or test 'sparksteps'.""" 3 | import textwrap 4 | 5 | from setuptools import setup, find_packages 6 | 7 | with open('README.rst') as f: 8 | readme = f.read() 9 | 10 | setup( 11 | name='sparksteps', 12 | description='Workflow tool to launch Spark jobs on AWS EMR', 13 | long_description=readme, 14 | packages=find_packages(exclude=['tests', 'examples', 'bootstrap']), 15 | use_scm_version=True, 16 | author='Kamil Sindi', 17 | author_email='kamil@jwplayer.com', 18 | url='https://github.com/jwplayer/sparksteps', 19 | keywords='aws emr pyspark spark boto'.split(), 20 | license='Apache License 2.0', 21 | install_requires=[ 22 | 'boto3>=1.3.1', 23 | 'polling==0.3.0' 24 | ], 25 | setup_requires=[ 26 | 'setuptools_scm', 27 | 'sphinx_rtd_theme', 28 | ], 29 | include_package_data=True, 30 | zip_safe=False, 31 | entry_points={ 32 | 'console_scripts': [ 33 | 'sparksteps=sparksteps.__main__:main' 34 | ] 35 | }, 36 | classifiers=textwrap.dedent(""" 37 | Development Status :: 4 - Beta 38 | Intended Audience :: Developers 39 | License :: OSI Approved :: Apache Software License 40 | Environment :: Console 41 | Programming Language :: Python :: 3.6 42 | Programming Language :: Python :: 3.7 43 | Programming Language :: Python :: 3.8 44 | """).strip().splitlines(), 45 | python_requires='>=3.6' 46 | ) 47 | -------------------------------------------------------------------------------- /examples/wordcount.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from __future__ import print_function 19 | 20 | import sys 21 | from operator import add 22 | 23 | from pyspark.sql import SparkSession 24 | 25 | 26 | if __name__ == "__main__": 27 | if len(sys.argv) != 2: 28 | print("Usage: wordcount ", file=sys.stderr) 29 | exit(-1) 30 | 31 | spark = SparkSession\ 32 | .builder\ 33 | .appName("PythonWordCount")\ 34 | .getOrCreate() 35 | 36 | lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0]) 37 | counts = lines.flatMap(lambda x: x.split(' ')) \ 38 | .map(lambda x: (x, 1)) \ 39 | .reduceByKey(add) 40 | output = counts.collect() 41 | for (word, count) in output: 42 | print("%s: %i" % (word, count)) 43 | 44 | spark.stop() 45 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean-pyc clean-build docs clean build install install-all version 2 | 3 | help: 4 | @echo "clean-build - remove build artifacts" 5 | @echo "clean-test - remove Python file artifacts" 6 | @echo "clean-eggs - remove cached eggs" 7 | @echo "build - build package" 8 | @echo "lint - check style with flake8" 9 | @echo "test - run tests quickly with the default Python" 10 | @echo "test-all - run tests on every Python version with tox" 11 | @echo "docs - generate Sphinx HTML documentation, including API docs" 12 | @echo "release - package and upload a release" 13 | @echo "dist - package" 14 | 15 | clean: clean-build clean-test clean-eggs 16 | rm -rf htmlcov/ 17 | 18 | clean-build: 19 | rm -rf build/ 20 | rm -rf dist/ 21 | rm -rf *.egg-info 22 | 23 | .PHONY: clean-test 24 | clean-test: 25 | find . | grep -E "(__pycache__|\.pyc|\.pyo$$)" | xargs rm -rf 26 | rm -rf .pytest_cache/ 27 | 28 | .PHONY: clean-eggs 29 | clean-eggs: 30 | rm -rf .eggs/ 31 | 32 | .PHONY: build 33 | build: clean-build clean-eggs 34 | python3 setup.py build_ext --inplace 35 | 36 | install: clean-build 37 | python3 setup.py install 38 | 39 | install-all: 40 | pip install -e .[all] 41 | 42 | lint: 43 | pytest --flake8 sparksteps tests 44 | 45 | test: install-all 46 | pip install -U \ 47 | pytest \ 48 | pytest-flake8 \ 49 | moto 50 | python3 -m pytest 51 | 52 | test-all: 53 | tox 54 | 55 | version: 56 | python setup.py --version 57 | 58 | docs: 59 | rm -f docs/sparksteps.rst 60 | rm -f docs/modules.rst 61 | sphinx-apidoc -o docs/ sparksteps 62 | $(MAKE) -C docs clean 63 | $(MAKE) -C docs html 64 | xdg-open docs/_build/html/index.html 65 | 66 | .PHONY: release 67 | release: clean build 68 | python3 setup.py sdist bdist_wheel 69 | twine check dist/* 70 | twine upload --verbose dist/* 71 | 72 | .PHONY: dist 73 | dist: clean build 74 | python3 setup.py sdist bdist_wheel 75 | twine check dist/* 76 | -------------------------------------------------------------------------------- /sparksteps/poll.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Utilities for polling for cluster status to determine if it's in a terminal state. 4 | """ 5 | import logging 6 | from polling import poll 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | NON_TERMINAL_STATES = frozenset(['PENDING', 'RUNNING', 'CONTINUE', 'CANCEL_PENDING']) 12 | FAILED_STATE = frozenset(['CANCELLED', 'FAILED', 'INTERRUPTED']) 13 | 14 | 15 | def failure_message_from_response(response): 16 | """ 17 | Given EMR response, returns a descriptive error message 18 | """ 19 | fail_details = response['Step']['Status'].get('FailureDetails') 20 | if fail_details: 21 | return 'for reason {} with message {} and log file {}'\ 22 | .format( 23 | fail_details.get('Reason'), 24 | fail_details.get('Message'), 25 | fail_details.get('LogFile') 26 | ) 27 | 28 | 29 | def is_step_complete(emr_client, jobflow_id, step_id): 30 | """ 31 | Will query EMR for step status, returns True if complete, False otherwise 32 | """ 33 | response = emr_client.describe_step(ClusterId=jobflow_id, StepId=step_id) 34 | 35 | if not response['ResponseMetadata']['HTTPStatusCode'] == 200: 36 | logger.info('Bad HTTP response: %s', response) 37 | return False 38 | 39 | state = response['Step']['Status']['State'] 40 | logger.info('Job flow currently %s', state) 41 | 42 | if state in NON_TERMINAL_STATES: 43 | return False 44 | 45 | if state in FAILED_STATE: 46 | final_message = 'EMR job failed' 47 | failure_message = failure_message_from_response(response) 48 | if failure_message: 49 | final_message += ' ' + failure_message 50 | raise Exception(final_message) 51 | 52 | return True 53 | 54 | 55 | def wait_for_step_complete(emr_client, jobflow_id, step_id, sleep_interval_s): 56 | """ 57 | Will poll EMR until provided step has a terminal status 58 | """ 59 | poll( 60 | is_step_complete, 61 | args=(emr_client, jobflow_id, step_id), 62 | step=sleep_interval_s, 63 | poll_forever=True 64 | ) 65 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | Changelog 4 | ========= 5 | 6 | Releases 7 | -------- 8 | 9 | v3.0.1 (2020-12-23) 10 | ~~~~~~~~~~~~~~~~~~~ 11 | 12 | * Fixed an issue where `get_bid_price` would always base the instance bid price on the zone with the lowest current instance price, even though the cluster may not be launched in that AZ. 13 | 14 | v3.0.0 (2020-08-20) 15 | ~~~~~~~~~~~~~~~~~~~ 16 | 17 | * Fix `determine_best_price` returning a spot price that would be below the current spot price in some conditions. 18 | * Dropped support for Python 3.5. 19 | 20 | 21 | v2.2.1 (2019-11-04) 22 | ~~~~~~~~~~~~~~~~~~~ 23 | 24 | * Fix `get_demand_price` returning 0.00 for various instance types. 25 | 26 | 27 | v2.2.0 (2019-09-19) 28 | ~~~~~~~~~~~~~~~~~~~ 29 | 30 | * Support S3 paths in the `uploads` CLI option. A copy step will be added to the EMR cluster which will copy into /home/hadoop from the provided remote path. 31 | * Add option `--service-role` to configure EMR service role beyond the default `EMR_DefaultRole`. 32 | 33 | 34 | v2.1.0 (2019-08-27) 35 | ~~~~~~~~~~~~~~~~~~~ 36 | 37 | * Add `wait` CLI option. When `--wait` is passed, waits for EMR cluster steps to complete before application exits, sleeping 150 seconds (default) between each poll attempt. An optional integer value can be passed to specify the polling interval to use, in seconds. 38 | 39 | 40 | v2.0.0 (2019-07-31) 41 | ~~~~~~~~~~~~~~~~~~~ 42 | 43 | * Add `s3-path` CLI argument to optionally configure the path prefix used when writing sparksteps related assets such as sources (file uploads) and logs. 44 | 45 | **NOTE:** This is a backwards incompatible change as `sources/` and `logs/` are now written to the location specified by the `s3-path` argument. 46 | Prior to this change logs were written to `s3://S3_BUCKET/logs/sparksteps` and uploads to `s3://S3_BUCKET/sparksteps/sources`. 47 | 48 | 49 | v1.1.1 (2019-07-22) 50 | ~~~~~~~~~~~~~~~~~~~ 51 | 52 | * Raise an error if one of the file or directory paths provided do not exist 53 | 54 | 55 | v1.1.0 (2019-07-13) 56 | ~~~~~~~~~~~~~~~~~~~ 57 | 58 | * Add `jobflow_role` CLI argument to configure cluster EC2 Instance Profile 59 | * Add `app-list` CLI argument to configure list of Applications installed on cluster 60 | 61 | 62 | v1.0.0 (2019-07-03) 63 | ~~~~~~~~~~~~~~~~~~~ 64 | 65 | * Drop support for Python 2 66 | * `defaults` CLI parameter value schema to support arbitrary classifications 67 | 68 | 69 | v0.4.0 (2017-01-03) 70 | ~~~~~~~~~~~~~~~~~~~ 71 | 72 | * First upload to PyPI. 73 | -------------------------------------------------------------------------------- /tests/test_pricing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit/Integration Tests for the pricing module. 3 | 4 | Integration Tests (i.e tests that perform actual queries / make HTTP requests) 5 | are marked appropriately using PyTest markers. 6 | """ 7 | import pytest 8 | 9 | import boto3 10 | 11 | from sparksteps.pricing import get_bid_price, get_demand_price, determine_best_price, Zone 12 | 13 | # The price for an m4.large on-demand Linux instance in us-east-1. 14 | M4_LARGE_OD_PRICE = 0.100000 15 | 16 | 17 | @pytest.fixture 18 | def ec2(): 19 | """ 20 | In order to test pricing mechanics, we need to be able to make actual requests AWS. 21 | Since we're actually communicating with AWS here this makes this tests using this fixture 22 | more of an integration test than a unit test. 23 | """ 24 | client = boto3.client('ec2') 25 | return client 26 | 27 | 28 | @pytest.fixture 29 | def pricing_client(): 30 | """ 31 | Boto3 Pricing Client. 32 | """ 33 | return boto3.client('pricing') 34 | 35 | 36 | @pytest.mark.integration 37 | class TestPricingIntegration: 38 | def test_get_demand_price(self, pricing_client): 39 | price = get_demand_price(pricing_client, 'm4.large') 40 | # Note: this test assumes that AWS doesn't 41 | # change their on-demand price. 42 | assert price == M4_LARGE_OD_PRICE 43 | 44 | def test_get_bid_price(self, ec2, pricing_client): 45 | bid_price, is_spot = get_bid_price(ec2, pricing_client, 'm4.large') 46 | if is_spot: 47 | assert bid_price > 0. 48 | else: 49 | assert bid_price == get_demand_price('us-east-1', 'm4.large') 50 | 51 | 52 | class TestPricing: 53 | def test_determine_best_spot_price(self): 54 | aws_zone = Zone('us-east-1d', 0.90, 0.83, (0.9+0.83) / 2, 0.8617) 55 | # on-demand price for c5d.9xlarge nodes in us-east-1 56 | demand_price = 1.728 57 | bid_price, use_spot = determine_best_price(demand_price, aws_zone) 58 | assert use_spot is True 59 | assert bid_price > aws_zone.current 60 | 61 | def test_determine_best_price(self): 62 | demand_price = 1.728 63 | aws_zone = Zone('us-east-1a', demand_price, demand_price, demand_price, demand_price) 64 | # If the spot price is very close to the on-demand price, 65 | # then we should just be using on-demand pricing instead. 66 | bid_price, use_spot = determine_best_price(demand_price, aws_zone) 67 | assert use_spot is False 68 | assert bid_price == demand_price 69 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 92 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 93 | 94 | # User-specific stuff: 95 | .idea/workspace.xml 96 | .idea/tasks.xml 97 | .idea/dictionaries 98 | .idea/vcs.xml 99 | .idea/jsLibraryMappings.xml 100 | 101 | # Sensitive or high-churn files: 102 | .idea/dataSources.ids 103 | .idea/dataSources.xml 104 | .idea/dataSources.local.xml 105 | .idea/sqlDataSources.xml 106 | .idea/dynamic.xml 107 | .idea/uiDesigner.xml 108 | 109 | # Gradle: 110 | .idea/gradle.xml 111 | .idea/libraries 112 | 113 | # Mongo Explorer plugin: 114 | .idea/mongoSettings.xml 115 | 116 | ## File-based project format: 117 | *.iws 118 | 119 | ## Plugin-specific files: 120 | 121 | # IntelliJ 122 | /out/ 123 | 124 | # mpeltonen/sbt-idea plugin 125 | .idea_modules/ 126 | 127 | # JIRA plugin 128 | atlassian-ide-plugin.xml 129 | 130 | # Crashlytics plugin (for Android Studio and IntelliJ) 131 | com_crashlytics_export_strings.xml 132 | crashlytics.properties 133 | crashlytics-build.properties 134 | fabric.properties 135 | 136 | !examples/lib 137 | 138 | .idea/ 139 | -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Test Parser.""" 3 | import shlex 4 | from sparksteps import __main__ 5 | 6 | 7 | def test_parser(): 8 | parser = __main__.create_parser() 9 | cmd_args_str = """episodes.py \ 10 | --jobflow-role MyCustomRole \ 11 | --s3-bucket my-bucket \ 12 | --aws-region us-east-1 \ 13 | --release-label emr-4.7.0 \ 14 | --uploads examples/dir examples/episodes.avro \ 15 | --submit-args="--jars /home/hadoop/lib/spark-avro_2.10-2.0.2.jar" \ 16 | --app-args="--input /home/hadoop/episodes.avro" \ 17 | --app-list Hadoop Hive Spark \ 18 | --num-core 1 \ 19 | --tags Name=MyName CostCenter=MyCostCenter \ 20 | --defaults spark-defaults key=value another_key=another_value \ 21 | --maximize-resource-allocation \ 22 | --debug \ 23 | --wait 24 | """ 25 | args = __main__.parse_cli_args(parser, args=shlex.split(cmd_args_str)) 26 | assert args['app'] == 'episodes.py' 27 | assert args['jobflow_role'] == 'MyCustomRole' 28 | assert args['s3_bucket'] == 'my-bucket' 29 | assert args['app_args'] == ['--input', '/home/hadoop/episodes.avro'] 30 | assert args['app_list'] == ['Hadoop', 'Hive', 'Spark'] 31 | assert args['debug'] is True 32 | assert args['defaults'] == ['spark-defaults', 'key=value', 'another_key=another_value'] 33 | assert args['instance_type_master'] == 'm4.large' 34 | assert args['release_label'] == 'emr-4.7.0' 35 | assert args['submit_args'] == ['--jars', 36 | '/home/hadoop/lib/spark-avro_2.10-2.0.2.jar'] 37 | assert args['uploads'] == ['examples/dir', 'examples/episodes.avro'] 38 | assert args['tags'] == ['Name=MyName', 'CostCenter=MyCostCenter'] 39 | assert args['maximize_resource_allocation'] is True 40 | assert args['num_core'] == 1 41 | assert args['wait'] == 150 42 | 43 | 44 | def test_parser_with_bootstrap(): 45 | parser = __main__.create_parser() 46 | cmd_args_str = """episodes.py \ 47 | --s3-bucket my-bucket \ 48 | --aws-region us-east-1 \ 49 | --release-label emr-4.7.0 \ 50 | --uploads examples/dir examples/episodes.avro \ 51 | --submit-args="--jars /home/hadoop/lib/spark-avro_2.10-2.0.2.jar" \ 52 | --app-args="--input /home/hadoop/episodes.avro" \ 53 | --num-core 1 \ 54 | --tags Name=MyName CostCenter=MyCostCenter \ 55 | --defaults spark-defaults key=value another_key=another_value \ 56 | --bootstrap-script s3://bucket/bootstrap-actions.sh \ 57 | --debug 58 | """ 59 | args = __main__.parse_cli_args(parser, args=shlex.split(cmd_args_str)) 60 | assert args['app'] == 'episodes.py' 61 | assert args['s3_bucket'] == 'my-bucket' 62 | assert args['app_args'] == ['--input', '/home/hadoop/episodes.avro'] 63 | assert args['debug'] is True 64 | assert args['defaults'] == ['spark-defaults', 'key=value', 'another_key=another_value'] 65 | assert args['instance_type_master'] == 'm4.large' 66 | assert args['release_label'] == 'emr-4.7.0' 67 | assert args['submit_args'] == ['--jars', 68 | '/home/hadoop/lib/spark-avro_2.10-2.0.2.jar'] 69 | assert args['uploads'] == ['examples/dir', 'examples/episodes.avro'] 70 | assert args['tags'] == ['Name=MyName', 'CostCenter=MyCostCenter'] 71 | assert args['bootstrap_script'] == 's3://bucket/bootstrap-actions.sh' 72 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, political orientation, ideology, or sexual identity and orientation. 6 | 7 | We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. 8 | 9 | ## Our Standards 10 | 11 | Examples of behavior that contributes to a positive environment for our community include: 12 | 13 | * Demonstrating empathy and kindness toward other people 14 | * Being respectful of differing opinions, viewpoints, and experiences 15 | * Giving and gracefully accepting constructive feedback 16 | * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience 17 | * Focusing on what is best not just for us as individuals, but for the overall community 18 | * Communicate generously. Assume that contributors lack context rather than experience. 19 | 20 | Examples of unacceptable behavior include: 21 | * The use of sexualized language or imagery, and sexual attention or advances of any kind 22 | * Trolling, insulting or derogatory comments, and personal or political attacks 23 | * Public or private harassment 24 | * Publishing others’ private information, such as a physical or email address, without their explicit permission 25 | * Other conduct which could reasonably be considered inappropriate in a professional setting 26 | 27 | ## Enforcement Responsibilities 28 | 29 | Project maintainers are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. 32 | 33 | ## Scope 34 | 35 | This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official email address, posting via an official social media account, or acting as an appointed representative at an online or offline event. 36 | 37 | ## Enforcement 38 | 39 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the project maintainers responsible for enforcement at conduct@jwplayer.com. All complaints will be reviewed and investigated promptly and fairly. 40 | 41 | All project maintainers are obligated to respect the privacy and security of the reporter of any incident. 42 | 43 | ## Attribution 44 | 45 | This Code of Conduct is adapted from the Contributor Covenant, version 2.0, 46 | available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 47 | 48 | Community Impact Guidelines were inspired by Mozilla’s code of conduct enforcement ladder. 49 | 50 | For answers to common questions about this code of conduct, see the FAQ at 51 | https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. 52 | -------------------------------------------------------------------------------- /tests/test_poll.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Test Poll logic.""" 3 | import os 4 | import pytest 5 | import boto3 6 | 7 | from unittest.mock import MagicMock, patch 8 | 9 | from moto import mock_emr, mock_s3 10 | from moto.emr.models import emr_backends 11 | 12 | from sparksteps.cluster import emr_config 13 | from sparksteps.poll import failure_message_from_response, is_step_complete, wait_for_step_complete 14 | 15 | 16 | @pytest.fixture(scope='function') 17 | def aws_credentials(): 18 | """ 19 | Mocked AWS Credentials for moto to prevent impact to real infrastructure 20 | """ 21 | os.environ['AWS_ACCESS_KEY_ID'] = 'testing' 22 | os.environ['AWS_SECRET_ACCESS_KEY'] = 'testing' 23 | os.environ['AWS_SECURITY_TOKEN'] = 'testing' 24 | os.environ['AWS_SESSION_TOKEN'] = 'testing' 25 | 26 | 27 | @pytest.fixture(scope='function') 28 | def emr_client(aws_credentials): 29 | with mock_emr(): 30 | yield boto3.client('emr', region_name='us-east-1') 31 | 32 | 33 | @pytest.fixture(scope='function') 34 | def s3_client(aws_credentials): 35 | with mock_s3(): 36 | yield boto3.client('s3', region_name='us-east-1') 37 | 38 | 39 | def test_failure_message_from_response(): 40 | """ 41 | Ensure failure_message_from_response returns expected string 42 | """ 43 | mock_aws_response = { 44 | 'Step': { 45 | 'Status': { 46 | 'FailureDetails': { 47 | 'Reason': 'error-reason', 48 | 'Message': 'error-message', 49 | 'LogFile': '/path/to/logfile' 50 | } 51 | } 52 | } 53 | } 54 | expected = 'for reason error-reason with message error-message and log file /path/to/logfile' 55 | actual = failure_message_from_response(mock_aws_response) 56 | assert expected == actual, 'Mismatch, Expected: {}, Actual: {}'.format(expected, actual) 57 | 58 | del mock_aws_response['Step']['Status']['FailureDetails'] 59 | assert failure_message_from_response(mock_aws_response) is None, \ 60 | 'Expected None when FailureDetails key is missing from response.' 61 | 62 | 63 | def set_step_state(step_id, cluster_id, new_state): 64 | """ 65 | Helper to update the state of a step 66 | """ 67 | for step in emr_backends['us-east-1'].clusters[cluster_id].steps: 68 | if step.id == step_id: 69 | step.state = new_state 70 | 71 | 72 | def test_is_step_complete(emr_client, s3_client): 73 | """ 74 | Ensure is_step_complete returns expected boolean value 75 | """ 76 | cluster_config = emr_config( 77 | 'emr-5.2.0', 78 | instance_type_master='m4.large', 79 | jobflow_role='MyCustomRole', 80 | keep_alive=False, 81 | instance_type_core='m4.2xlarge', 82 | instance_type_task='m4.2xlarge', 83 | num_core=1, 84 | num_task=1, 85 | bid_price_task='0.1', 86 | maximize_resource_allocation=True, 87 | name='Test SparkSteps', 88 | app_list=['hadoop', 'hive', 'spark'] 89 | ) 90 | response = emr_client.run_job_flow(**cluster_config) 91 | cluster_id = response['JobFlowId'] 92 | 93 | test_step = { 94 | 'Name': 'test-step', 95 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 96 | 'HadoopJarStep': { 97 | 'Jar': 'command-runner.jar', 98 | 'Args': ['state-pusher-script'] 99 | } 100 | } 101 | response = emr_client.add_job_flow_steps(JobFlowId=cluster_id, Steps=[test_step]) 102 | last_step_id = response['StepIds'][-1] 103 | 104 | # while the step state is non-terminal is_step_complete should return False 105 | for state in ['PENDING', 'RUNNING', 'CONTINUE', 'CANCEL_PENDING']: 106 | set_step_state(last_step_id, cluster_id, state) 107 | assert not is_step_complete(emr_client, cluster_id, last_step_id), \ 108 | 'Expected last step to not be complete when step state is {}'.format(state) 109 | 110 | # when last step is in a terminal state (completed), is_step_complete should return True 111 | set_step_state(last_step_id, cluster_id, 'COMPLETED') 112 | assert is_step_complete(emr_client, cluster_id, last_step_id), \ 113 | 'Expected last step to be complete when last step state is {}'.format('COMPLETED') 114 | 115 | # when last step is in a failed state, is_step_complete should raise a helpful exception 116 | for state in ['CANCELLED', 'FAILED', 'INTERRUPTED']: 117 | set_step_state(last_step_id, cluster_id, state) 118 | try: 119 | is_step_complete(emr_client, cluster_id, last_step_id) 120 | assert False, \ 121 | 'Expected an exception to be raised when the last step is in {} state'.format(state) 122 | except Exception as e: 123 | assert 'EMR job failed' == str(e), 'Exception message not as expected' 124 | 125 | 126 | def test_wait_for_step_complete(): 127 | """ 128 | Ensure polling.poll is called with expected arguments 129 | """ 130 | with patch('sparksteps.poll.poll') as mock_poll: 131 | mock_emr = MagicMock() 132 | jobflow_id = 'fake-jobflow-id' 133 | step_id = 'fake-step-id' 134 | wait_for_step_complete(mock_emr, jobflow_id, step_id, 1) 135 | mock_poll.assert_called_once_with( 136 | is_step_complete, args=(mock_emr, jobflow_id, step_id), step=1, poll_forever=True) 137 | -------------------------------------------------------------------------------- /sparksteps/cluster.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Create EMR cluster.""" 3 | import os 4 | import getpass 5 | import logging 6 | import datetime 7 | 8 | from sparksteps import steps 9 | 10 | DEFAULT_JOBFLOW_ROLE = 'EMR_EC2_DefaultRole' 11 | DEFAULT_SERVICE_ROLE = 'EMR_DefaultRole' 12 | DEFAULT_APP_LIST = ['Hadoop', 'Spark'] 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | username = getpass.getuser() 17 | 18 | 19 | def parse_tags(raw_tags_list): 20 | """Parse AWS tags. 21 | 22 | Examples: 23 | >>> from pprint import pprint 24 | >>> pprint(parse_tags(['name="Peanut Pug"', 'age=5'])) 25 | [{'Key': 'name', 'Value': '"Peanut Pug"'}, {'Key': 'age', 'Value': '5'}] 26 | """ 27 | tags_dict_list = [] 28 | for raw_tag in raw_tags_list: 29 | if raw_tag.find('=') == -1: 30 | key, value = raw_tag, '' 31 | else: 32 | key, value = raw_tag.split('=', 1) 33 | tags_dict_list.append({'Key': key, 'Value': value}) 34 | 35 | return tags_dict_list 36 | 37 | 38 | def parse_conf(raw_conf_list): 39 | """Parse configuration items.""" 40 | 41 | defaults = [] 42 | classification = None 43 | 44 | for token in raw_conf_list: 45 | if '=' in token: 46 | key, value = token.split('=', 1) 47 | classification['Properties'][key] = value 48 | else: 49 | if classification: 50 | defaults.append(classification) 51 | classification = { 52 | 'Classification': token, 53 | 'Properties': {} 54 | } 55 | 56 | if classification: 57 | defaults.append(classification) 58 | 59 | return defaults 60 | 61 | 62 | def parse_apps(raw_app_list): 63 | """ 64 | Given a list of app name strings, 65 | returns formatted application configuration value. 66 | 67 | Examples: 68 | >>> from pprint import pprint 69 | >>> pprint(parse_apps(['hadoop', 'spark'])) 70 | [{'Name': 'Hadoop', 'Name': 'Spark'}] 71 | """ 72 | return sorted( 73 | [{'Name': app_name.capitalize()} for app_name in set(raw_app_list)], 74 | key=lambda x: x['Name']) 75 | 76 | 77 | def emr_config(release_label, keep_alive=False, **kw): 78 | timestamp = datetime.datetime.now().replace(microsecond=0) 79 | config = dict( 80 | Name="{} SparkStep Task [{}]".format(username, timestamp), 81 | ReleaseLabel=release_label, 82 | Instances={ 83 | 'InstanceGroups': [], 84 | 'KeepJobFlowAliveWhenNoSteps': keep_alive, 85 | 'TerminationProtected': False, 86 | }, 87 | Applications=parse_apps(kw.get('app_list', DEFAULT_APP_LIST)), 88 | VisibleToAllUsers=True, 89 | JobFlowRole=kw.get('jobflow_role', DEFAULT_JOBFLOW_ROLE), 90 | ServiceRole=kw.get('service_role', DEFAULT_SERVICE_ROLE) 91 | ) 92 | 93 | for instance_group in ('master', 'core', 'task'): 94 | num_instances = kw.get('num_{}'.format(instance_group), 0) 95 | if instance_group != 'master' and not num_instances: 96 | # We don't need this instance group. 97 | continue 98 | 99 | instance_type = kw.get('instance_type_{}'.format(instance_group)) 100 | if not instance_type: 101 | raise ValueError('{} nodes specified without instance type.'.format( 102 | instance_group.capitalize())) 103 | 104 | instance_group_config = { 105 | 'Name': '{} Node{}'.format(instance_group.capitalize(), 106 | 's' if instance_group != 'master' else ''), 107 | 'Market': 'ON_DEMAND', 108 | 'InstanceRole': instance_group.upper(), 109 | 'InstanceType': instance_type, 110 | 'InstanceCount': 1 if instance_group == 'master' else num_instances 111 | } 112 | 113 | bid_price = kw.get('bid_price_{}'.format(instance_group)) 114 | if bid_price: 115 | instance_group_config['Market'] = 'SPOT' 116 | instance_group_config['BidPrice'] = bid_price 117 | 118 | ebs_volume_size = kw.get('ebs_volume_size_{}'.format(instance_group), 0) 119 | if ebs_volume_size: 120 | ebs_configuration = { 121 | 'EbsBlockDeviceConfigs': [{ 122 | 'VolumeSpecification': { 123 | 'VolumeType': kw.get('ebs_volume_type_{}'.format(instance_group)), 124 | 'SizeInGB': ebs_volume_size 125 | }, 126 | 'VolumesPerInstance': kw.get('ebs_volumes_per_{}'.format(instance_group), 1) 127 | }], 128 | 'EbsOptimized': kw.get('ebs_optimized_{}'.format(instance_group), False) 129 | } 130 | instance_group_config['EbsConfiguration'] = ebs_configuration 131 | config['Instances']['InstanceGroups'].append(instance_group_config) 132 | 133 | if kw.get('name'): 134 | config['Name'] = kw['name'] 135 | if kw.get('ec2_key'): 136 | config['Instances']['Ec2KeyName'] = kw['ec2_key'] 137 | if kw.get('ec2_subnet_id'): 138 | config['Instances']['Ec2SubnetId'] = kw['ec2_subnet_id'] 139 | if kw.get('debug', False) and kw.get('s3_bucket'): 140 | config['LogUri'] = os.path.join('s3://', kw['s3_bucket'], kw['s3_path'], 'logs/') 141 | config['Steps'] = [steps.DebugStep().step] 142 | if kw.get('tags'): 143 | config['Tags'] = parse_tags(kw['tags']) 144 | if kw.get('defaults'): 145 | config['Configurations'] = parse_conf(kw['defaults']) 146 | if kw.get('maximize_resource_allocation'): 147 | configurations = config.get('Configurations', []) 148 | configurations.append({ 149 | 'Classification': 'spark', 150 | 'Properties': {'maximizeResourceAllocation': 'true'} 151 | }) 152 | config['Configurations'] = configurations 153 | if kw.get('bootstrap_script'): 154 | config['BootstrapActions'] = [{'Name': 'bootstrap', 155 | 'ScriptBootstrapAction': {'Path': kw['bootstrap_script']}}] 156 | 157 | return config 158 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Spark Steps 2 | =========== 3 | 4 | .. image:: https://github.com/jwplayer/sparksteps/workflows/Tests/badge.svg?branch=master 5 | :target: https://github.com/jwplayer/sparksteps/actions?query=workflow%3ATests+branch%3Amaster 6 | :alt: Build Status 7 | 8 | .. image:: https://readthedocs.org/projects/spark-steps/badge/?version=latest 9 | :target: http://spark-steps.readthedocs.io/en/latest/?badge=latest 10 | :alt: Documentation Status 11 | 12 | SparkSteps allows you to configure your EMR cluster and upload your 13 | spark script and its dependencies via AWS S3. All you need to do is 14 | define an S3 bucket. 15 | 16 | Install 17 | ------- 18 | 19 | :: 20 | 21 | pip install sparksteps 22 | 23 | CLI Options 24 | ----------- 25 | 26 | :: 27 | 28 | Prompt parameters: 29 | app main spark script for submit spark (required) 30 | app-args: arguments passed to main spark script 31 | app-list: Space delimited list of applications to be installed on the EMR cluster (Default: Hadoop Spark) 32 | aws-region: AWS region name 33 | bid-price: specify bid price for task nodes 34 | bootstrap-script: include a bootstrap script (s3 path) 35 | cluster-id: job flow id of existing cluster to submit to 36 | debug: allow debugging of cluster 37 | defaults: cluster configurations of the form " key1=val1 key2=val2 ..." 38 | dynamic-pricing-master: use spot pricing for the master nodes. 39 | dynamic-pricing-core: use spot pricing for the core nodes. 40 | dynamic-pricing-task: use spot pricing for the task nodes. 41 | ebs-volume-size-core: size of the EBS volume to attach to core nodes in GiB. 42 | ebs-volume-type-core: type of the EBS volume to attach to core nodes (supported: [standard, gp2, io1]). 43 | ebs-volumes-per-core: the number of EBS volumes to attach per core node. 44 | ebs-optimized-core: whether to use EBS optimized volumes for core nodes. 45 | ebs-volume-size-task: size of the EBS volume to attach to task nodes in GiB. 46 | ebs-volume-type-task: type of the EBS volume to attach to task nodes. 47 | ebs-volumes-per-task: the number of EBS volumes to attach per task node. 48 | ebs-optimized-task: whether to use EBS optimized volumes for task nodes. 49 | ec2-key: name of the Amazon EC2 key pair 50 | ec2-subnet-id: Amazon VPC subnet id 51 | help (-h): argparse help 52 | jobflow-role: Amazon EC2 instance profile name to use (Default: EMR_EC2_DefaultRole) 53 | service-role: AWS IAM service role to use for EMR (Default: EMR_DefaultRole) 54 | keep-alive: whether to keep the EMR cluster alive when there are no steps 55 | log-level (-l): logging level (default=INFO) 56 | instance-type-master: instance type of of master host (default='m4.large') 57 | instance-type-core: instance type of the core nodes, must be set when num-core > 0 58 | instance-type-task: instance type of the task nodes, must be set when num-task > 0 59 | maximize-resource-allocation: sets the maximizeResourceAllocation property for the cluster to true when supplied. 60 | name: specify cluster name 61 | num-core: number of core nodes 62 | num-task: number of task nodes 63 | release-label: EMR release label 64 | s3-bucket: name of s3 bucket to upload spark file (required) 65 | s3-path: path within s3-bucket to use when writing assets 66 | s3-dist-cp: s3-dist-cp step after spark job is done 67 | submit-args: arguments passed to spark-submit 68 | tags: EMR cluster tags of the form "key1=value1 key2=value2" 69 | uploads: files to upload to /home/hadoop/ in master instance 70 | wait: poll until all steps are complete (or error) 71 | 72 | Example 73 | ------- 74 | 75 | :: 76 | 77 | AWS_S3_BUCKET = 78 | cd sparksteps/ 79 | sparksteps examples/episodes.py \ 80 | --s3-bucket $AWS_S3_BUCKET \ 81 | --aws-region us-east-1 \ 82 | --release-label emr-4.7.0 \ 83 | --uploads examples/lib examples/episodes.avro \ 84 | --submit-args="--deploy-mode client --jars /home/hadoop/lib/spark-avro_2.10-2.0.2-custom.jar" \ 85 | --app-args="--input /home/hadoop/episodes.avro" \ 86 | --tags Application="Spark Steps" \ 87 | --debug 88 | 89 | The above example creates an EMR cluster of 1 node with default instance 90 | type *m4.large*, uploads the pyspark script episodes.py and its 91 | dependencies to the specified S3 bucket and copies the file from S3 to 92 | the cluster. Each operation is defined as an EMR "step" that you can 93 | monitor in EMR. The final step is to run the spark application with 94 | submit args that includes a custom spark-avro package and app args 95 | "--input". 96 | 97 | Run Spark Job on Existing Cluster 98 | --------------------------------- 99 | 100 | You can use the option ``--cluster-id`` to specify a cluster to upload 101 | and run the Spark job. This is especially helpful for debugging. 102 | 103 | Dynamic Pricing 104 | ----------------------- 105 | 106 | Use CLI option ``--dynamic-pricing-`` to allow sparksteps to dynamically 107 | determine the best bid price for EMR instances within a certain instance group. 108 | 109 | Currently the algorithm looks back at spot history over the last 12 110 | hours and calculates ``min(0.8 * on_demand_price, 1.2 * max_spot_price)`` to 111 | determine bid price. That said, if the current spot price is over 80% of 112 | the on-demand cost, then on-demand instances are used to be 113 | conservative. 114 | 115 | 116 | Testing 117 | ------- 118 | 119 | :: 120 | 121 | make test 122 | 123 | Blog 124 | ---- 125 | Read more about sparksteps in our blog post here: 126 | https://www.jwplayer.com/blog/sparksteps/ 127 | 128 | License 129 | ------- 130 | 131 | Apache License 2.0 132 | -------------------------------------------------------------------------------- /sparksteps/steps.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Create EMR steps and upload files.""" 3 | import os 4 | import tempfile 5 | import zipfile 6 | from urllib.parse import urlparse 7 | 8 | REMOTE_DIR = '/home/hadoop/' 9 | 10 | 11 | def get_basename(path): 12 | return os.path.basename(os.path.normpath(path)) 13 | 14 | 15 | def ls_recursive(dirname): 16 | """Recursively list files in a directory.""" 17 | for (dirpath, dirnames, filenames) in os.walk(os.path.expanduser(dirname)): 18 | for f in filenames: 19 | yield os.path.join(dirpath, f) 20 | 21 | 22 | def zip_to_s3(s3_resource, dirpath, bucket, key): 23 | """Zip folder and upload to S3.""" 24 | with tempfile.SpooledTemporaryFile() as tmp: 25 | with zipfile.ZipFile(tmp, 'w', zipfile.ZIP_DEFLATED) as archive: 26 | for fpath in ls_recursive(dirpath): 27 | archive.write(fpath, get_basename(fpath)) 28 | tmp.seek(0) # Reset file pointer 29 | response = s3_resource.Bucket(bucket).put_object(Key=key, Body=tmp) 30 | return response 31 | 32 | 33 | def parse_s3_path(s3_path): 34 | """Return bucket, path, and filename of an S3 path""" 35 | parsed = urlparse(s3_path, allow_fragments=False) 36 | bucket = parsed.netloc 37 | path, filename = parsed.path.rsplit('/', 1) 38 | path = path[1:] if path.startswith('/') else path 39 | return bucket, path, filename 40 | 41 | 42 | class CmdStep(object): 43 | on_failure = 'CANCEL_AND_WAIT' 44 | 45 | @property 46 | def step_name(self): 47 | raise NotImplementedError() 48 | 49 | @property 50 | def cmd(self): 51 | raise NotImplementedError() 52 | 53 | @property 54 | def step(self): 55 | return { 56 | 'Name': self.step_name, 57 | 'ActionOnFailure': self.on_failure, 58 | 'HadoopJarStep': { 59 | 'Jar': 'command-runner.jar', 60 | 'Args': self.cmd 61 | } 62 | } 63 | 64 | 65 | class CopyStep(CmdStep): 66 | def __init__(self, bucket, path, filename): 67 | self.bucket = bucket 68 | self.path = path 69 | self.filename = filename 70 | 71 | @property 72 | def step_name(self): 73 | return "Copy {}".format(self.filename) 74 | 75 | @property 76 | def cmd(self): 77 | return ['aws', 's3', 'cp', self.s3_uri, REMOTE_DIR] 78 | 79 | @property 80 | def key(self): 81 | return os.path.join(self.path, self.filename) 82 | 83 | @property 84 | def s3_uri(self): 85 | return os.path.join('s3://', self.bucket, self.key) 86 | 87 | 88 | class DebugStep(CmdStep): 89 | on_failure = 'TERMINATE_CLUSTER' 90 | 91 | @property 92 | def step_name(self): 93 | return "Setup - debug" 94 | 95 | @property 96 | def cmd(self): 97 | return ['state-pusher-script'] 98 | 99 | 100 | class SparkStep(CmdStep): 101 | def __init__(self, app_path, submit_args=None, app_args=None): 102 | self.app = get_basename(app_path) 103 | self.submit_args = submit_args or [] 104 | self.app_args = app_args or [] 105 | 106 | @property 107 | def step_name(self): 108 | return "Run {}".format(self.app) 109 | 110 | @property 111 | def cmd(self): 112 | return (['spark-submit'] + self.submit_args + [self.remote_app] + 113 | self.app_args) 114 | 115 | @property 116 | def remote_app(self): 117 | return os.path.join(REMOTE_DIR, self.app) 118 | 119 | 120 | class UnzipStep(CmdStep): 121 | def __init__(self, dirpath): 122 | self.dirpath = dirpath 123 | 124 | @property 125 | def step_name(self): 126 | return "Unzip {}".format(self.zipfile) 127 | 128 | @property 129 | def cmd(self): 130 | return ['unzip', '-o', self.remote_zipfile, '-d', self.remote_dirpath] 131 | 132 | @property 133 | def zipfile(self): 134 | return self.dirname + '.zip' 135 | 136 | @property 137 | def remote_zipfile(self): 138 | return os.path.join(REMOTE_DIR, self.zipfile) 139 | 140 | @property 141 | def dirname(self): 142 | return get_basename(self.dirpath) 143 | 144 | @property 145 | def remote_dirpath(self): 146 | return os.path.join(REMOTE_DIR, self.dirname) 147 | 148 | 149 | class S3DistCp(CmdStep): 150 | on_failure = 'CONTINUE' 151 | 152 | def __init__(self, s3_dist_cp): 153 | self.s3_dist_cp = s3_dist_cp 154 | 155 | @property 156 | def step_name(self): 157 | return "S3DistCp step" 158 | 159 | @property 160 | def cmd(self): 161 | return ['s3-dist-cp'] + self.s3_dist_cp 162 | 163 | 164 | def get_download_steps(s3_resource, bucket, bucket_path, src_path): 165 | """ 166 | Return list of step instances necessary to download file/directory resources onto the EMR master node. 167 | May upload local files and directories to S3 to make them available to EMR. 168 | """ 169 | steps = [] 170 | basename = get_basename(src_path) 171 | 172 | # Location where files will be copied to be made accessible by EMR 173 | default_dest_path = os.path.join(bucket_path, 'sources') 174 | 175 | if src_path.startswith('s3://'): 176 | # S3 file, simply add the Copy EMR step, 177 | # no intermediate S3 file is necessary as it's already on S3 178 | steps.append(CopyStep(*parse_s3_path(src_path))) 179 | elif os.path.isdir(src_path): 180 | # Directory, will zip and push to S3 first before adding EMR copy/unzip step 181 | basename = basename + '.zip' 182 | dest_path = os.path.join(default_dest_path, basename) 183 | zip_to_s3(s3_resource, src_path, bucket, key=dest_path) 184 | copy_step = CopyStep(bucket, default_dest_path, basename) 185 | steps.extend([copy_step, UnzipStep(src_path)]) 186 | elif os.path.isfile(src_path): 187 | # File, upload to S3 and add copy step 188 | dest_path = os.path.join(default_dest_path, basename) 189 | s3_resource.meta.client.upload_file(src_path, bucket, dest_path) 190 | copy_step = CopyStep(bucket, default_dest_path, basename) 191 | steps.append(copy_step) 192 | else: 193 | raise FileNotFoundError( 194 | '{} does not exist (does not reference a valid file or path).' 195 | .format(src_path)) 196 | return steps 197 | 198 | 199 | def setup_steps(s3, bucket, bucket_path, app_path, submit_args=None, app_args=None, 200 | uploads=None, s3_dist_cp=None): 201 | cmd_steps = [] 202 | paths = uploads or [] 203 | paths.append(app_path) 204 | 205 | for src_path in paths: 206 | cmd_steps.extend(get_download_steps(s3, bucket, bucket_path, src_path)) 207 | 208 | cmd_steps.append(SparkStep(app_path, submit_args, app_args)) 209 | 210 | if s3_dist_cp is not None: 211 | cmd_steps.append(S3DistCp(s3_dist_cp)) 212 | 213 | return [s.step for s in cmd_steps] 214 | -------------------------------------------------------------------------------- /sparksteps/pricing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Get optimal pricing for EC2 instances.""" 3 | import json 4 | import datetime 5 | import itertools 6 | import logging 7 | import collections 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | SPOT_DEMAND_THRESHOLD_FACTOR = 0.8 12 | SPOT_PRICE_LOOKBACK = 12 # hours 13 | 14 | Zone = collections.namedtuple('Zone', 'name max min mean current') 15 | Spot = collections.namedtuple('Spot', 'availability_zone timestamp price') 16 | 17 | EC2_PRICE_FILTER_TEMPLATE = ''' 18 | [ 19 | {{"Field": "tenancy", "Value": "shared", "Type": "TERM_MATCH"}}, 20 | {{"Field": "operatingSystem", "Value": "{operating_sytem}", "Type": "TERM_MATCH"}}, 21 | {{"Field": "preInstalledSw", "Value": "NA", "Type": "TERM_MATCH"}}, 22 | {{"Field": "instanceType", "Value": "{instance_type}", "Type": "TERM_MATCH"}}, 23 | {{"Field": "location", "Value": "{region}", "Type": "TERM_MATCH"}}, 24 | {{"Field": "licenseModel", "Value": "No License required", "Type": "TERM_MATCH"}}, 25 | {{"Field": "usagetype", "Value": "BoxUsage:{instance_type}", "Type": "TERM_MATCH"}} 26 | ] 27 | ''' 28 | 29 | 30 | def get_demand_price(pricing_client, instance_type, region='US East (N. Virginia)', operating_system='Linux'): 31 | """ 32 | Retrieves the on-demand price for a particular EC2 instance type in the specified region. 33 | This function does not take reserved instance pricing into account. 34 | 35 | Args: 36 | pricing_client: Boto3 Pricing client. 37 | instance_type (str): The type of the instance. 38 | region: The region to get the price for, this must be the human-readable name of the region! 39 | operating_system: The operating system of the instance, this must be the human-readable name! 40 | """ 41 | if '-' in region: 42 | # TODO (rikheijdens): Perhaps we could map these using information from botocore/data/endpoints.json. 43 | raise ValueError('get_demand_price() requires the human-readable name of the region to be supplied.') 44 | 45 | filter_template = EC2_PRICE_FILTER_TEMPLATE.format( 46 | operating_sytem=operating_system, instance_type=instance_type, region=region) 47 | data = pricing_client.get_products(ServiceCode='AmazonEC2', Filters=json.loads(filter_template)) 48 | on_demand = json.loads(data['PriceList'][0])['terms']['OnDemand'] 49 | index_1 = list(on_demand)[0] 50 | index_2 = list(on_demand[index_1]['priceDimensions'])[0] 51 | return float(on_demand[index_1]['priceDimensions'][index_2]['pricePerUnit']['USD']) 52 | 53 | 54 | def get_availability_zone(ec2_client, subnet_id): 55 | """ 56 | Returns the availability zone associated with the provided `subnet_id`. 57 | 58 | Args: 59 | ec2_client: Boto3 EC2 client. 60 | subnet_id (str): The identifier of the subnet to look the associated AZ up for. 61 | 62 | Returns: 63 | AZ: The AvailabilityZone of the associated subnet. 64 | """ 65 | response = ec2_client.describe_subnets(SubnetIds=[subnet_id]) 66 | subnets = response.get('Subnets', []) 67 | for s in subnets: 68 | if s['SubnetId'] == subnet_id: 69 | return s['AvailabilityZone'] 70 | # Could not determine the associated AZ. 71 | return None 72 | 73 | 74 | def get_spot_price_history(ec2_client, instance_type, lookback=1): 75 | """Return dictionary of price history by availability zone. 76 | 77 | Args: 78 | ec2_client: EC2 client 79 | instance_type (str): get results by the specified instance type 80 | lookback (int): number of hours to look back for spot history 81 | 82 | Returns: 83 | float: bid price for the instance type. 84 | """ 85 | end = datetime.datetime.utcnow() 86 | start = end - datetime.timedelta(hours=lookback) 87 | 88 | response = ec2_client.describe_spot_price_history( 89 | StartTime=start, 90 | EndTime=end, 91 | InstanceTypes=[ 92 | instance_type, 93 | ], 94 | ProductDescriptions=[ 95 | 'Linux/UNIX (Amazon VPC)', 96 | 'Linux/UNIX', 97 | ], 98 | ) 99 | return response['SpotPriceHistory'] 100 | 101 | 102 | def price_by_zone(price_history): 103 | prices = [Spot(d['AvailabilityZone'], d['Timestamp'], float(d['SpotPrice'])) 104 | for d in price_history] 105 | g = itertools.groupby(sorted(prices), key=lambda x: x.availability_zone) 106 | result = {key: list(grp) for key, grp in g} 107 | return result 108 | 109 | 110 | def get_zone_profile(zone_history): 111 | zone_prices = {k: [x.price for x in v] for k, v in zone_history.items()} 112 | return [Zone(k, max(v), min(v), sum(v) / len(v), v[-1]) 113 | for k, v in zone_prices.items()] 114 | 115 | 116 | def determine_best_price(demand_price, aws_zone): 117 | """Calculate optimal bid price. 118 | 119 | Args: 120 | demand_price (float): on-demand cost of AWS instance 121 | aws_zone (Zone): AWS zone namedtuple ('name max min mean current') 122 | 123 | Returns: 124 | float: bid price 125 | bool: boolean to use spot pricing 126 | """ 127 | if aws_zone.current >= demand_price * SPOT_DEMAND_THRESHOLD_FACTOR: 128 | return demand_price, False 129 | # We always bid higher than the maximum current spot price for a particular instance type 130 | # in order to make it less likely that our clusters will be shutdown. 131 | return min(1.2 * aws_zone.max, demand_price * SPOT_DEMAND_THRESHOLD_FACTOR), True 132 | 133 | 134 | def get_bid_price(ec2_client, pricing_client, instance_type, availability_zone=None): 135 | """Determine AWS bid price. 136 | 137 | Args: 138 | ec2_client: boto3 EC2 client 139 | instance_type: EC2 instance type 140 | availability_zone: The availability zone the instance should be launched in, 141 | if not provided an AZ is automatically selected. 142 | 143 | Returns: 144 | float: bid price, bool: is_spot 145 | 146 | Examples: 147 | >>> import boto3 148 | >>> client = boto3.client('ec2', region_name='us-east-1') 149 | >>> print(get_bid_price(client, 'm3.2xlarge')) 150 | """ 151 | history = get_spot_price_history(ec2_client, instance_type, SPOT_PRICE_LOOKBACK) 152 | by_zone = price_by_zone(history) 153 | if availability_zone is not None and availability_zone not in by_zone: 154 | # Unable to determine the spot price because no information was available for the 155 | # desired AZ. 156 | logger.info( 157 | "Unable to determine the spot price for %s instances in %s because no " 158 | "zone information was available.", instance_type, availability_zone) 159 | return round(get_demand_price(pricing_client, instance_type), 2), False 160 | 161 | if availability_zone: 162 | # Consider only the AZ in which we expect to launch instances. 163 | zone_profile = get_zone_profile({availability_zone: by_zone[availability_zone]}) 164 | else: 165 | # Consider all AZ's. 166 | zone_profile = get_zone_profile(by_zone) 167 | best_zone = min(zone_profile, key=lambda x: x.max) 168 | demand_price = get_demand_price(pricing_client, instance_type) 169 | bid_price, is_spot = determine_best_price(demand_price, best_zone) 170 | bid_price_rounded = round(bid_price, 2) # AWS requires max 3 decimal places 171 | return bid_price_rounded, is_spot 172 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help 18 | help: 19 | @echo "Please use \`make ' where is one of" 20 | @echo " html to make standalone HTML files" 21 | @echo " dirhtml to make HTML files named index.html in directories" 22 | @echo " singlehtml to make a single large HTML file" 23 | @echo " pickle to make pickle files" 24 | @echo " json to make JSON files" 25 | @echo " htmlhelp to make HTML files and a HTML help project" 26 | @echo " qthelp to make HTML files and a qthelp project" 27 | @echo " applehelp to make an Apple Help Book" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " epub3 to make an epub3" 31 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 32 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 33 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 34 | @echo " text to make text files" 35 | @echo " man to make manual pages" 36 | @echo " texinfo to make Texinfo files" 37 | @echo " info to make Texinfo files and run them through makeinfo" 38 | @echo " gettext to make PO message catalogs" 39 | @echo " changes to make an overview of all changed/added/deprecated items" 40 | @echo " xml to make Docutils-native XML files" 41 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 42 | @echo " linkcheck to check all external links for integrity" 43 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 44 | @echo " coverage to run coverage check of the documentation (if enabled)" 45 | @echo " dummy to check syntax errors of document sources" 46 | 47 | .PHONY: clean 48 | clean: 49 | rm -rf $(BUILDDIR)/* 50 | 51 | .PHONY: html 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | .PHONY: dirhtml 58 | dirhtml: 59 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 60 | @echo 61 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 62 | 63 | .PHONY: singlehtml 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | .PHONY: pickle 70 | pickle: 71 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 72 | @echo 73 | @echo "Build finished; now you can process the pickle files." 74 | 75 | .PHONY: json 76 | json: 77 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 78 | @echo 79 | @echo "Build finished; now you can process the JSON files." 80 | 81 | .PHONY: htmlhelp 82 | htmlhelp: 83 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 84 | @echo 85 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 86 | ".hhp project file in $(BUILDDIR)/htmlhelp." 87 | 88 | .PHONY: qthelp 89 | qthelp: 90 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 91 | @echo 92 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 93 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 94 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/sparksteps.qhcp" 95 | @echo "To view the help file:" 96 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/sparksteps.qhc" 97 | 98 | .PHONY: applehelp 99 | applehelp: 100 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 101 | @echo 102 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 103 | @echo "N.B. You won't be able to view it unless you put it in" \ 104 | "~/Library/Documentation/Help or install it in your application" \ 105 | "bundle." 106 | 107 | .PHONY: devhelp 108 | devhelp: 109 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 110 | @echo 111 | @echo "Build finished." 112 | @echo "To view the help file:" 113 | @echo "# mkdir -p $$HOME/.local/share/devhelp/sparksteps" 114 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/sparksteps" 115 | @echo "# devhelp" 116 | 117 | .PHONY: epub 118 | epub: 119 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 120 | @echo 121 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 122 | 123 | .PHONY: epub3 124 | epub3: 125 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 126 | @echo 127 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 128 | 129 | .PHONY: latex 130 | latex: 131 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 132 | @echo 133 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 134 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 135 | "(use \`make latexpdf' here to do that automatically)." 136 | 137 | .PHONY: latexpdf 138 | latexpdf: 139 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 140 | @echo "Running LaTeX files through pdflatex..." 141 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 142 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 143 | 144 | .PHONY: latexpdfja 145 | latexpdfja: 146 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 147 | @echo "Running LaTeX files through platex and dvipdfmx..." 148 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 149 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 150 | 151 | .PHONY: text 152 | text: 153 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 154 | @echo 155 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 156 | 157 | .PHONY: man 158 | man: 159 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 160 | @echo 161 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 162 | 163 | .PHONY: texinfo 164 | texinfo: 165 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 166 | @echo 167 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 168 | @echo "Run \`make' in that directory to run these through makeinfo" \ 169 | "(use \`make info' here to do that automatically)." 170 | 171 | .PHONY: info 172 | info: 173 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 174 | @echo "Running Texinfo files through makeinfo..." 175 | make -C $(BUILDDIR)/texinfo info 176 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 177 | 178 | .PHONY: gettext 179 | gettext: 180 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 181 | @echo 182 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 183 | 184 | .PHONY: changes 185 | changes: 186 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 187 | @echo 188 | @echo "The overview file is in $(BUILDDIR)/changes." 189 | 190 | .PHONY: linkcheck 191 | linkcheck: 192 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 193 | @echo 194 | @echo "Link check complete; look for any errors in the above output " \ 195 | "or in $(BUILDDIR)/linkcheck/output.txt." 196 | 197 | .PHONY: doctest 198 | doctest: 199 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 200 | @echo "Testing of doctests in the sources finished, look at the " \ 201 | "results in $(BUILDDIR)/doctest/output.txt." 202 | 203 | .PHONY: coverage 204 | coverage: 205 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 206 | @echo "Testing of coverage in the sources finished, look at the " \ 207 | "results in $(BUILDDIR)/coverage/python.txt." 208 | 209 | .PHONY: xml 210 | xml: 211 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 212 | @echo 213 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 214 | 215 | .PHONY: pseudoxml 216 | pseudoxml: 217 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 218 | @echo 219 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 220 | 221 | .PHONY: dummy 222 | dummy: 223 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy 224 | @echo 225 | @echo "Build finished. Dummy builder generates no files." 226 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. epub3 to make an epub3 31 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 32 | echo. text to make text files 33 | echo. man to make manual pages 34 | echo. texinfo to make Texinfo files 35 | echo. gettext to make PO message catalogs 36 | echo. changes to make an overview over all changed/added/deprecated items 37 | echo. xml to make Docutils-native XML files 38 | echo. pseudoxml to make pseudoxml-XML files for display purposes 39 | echo. linkcheck to check all external links for integrity 40 | echo. doctest to run all doctests embedded in the documentation if enabled 41 | echo. coverage to run coverage check of the documentation if enabled 42 | echo. dummy to check syntax errors of document sources 43 | goto end 44 | ) 45 | 46 | if "%1" == "clean" ( 47 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 48 | del /q /s %BUILDDIR%\* 49 | goto end 50 | ) 51 | 52 | 53 | REM Check if sphinx-build is available and fallback to Python version if any 54 | %SPHINXBUILD% 1>NUL 2>NUL 55 | if errorlevel 9009 goto sphinx_python 56 | goto sphinx_ok 57 | 58 | :sphinx_python 59 | 60 | set SPHINXBUILD=python -m sphinx.__init__ 61 | %SPHINXBUILD% 2> nul 62 | if errorlevel 9009 ( 63 | echo. 64 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 65 | echo.installed, then set the SPHINXBUILD environment variable to point 66 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 67 | echo.may add the Sphinx directory to PATH. 68 | echo. 69 | echo.If you don't have Sphinx installed, grab it from 70 | echo.http://sphinx-doc.org/ 71 | exit /b 1 72 | ) 73 | 74 | :sphinx_ok 75 | 76 | 77 | if "%1" == "html" ( 78 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 79 | if errorlevel 1 exit /b 1 80 | echo. 81 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 82 | goto end 83 | ) 84 | 85 | if "%1" == "dirhtml" ( 86 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 87 | if errorlevel 1 exit /b 1 88 | echo. 89 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 90 | goto end 91 | ) 92 | 93 | if "%1" == "singlehtml" ( 94 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 95 | if errorlevel 1 exit /b 1 96 | echo. 97 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 98 | goto end 99 | ) 100 | 101 | if "%1" == "pickle" ( 102 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 103 | if errorlevel 1 exit /b 1 104 | echo. 105 | echo.Build finished; now you can process the pickle files. 106 | goto end 107 | ) 108 | 109 | if "%1" == "json" ( 110 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 111 | if errorlevel 1 exit /b 1 112 | echo. 113 | echo.Build finished; now you can process the JSON files. 114 | goto end 115 | ) 116 | 117 | if "%1" == "htmlhelp" ( 118 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 119 | if errorlevel 1 exit /b 1 120 | echo. 121 | echo.Build finished; now you can run HTML Help Workshop with the ^ 122 | .hhp project file in %BUILDDIR%/htmlhelp. 123 | goto end 124 | ) 125 | 126 | if "%1" == "qthelp" ( 127 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 128 | if errorlevel 1 exit /b 1 129 | echo. 130 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 131 | .qhcp project file in %BUILDDIR%/qthelp, like this: 132 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\sparksteps.qhcp 133 | echo.To view the help file: 134 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\sparksteps.ghc 135 | goto end 136 | ) 137 | 138 | if "%1" == "devhelp" ( 139 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 140 | if errorlevel 1 exit /b 1 141 | echo. 142 | echo.Build finished. 143 | goto end 144 | ) 145 | 146 | if "%1" == "epub" ( 147 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 148 | if errorlevel 1 exit /b 1 149 | echo. 150 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 151 | goto end 152 | ) 153 | 154 | if "%1" == "epub3" ( 155 | %SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3 156 | if errorlevel 1 exit /b 1 157 | echo. 158 | echo.Build finished. The epub3 file is in %BUILDDIR%/epub3. 159 | goto end 160 | ) 161 | 162 | if "%1" == "latex" ( 163 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 164 | if errorlevel 1 exit /b 1 165 | echo. 166 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdf" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "latexpdfja" ( 181 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 182 | cd %BUILDDIR%/latex 183 | make all-pdf-ja 184 | cd %~dp0 185 | echo. 186 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 187 | goto end 188 | ) 189 | 190 | if "%1" == "text" ( 191 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 192 | if errorlevel 1 exit /b 1 193 | echo. 194 | echo.Build finished. The text files are in %BUILDDIR%/text. 195 | goto end 196 | ) 197 | 198 | if "%1" == "man" ( 199 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 200 | if errorlevel 1 exit /b 1 201 | echo. 202 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 203 | goto end 204 | ) 205 | 206 | if "%1" == "texinfo" ( 207 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 208 | if errorlevel 1 exit /b 1 209 | echo. 210 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 211 | goto end 212 | ) 213 | 214 | if "%1" == "gettext" ( 215 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 216 | if errorlevel 1 exit /b 1 217 | echo. 218 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 219 | goto end 220 | ) 221 | 222 | if "%1" == "changes" ( 223 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 224 | if errorlevel 1 exit /b 1 225 | echo. 226 | echo.The overview file is in %BUILDDIR%/changes. 227 | goto end 228 | ) 229 | 230 | if "%1" == "linkcheck" ( 231 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 232 | if errorlevel 1 exit /b 1 233 | echo. 234 | echo.Link check complete; look for any errors in the above output ^ 235 | or in %BUILDDIR%/linkcheck/output.txt. 236 | goto end 237 | ) 238 | 239 | if "%1" == "doctest" ( 240 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 241 | if errorlevel 1 exit /b 1 242 | echo. 243 | echo.Testing of doctests in the sources finished, look at the ^ 244 | results in %BUILDDIR%/doctest/output.txt. 245 | goto end 246 | ) 247 | 248 | if "%1" == "coverage" ( 249 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 250 | if errorlevel 1 exit /b 1 251 | echo. 252 | echo.Testing of coverage in the sources finished, look at the ^ 253 | results in %BUILDDIR%/coverage/python.txt. 254 | goto end 255 | ) 256 | 257 | if "%1" == "xml" ( 258 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 259 | if errorlevel 1 exit /b 1 260 | echo. 261 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 262 | goto end 263 | ) 264 | 265 | if "%1" == "pseudoxml" ( 266 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 267 | if errorlevel 1 exit /b 1 268 | echo. 269 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 270 | goto end 271 | ) 272 | 273 | if "%1" == "dummy" ( 274 | %SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy 275 | if errorlevel 1 exit /b 1 276 | echo. 277 | echo.Build finished. Dummy builder generates no files. 278 | goto end 279 | ) 280 | 281 | :end 282 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # sparksteps documentation build configuration file, created by 5 | # sphinx-quickstart on Sun Jan 1 10:41:08 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | from pkg_resources import get_distribution 24 | 25 | import sphinx_rtd_theme 26 | 27 | # -- General configuration ------------------------------------------------ 28 | 29 | # If your documentation needs a minimal Sphinx version, state it here. 30 | # 31 | # needs_sphinx = '1.0' 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | 'sphinx.ext.autodoc', 38 | 'sphinx.ext.doctest', 39 | 'sphinx.ext.todo', 40 | 'sphinx.ext.coverage', 41 | 'sphinx.ext.viewcode', 42 | ] 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ['_templates'] 46 | 47 | # The suffix(es) of source filenames. 48 | # You can specify multiple suffix as a list of string: 49 | # 50 | # source_suffix = ['.rst', '.md'] 51 | source_suffix = '.rst' 52 | 53 | # The encoding of source files. 54 | # 55 | # source_encoding = 'utf-8-sig' 56 | 57 | # The master toctree document. 58 | master_doc = 'index' 59 | 60 | # General information about the project. 61 | project = 'SparkSteps' 62 | copyright = '2017, JW Player' 63 | author = 'Kamil Sindi' 64 | 65 | # The version info for the project you're documenting, acts as replacement for 66 | # |version| and |release|, also used in various other places throughout the 67 | # built documents. 68 | # 69 | # The full version, including alpha/beta/rc tags. 70 | release = get_distribution('sparksteps').version 71 | # The short X.Y version. 72 | version = '.'.join(release.split('.')[:2]) 73 | 74 | # The language for content autogenerated by Sphinx. Refer to documentation 75 | # for a list of supported languages. 76 | # 77 | # This is also used if you do content translation via gettext catalogs. 78 | # Usually you set "language" from the command line for these cases. 79 | language = None 80 | 81 | # There are two options for replacing |today|: either, you set today to some 82 | # non-false value, then it is used: 83 | # 84 | # today = '' 85 | # 86 | # Else, today_fmt is used as the format for a strftime call. 87 | # 88 | # today_fmt = '%B %d, %Y' 89 | 90 | # List of patterns, relative to source directory, that match files and 91 | # directories to ignore when looking for source files. 92 | # This patterns also effect to html_static_path and html_extra_path 93 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 94 | 95 | # The reST default role (used for this markup: `text`) to use for all 96 | # documents. 97 | # 98 | # default_role = None 99 | 100 | # If true, '()' will be appended to :func: etc. cross-reference text. 101 | # 102 | # add_function_parentheses = True 103 | 104 | # If true, the current module name will be prepended to all description 105 | # unit titles (such as .. function::). 106 | # 107 | # add_module_names = True 108 | 109 | # If true, sectionauthor and moduleauthor directives will be shown in the 110 | # output. They are ignored by default. 111 | # 112 | # show_authors = False 113 | 114 | # The name of the Pygments (syntax highlighting) style to use. 115 | pygments_style = 'sphinx' 116 | 117 | # A list of ignored prefixes for module index sorting. 118 | # modindex_common_prefix = [] 119 | 120 | # If true, keep warnings as "system message" paragraphs in the built documents. 121 | # keep_warnings = False 122 | 123 | # If true, `todo` and `todoList` produce output, else they produce nothing. 124 | todo_include_todos = True 125 | 126 | 127 | # -- Options for HTML output ---------------------------------------------- 128 | 129 | # The theme to use for HTML and HTML Help pages. See the documentation for 130 | # a list of builtin themes. 131 | # 132 | html_theme = 'sphinx_rtd_theme' 133 | 134 | # Theme options are theme-specific and customize the look and feel of a theme 135 | # further. For a list of options available for each theme, see the 136 | # documentation. 137 | # 138 | html_theme_options = { 139 | 'collapse_navigation': False, 140 | 'display_version': False, 141 | } 142 | 143 | # Add any paths that contain custom themes here, relative to this directory. 144 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 145 | 146 | # The name for this set of Sphinx documents. 147 | # " v documentation" by default. 148 | # 149 | # html_title = 'sparksteps v0.3.0' 150 | 151 | # A shorter title for the navigation bar. Default is the same as html_title. 152 | # 153 | # html_short_title = None 154 | 155 | # The name of an image file (relative to this directory) to place at the top 156 | # of the sidebar. 157 | # 158 | # html_logo = None 159 | 160 | # The name of an image file (relative to this directory) to use as a favicon of 161 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 162 | # pixels large. 163 | # 164 | # html_favicon = None 165 | 166 | # Add any paths that contain custom static files (such as style sheets) here, 167 | # relative to this directory. They are copied after the builtin static files, 168 | # so a file named "default.css" will overwrite the builtin "default.css". 169 | html_static_path = ['_static'] 170 | 171 | # Add any extra paths that contain custom files (such as robots.txt or 172 | # .htaccess) here, relative to this directory. These files are copied 173 | # directly to the root of the documentation. 174 | # 175 | # html_extra_path = [] 176 | 177 | # If not None, a 'Last updated on:' timestamp is inserted at every page 178 | # bottom, using the given strftime format. 179 | # The empty string is equivalent to '%b %d, %Y'. 180 | # 181 | # html_last_updated_fmt = None 182 | 183 | # If true, SmartyPants will be used to convert quotes and dashes to 184 | # typographically correct entities. 185 | # 186 | # html_use_smartypants = True 187 | 188 | # Custom sidebar templates, maps document names to template names. 189 | # 190 | # html_sidebars = {} 191 | 192 | # Additional templates that should be rendered to pages, maps page names to 193 | # template names. 194 | # 195 | # html_additional_pages = {} 196 | 197 | # If false, no module index is generated. 198 | # 199 | # html_domain_indices = True 200 | 201 | # If false, no index is generated. 202 | # 203 | # html_use_index = True 204 | 205 | # If true, the index is split into individual pages for each letter. 206 | # 207 | # html_split_index = False 208 | 209 | # If true, links to the reST sources are added to the pages. 210 | # 211 | # html_show_sourcelink = True 212 | 213 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 214 | # 215 | # html_show_sphinx = True 216 | 217 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 218 | # 219 | # html_show_copyright = True 220 | 221 | # If true, an OpenSearch description file will be output, and all pages will 222 | # contain a tag referring to it. The value of this option must be the 223 | # base URL from which the finished HTML is served. 224 | # 225 | # html_use_opensearch = '' 226 | 227 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 228 | # html_file_suffix = None 229 | 230 | # Language to be used for generating the HTML full-text search index. 231 | # Sphinx supports the following languages: 232 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' 233 | # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' 234 | # 235 | # html_search_language = 'en' 236 | 237 | # A dictionary with options for the search language support, empty by default. 238 | # 'ja' uses this config value. 239 | # 'zh' user can custom change `jieba` dictionary path. 240 | # 241 | # html_search_options = {'type': 'default'} 242 | 243 | # The name of a javascript file (relative to the configuration directory) that 244 | # implements a search results scorer. If empty, the default will be used. 245 | # 246 | # html_search_scorer = 'scorer.js' 247 | 248 | # Output file base name for HTML help builder. 249 | htmlhelp_basename = 'sparkstepsdoc' 250 | 251 | # -- Options for LaTeX output --------------------------------------------- 252 | 253 | latex_elements = { 254 | # The paper size ('letterpaper' or 'a4paper'). 255 | # 256 | # 'papersize': 'letterpaper', 257 | 258 | # The font size ('10pt', '11pt' or '12pt'). 259 | # 260 | # 'pointsize': '10pt', 261 | 262 | # Additional stuff for the LaTeX preamble. 263 | # 264 | # 'preamble': '', 265 | 266 | # Latex figure (float) alignment 267 | # 268 | # 'figure_align': 'htbp', 269 | } 270 | 271 | # Grouping the document tree into LaTeX files. List of tuples 272 | # (source start file, target name, title, 273 | # author, documentclass [howto, manual, or own class]). 274 | latex_documents = [ 275 | (master_doc, 'sparksteps.tex', 'SparkSteps Documentation', 276 | 'Kamil Sindi', 'manual'), 277 | ] 278 | 279 | # The name of an image file (relative to this directory) to place at the top of 280 | # the title page. 281 | # 282 | # latex_logo = None 283 | 284 | # For "manual" documents, if this is true, then toplevel headings are parts, 285 | # not chapters. 286 | # 287 | # latex_use_parts = False 288 | 289 | # If true, show page references after internal links. 290 | # 291 | # latex_show_pagerefs = False 292 | 293 | # If true, show URL addresses after external links. 294 | # 295 | # latex_show_urls = False 296 | 297 | # Documents to append as an appendix to all manuals. 298 | # 299 | # latex_appendices = [] 300 | 301 | # It false, will not define \strong, \code, itleref, \crossref ... but only 302 | # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added 303 | # packages. 304 | # 305 | # latex_keep_old_macro_names = True 306 | 307 | # If false, no module index is generated. 308 | # 309 | # latex_domain_indices = True 310 | 311 | 312 | # -- Options for manual page output --------------------------------------- 313 | 314 | # One entry per manual page. List of tuples 315 | # (source start file, name, description, authors, manual section). 316 | man_pages = [ 317 | (master_doc, 'SparkSteps', 'SparkSteps Documentation', 318 | [author], 1) 319 | ] 320 | 321 | # If true, show URL addresses after external links. 322 | # 323 | # man_show_urls = False 324 | 325 | 326 | # -- Options for Texinfo output ------------------------------------------- 327 | 328 | # Grouping the document tree into Texinfo files. List of tuples 329 | # (source start file, target name, title, author, 330 | # dir menu entry, description, category) 331 | texinfo_documents = [ 332 | (master_doc, 'SparkSteps', 'SparkSteps Documentation', 333 | author, 'SparkSteps', 'Workflow tool to launch Spark jobs on AWS EMR.', 334 | 'Miscellaneous'), 335 | ] 336 | 337 | # Documents to append as an appendix to all manuals. 338 | # 339 | # texinfo_appendices = [] 340 | 341 | # If false, no module index is generated. 342 | # 343 | # texinfo_domain_indices = True 344 | 345 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 346 | # 347 | # texinfo_show_urls = 'footnote' 348 | 349 | # If true, do not generate a @detailmenu in the "Top" node's menu. 350 | # 351 | # texinfo_no_detailmenu = False 352 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2016 JW Player 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /sparksteps/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Create Spark cluster on EMR. 4 | 5 | Prompt parameters: 6 | app main spark script for submit spark (required) 7 | app-args: arguments passed to main spark script 8 | app-list: Applications to be installed on the EMR cluster (Default: Hadoop Spark) 9 | aws-region: AWS region name 10 | bid-price: specify bid price for task nodes 11 | bootstrap-script: include a bootstrap script (s3 path) 12 | cluster-id: job flow id of existing cluster to submit to 13 | debug: allow debugging of cluster 14 | defaults: cluster configurations of the form " key1=val1 key2=val2 ..." 15 | dynamic-pricing-master: use spot pricing for the master nodes. 16 | dynamic-pricing-core: use spot pricing for the core nodes. 17 | dynamic-pricing-task: use spot pricing for the task nodes. 18 | ebs-volume-size-core: size of the EBS volume to attach to core nodes in GiB. 19 | ebs-volume-type-core: type of the EBS volume to attach to core nodes (supported: [standard, gp2, io1]). 20 | ebs-volumes-per-core: the number of EBS volumes to attach per core node. 21 | ebs-optimized-core: whether to use EBS optimized volumes for core nodes. 22 | ebs-volume-size-task: size of the EBS volume to attach to task nodes in GiB. 23 | ebs-volume-type-task: type of the EBS volume to attach to task nodes. 24 | ebs-volumes-per-task: the number of EBS volumes to attach per task node. 25 | ebs-optimized-task: whether to use EBS optimized volumes for task nodes. 26 | ec2-key: name of the Amazon EC2 key pair 27 | ec2-subnet-id: Amazon VPC subnet id 28 | help (-h): argparse help 29 | jobflow-role: Amazon EC2 instance profile name to use (Default: EMR_EC2_DefaultRole) 30 | service-role: AWS IAM service role to use for EMR (Default: EMR_DefaultRole) 31 | keep-alive: whether to keep the EMR cluster alive when there are no steps 32 | log-level (-l): logging level (default=INFO) 33 | instance-type-master: instance type of of master host (default='m4.large') 34 | instance-type-core: instance type of the core nodes, must be set when num-core > 0 35 | instance-type-task: instance type of the task nodes, must be set when num-task > 0 36 | maximize-resource-allocation: sets the maximizeResourceAllocation property for the cluster to true when supplied. 37 | name: specify cluster name 38 | num-core: number of core nodes 39 | num-task: number of task nodes 40 | release-label: EMR release label 41 | s3-bucket: name of s3 bucket to upload spark file (required) 42 | s3-path: path (key prefix) within s3-bucket to use when uploading spark file 43 | s3-dist-cp: s3-dist-cp step after spark job is done 44 | submit-args: arguments passed to spark-submit 45 | tags: EMR cluster tags of the form "key1=value1 key2=value2" 46 | uploads: files to upload to /home/hadoop/ in master instance 47 | wait: poll until all steps are complete (or error) 48 | 49 | Examples: 50 | sparksteps examples/episodes.py \ 51 | --s3-bucket $AWS_S3_BUCKET \ 52 | --aws-region us-east-1 \ 53 | --release-label emr-4.7.0 \ 54 | --uploads examples/lib examples/episodes.avro \ 55 | --submit-args="--jars /home/hadoop/lib/spark-avro_2.10-2.0.2-custom.jar" \ 56 | --app-args="--input /home/hadoop/episodes.avro" \ 57 | --num-core 1 \ 58 | --instance-type-core m4.large \ 59 | --debug 60 | 61 | """ 62 | from __future__ import print_function 63 | 64 | import json 65 | import shlex 66 | import logging 67 | import argparse 68 | 69 | import boto3 70 | 71 | from sparksteps import steps 72 | from sparksteps import cluster 73 | from sparksteps import pricing 74 | from sparksteps.cluster import DEFAULT_APP_LIST, DEFAULT_JOBFLOW_ROLE, DEFAULT_SERVICE_ROLE 75 | from sparksteps.poll import wait_for_step_complete 76 | 77 | logger = logging.getLogger(__name__) 78 | LOGFORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' 79 | DEFAULT_SLEEP_INTERVAL_SECONDS = 150 80 | 81 | 82 | def create_parser(): 83 | parser = argparse.ArgumentParser( 84 | description=__doc__, 85 | formatter_class=argparse.RawDescriptionHelpFormatter 86 | ) 87 | 88 | parser.add_argument('app', metavar='FILE') 89 | parser.add_argument('--app-args', type=shlex.split) 90 | parser.add_argument('--app-list', nargs='*', default=DEFAULT_APP_LIST) 91 | parser.add_argument('--aws-region', required=True) 92 | parser.add_argument('--bid-price') 93 | parser.add_argument('--bootstrap-script') 94 | parser.add_argument('--cluster-id') 95 | parser.add_argument('--debug', action='store_true') 96 | parser.add_argument('--defaults', nargs='*') 97 | parser.add_argument('--ec2-key') 98 | parser.add_argument('--ec2-subnet-id') 99 | parser.add_argument('--jobflow-role', default=DEFAULT_JOBFLOW_ROLE) 100 | parser.add_argument('--service-role', default=DEFAULT_SERVICE_ROLE) 101 | parser.add_argument('--keep-alive', action='store_true') 102 | parser.add_argument('--log-level', '-l', type=str.upper, default='INFO') 103 | parser.add_argument('--name') 104 | parser.add_argument('--num-core', type=int) 105 | parser.add_argument('--num-task', type=int) 106 | parser.add_argument('--release-label', required=True) 107 | parser.add_argument('--s3-bucket', required=True) 108 | parser.add_argument('--s3-path', default='sparksteps/') 109 | parser.add_argument('--s3-dist-cp', type=shlex.split) 110 | parser.add_argument('--submit-args', type=shlex.split) 111 | parser.add_argument('--tags', nargs='*') 112 | parser.add_argument('--uploads', nargs='*') 113 | parser.add_argument('--maximize-resource-allocation', action='store_true') 114 | # TODO: wrap lines below in a for loop? 115 | parser.add_argument('--instance-type-master', default='m4.large') 116 | parser.add_argument('--instance-type-core') 117 | parser.add_argument('--instance-type-task') 118 | parser.add_argument('--dynamic-pricing-master', action='store_true') 119 | parser.add_argument('--dynamic-pricing-core', action='store_true') 120 | parser.add_argument('--dynamic-pricing-task', action='store_true') 121 | 122 | # EBS configuration 123 | parser.add_argument('--ebs-volume-size-core', type=int, default=0) 124 | parser.add_argument('--ebs-volume-type-core', type=str, default='standard') 125 | parser.add_argument('--ebs-volumes-per-core', type=int, default=1) 126 | parser.add_argument('--ebs-optimized-core', action='store_true') 127 | 128 | parser.add_argument('--ebs-volume-size-task', type=int, default=0) 129 | parser.add_argument('--ebs-volume-type-task', type=str, default='standard') 130 | parser.add_argument('--ebs-volumes-per-task', type=int, default=1) 131 | parser.add_argument('--ebs-optimized-task', action='store_true') 132 | 133 | # Wait configuration 134 | parser.add_argument('--wait', type=int, nargs='?', default=False) 135 | 136 | # Deprecated arguments 137 | parser.add_argument('--master') 138 | parser.add_argument('--slave') 139 | parser.add_argument('--dynamic-pricing', action='store_true') 140 | 141 | return parser 142 | 143 | 144 | def parse_cli_args(parser, args=None): 145 | """ 146 | Utilizes `parser` to parse command line variables and logs. 147 | """ 148 | args = vars(parser.parse_args(args)) 149 | 150 | # Perform sanitization on any arguments 151 | if args['s3_path'] and args['s3_path'].startswith('/'): 152 | raise ValueError( 153 | f"Provided value for s3-path \"{args['s3_path']}\" cannot have leading \"/\" character.") 154 | 155 | if args['wait'] is None: 156 | args['wait'] = DEFAULT_SLEEP_INTERVAL_SECONDS 157 | 158 | return args 159 | 160 | 161 | def determine_prices(args, ec2, pricing_client): 162 | """ 163 | Checks `args` in order to determine whether spot pricing should be 164 | used for instance groups within the EMR cluster, and if this is the 165 | case attempts to determine the optimal bid price. 166 | """ 167 | # Check if we need to do anything 168 | pricing_properties = ( 169 | 'dynamic_pricing_master', 'dynamic_pricing_core', 'dynamic_pricing_task') 170 | if not any([x in args for x in pricing_properties]): 171 | return args 172 | 173 | availability_zone = None 174 | subnet_id = args.get('ec2_subnet_id') 175 | if subnet_id: 176 | # We need to determine the AZ associated with the provided EC2 subnet ID 177 | # in order to look up spot prices in the correct region. 178 | availability_zone = pricing.get_availability_zone(ec2, subnet_id) 179 | if not availability_zone: 180 | logger.info("Could not determine availability zone for subnet '%s'", subnet_id) 181 | 182 | # Mutate a copy of args. 183 | args = args.copy() 184 | 185 | # Determine bid prices for the instance types for which we want to 186 | # use bid pricing. 187 | for price_property in pricing_properties: 188 | if price_property not in args: 189 | continue 190 | 191 | if args[price_property]: 192 | instance_type_key = price_property.replace( 193 | 'dynamic_pricing', 'instance_type') 194 | instance_type = args[instance_type_key] 195 | instance_group = price_property.replace('dynamic_pricing_', '') 196 | # TODO (rikheijdens): optimize by caching instance prices 197 | # between instance groups? 198 | bid_price, is_spot = pricing.get_bid_price(ec2, pricing_client, instance_type, availability_zone) 199 | if is_spot: 200 | logger.info("Using spot pricing with a bid price of $%.2f" 201 | " for %s instances in the %s instance group.", 202 | bid_price, instance_type, 203 | instance_group) 204 | bid_key = price_property.replace('dynamic_pricing', 'bid_price') 205 | args[bid_key] = str(bid_price) 206 | else: 207 | logger.info("Spot price for %s in the %s instance group too high." 208 | " Using on-demand price of $%.2f", 209 | instance_type, instance_group, bid_price) 210 | return args 211 | 212 | 213 | def main(): 214 | args_dict = parse_cli_args(create_parser()) 215 | print("Args: ", args_dict) 216 | 217 | numeric_level = getattr(logging, args_dict['log_level'], None) 218 | logging.basicConfig(format=LOGFORMAT) 219 | logging.getLogger('sparksteps').setLevel(numeric_level) 220 | 221 | client = boto3.client('emr', region_name=args_dict['aws_region']) 222 | s3 = boto3.resource('s3') 223 | 224 | cluster_id = args_dict.get('cluster_id') 225 | if cluster_id is None: 226 | logger.info("Launching cluster...") 227 | ec2_client = boto3.client('ec2', region_name=args_dict['aws_region']) 228 | pricing_client = boto3.client('pricing', region_name=args_dict['aws_region']) 229 | args_dict = determine_prices(args_dict, ec2_client, pricing_client) 230 | cluster_config = cluster.emr_config(**args_dict) 231 | response = client.run_job_flow(**cluster_config) 232 | cluster_id = response['JobFlowId'] 233 | logger.info("Cluster ID: %s", cluster_id) 234 | 235 | emr_steps = steps.setup_steps(s3, 236 | args_dict['s3_bucket'], 237 | args_dict['s3_path'], 238 | args_dict['app'], 239 | args_dict['submit_args'], 240 | args_dict['app_args'], 241 | args_dict['uploads'], 242 | args_dict['s3_dist_cp']) 243 | 244 | response = client.add_job_flow_steps(JobFlowId=cluster_id, Steps=emr_steps) 245 | 246 | try: 247 | step_ids = json.dumps(response['StepIds']) 248 | except KeyError: 249 | step_ids = 'Invalid response' 250 | args_dict['wait'] = False 251 | logger.info("Step IDs: %s", step_ids) 252 | 253 | sleep_interval = args_dict.get('wait') 254 | if sleep_interval: 255 | last_step_id = response['StepIds'][-1] 256 | logger.info('Polling until step {last_step} is complete using a sleep interval of {interval} seconds...' 257 | .format(last_step=last_step_id, interval=sleep_interval)) 258 | wait_for_step_complete(client, cluster_id, last_step_id, sleep_interval_s=int(sleep_interval)) 259 | -------------------------------------------------------------------------------- /tests/test_sparksteps.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Test SparkSteps.""" 3 | import shlex 4 | import os.path 5 | 6 | import boto3 7 | import moto 8 | 9 | from sparksteps.cluster import emr_config 10 | from sparksteps.steps import setup_steps, S3DistCp 11 | 12 | TEST_BUCKET = 'sparksteps-test' 13 | TEST_BUCKET_PATH = 'sparksteps/' 14 | AWS_REGION_NAME = 'us-east-1' 15 | 16 | DIR_PATH = os.path.dirname(os.path.realpath(__file__)) 17 | DATA_DIR = os.path.join(DIR_PATH, 'data') 18 | LIB_DIR = os.path.join(DATA_DIR, 'dir') 19 | EPISODES_APP = os.path.join(DATA_DIR, 'episodes.py') 20 | EPISODES_AVRO = os.path.join(DATA_DIR, 'episodes.avro') 21 | 22 | 23 | @moto.mock_emr 24 | def test_emr_cluster_config(): 25 | config = emr_config('emr-5.2.0', 26 | instance_type_master='m4.large', 27 | jobflow_role='MyCustomRole', 28 | service_role='MyServiceRole', 29 | keep_alive=False, 30 | instance_type_core='m4.2xlarge', 31 | instance_type_task='m4.2xlarge', 32 | num_core=1, 33 | num_task=1, 34 | bid_price_task='0.1', 35 | maximize_resource_allocation=True, 36 | name="Test SparkSteps", 37 | app_list=['hadoop', 'hive', 'spark']) 38 | assert config == {'Instances': 39 | {'InstanceGroups': [{'InstanceCount': 1, # NOQA: E127 40 | 'InstanceRole': 'MASTER', 41 | 'InstanceType': 'm4.large', 42 | 'Market': 'ON_DEMAND', 43 | 'Name': 'Master Node'}, 44 | {'InstanceCount': 1, 45 | 'InstanceRole': 'CORE', 46 | 'InstanceType': 'm4.2xlarge', 47 | 'Market': 'ON_DEMAND', 48 | 'Name': 'Core Nodes'}, 49 | {'BidPrice': '0.1', 50 | 'InstanceCount': 1, 51 | 'InstanceRole': 'TASK', 52 | 'InstanceType': 'm4.2xlarge', 53 | 'Market': 'SPOT', 54 | 'Name': 'Task Nodes'}], 55 | 'KeepJobFlowAliveWhenNoSteps': False, 56 | 'TerminationProtected': False 57 | }, 58 | 'Applications': [{'Name': 'Hadoop'}, {'Name': 'Hive'}, {'Name': 'Spark'}], 59 | 'Name': 'Test SparkSteps', 60 | 'JobFlowRole': 'MyCustomRole', 61 | 'ServiceRole': 'MyServiceRole', 62 | 'ReleaseLabel': 'emr-5.2.0', 63 | 'VisibleToAllUsers': True, 64 | 'Configurations': [{'Classification': 'spark', 65 | 'Properties': {'maximizeResourceAllocation': 'true'}}] 66 | } 67 | 68 | client = boto3.client('emr', region_name=AWS_REGION_NAME) 69 | client.run_job_flow(**config) 70 | 71 | 72 | @moto.mock_emr 73 | def test_emr_cluster_config_with_bootstrap(): 74 | config = emr_config('emr-5.2.0', 75 | instance_type_master='m4.large', 76 | keep_alive=False, 77 | instance_type_core='m4.2xlarge', 78 | instance_type_task='m4.2xlarge', 79 | num_core=1, 80 | num_task=1, 81 | bid_price_task='0.1', 82 | name="Test SparkSteps", 83 | bootstrap_script='s3://bucket/bootstrap-actions.sh') 84 | assert config == {'Instances': 85 | {'InstanceGroups': [{'InstanceCount': 1, # NOQA: E127 86 | 'InstanceRole': 'MASTER', 87 | 'InstanceType': 'm4.large', 88 | 'Market': 'ON_DEMAND', 89 | 'Name': 'Master Node'}, 90 | {'InstanceCount': 1, 91 | 'InstanceRole': 'CORE', 92 | 'InstanceType': 'm4.2xlarge', 93 | 'Market': 'ON_DEMAND', 94 | 'Name': 'Core Nodes'}, 95 | {'BidPrice': '0.1', 96 | 'InstanceCount': 1, 97 | 'InstanceRole': 'TASK', 98 | 'InstanceType': 'm4.2xlarge', 99 | 'Market': 'SPOT', 100 | 'Name': 'Task Nodes'}], 101 | 'KeepJobFlowAliveWhenNoSteps': False, 102 | 'TerminationProtected': False 103 | }, 104 | 'Applications': [{'Name': 'Hadoop'}, {'Name': 'Spark'}], 105 | 'BootstrapActions': [{'Name': 'bootstrap', 106 | 'ScriptBootstrapAction': {'Path': 's3://bucket/bootstrap-actions.sh'}}], 107 | 'Name': 'Test SparkSteps', 108 | 'JobFlowRole': 'EMR_EC2_DefaultRole', 109 | 'ReleaseLabel': 'emr-5.2.0', 110 | 'VisibleToAllUsers': True, 111 | 'ServiceRole': 'EMR_DefaultRole'} 112 | 113 | client = boto3.client('emr', region_name=AWS_REGION_NAME) 114 | client.run_job_flow(**config) 115 | 116 | 117 | @moto.mock_emr 118 | def test_emr_cluster_config_with_defaults(): 119 | config = emr_config('emr-5.2.0', 120 | instance_type_master='m4.large', 121 | keep_alive=False, 122 | instance_type_core='m4.2xlarge', 123 | instance_type_task='m4.2xlarge', 124 | num_core=1, 125 | num_task=1, 126 | bid_price_task='0.1', 127 | name="Test SparkSteps", 128 | defaults=['spark-defaults', 'spark.speculation=false', 129 | 'yarn-site', 'yarn.nodemanager.vmem-check-enabled=true']) 130 | print(config['Configurations']) 131 | assert config == { 132 | 'Instances': { 133 | 'InstanceGroups': [{'InstanceCount': 1, # NOQA: E127 134 | 'InstanceRole': 'MASTER', 135 | 'InstanceType': 'm4.large', 136 | 'Market': 'ON_DEMAND', 137 | 'Name': 'Master Node'}, 138 | {'InstanceCount': 1, 139 | 'InstanceRole': 'CORE', 140 | 'InstanceType': 'm4.2xlarge', 141 | 'Market': 'ON_DEMAND', 142 | 'Name': 'Core Nodes'}, 143 | {'BidPrice': '0.1', 144 | 'InstanceCount': 1, 145 | 'InstanceRole': 'TASK', 146 | 'InstanceType': 'm4.2xlarge', 147 | 'Market': 'SPOT', 148 | 'Name': 'Task Nodes'}], 149 | 'KeepJobFlowAliveWhenNoSteps': False, 150 | 'TerminationProtected': False 151 | }, 152 | 'Applications': [{'Name': 'Hadoop'}, {'Name': 'Spark'}], 153 | 'Configurations': [ 154 | { 155 | 'Classification': 'spark-defaults', 156 | 'Properties': { 157 | 'spark.speculation': 'false' 158 | } 159 | }, 160 | { 161 | 'Classification': 'yarn-site', 162 | 'Properties': { 163 | 'yarn.nodemanager.vmem-check-enabled': 'true' 164 | } 165 | } 166 | ], 167 | 'Name': 'Test SparkSteps', 168 | 'JobFlowRole': 'EMR_EC2_DefaultRole', 169 | 'ReleaseLabel': 'emr-5.2.0', 170 | 'VisibleToAllUsers': True, 171 | 'ServiceRole': 'EMR_DefaultRole' 172 | } 173 | 174 | client = boto3.client('emr', region_name=AWS_REGION_NAME) 175 | client.run_job_flow(**config) 176 | 177 | 178 | def test_emr_spot_cluster(): 179 | config = emr_config('emr-5.2.0', 180 | instance_type_master='m4.large', 181 | keep_alive=False, 182 | instance_type_core='c3.8xlarge', 183 | instance_type_task='c3.8xlarge', 184 | num_core=2, 185 | num_task=4, 186 | bid_price_master='0.05', 187 | bid_price_core='0.25', 188 | bid_price_task='0.1', 189 | name="Test SparkSteps", 190 | bootstrap_script='s3://bucket/bootstrap-actions.sh') 191 | assert config == {'Instances': 192 | {'InstanceGroups': [{'InstanceCount': 1, # NOQA: E127 193 | 'InstanceRole': 'MASTER', 194 | 'InstanceType': 'm4.large', 195 | 'Market': 'SPOT', 196 | 'BidPrice': '0.05', 197 | 'Name': 'Master Node'}, 198 | {'BidPrice': '0.25', 199 | 'InstanceCount': 2, 200 | 'InstanceRole': 'CORE', 201 | 'InstanceType': 'c3.8xlarge', 202 | 'Market': 'SPOT', 203 | 'Name': 'Core Nodes'}, 204 | {'BidPrice': '0.1', 205 | 'InstanceCount': 4, 206 | 'InstanceRole': 'TASK', 207 | 'InstanceType': 'c3.8xlarge', 208 | 'Market': 'SPOT', 209 | 'Name': 'Task Nodes'}], 210 | 'KeepJobFlowAliveWhenNoSteps': False, 211 | 'TerminationProtected': False 212 | }, 213 | 'Applications': [{'Name': 'Hadoop'}, {'Name': 'Spark'}], 214 | 'BootstrapActions': [{'Name': 'bootstrap', 215 | 'ScriptBootstrapAction': {'Path': 's3://bucket/bootstrap-actions.sh'}}], 216 | 'Name': 'Test SparkSteps', 217 | 'JobFlowRole': 'EMR_EC2_DefaultRole', 218 | 'ReleaseLabel': 'emr-5.2.0', 219 | 'VisibleToAllUsers': True, 220 | 'ServiceRole': 'EMR_DefaultRole'} 221 | 222 | 223 | def test_emr_ebs_storage(): 224 | config = emr_config('emr-5.2.0', 225 | instance_type_master='m4.large', 226 | keep_alive=False, 227 | instance_type_core='c3.8xlarge', 228 | instance_type_task='c3.8xlarge', 229 | ebs_volume_size_core=100, 230 | ebs_volume_type_core='gp2', 231 | ebs_volumes_per_core=2, 232 | ebs_volume_size_task=10, 233 | ebs_volume_type_task='io1', 234 | ebs_optimized_task=True, 235 | num_core=2, 236 | num_task=4, 237 | bid_price_master='0.05', 238 | bid_price_core='0.25', 239 | bid_price_task='0.1', 240 | name="Test SparkSteps", 241 | bootstrap_script='s3://bucket/bootstrap-actions.sh') 242 | assert config == {'Instances': 243 | {'InstanceGroups': [{'InstanceCount': 1, # NOQA: E127 244 | 'InstanceRole': 'MASTER', 245 | 'InstanceType': 'm4.large', 246 | 'Market': 'SPOT', 247 | 'BidPrice': '0.05', 248 | 'Name': 'Master Node'}, 249 | {'BidPrice': '0.25', 250 | 'InstanceCount': 2, 251 | 'InstanceRole': 'CORE', 252 | 'InstanceType': 'c3.8xlarge', 253 | 'Market': 'SPOT', 254 | 'Name': 'Core Nodes', 255 | 'EbsConfiguration': { 256 | 'EbsBlockDeviceConfigs': [{ 257 | 'VolumeSpecification': { 258 | 'VolumeType': 'gp2', 259 | 'SizeInGB': 100 260 | }, 261 | 'VolumesPerInstance': 2 262 | }], 263 | 'EbsOptimized': False 264 | }}, 265 | {'BidPrice': '0.1', 266 | 'InstanceCount': 4, 267 | 'InstanceRole': 'TASK', 268 | 'InstanceType': 'c3.8xlarge', 269 | 'Market': 'SPOT', 270 | 'Name': 'Task Nodes', 271 | 'EbsConfiguration': { 272 | 'EbsBlockDeviceConfigs': [{ 273 | 'VolumeSpecification': { 274 | 'VolumeType': 'io1', 275 | 'SizeInGB': 10 276 | }, 277 | 'VolumesPerInstance': 1 278 | }], 279 | 'EbsOptimized': True 280 | }}], 281 | 'KeepJobFlowAliveWhenNoSteps': False, 282 | 'TerminationProtected': False 283 | }, 284 | 'Applications': [{'Name': 'Hadoop'}, {'Name': 'Spark'}], 285 | 'BootstrapActions': [{'Name': 'bootstrap', 286 | 'ScriptBootstrapAction': {'Path': 's3://bucket/bootstrap-actions.sh'}}], 287 | 'Name': 'Test SparkSteps', 288 | 'JobFlowRole': 'EMR_EC2_DefaultRole', 289 | 'ReleaseLabel': 'emr-5.2.0', 290 | 'VisibleToAllUsers': True, 291 | 'ServiceRole': 'EMR_DefaultRole'} 292 | 293 | 294 | @moto.mock_s3 295 | def test_setup_steps(): 296 | s3 = boto3.resource('s3', region_name=AWS_REGION_NAME) 297 | s3.create_bucket(Bucket=TEST_BUCKET) 298 | steps = (setup_steps(s3, 299 | TEST_BUCKET, 300 | TEST_BUCKET_PATH, 301 | EPISODES_APP, 302 | submit_args="--jars /home/hadoop/dir/test.jar".split(), 303 | app_args="--input /home/hadoop/episodes.avro".split(), 304 | uploads=[LIB_DIR, EPISODES_AVRO]) 305 | ) 306 | assert steps == [ 307 | {'HadoopJarStep': {'Jar': 'command-runner.jar', 308 | 'Args': ['aws', 's3', 'cp', 309 | 's3://sparksteps-test/sparksteps/sources/dir.zip', 310 | '/home/hadoop/']}, 311 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 312 | 'Name': 'Copy dir.zip'}, 313 | {'HadoopJarStep': {'Jar': 'command-runner.jar', 314 | 'Args': ['unzip', '-o', '/home/hadoop/dir.zip', 315 | '-d', '/home/hadoop/dir']}, 316 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 317 | 'Name': 'Unzip dir.zip'}, 318 | {'HadoopJarStep': {'Jar': 'command-runner.jar', 319 | 'Args': ['aws', 's3', 'cp', 320 | 's3://sparksteps-test/sparksteps/sources/episodes.avro', 321 | '/home/hadoop/']}, 322 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 323 | 'Name': 'Copy episodes.avro'}, 324 | {'HadoopJarStep': {'Jar': 'command-runner.jar', 325 | 'Args': ['aws', 's3', 'cp', 326 | 's3://sparksteps-test/sparksteps/sources/episodes.py', 327 | '/home/hadoop/']}, 328 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 'Name': 'Copy episodes.py'}, 329 | {'HadoopJarStep': {'Jar': 'command-runner.jar', 330 | 'Args': ['spark-submit', '--jars', 331 | '/home/hadoop/dir/test.jar', 332 | '/home/hadoop/episodes.py', '--input', 333 | '/home/hadoop/episodes.avro']}, 334 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 335 | 'Name': 'Run episodes.py'}] 336 | 337 | 338 | @moto.mock_s3 339 | def test_setup_steps_non_existing_upload_file(): 340 | s3 = boto3.resource('s3', region_name=AWS_REGION_NAME) 341 | s3.create_bucket(Bucket=TEST_BUCKET) 342 | dne_file_path = os.path.join(DATA_DIR, 'does_not_exist.jar') 343 | try: 344 | setup_steps(s3, 345 | TEST_BUCKET, 346 | TEST_BUCKET_PATH, 347 | EPISODES_APP, 348 | submit_args="--jars /home/hadoop/dir/test.jar".split(), 349 | app_args="--input /home/hadoop/episodes.avro".split(), 350 | uploads=[dne_file_path]) 351 | except FileNotFoundError as e: 352 | assert str(e) == '{} does not exist (does not reference a valid file or path).'.format(dne_file_path) 353 | return 354 | assert False, 'Expected ValueError to be raised when `--uploads` parameter contains path to non-existing file or directory.' # NOQA: E501 355 | 356 | 357 | @moto.mock_s3 358 | def test_setup_steps_with_bucket_path(): 359 | s3 = boto3.resource('s3', region_name=AWS_REGION_NAME) 360 | s3.create_bucket(Bucket=TEST_BUCKET) 361 | steps = (setup_steps(s3, 362 | TEST_BUCKET, 363 | 'custom/path/prefix/', 364 | EPISODES_APP, 365 | submit_args="--jars /home/hadoop/dir/test.jar".split(), 366 | app_args="--input /home/hadoop/episodes.avro".split(), 367 | uploads=[LIB_DIR, EPISODES_AVRO, 's3://custom-bucket/custom/path/s3_file.py']) 368 | ) 369 | assert steps == [ 370 | {'HadoopJarStep': {'Jar': 'command-runner.jar', 371 | 'Args': ['aws', 's3', 'cp', 372 | 's3://sparksteps-test/custom/path/prefix/sources/dir.zip', 373 | '/home/hadoop/']}, 374 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 375 | 'Name': 'Copy dir.zip'}, 376 | {'HadoopJarStep': {'Jar': 'command-runner.jar', 377 | 'Args': ['unzip', '-o', '/home/hadoop/dir.zip', 378 | '-d', '/home/hadoop/dir']}, 379 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 380 | 'Name': 'Unzip dir.zip'}, 381 | {'HadoopJarStep': {'Jar': 'command-runner.jar', 382 | 'Args': ['aws', 's3', 'cp', 383 | 's3://sparksteps-test/custom/path/prefix/sources/episodes.avro', 384 | '/home/hadoop/']}, 385 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 386 | 'Name': 'Copy episodes.avro'}, 387 | {'HadoopJarStep': {'Jar': 'command-runner.jar', 388 | 'Args': ['aws', 's3', 'cp', 389 | 's3://custom-bucket/custom/path/s3_file.py', 390 | '/home/hadoop/']}, 391 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 'Name': 'Copy s3_file.py'}, 392 | {'HadoopJarStep': {'Jar': 'command-runner.jar', 393 | 'Args': ['aws', 's3', 'cp', 394 | 's3://sparksteps-test/custom/path/prefix/sources/episodes.py', 395 | '/home/hadoop/']}, 396 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 'Name': 'Copy episodes.py'}, 397 | {'HadoopJarStep': {'Jar': 'command-runner.jar', 398 | 'Args': ['spark-submit', '--jars', 399 | '/home/hadoop/dir/test.jar', 400 | '/home/hadoop/episodes.py', '--input', 401 | '/home/hadoop/episodes.avro']}, 402 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 403 | 'Name': 'Run episodes.py'}] 404 | 405 | 406 | def test_s3_dist_cp_step(): 407 | splitted = shlex.split( 408 | "--s3Endpoint=s3.amazonaws.com --src=s3://mybucket/logs/j-3GYXXXXXX9IOJ/node/ --dest=hdfs:///output --srcPattern=.*[a-zA-Z,]+") # NOQA: E501 409 | assert S3DistCp(splitted).step == { 410 | 'ActionOnFailure': 'CONTINUE', 411 | 'HadoopJarStep': { 412 | 'Args': ['s3-dist-cp', 413 | '--s3Endpoint=s3.amazonaws.com', 414 | '--src=s3://mybucket/logs/j-3GYXXXXXX9IOJ/node/', 415 | '--dest=hdfs:///output', 416 | '--srcPattern=.*[a-zA-Z,]+'], 417 | 'Jar': 'command-runner.jar'}, 418 | 'Name': 'S3DistCp step' 419 | } 420 | --------------------------------------------------------------------------------