├── tests
    ├── __init__.py
    ├── data
    │   ├── dir
    │   │   └── test.jar
    │   ├── episodes.avro
    │   └── episodes.py
    ├── test_pricing.py
    ├── test_parser.py
    ├── test_poll.py
    └── test_sparksteps.py
├── sparksteps
    ├── __init__.py
    ├── poll.py
    ├── cluster.py
    ├── steps.py
    ├── pricing.py
    └── __main__.py
├── docs
    ├── overview.rst
    ├── modules.rst
    ├── index.rst
    ├── sparksteps.rst
    ├── Makefile
    ├── make.bat
    └── conf.py
├── .coveragerc
├── examples
    ├── episodes.avro
    ├── lib
    │   └── spark-avro_2.10-2.0.2-custom.jar
    ├── episodes.py
    └── wordcount.py
├── MANIFEST.in
├── requirements.txt
├── setup.cfg
├── .github
    └── workflows
    │   └── tests.yml
├── bootstrap
    └── install-jupyter-notebook.sh
├── tox.ini
├── setup.py
├── Makefile
├── CHANGELOG.rst
├── .gitignore
├── CODE_OF_CONDUCT.md
├── README.rst
└── LICENSE


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sparksteps/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/data/dir/test.jar:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/overview.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | # .coveragerc to control coverage.py
2 | [run]
3 | branch=False
4 | source=sparksteps
5 | 


--------------------------------------------------------------------------------
/examples/episodes.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jwplayer/sparksteps/HEAD/examples/episodes.avro


--------------------------------------------------------------------------------
/tests/data/episodes.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jwplayer/sparksteps/HEAD/tests/data/episodes.avro


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | sparksteps
2 | ==========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    sparksteps
8 | 


--------------------------------------------------------------------------------
/examples/lib/spark-avro_2.10-2.0.2-custom.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jwplayer/sparksteps/HEAD/examples/lib/spark-avro_2.10-2.0.2-custom.jar


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE README.rst CHANGELOG.rst requirements.txt tox.ini
2 | prune examples*
3 | prune bootstrap*
4 | recursive-include tests *
5 | recursive-exclude docs/_build *


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | boto3==1.14.46
 2 | botocore==1.17.46
 3 | docutils==0.15.2
 4 | jmespath==0.10.0
 5 | polling==0.3.0
 6 | python-dateutil==2.8.1
 7 | s3transfer==0.3.3
 8 | six==1.15.0
 9 | urllib3==1.26.5
10 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | SparkSteps: Launch Spark jobs on AWS EMR
 2 | ========================================
 3 | 
 4 | .. only:: html
 5 | 
 6 |     :Release: |version|
 7 |     :Date: |today|
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 | 
12 |    overview
13 | 
14 | 
15 | Indices and tables
16 | ==================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 | 
22 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | description-file=README.rst
 3 | 
 4 | [aliases]
 5 | test=pytest
 6 | 
 7 | [tool:pytest]
 8 | addopts=-vv --flake8 -m 'not integration'
 9 | markers=
10 |     integration
11 | 
12 | [flake8]
13 | max-line-length=120
14 | exclude=
15 |     .git,
16 |     __pycache__,
17 |     docs/conf.py,
18 |     old,
19 |     build,
20 |     dist
21 | 
22 | [bdist_wheel]
23 | universal=1
24 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |   pull_request:
 7 | jobs:
 8 |   test:
 9 |     name: Test sparksteps on Python ${{ matrix.python_version }}
10 |     strategy:
11 |       matrix:
12 |         python_version: [3.6, 3.7, 3.8]
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v2
16 |       - name: Set up Python ${{ matrix.python_version }}
17 |         uses: actions/setup-python@v2
18 |         with:
19 |           python-version: ${{ matrix.python_version }}
20 |       - name: Run tests on Python ${{ matrix.python_version }}
21 |         run: make test
22 | 


--------------------------------------------------------------------------------
/docs/sparksteps.rst:
--------------------------------------------------------------------------------
 1 | sparksteps package
 2 | ==================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | sparksteps.cluster module
 8 | -------------------------
 9 | 
10 | .. automodule:: sparksteps.cluster
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | sparksteps.pricing module
16 | -------------------------
17 | 
18 | .. automodule:: sparksteps.pricing
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | sparksteps.steps module
24 | -----------------------
25 | 
26 | .. automodule:: sparksteps.steps
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | 
32 | Module contents
33 | ---------------
34 | 
35 | .. automodule:: sparksteps
36 |     :members:
37 |     :undoc-members:
38 |     :show-inheritance:
39 | 


--------------------------------------------------------------------------------
/bootstrap/install-jupyter-notebook.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x -e
 3 | 
 4 | 
 5 | #Installing iPython Notebook
 6 | if grep isMaster /mnt/var/lib/info/instance.json | grep true;
 7 | then
 8 | cd /home/hadoop
 9 | sudo pip install virtualenv
10 | mkdir Jupyter
11 | cd Jupyter
12 | /usr/bin/virtualenv -p /usr/bin/python2.7 venv
13 | source venv/bin/activate
14 | 
15 | #Install jupyter and dependency
16 | pip install --upgrade pip
17 | pip install jupyter requests numpy matplotlib s3cmd
18 | 
19 | #Create profile
20 | # jupyter profile create default
21 | jupyter notebook --generate-config
22 | 
23 | #Run on master /slave based on configuration
24 | 
25 | echo "c = get_config()" >  /home/hadoop/.jupyter/jupyter_notebook_config.py
26 | echo "c.NotebookApp.ip = '*'" >>  /home/hadoop/.jupyter/jupyter_notebook_config.py
27 | echo "c.NotebookApp.open_browser = False"  >>  /home/hadoop/.jupyter/jupyter_notebook_config.py
28 | echo "c.NotebookApp.port = 8192" >>  /home/hadoop/.jupyter/jupyter_notebook_config.py
29 | 
30 | fi
31 | 


--------------------------------------------------------------------------------
/examples/episodes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """episodes.py test script.
 3 | 
 4 | Prompt parameters:
 5 |   help (-h):    argparse help
 6 |   input (-i):   input path
 7 | 
 8 | Examples:
 9 |   $ spark-submit \
10 |       --packages com.databricks:spark-avro_2.10:2.0.1 \
11 |       episodes.py \
12 |       --input episodes.avro
13 | 
14 | """
15 | 
16 | import ntpath
17 | from argparse import RawDescriptionHelpFormatter, ArgumentParser
18 | import subprocess
19 | 
20 | from pyspark import SparkContext
21 | from pyspark.sql import SQLContext
22 | 
23 | parser = ArgumentParser(description=__doc__,
24 |                         formatter_class=RawDescriptionHelpFormatter)
25 | parser.add_argument('--input', '-i', required=True)
26 | args = parser.parse_args()
27 | 
28 | if __name__ == "__main__":
29 |     in_path = args.input
30 | 
31 |     filename = ntpath.basename(in_path)
32 |     subprocess.call(["hadoop", "fs", "-put", in_path, filename])
33 | 
34 |     sc = SparkContext(appName="Episodes")
35 |     sqlContext = SQLContext(sc)
36 |     df = sqlContext.read.format("com.databricks.spark.avro").load(filename)
37 |     df.first()
38 | 


--------------------------------------------------------------------------------
/tests/data/episodes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """episodes.py test script.
 3 | 
 4 | Prompt parameters:
 5 |   help (-h):    argparse help
 6 |   input (-i):   input path
 7 | 
 8 | Examples:
 9 |   $ spark-submit \
10 |       --packages com.databricks:spark-avro_2.10:2.0.1 \
11 |       episodes.py \
12 |       --input episodes.avro
13 | 
14 | """
15 | 
16 | import ntpath
17 | from argparse import RawDescriptionHelpFormatter, ArgumentParser
18 | import subprocess
19 | 
20 | from pyspark import SparkContext
21 | from pyspark.sql import SQLContext
22 | 
23 | parser = ArgumentParser(description=__doc__,
24 |                         formatter_class=RawDescriptionHelpFormatter)
25 | parser.add_argument('--input', '-i', required=True)
26 | args = parser.parse_args()
27 | 
28 | if __name__ == "__main__":
29 |     in_path = args.input
30 | 
31 |     filename = ntpath.basename(in_path)
32 |     subprocess.call(["hadoop", "fs", "-put", in_path, filename])
33 | 
34 |     sc = SparkContext(appName="Episodes")
35 |     sqlContext = SQLContext(sc)
36 |     df = sqlContext.read.format("com.databricks.spark.avro").load(filename)
37 |     df.first()
38 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = docs,flake8,py36,py37,py38
 3 | skipsdist=True
 4 | 
 5 | [testenv]
 6 | usedevelop=True
 7 | deps=-r{toxinidir}/requirements.txt
 8 | commands=
 9 |     {envpython} setup.py test
10 | setenv=
11 |     PYTHONWARNINGS=always::DeprecationWarning
12 | passenv=TRAVIS
13 | 
14 | [testenv:flake8]
15 | usedevelop=False
16 | skip_install=True
17 | deps=
18 |     flake8
19 | commands=
20 |     flake8 --version
21 |     flake8 setup.py sparksteps
22 | 
23 | [testenv:docs]
24 | # only work if first run python setup.py develop
25 | commands=
26 |     rm -rf {toxinidir}/docs/_build
27 |     make -C {toxinidir}/docs html
28 | whitelist_externals=
29 |     rm
30 |     make
31 | 
32 | [testenv:upload]
33 | deps=wheel
34 |      twine
35 | commands=
36 |     python setup.py clean --all rotate -k - -m .whl,.tar.gz,.zip
37 |     python setup.py -q egg_info
38 |     python setup.py -q sdist --formats zip bdist_wheel register
39 | 
40 | [testenv:dist]
41 | deps= wheel
42 | whitelist_externals = rm
43 | commands=
44 |     python setup.py -q clean --all
45 |     python setup.py -q rotate -k 0 -m .egg,.zip,.whl,.tar.gz
46 |     python setup.py -q egg_info
47 |     python setup.py -q sdist --formats zip,bztar bdist_wheel upload
48 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Distutils setup file, used to install or test 'sparksteps'."""
 3 | import textwrap
 4 | 
 5 | from setuptools import setup, find_packages
 6 | 
 7 | with open('README.rst') as f:
 8 |     readme = f.read()
 9 | 
10 | setup(
11 |     name='sparksteps',
12 |     description='Workflow tool to launch Spark jobs on AWS EMR',
13 |     long_description=readme,
14 |     packages=find_packages(exclude=['tests', 'examples', 'bootstrap']),
15 |     use_scm_version=True,
16 |     author='Kamil Sindi',
17 |     author_email='kamil@jwplayer.com',
18 |     url='https://github.com/jwplayer/sparksteps',
19 |     keywords='aws emr pyspark spark boto'.split(),
20 |     license='Apache License 2.0',
21 |     install_requires=[
22 |         'boto3>=1.3.1',
23 |         'polling==0.3.0'
24 |     ],
25 |     setup_requires=[
26 |         'setuptools_scm',
27 |         'sphinx_rtd_theme',
28 |     ],
29 |     include_package_data=True,
30 |     zip_safe=False,
31 |     entry_points={
32 |         'console_scripts': [
33 |             'sparksteps=sparksteps.__main__:main'
34 |         ]
35 |     },
36 |     classifiers=textwrap.dedent("""
37 |         Development Status :: 4 - Beta
38 |         Intended Audience :: Developers
39 |         License :: OSI Approved :: Apache Software License
40 |         Environment :: Console
41 |         Programming Language :: Python :: 3.6
42 |         Programming Language :: Python :: 3.7
43 |         Programming Language :: Python :: 3.8
44 |         """).strip().splitlines(),
45 |     python_requires='>=3.6'
46 | )
47 | 


--------------------------------------------------------------------------------
/examples/wordcount.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | from __future__ import print_function
19 | 
20 | import sys
21 | from operator import add
22 | 
23 | from pyspark.sql import SparkSession
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     if len(sys.argv) != 2:
28 |         print("Usage: wordcount <file>", file=sys.stderr)
29 |         exit(-1)
30 | 
31 |     spark = SparkSession\
32 |         .builder\
33 |         .appName("PythonWordCount")\
34 |         .getOrCreate()
35 | 
36 |     lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
37 |     counts = lines.flatMap(lambda x: x.split(' ')) \
38 |                   .map(lambda x: (x, 1)) \
39 |                   .reduceByKey(add)
40 |     output = counts.collect()
41 |     for (word, count) in output:
42 |         print("%s: %i" % (word, count))
43 | 
44 |     spark.stop()
45 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean-pyc clean-build docs clean build install install-all version
 2 | 
 3 | help:
 4 | 	@echo "clean-build - remove build artifacts"
 5 | 	@echo "clean-test - remove Python file artifacts"
 6 | 	@echo "clean-eggs - remove cached eggs"
 7 | 	@echo "build - build package"
 8 | 	@echo "lint - check style with flake8"
 9 | 	@echo "test - run tests quickly with the default Python"
10 | 	@echo "test-all - run tests on every Python version with tox"
11 | 	@echo "docs - generate Sphinx HTML documentation, including API docs"
12 | 	@echo "release - package and upload a release"
13 | 	@echo "dist - package"
14 | 
15 | clean: clean-build clean-test clean-eggs
16 | 	rm -rf htmlcov/
17 | 
18 | clean-build:
19 | 	rm -rf build/
20 | 	rm -rf dist/
21 | 	rm -rf *.egg-info
22 | 
23 | .PHONY: clean-test
24 | clean-test:
25 | 	find . | grep -E "(__pycache__|\.pyc|\.pyo$$)" | xargs rm -rf
26 | 	rm -rf .pytest_cache/
27 | 
28 | .PHONY: clean-eggs
29 | clean-eggs:
30 | 	rm -rf .eggs/
31 | 
32 | .PHONY: build
33 | build: clean-build clean-eggs
34 | 	python3 setup.py build_ext --inplace
35 | 
36 | install: clean-build
37 | 	python3 setup.py install
38 | 
39 | install-all:
40 | 	pip install -e .[all]
41 | 
42 | lint:
43 | 	pytest --flake8 sparksteps tests
44 | 
45 | test: install-all
46 | 	pip install -U \
47 | 		pytest \
48 | 		pytest-flake8 \
49 | 		moto
50 | 	python3 -m pytest
51 | 
52 | test-all:
53 | 	tox
54 | 
55 | version:
56 | 	python setup.py --version
57 | 
58 | docs:
59 | 	rm -f docs/sparksteps.rst
60 | 	rm -f docs/modules.rst
61 | 	sphinx-apidoc -o docs/ sparksteps
62 | 	$(MAKE) -C docs clean
63 | 	$(MAKE) -C docs html
64 | 	xdg-open docs/_build/html/index.html
65 | 
66 | .PHONY: release
67 | release: clean build
68 | 	python3 setup.py sdist bdist_wheel
69 | 	twine check dist/*
70 | 	twine upload --verbose dist/*
71 | 
72 | .PHONY: dist
73 | dist: clean build
74 | 	python3 setup.py sdist bdist_wheel
75 | 	twine check dist/*
76 | 


--------------------------------------------------------------------------------
/sparksteps/poll.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Utilities for polling for cluster status to determine if it's in a terminal state.
 4 | """
 5 | import logging
 6 | from polling import poll
 7 | 
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | NON_TERMINAL_STATES = frozenset(['PENDING', 'RUNNING', 'CONTINUE', 'CANCEL_PENDING'])
12 | FAILED_STATE = frozenset(['CANCELLED', 'FAILED', 'INTERRUPTED'])
13 | 
14 | 
15 | def failure_message_from_response(response):
16 |     """
17 |     Given EMR response, returns a descriptive error message
18 |     """
19 |     fail_details = response['Step']['Status'].get('FailureDetails')
20 |     if fail_details:
21 |         return 'for reason {} with message {} and log file {}'\
22 |             .format(
23 |                 fail_details.get('Reason'),
24 |                 fail_details.get('Message'),
25 |                 fail_details.get('LogFile')
26 |             )
27 | 
28 | 
29 | def is_step_complete(emr_client, jobflow_id, step_id):
30 |     """
31 |     Will query EMR for step status, returns True if complete, False otherwise
32 |     """
33 |     response = emr_client.describe_step(ClusterId=jobflow_id, StepId=step_id)
34 | 
35 |     if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
36 |         logger.info('Bad HTTP response: %s', response)
37 |         return False
38 | 
39 |     state = response['Step']['Status']['State']
40 |     logger.info('Job flow currently %s', state)
41 | 
42 |     if state in NON_TERMINAL_STATES:
43 |         return False
44 | 
45 |     if state in FAILED_STATE:
46 |         final_message = 'EMR job failed'
47 |         failure_message = failure_message_from_response(response)
48 |         if failure_message:
49 |             final_message += ' ' + failure_message
50 |         raise Exception(final_message)
51 | 
52 |     return True
53 | 
54 | 
55 | def wait_for_step_complete(emr_client, jobflow_id, step_id, sleep_interval_s):
56 |     """
57 |     Will poll EMR until provided step has a terminal status
58 |     """
59 |     poll(
60 |         is_step_complete,
61 |         args=(emr_client, jobflow_id, step_id),
62 |         step=sleep_interval_s,
63 |         poll_forever=True
64 |     )
65 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | .. :changelog:
 2 | 
 3 | Changelog
 4 | =========
 5 | 
 6 | Releases
 7 | --------
 8 | 
 9 | v3.0.1 (2020-12-23)
10 | ~~~~~~~~~~~~~~~~~~~
11 | 
12 | * Fixed an issue where `get_bid_price` would always base the instance bid price on the zone with the lowest current instance price, even though the cluster may not be launched in that AZ.
13 | 
14 | v3.0.0 (2020-08-20)
15 | ~~~~~~~~~~~~~~~~~~~
16 | 
17 | * Fix `determine_best_price` returning a spot price that would be below the current spot price in some conditions.
18 | * Dropped support for Python 3.5.
19 | 
20 | 
21 | v2.2.1 (2019-11-04)
22 | ~~~~~~~~~~~~~~~~~~~
23 | 
24 | * Fix `get_demand_price` returning 0.00 for various instance types.
25 | 
26 | 
27 | v2.2.0 (2019-09-19)
28 | ~~~~~~~~~~~~~~~~~~~
29 | 
30 | * Support S3 paths in the `uploads` CLI option. A copy step will be added to the EMR cluster which will copy into /home/hadoop from the provided remote path.
31 | * Add option `--service-role` to configure EMR service role beyond the default `EMR_DefaultRole`.
32 | 
33 | 
34 | v2.1.0 (2019-08-27)
35 | ~~~~~~~~~~~~~~~~~~~
36 | 
37 | * Add `wait` CLI option. When `--wait` is passed, waits for EMR cluster steps to complete before application exits, sleeping 150 seconds (default) between each poll attempt. An optional integer value can be passed to specify the polling interval to use, in seconds.
38 | 
39 | 
40 | v2.0.0 (2019-07-31)
41 | ~~~~~~~~~~~~~~~~~~~
42 | 
43 | * Add `s3-path` CLI argument to optionally configure the path prefix used when writing sparksteps related assets such as sources (file uploads) and logs.
44 | 
45 | **NOTE:** This is a backwards incompatible change as `sources/` and `logs/` are now written to the location specified by the `s3-path` argument.
46 | Prior to this change logs were written to `s3://S3_BUCKET/logs/sparksteps` and uploads to `s3://S3_BUCKET/sparksteps/sources`.
47 | 
48 | 
49 | v1.1.1 (2019-07-22)
50 | ~~~~~~~~~~~~~~~~~~~
51 | 
52 | * Raise an error if one of the file or directory paths provided do not exist
53 | 
54 | 
55 | v1.1.0 (2019-07-13)
56 | ~~~~~~~~~~~~~~~~~~~
57 | 
58 | * Add `jobflow_role` CLI argument to configure cluster EC2 Instance Profile
59 | * Add `app-list` CLI argument to configure list of Applications installed on cluster
60 | 
61 | 
62 | v1.0.0 (2019-07-03)
63 | ~~~~~~~~~~~~~~~~~~~
64 | 
65 | * Drop support for Python 2
66 | * `defaults` CLI parameter value schema to support arbitrary classifications
67 | 
68 | 
69 | v0.4.0 (2017-01-03)
70 | ~~~~~~~~~~~~~~~~~~~
71 | 
72 | * First upload to PyPI.
73 | 


--------------------------------------------------------------------------------
/tests/test_pricing.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unit/Integration Tests for the pricing module.
 3 | 
 4 | Integration Tests (i.e tests that perform actual queries / make HTTP requests)
 5 |  are marked appropriately using PyTest markers.
 6 | """
 7 | import pytest
 8 | 
 9 | import boto3
10 | 
11 | from sparksteps.pricing import get_bid_price, get_demand_price, determine_best_price, Zone
12 | 
13 | # The price for an m4.large on-demand Linux instance in us-east-1.
14 | M4_LARGE_OD_PRICE = 0.100000
15 | 
16 | 
17 | @pytest.fixture
18 | def ec2():
19 |     """
20 |     In order to test pricing mechanics, we need to be able to make actual requests AWS.
21 |     Since we're actually communicating with AWS here this makes this tests using this fixture
22 |      more of an integration test than a unit test.
23 |     """
24 |     client = boto3.client('ec2')
25 |     return client
26 | 
27 | 
28 | @pytest.fixture
29 | def pricing_client():
30 |     """
31 |     Boto3 Pricing Client.
32 |     """
33 |     return boto3.client('pricing')
34 | 
35 | 
36 | @pytest.mark.integration
37 | class TestPricingIntegration:
38 |     def test_get_demand_price(self, pricing_client):
39 |         price = get_demand_price(pricing_client, 'm4.large')
40 |         # Note: this test assumes that AWS doesn't
41 |         # change their on-demand price.
42 |         assert price == M4_LARGE_OD_PRICE
43 | 
44 |     def test_get_bid_price(self, ec2, pricing_client):
45 |         bid_price, is_spot = get_bid_price(ec2, pricing_client, 'm4.large')
46 |         if is_spot:
47 |             assert bid_price > 0.
48 |         else:
49 |             assert bid_price == get_demand_price('us-east-1', 'm4.large')
50 | 
51 | 
52 | class TestPricing:
53 |     def test_determine_best_spot_price(self):
54 |         aws_zone = Zone('us-east-1d', 0.90, 0.83, (0.9+0.83) / 2, 0.8617)
55 |         # on-demand price for c5d.9xlarge nodes in us-east-1
56 |         demand_price = 1.728
57 |         bid_price, use_spot = determine_best_price(demand_price, aws_zone)
58 |         assert use_spot is True
59 |         assert bid_price > aws_zone.current
60 | 
61 |     def test_determine_best_price(self):
62 |         demand_price = 1.728
63 |         aws_zone = Zone('us-east-1a', demand_price, demand_price, demand_price, demand_price)
64 |         # If the spot price is very close to the on-demand price,
65 |         # then we should just be using on-demand pricing instead.
66 |         bid_price, use_spot = determine_best_price(demand_price, aws_zone)
67 |         assert use_spot is False
68 |         assert bid_price == demand_price
69 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # IPython Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # dotenv
 79 | .env
 80 | 
 81 | # virtualenv
 82 | venv/
 83 | ENV/
 84 | 
 85 | # Spyder project settings
 86 | .spyderproject
 87 | 
 88 | # Rope project settings
 89 | .ropeproject
 90 | 
 91 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 92 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 93 | 
 94 | # User-specific stuff:
 95 | .idea/workspace.xml
 96 | .idea/tasks.xml
 97 | .idea/dictionaries
 98 | .idea/vcs.xml
 99 | .idea/jsLibraryMappings.xml
100 | 
101 | # Sensitive or high-churn files:
102 | .idea/dataSources.ids
103 | .idea/dataSources.xml
104 | .idea/dataSources.local.xml
105 | .idea/sqlDataSources.xml
106 | .idea/dynamic.xml
107 | .idea/uiDesigner.xml
108 | 
109 | # Gradle:
110 | .idea/gradle.xml
111 | .idea/libraries
112 | 
113 | # Mongo Explorer plugin:
114 | .idea/mongoSettings.xml
115 | 
116 | ## File-based project format:
117 | *.iws
118 | 
119 | ## Plugin-specific files:
120 | 
121 | # IntelliJ
122 | /out/
123 | 
124 | # mpeltonen/sbt-idea plugin
125 | .idea_modules/
126 | 
127 | # JIRA plugin
128 | atlassian-ide-plugin.xml
129 | 
130 | # Crashlytics plugin (for Android Studio and IntelliJ)
131 | com_crashlytics_export_strings.xml
132 | crashlytics.properties
133 | crashlytics-build.properties
134 | fabric.properties
135 | 
136 | !examples/lib
137 | 
138 | .idea/
139 | 


--------------------------------------------------------------------------------
/tests/test_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Test Parser."""
 3 | import shlex
 4 | from sparksteps import __main__
 5 | 
 6 | 
 7 | def test_parser():
 8 |     parser = __main__.create_parser()
 9 |     cmd_args_str = """episodes.py \
10 |       --jobflow-role MyCustomRole \
11 |       --s3-bucket my-bucket \
12 |       --aws-region us-east-1 \
13 |       --release-label emr-4.7.0 \
14 |       --uploads examples/dir examples/episodes.avro \
15 |       --submit-args="--jars /home/hadoop/lib/spark-avro_2.10-2.0.2.jar" \
16 |       --app-args="--input /home/hadoop/episodes.avro" \
17 |       --app-list Hadoop Hive Spark \
18 |       --num-core 1 \
19 |       --tags Name=MyName CostCenter=MyCostCenter \
20 |       --defaults spark-defaults key=value another_key=another_value \
21 |       --maximize-resource-allocation \
22 |       --debug \
23 |       --wait
24 |     """
25 |     args = __main__.parse_cli_args(parser, args=shlex.split(cmd_args_str))
26 |     assert args['app'] == 'episodes.py'
27 |     assert args['jobflow_role'] == 'MyCustomRole'
28 |     assert args['s3_bucket'] == 'my-bucket'
29 |     assert args['app_args'] == ['--input', '/home/hadoop/episodes.avro']
30 |     assert args['app_list'] == ['Hadoop', 'Hive', 'Spark']
31 |     assert args['debug'] is True
32 |     assert args['defaults'] == ['spark-defaults', 'key=value', 'another_key=another_value']
33 |     assert args['instance_type_master'] == 'm4.large'
34 |     assert args['release_label'] == 'emr-4.7.0'
35 |     assert args['submit_args'] == ['--jars',
36 |                                    '/home/hadoop/lib/spark-avro_2.10-2.0.2.jar']
37 |     assert args['uploads'] == ['examples/dir', 'examples/episodes.avro']
38 |     assert args['tags'] == ['Name=MyName', 'CostCenter=MyCostCenter']
39 |     assert args['maximize_resource_allocation'] is True
40 |     assert args['num_core'] == 1
41 |     assert args['wait'] == 150
42 | 
43 | 
44 | def test_parser_with_bootstrap():
45 |     parser = __main__.create_parser()
46 |     cmd_args_str = """episodes.py \
47 |       --s3-bucket my-bucket \
48 |       --aws-region us-east-1 \
49 |       --release-label emr-4.7.0 \
50 |       --uploads examples/dir examples/episodes.avro \
51 |       --submit-args="--jars /home/hadoop/lib/spark-avro_2.10-2.0.2.jar" \
52 |       --app-args="--input /home/hadoop/episodes.avro" \
53 |       --num-core 1 \
54 |       --tags Name=MyName CostCenter=MyCostCenter \
55 |       --defaults spark-defaults key=value another_key=another_value \
56 |       --bootstrap-script s3://bucket/bootstrap-actions.sh \
57 |       --debug
58 |     """
59 |     args = __main__.parse_cli_args(parser, args=shlex.split(cmd_args_str))
60 |     assert args['app'] == 'episodes.py'
61 |     assert args['s3_bucket'] == 'my-bucket'
62 |     assert args['app_args'] == ['--input', '/home/hadoop/episodes.avro']
63 |     assert args['debug'] is True
64 |     assert args['defaults'] == ['spark-defaults', 'key=value', 'another_key=another_value']
65 |     assert args['instance_type_master'] == 'm4.large'
66 |     assert args['release_label'] == 'emr-4.7.0'
67 |     assert args['submit_args'] == ['--jars',
68 |                                    '/home/hadoop/lib/spark-avro_2.10-2.0.2.jar']
69 |     assert args['uploads'] == ['examples/dir', 'examples/episodes.avro']
70 |     assert args['tags'] == ['Name=MyName', 'CostCenter=MyCostCenter']
71 |     assert args['bootstrap_script'] == 's3://bucket/bootstrap-actions.sh'
72 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, political orientation, ideology, or sexual identity and orientation.
 6 | 
 7 | We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community.
 8 | 
 9 | ## Our Standards
10 | 
11 | Examples of behavior that contributes to a positive environment for our community include:
12 | 
13 | * Demonstrating empathy and kindness toward other people
14 | * Being respectful of differing opinions, viewpoints, and experiences
15 | * Giving and gracefully accepting constructive feedback
16 | * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience
17 | * Focusing on what is best not just for us as individuals, but for the overall community
18 | * Communicate generously. Assume that contributors lack context rather than experience. 
19 | 
20 | Examples of unacceptable behavior include:
21 | * The use of sexualized language or imagery, and sexual attention or advances of any kind
22 | * Trolling, insulting or derogatory comments, and personal or political attacks
23 | * Public or private harassment
24 | * Publishing others’ private information, such as a physical or email address, without their explicit permission
25 | * Other conduct which could reasonably be considered inappropriate in a professional setting
26 | 
27 | ## Enforcement Responsibilities
28 | 
29 | Project maintainers are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate.
32 | 
33 | ## Scope
34 | 
35 | This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official email address, posting via an official social media account, or acting as an appointed representative at an online or offline event.
36 | 
37 | ## Enforcement
38 | 
39 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the project maintainers responsible for enforcement at conduct@jwplayer.com. All complaints will be reviewed and investigated promptly and fairly.
40 | 
41 | All project maintainers are obligated to respect the privacy and security of the reporter of any incident.
42 | 
43 | ## Attribution
44 | 
45 | This Code of Conduct is adapted from the Contributor Covenant, version 2.0,
46 | available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
47 | 
48 | Community Impact Guidelines were inspired by Mozilla’s code of conduct enforcement ladder.
49 | 
50 | For answers to common questions about this code of conduct, see the FAQ at
51 | https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations.
52 | 


--------------------------------------------------------------------------------
/tests/test_poll.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Test Poll logic."""
  3 | import os
  4 | import pytest
  5 | import boto3
  6 | 
  7 | from unittest.mock import MagicMock, patch
  8 | 
  9 | from moto import mock_emr, mock_s3
 10 | from moto.emr.models import emr_backends
 11 | 
 12 | from sparksteps.cluster import emr_config
 13 | from sparksteps.poll import failure_message_from_response, is_step_complete, wait_for_step_complete
 14 | 
 15 | 
 16 | @pytest.fixture(scope='function')
 17 | def aws_credentials():
 18 |     """
 19 |     Mocked AWS Credentials for moto to prevent impact to real infrastructure
 20 |     """
 21 |     os.environ['AWS_ACCESS_KEY_ID'] = 'testing'
 22 |     os.environ['AWS_SECRET_ACCESS_KEY'] = 'testing'
 23 |     os.environ['AWS_SECURITY_TOKEN'] = 'testing'
 24 |     os.environ['AWS_SESSION_TOKEN'] = 'testing'
 25 | 
 26 | 
 27 | @pytest.fixture(scope='function')
 28 | def emr_client(aws_credentials):
 29 |     with mock_emr():
 30 |         yield boto3.client('emr', region_name='us-east-1')
 31 | 
 32 | 
 33 | @pytest.fixture(scope='function')
 34 | def s3_client(aws_credentials):
 35 |     with mock_s3():
 36 |         yield boto3.client('s3', region_name='us-east-1')
 37 | 
 38 | 
 39 | def test_failure_message_from_response():
 40 |     """
 41 |     Ensure failure_message_from_response returns expected string
 42 |     """
 43 |     mock_aws_response = {
 44 |         'Step': {
 45 |             'Status': {
 46 |                 'FailureDetails': {
 47 |                     'Reason': 'error-reason',
 48 |                     'Message': 'error-message',
 49 |                     'LogFile': '/path/to/logfile'
 50 |                 }
 51 |             }
 52 |         }
 53 |     }
 54 |     expected = 'for reason error-reason with message error-message and log file /path/to/logfile'
 55 |     actual = failure_message_from_response(mock_aws_response)
 56 |     assert expected == actual, 'Mismatch, Expected: {}, Actual: {}'.format(expected, actual)
 57 | 
 58 |     del mock_aws_response['Step']['Status']['FailureDetails']
 59 |     assert failure_message_from_response(mock_aws_response) is None, \
 60 |         'Expected None when FailureDetails key is missing from response.'
 61 | 
 62 | 
 63 | def set_step_state(step_id, cluster_id, new_state):
 64 |     """
 65 |     Helper to update the state of a step
 66 |     """
 67 |     for step in emr_backends['us-east-1'].clusters[cluster_id].steps:
 68 |         if step.id == step_id:
 69 |             step.state = new_state
 70 | 
 71 | 
 72 | def test_is_step_complete(emr_client, s3_client):
 73 |     """
 74 |     Ensure is_step_complete returns expected boolean value
 75 |     """
 76 |     cluster_config = emr_config(
 77 |         'emr-5.2.0',
 78 |         instance_type_master='m4.large',
 79 |         jobflow_role='MyCustomRole',
 80 |         keep_alive=False,
 81 |         instance_type_core='m4.2xlarge',
 82 |         instance_type_task='m4.2xlarge',
 83 |         num_core=1,
 84 |         num_task=1,
 85 |         bid_price_task='0.1',
 86 |         maximize_resource_allocation=True,
 87 |         name='Test SparkSteps',
 88 |         app_list=['hadoop', 'hive', 'spark']
 89 |     )
 90 |     response = emr_client.run_job_flow(**cluster_config)
 91 |     cluster_id = response['JobFlowId']
 92 | 
 93 |     test_step = {
 94 |         'Name': 'test-step',
 95 |         'ActionOnFailure': 'CANCEL_AND_WAIT',
 96 |         'HadoopJarStep': {
 97 |             'Jar': 'command-runner.jar',
 98 |             'Args': ['state-pusher-script']
 99 |         }
100 |     }
101 |     response = emr_client.add_job_flow_steps(JobFlowId=cluster_id, Steps=[test_step])
102 |     last_step_id = response['StepIds'][-1]
103 | 
104 |     # while the step state is non-terminal is_step_complete should return False
105 |     for state in ['PENDING', 'RUNNING', 'CONTINUE', 'CANCEL_PENDING']:
106 |         set_step_state(last_step_id, cluster_id, state)
107 |         assert not is_step_complete(emr_client, cluster_id, last_step_id), \
108 |             'Expected last step to not be complete when step state is {}'.format(state)
109 | 
110 |     # when last step is in a terminal state (completed), is_step_complete should return True
111 |     set_step_state(last_step_id, cluster_id, 'COMPLETED')
112 |     assert is_step_complete(emr_client, cluster_id, last_step_id), \
113 |         'Expected last step to be complete when last step state is {}'.format('COMPLETED')
114 | 
115 |     # when last step is in a failed state, is_step_complete should raise a helpful exception
116 |     for state in ['CANCELLED', 'FAILED', 'INTERRUPTED']:
117 |         set_step_state(last_step_id, cluster_id, state)
118 |         try:
119 |             is_step_complete(emr_client, cluster_id, last_step_id)
120 |             assert False, \
121 |                 'Expected an exception to be raised when the last step is in {} state'.format(state)
122 |         except Exception as e:
123 |             assert 'EMR job failed' == str(e), 'Exception message not as expected'
124 | 
125 | 
126 | def test_wait_for_step_complete():
127 |     """
128 |     Ensure polling.poll is called with expected arguments
129 |     """
130 |     with patch('sparksteps.poll.poll') as mock_poll:
131 |         mock_emr = MagicMock()
132 |         jobflow_id = 'fake-jobflow-id'
133 |         step_id = 'fake-step-id'
134 |         wait_for_step_complete(mock_emr, jobflow_id, step_id, 1)
135 |         mock_poll.assert_called_once_with(
136 |             is_step_complete, args=(mock_emr, jobflow_id, step_id), step=1, poll_forever=True)
137 | 


--------------------------------------------------------------------------------
/sparksteps/cluster.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Create EMR cluster."""
  3 | import os
  4 | import getpass
  5 | import logging
  6 | import datetime
  7 | 
  8 | from sparksteps import steps
  9 | 
 10 | DEFAULT_JOBFLOW_ROLE = 'EMR_EC2_DefaultRole'
 11 | DEFAULT_SERVICE_ROLE = 'EMR_DefaultRole'
 12 | DEFAULT_APP_LIST = ['Hadoop', 'Spark']
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | username = getpass.getuser()
 17 | 
 18 | 
 19 | def parse_tags(raw_tags_list):
 20 |     """Parse AWS tags.
 21 | 
 22 |     Examples:
 23 |         >>> from pprint import pprint
 24 |         >>> pprint(parse_tags(['name="Peanut Pug"', 'age=5']))
 25 |         [{'Key': 'name', 'Value': '"Peanut Pug"'}, {'Key': 'age', 'Value': '5'}]
 26 |     """
 27 |     tags_dict_list = []
 28 |     for raw_tag in raw_tags_list:
 29 |         if raw_tag.find('=') == -1:
 30 |             key, value = raw_tag, ''
 31 |         else:
 32 |             key, value = raw_tag.split('=', 1)
 33 |         tags_dict_list.append({'Key': key, 'Value': value})
 34 | 
 35 |     return tags_dict_list
 36 | 
 37 | 
 38 | def parse_conf(raw_conf_list):
 39 |     """Parse configuration items."""
 40 | 
 41 |     defaults = []
 42 |     classification = None
 43 | 
 44 |     for token in raw_conf_list:
 45 |         if '=' in token:
 46 |             key, value = token.split('=', 1)
 47 |             classification['Properties'][key] = value
 48 |         else:
 49 |             if classification:
 50 |                 defaults.append(classification)
 51 |             classification = {
 52 |                 'Classification': token,
 53 |                 'Properties': {}
 54 |             }
 55 | 
 56 |     if classification:
 57 |         defaults.append(classification)
 58 | 
 59 |     return defaults
 60 | 
 61 | 
 62 | def parse_apps(raw_app_list):
 63 |     """
 64 |     Given a list of app name strings,
 65 |     returns formatted application configuration value.
 66 | 
 67 |     Examples:
 68 |         >>> from pprint import pprint
 69 |         >>> pprint(parse_apps(['hadoop', 'spark']))
 70 |         [{'Name': 'Hadoop', 'Name': 'Spark'}]
 71 |     """
 72 |     return sorted(
 73 |         [{'Name': app_name.capitalize()} for app_name in set(raw_app_list)],
 74 |         key=lambda x: x['Name'])
 75 | 
 76 | 
 77 | def emr_config(release_label, keep_alive=False, **kw):
 78 |     timestamp = datetime.datetime.now().replace(microsecond=0)
 79 |     config = dict(
 80 |         Name="{} SparkStep Task [{}]".format(username, timestamp),
 81 |         ReleaseLabel=release_label,
 82 |         Instances={
 83 |             'InstanceGroups': [],
 84 |             'KeepJobFlowAliveWhenNoSteps': keep_alive,
 85 |             'TerminationProtected': False,
 86 |         },
 87 |         Applications=parse_apps(kw.get('app_list', DEFAULT_APP_LIST)),
 88 |         VisibleToAllUsers=True,
 89 |         JobFlowRole=kw.get('jobflow_role', DEFAULT_JOBFLOW_ROLE),
 90 |         ServiceRole=kw.get('service_role', DEFAULT_SERVICE_ROLE)
 91 |     )
 92 | 
 93 |     for instance_group in ('master', 'core', 'task'):
 94 |         num_instances = kw.get('num_{}'.format(instance_group), 0)
 95 |         if instance_group != 'master' and not num_instances:
 96 |             # We don't need this instance group.
 97 |             continue
 98 | 
 99 |         instance_type = kw.get('instance_type_{}'.format(instance_group))
100 |         if not instance_type:
101 |             raise ValueError('{} nodes specified without instance type.'.format(
102 |                 instance_group.capitalize()))
103 | 
104 |         instance_group_config = {
105 |             'Name': '{} Node{}'.format(instance_group.capitalize(),
106 |                                        's' if instance_group != 'master' else ''),
107 |             'Market': 'ON_DEMAND',
108 |             'InstanceRole': instance_group.upper(),
109 |             'InstanceType': instance_type,
110 |             'InstanceCount': 1 if instance_group == 'master' else num_instances
111 |         }
112 | 
113 |         bid_price = kw.get('bid_price_{}'.format(instance_group))
114 |         if bid_price:
115 |             instance_group_config['Market'] = 'SPOT'
116 |             instance_group_config['BidPrice'] = bid_price
117 | 
118 |         ebs_volume_size = kw.get('ebs_volume_size_{}'.format(instance_group), 0)
119 |         if ebs_volume_size:
120 |             ebs_configuration = {
121 |                 'EbsBlockDeviceConfigs': [{
122 |                     'VolumeSpecification': {
123 |                         'VolumeType': kw.get('ebs_volume_type_{}'.format(instance_group)),
124 |                         'SizeInGB': ebs_volume_size
125 |                     },
126 |                     'VolumesPerInstance': kw.get('ebs_volumes_per_{}'.format(instance_group), 1)
127 |                 }],
128 |                 'EbsOptimized': kw.get('ebs_optimized_{}'.format(instance_group), False)
129 |             }
130 |             instance_group_config['EbsConfiguration'] = ebs_configuration
131 |         config['Instances']['InstanceGroups'].append(instance_group_config)
132 | 
133 |     if kw.get('name'):
134 |         config['Name'] = kw['name']
135 |     if kw.get('ec2_key'):
136 |         config['Instances']['Ec2KeyName'] = kw['ec2_key']
137 |     if kw.get('ec2_subnet_id'):
138 |         config['Instances']['Ec2SubnetId'] = kw['ec2_subnet_id']
139 |     if kw.get('debug', False) and kw.get('s3_bucket'):
140 |         config['LogUri'] = os.path.join('s3://', kw['s3_bucket'], kw['s3_path'], 'logs/')
141 |         config['Steps'] = [steps.DebugStep().step]
142 |     if kw.get('tags'):
143 |         config['Tags'] = parse_tags(kw['tags'])
144 |     if kw.get('defaults'):
145 |         config['Configurations'] = parse_conf(kw['defaults'])
146 |     if kw.get('maximize_resource_allocation'):
147 |         configurations = config.get('Configurations', [])
148 |         configurations.append({
149 |             'Classification': 'spark',
150 |             'Properties': {'maximizeResourceAllocation': 'true'}
151 |         })
152 |         config['Configurations'] = configurations
153 |     if kw.get('bootstrap_script'):
154 |         config['BootstrapActions'] = [{'Name': 'bootstrap',
155 |                                        'ScriptBootstrapAction': {'Path': kw['bootstrap_script']}}]
156 | 
157 |     return config
158 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | Spark Steps
  2 | ===========
  3 | 
  4 | .. image:: https://github.com/jwplayer/sparksteps/workflows/Tests/badge.svg?branch=master
  5 |     :target: https://github.com/jwplayer/sparksteps/actions?query=workflow%3ATests+branch%3Amaster
  6 |     :alt: Build Status
  7 | 
  8 | .. image:: https://readthedocs.org/projects/spark-steps/badge/?version=latest
  9 |     :target: http://spark-steps.readthedocs.io/en/latest/?badge=latest
 10 |     :alt: Documentation Status
 11 | 
 12 | SparkSteps allows you to configure your EMR cluster and upload your
 13 | spark script and its dependencies via AWS S3. All you need to do is
 14 | define an S3 bucket.
 15 | 
 16 | Install
 17 | -------
 18 | 
 19 | ::
 20 | 
 21 |     pip install sparksteps
 22 | 
 23 | CLI Options
 24 | -----------
 25 | 
 26 | ::
 27 | 
 28 |     Prompt parameters:
 29 |       app                           main spark script for submit spark (required)
 30 |       app-args:                     arguments passed to main spark script
 31 |       app-list:                     Space delimited list of applications to be installed on the EMR cluster (Default: Hadoop Spark)
 32 |       aws-region:                   AWS region name
 33 |       bid-price:                    specify bid price for task nodes
 34 |       bootstrap-script:             include a bootstrap script (s3 path)
 35 |       cluster-id:                   job flow id of existing cluster to submit to
 36 |       debug:                        allow debugging of cluster
 37 |       defaults:                     cluster configurations of the form "<classification1> key1=val1 key2=val2 ..."
 38 |       dynamic-pricing-master:       use spot pricing for the master nodes.
 39 |       dynamic-pricing-core:         use spot pricing for the core nodes.
 40 |       dynamic-pricing-task:         use spot pricing for the task nodes.
 41 |       ebs-volume-size-core:         size of the EBS volume to attach to core nodes in GiB.
 42 |       ebs-volume-type-core:         type of the EBS volume to attach to core nodes (supported: [standard, gp2, io1]).
 43 |       ebs-volumes-per-core:         the number of EBS volumes to attach per core node.
 44 |       ebs-optimized-core:           whether to use EBS optimized volumes for core nodes.
 45 |       ebs-volume-size-task:         size of the EBS volume to attach to task nodes in GiB.
 46 |       ebs-volume-type-task:         type of the EBS volume to attach to task nodes.
 47 |       ebs-volumes-per-task:         the number of EBS volumes to attach per task node.
 48 |       ebs-optimized-task:           whether to use EBS optimized volumes for task nodes.
 49 |       ec2-key:                      name of the Amazon EC2 key pair
 50 |       ec2-subnet-id:                Amazon VPC subnet id
 51 |       help (-h):                    argparse help
 52 |       jobflow-role:                 Amazon EC2 instance profile name to use (Default: EMR_EC2_DefaultRole)
 53 |       service-role:                 AWS IAM service role to use for EMR (Default: EMR_DefaultRole)
 54 |       keep-alive:                   whether to keep the EMR cluster alive when there are no steps
 55 |       log-level (-l):               logging level (default=INFO)
 56 |       instance-type-master:         instance type of of master host (default='m4.large')
 57 |       instance-type-core:           instance type of the core nodes, must be set when num-core > 0
 58 |       instance-type-task:           instance type of the task nodes, must be set when num-task > 0
 59 |       maximize-resource-allocation: sets the maximizeResourceAllocation property for the cluster to true when supplied.
 60 |       name:                         specify cluster name
 61 |       num-core:                     number of core nodes
 62 |       num-task:                     number of task nodes
 63 |       release-label:                EMR release label
 64 |       s3-bucket:                    name of s3 bucket to upload spark file (required)
 65 |       s3-path:                      path within s3-bucket to use when writing assets
 66 |       s3-dist-cp:                   s3-dist-cp step after spark job is done
 67 |       submit-args:                  arguments passed to spark-submit
 68 |       tags:                         EMR cluster tags of the form "key1=value1 key2=value2"
 69 |       uploads:                      files to upload to /home/hadoop/ in master instance
 70 |       wait:                         poll until all steps are complete (or error)
 71 | 
 72 | Example
 73 | -------
 74 | 
 75 | ::
 76 | 
 77 |       AWS_S3_BUCKET = <insert-s3-bucket>
 78 |       cd sparksteps/
 79 |       sparksteps examples/episodes.py \
 80 |         --s3-bucket $AWS_S3_BUCKET \
 81 |         --aws-region us-east-1 \
 82 |         --release-label emr-4.7.0 \
 83 |         --uploads examples/lib examples/episodes.avro \
 84 |         --submit-args="--deploy-mode client --jars /home/hadoop/lib/spark-avro_2.10-2.0.2-custom.jar" \
 85 |         --app-args="--input /home/hadoop/episodes.avro" \
 86 |         --tags Application="Spark Steps" \
 87 |         --debug
 88 | 
 89 | The above example creates an EMR cluster of 1 node with default instance
 90 | type *m4.large*, uploads the pyspark script episodes.py and its
 91 | dependencies to the specified S3 bucket and copies the file from S3 to
 92 | the cluster. Each operation is defined as an EMR "step" that you can
 93 | monitor in EMR. The final step is to run the spark application with
 94 | submit args that includes a custom spark-avro package and app args
 95 | "--input".
 96 | 
 97 | Run Spark Job on Existing Cluster
 98 | ---------------------------------
 99 | 
100 | You can use the option ``--cluster-id`` to specify a cluster to upload
101 | and run the Spark job. This is especially helpful for debugging.
102 | 
103 | Dynamic Pricing
104 | -----------------------
105 | 
106 | Use CLI option ``--dynamic-pricing-<instance-type>`` to allow sparksteps to dynamically
107 | determine the best bid price for EMR instances within a certain instance group.
108 | 
109 | Currently the algorithm looks back at spot history over the last 12
110 | hours and calculates ``min(0.8 * on_demand_price, 1.2 * max_spot_price)`` to
111 | determine bid price. That said, if the current spot price is over 80% of
112 | the on-demand cost, then on-demand instances are used to be
113 | conservative.
114 | 
115 | 
116 | Testing
117 | -------
118 | 
119 | ::
120 | 
121 |     make test
122 | 
123 | Blog
124 | ----
125 | Read more about sparksteps in our blog post here:
126 | https://www.jwplayer.com/blog/sparksteps/
127 | 
128 | License
129 | -------
130 | 
131 | Apache License 2.0
132 | 


--------------------------------------------------------------------------------
/sparksteps/steps.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Create EMR steps and upload files."""
  3 | import os
  4 | import tempfile
  5 | import zipfile
  6 | from urllib.parse import urlparse
  7 | 
  8 | REMOTE_DIR = '/home/hadoop/'
  9 | 
 10 | 
 11 | def get_basename(path):
 12 |     return os.path.basename(os.path.normpath(path))
 13 | 
 14 | 
 15 | def ls_recursive(dirname):
 16 |     """Recursively list files in a directory."""
 17 |     for (dirpath, dirnames, filenames) in os.walk(os.path.expanduser(dirname)):
 18 |         for f in filenames:
 19 |             yield os.path.join(dirpath, f)
 20 | 
 21 | 
 22 | def zip_to_s3(s3_resource, dirpath, bucket, key):
 23 |     """Zip folder and upload to S3."""
 24 |     with tempfile.SpooledTemporaryFile() as tmp:
 25 |         with zipfile.ZipFile(tmp, 'w', zipfile.ZIP_DEFLATED) as archive:
 26 |             for fpath in ls_recursive(dirpath):
 27 |                 archive.write(fpath, get_basename(fpath))
 28 |         tmp.seek(0)  # Reset file pointer
 29 |         response = s3_resource.Bucket(bucket).put_object(Key=key, Body=tmp)
 30 |     return response
 31 | 
 32 | 
 33 | def parse_s3_path(s3_path):
 34 |     """Return bucket, path, and filename of an S3 path"""
 35 |     parsed = urlparse(s3_path, allow_fragments=False)
 36 |     bucket = parsed.netloc
 37 |     path, filename = parsed.path.rsplit('/', 1)
 38 |     path = path[1:] if path.startswith('/') else path
 39 |     return bucket, path, filename
 40 | 
 41 | 
 42 | class CmdStep(object):
 43 |     on_failure = 'CANCEL_AND_WAIT'
 44 | 
 45 |     @property
 46 |     def step_name(self):
 47 |         raise NotImplementedError()
 48 | 
 49 |     @property
 50 |     def cmd(self):
 51 |         raise NotImplementedError()
 52 | 
 53 |     @property
 54 |     def step(self):
 55 |         return {
 56 |             'Name': self.step_name,
 57 |             'ActionOnFailure': self.on_failure,
 58 |             'HadoopJarStep': {
 59 |                 'Jar': 'command-runner.jar',
 60 |                 'Args': self.cmd
 61 |             }
 62 |         }
 63 | 
 64 | 
 65 | class CopyStep(CmdStep):
 66 |     def __init__(self, bucket, path, filename):
 67 |         self.bucket = bucket
 68 |         self.path = path
 69 |         self.filename = filename
 70 | 
 71 |     @property
 72 |     def step_name(self):
 73 |         return "Copy {}".format(self.filename)
 74 | 
 75 |     @property
 76 |     def cmd(self):
 77 |         return ['aws', 's3', 'cp', self.s3_uri, REMOTE_DIR]
 78 | 
 79 |     @property
 80 |     def key(self):
 81 |         return os.path.join(self.path, self.filename)
 82 | 
 83 |     @property
 84 |     def s3_uri(self):
 85 |         return os.path.join('s3://', self.bucket, self.key)
 86 | 
 87 | 
 88 | class DebugStep(CmdStep):
 89 |     on_failure = 'TERMINATE_CLUSTER'
 90 | 
 91 |     @property
 92 |     def step_name(self):
 93 |         return "Setup - debug"
 94 | 
 95 |     @property
 96 |     def cmd(self):
 97 |         return ['state-pusher-script']
 98 | 
 99 | 
100 | class SparkStep(CmdStep):
101 |     def __init__(self, app_path, submit_args=None, app_args=None):
102 |         self.app = get_basename(app_path)
103 |         self.submit_args = submit_args or []
104 |         self.app_args = app_args or []
105 | 
106 |     @property
107 |     def step_name(self):
108 |         return "Run {}".format(self.app)
109 | 
110 |     @property
111 |     def cmd(self):
112 |         return (['spark-submit'] + self.submit_args + [self.remote_app] +
113 |                 self.app_args)
114 | 
115 |     @property
116 |     def remote_app(self):
117 |         return os.path.join(REMOTE_DIR, self.app)
118 | 
119 | 
120 | class UnzipStep(CmdStep):
121 |     def __init__(self, dirpath):
122 |         self.dirpath = dirpath
123 | 
124 |     @property
125 |     def step_name(self):
126 |         return "Unzip {}".format(self.zipfile)
127 | 
128 |     @property
129 |     def cmd(self):
130 |         return ['unzip', '-o', self.remote_zipfile, '-d', self.remote_dirpath]
131 | 
132 |     @property
133 |     def zipfile(self):
134 |         return self.dirname + '.zip'
135 | 
136 |     @property
137 |     def remote_zipfile(self):
138 |         return os.path.join(REMOTE_DIR, self.zipfile)
139 | 
140 |     @property
141 |     def dirname(self):
142 |         return get_basename(self.dirpath)
143 | 
144 |     @property
145 |     def remote_dirpath(self):
146 |         return os.path.join(REMOTE_DIR, self.dirname)
147 | 
148 | 
149 | class S3DistCp(CmdStep):
150 |     on_failure = 'CONTINUE'
151 | 
152 |     def __init__(self, s3_dist_cp):
153 |         self.s3_dist_cp = s3_dist_cp
154 | 
155 |     @property
156 |     def step_name(self):
157 |         return "S3DistCp step"
158 | 
159 |     @property
160 |     def cmd(self):
161 |         return ['s3-dist-cp'] + self.s3_dist_cp
162 | 
163 | 
164 | def get_download_steps(s3_resource, bucket, bucket_path, src_path):
165 |     """
166 |     Return list of step instances necessary to download file/directory resources onto the EMR master node.
167 |     May upload local files and directories to S3 to make them available to EMR.
168 |     """
169 |     steps = []
170 |     basename = get_basename(src_path)
171 | 
172 |     # Location where files will be copied to be made accessible by EMR
173 |     default_dest_path = os.path.join(bucket_path, 'sources')
174 | 
175 |     if src_path.startswith('s3://'):
176 |         # S3 file, simply add the Copy EMR step,
177 |         # no intermediate S3 file is necessary as it's already on S3
178 |         steps.append(CopyStep(*parse_s3_path(src_path)))
179 |     elif os.path.isdir(src_path):
180 |         # Directory, will zip and push to S3 first before adding EMR copy/unzip step
181 |         basename = basename + '.zip'
182 |         dest_path = os.path.join(default_dest_path, basename)
183 |         zip_to_s3(s3_resource, src_path, bucket, key=dest_path)
184 |         copy_step = CopyStep(bucket, default_dest_path, basename)
185 |         steps.extend([copy_step, UnzipStep(src_path)])
186 |     elif os.path.isfile(src_path):
187 |         # File, upload to S3 and add copy step
188 |         dest_path = os.path.join(default_dest_path, basename)
189 |         s3_resource.meta.client.upload_file(src_path, bucket, dest_path)
190 |         copy_step = CopyStep(bucket, default_dest_path, basename)
191 |         steps.append(copy_step)
192 |     else:
193 |         raise FileNotFoundError(
194 |             '{} does not exist (does not reference a valid file or path).'
195 |             .format(src_path))
196 |     return steps
197 | 
198 | 
199 | def setup_steps(s3, bucket, bucket_path, app_path, submit_args=None, app_args=None,
200 |                 uploads=None, s3_dist_cp=None):
201 |     cmd_steps = []
202 |     paths = uploads or []
203 |     paths.append(app_path)
204 | 
205 |     for src_path in paths:
206 |         cmd_steps.extend(get_download_steps(s3, bucket, bucket_path, src_path))
207 | 
208 |     cmd_steps.append(SparkStep(app_path, submit_args, app_args))
209 | 
210 |     if s3_dist_cp is not None:
211 |         cmd_steps.append(S3DistCp(s3_dist_cp))
212 | 
213 |     return [s.step for s in cmd_steps]
214 | 


--------------------------------------------------------------------------------
/sparksteps/pricing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Get optimal pricing for EC2 instances."""
  3 | import json
  4 | import datetime
  5 | import itertools
  6 | import logging
  7 | import collections
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | SPOT_DEMAND_THRESHOLD_FACTOR = 0.8
 12 | SPOT_PRICE_LOOKBACK = 12  # hours
 13 | 
 14 | Zone = collections.namedtuple('Zone', 'name max min mean current')
 15 | Spot = collections.namedtuple('Spot', 'availability_zone timestamp price')
 16 | 
 17 | EC2_PRICE_FILTER_TEMPLATE = '''
 18 | [
 19 |     {{"Field": "tenancy", "Value": "shared", "Type": "TERM_MATCH"}},
 20 |     {{"Field": "operatingSystem", "Value": "{operating_sytem}", "Type": "TERM_MATCH"}},
 21 |     {{"Field": "preInstalledSw", "Value": "NA", "Type": "TERM_MATCH"}},
 22 |     {{"Field": "instanceType", "Value": "{instance_type}", "Type": "TERM_MATCH"}},
 23 |     {{"Field": "location", "Value": "{region}", "Type": "TERM_MATCH"}},
 24 |     {{"Field": "licenseModel", "Value": "No License required", "Type": "TERM_MATCH"}},
 25 |     {{"Field": "usagetype", "Value": "BoxUsage:{instance_type}", "Type": "TERM_MATCH"}}
 26 | ]
 27 | '''
 28 | 
 29 | 
 30 | def get_demand_price(pricing_client, instance_type, region='US East (N. Virginia)', operating_system='Linux'):
 31 |     """
 32 |     Retrieves the on-demand price for a particular EC2 instance type in the specified region.
 33 |     This function does not take reserved instance pricing into account.
 34 | 
 35 |     Args:
 36 |         pricing_client: Boto3 Pricing client.
 37 |         instance_type (str): The type of the instance.
 38 |         region: The region to get the price for, this must be the human-readable name of the region!
 39 |         operating_system: The operating system of the instance, this must be the human-readable name!
 40 |     """
 41 |     if '-' in region:
 42 |         # TODO (rikheijdens): Perhaps we could map these using information from botocore/data/endpoints.json.
 43 |         raise ValueError('get_demand_price() requires the human-readable name of the region to be supplied.')
 44 | 
 45 |     filter_template = EC2_PRICE_FILTER_TEMPLATE.format(
 46 |         operating_sytem=operating_system, instance_type=instance_type, region=region)
 47 |     data = pricing_client.get_products(ServiceCode='AmazonEC2', Filters=json.loads(filter_template))
 48 |     on_demand = json.loads(data['PriceList'][0])['terms']['OnDemand']
 49 |     index_1 = list(on_demand)[0]
 50 |     index_2 = list(on_demand[index_1]['priceDimensions'])[0]
 51 |     return float(on_demand[index_1]['priceDimensions'][index_2]['pricePerUnit']['USD'])
 52 | 
 53 | 
 54 | def get_availability_zone(ec2_client, subnet_id):
 55 |     """
 56 |     Returns the availability zone associated with the provided `subnet_id`.
 57 | 
 58 |     Args:
 59 |         ec2_client: Boto3 EC2 client.
 60 |         subnet_id (str): The identifier of the subnet to look the associated AZ up for.
 61 | 
 62 |     Returns:
 63 |         AZ: The AvailabilityZone of the associated subnet.
 64 |     """
 65 |     response = ec2_client.describe_subnets(SubnetIds=[subnet_id])
 66 |     subnets = response.get('Subnets', [])
 67 |     for s in subnets:
 68 |         if s['SubnetId'] == subnet_id:
 69 |             return s['AvailabilityZone']
 70 |     # Could not determine the associated AZ.
 71 |     return None
 72 | 
 73 | 
 74 | def get_spot_price_history(ec2_client, instance_type, lookback=1):
 75 |     """Return dictionary of price history by availability zone.
 76 | 
 77 |     Args:
 78 |         ec2_client: EC2 client
 79 |         instance_type (str): get results by the specified instance type
 80 |         lookback (int): number of hours to look back for spot history
 81 | 
 82 |     Returns:
 83 |         float: bid price for the instance type.
 84 |     """
 85 |     end = datetime.datetime.utcnow()
 86 |     start = end - datetime.timedelta(hours=lookback)
 87 | 
 88 |     response = ec2_client.describe_spot_price_history(
 89 |         StartTime=start,
 90 |         EndTime=end,
 91 |         InstanceTypes=[
 92 |             instance_type,
 93 |         ],
 94 |         ProductDescriptions=[
 95 |             'Linux/UNIX (Amazon VPC)',
 96 |             'Linux/UNIX',
 97 |         ],
 98 |     )
 99 |     return response['SpotPriceHistory']
100 | 
101 | 
102 | def price_by_zone(price_history):
103 |     prices = [Spot(d['AvailabilityZone'], d['Timestamp'], float(d['SpotPrice']))
104 |               for d in price_history]
105 |     g = itertools.groupby(sorted(prices), key=lambda x: x.availability_zone)
106 |     result = {key: list(grp) for key, grp in g}
107 |     return result
108 | 
109 | 
110 | def get_zone_profile(zone_history):
111 |     zone_prices = {k: [x.price for x in v] for k, v in zone_history.items()}
112 |     return [Zone(k, max(v), min(v), sum(v) / len(v), v[-1])
113 |             for k, v in zone_prices.items()]
114 | 
115 | 
116 | def determine_best_price(demand_price, aws_zone):
117 |     """Calculate optimal bid price.
118 | 
119 |     Args:
120 |         demand_price (float): on-demand cost of AWS instance
121 |         aws_zone (Zone): AWS zone namedtuple ('name max min mean current')
122 | 
123 |     Returns:
124 |         float: bid price
125 |         bool: boolean to use spot pricing
126 |     """
127 |     if aws_zone.current >= demand_price * SPOT_DEMAND_THRESHOLD_FACTOR:
128 |         return demand_price, False
129 |     # We always bid higher than the maximum current spot price for a particular instance type
130 |     # in order to make it less likely that our clusters will be shutdown.
131 |     return min(1.2 * aws_zone.max, demand_price * SPOT_DEMAND_THRESHOLD_FACTOR), True
132 | 
133 | 
134 | def get_bid_price(ec2_client, pricing_client, instance_type, availability_zone=None):
135 |     """Determine AWS bid price.
136 | 
137 |     Args:
138 |         ec2_client: boto3 EC2 client
139 |         instance_type: EC2 instance type
140 |         availability_zone: The availability zone the instance should be launched in,
141 |          if not provided an AZ is automatically selected.
142 | 
143 |     Returns:
144 |         float: bid price, bool: is_spot
145 | 
146 |     Examples:
147 |         >>> import boto3
148 |         >>> client = boto3.client('ec2', region_name='us-east-1')
149 |         >>> print(get_bid_price(client, 'm3.2xlarge'))
150 |     """
151 |     history = get_spot_price_history(ec2_client, instance_type, SPOT_PRICE_LOOKBACK)
152 |     by_zone = price_by_zone(history)
153 |     if availability_zone is not None and availability_zone not in by_zone:
154 |         # Unable to determine the spot price because no information was available for the
155 |         # desired AZ.
156 |         logger.info(
157 |             "Unable to determine the spot price for %s instances in %s because no "
158 |             "zone information was available.", instance_type, availability_zone)
159 |         return round(get_demand_price(pricing_client, instance_type), 2), False
160 | 
161 |     if availability_zone:
162 |         # Consider only the AZ in which we expect to launch instances.
163 |         zone_profile = get_zone_profile({availability_zone: by_zone[availability_zone]})
164 |     else:
165 |         # Consider all AZ's.
166 |         zone_profile = get_zone_profile(by_zone)
167 |     best_zone = min(zone_profile, key=lambda x: x.max)
168 |     demand_price = get_demand_price(pricing_client, instance_type)
169 |     bid_price, is_spot = determine_best_price(demand_price, best_zone)
170 |     bid_price_rounded = round(bid_price, 2)  # AWS requires max 3 decimal places
171 |     return bid_price_rounded, is_spot
172 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help
 18 | help:
 19 | 	@echo "Please use \`make <target>' where <target> is one of"
 20 | 	@echo "  html       to make standalone HTML files"
 21 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 22 | 	@echo "  singlehtml to make a single large HTML file"
 23 | 	@echo "  pickle     to make pickle files"
 24 | 	@echo "  json       to make JSON files"
 25 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 26 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 27 | 	@echo "  applehelp  to make an Apple Help Book"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  epub3      to make an epub3"
 31 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 32 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 33 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 34 | 	@echo "  text       to make text files"
 35 | 	@echo "  man        to make manual pages"
 36 | 	@echo "  texinfo    to make Texinfo files"
 37 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 38 | 	@echo "  gettext    to make PO message catalogs"
 39 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 40 | 	@echo "  xml        to make Docutils-native XML files"
 41 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 42 | 	@echo "  linkcheck  to check all external links for integrity"
 43 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 44 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 45 | 	@echo "  dummy      to check syntax errors of document sources"
 46 | 
 47 | .PHONY: clean
 48 | clean:
 49 | 	rm -rf $(BUILDDIR)/*
 50 | 
 51 | .PHONY: html
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | .PHONY: dirhtml
 58 | dirhtml:
 59 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 60 | 	@echo
 61 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 62 | 
 63 | .PHONY: singlehtml
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | .PHONY: pickle
 70 | pickle:
 71 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 72 | 	@echo
 73 | 	@echo "Build finished; now you can process the pickle files."
 74 | 
 75 | .PHONY: json
 76 | json:
 77 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 78 | 	@echo
 79 | 	@echo "Build finished; now you can process the JSON files."
 80 | 
 81 | .PHONY: htmlhelp
 82 | htmlhelp:
 83 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 84 | 	@echo
 85 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 86 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 87 | 
 88 | .PHONY: qthelp
 89 | qthelp:
 90 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 91 | 	@echo
 92 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 93 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 94 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/sparksteps.qhcp"
 95 | 	@echo "To view the help file:"
 96 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/sparksteps.qhc"
 97 | 
 98 | .PHONY: applehelp
 99 | applehelp:
100 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
101 | 	@echo
102 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
103 | 	@echo "N.B. You won't be able to view it unless you put it in" \
104 | 	      "~/Library/Documentation/Help or install it in your application" \
105 | 	      "bundle."
106 | 
107 | .PHONY: devhelp
108 | devhelp:
109 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
110 | 	@echo
111 | 	@echo "Build finished."
112 | 	@echo "To view the help file:"
113 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/sparksteps"
114 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/sparksteps"
115 | 	@echo "# devhelp"
116 | 
117 | .PHONY: epub
118 | epub:
119 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
120 | 	@echo
121 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
122 | 
123 | .PHONY: epub3
124 | epub3:
125 | 	$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
126 | 	@echo
127 | 	@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
128 | 
129 | .PHONY: latex
130 | latex:
131 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
132 | 	@echo
133 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
134 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
135 | 	      "(use \`make latexpdf' here to do that automatically)."
136 | 
137 | .PHONY: latexpdf
138 | latexpdf:
139 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
140 | 	@echo "Running LaTeX files through pdflatex..."
141 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
142 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
143 | 
144 | .PHONY: latexpdfja
145 | latexpdfja:
146 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
147 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
148 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
149 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
150 | 
151 | .PHONY: text
152 | text:
153 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
154 | 	@echo
155 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
156 | 
157 | .PHONY: man
158 | man:
159 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
160 | 	@echo
161 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
162 | 
163 | .PHONY: texinfo
164 | texinfo:
165 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
166 | 	@echo
167 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
168 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
169 | 	      "(use \`make info' here to do that automatically)."
170 | 
171 | .PHONY: info
172 | info:
173 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
174 | 	@echo "Running Texinfo files through makeinfo..."
175 | 	make -C $(BUILDDIR)/texinfo info
176 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
177 | 
178 | .PHONY: gettext
179 | gettext:
180 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
181 | 	@echo
182 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
183 | 
184 | .PHONY: changes
185 | changes:
186 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
187 | 	@echo
188 | 	@echo "The overview file is in $(BUILDDIR)/changes."
189 | 
190 | .PHONY: linkcheck
191 | linkcheck:
192 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
193 | 	@echo
194 | 	@echo "Link check complete; look for any errors in the above output " \
195 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
196 | 
197 | .PHONY: doctest
198 | doctest:
199 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
200 | 	@echo "Testing of doctests in the sources finished, look at the " \
201 | 	      "results in $(BUILDDIR)/doctest/output.txt."
202 | 
203 | .PHONY: coverage
204 | coverage:
205 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
206 | 	@echo "Testing of coverage in the sources finished, look at the " \
207 | 	      "results in $(BUILDDIR)/coverage/python.txt."
208 | 
209 | .PHONY: xml
210 | xml:
211 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
212 | 	@echo
213 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
214 | 
215 | .PHONY: pseudoxml
216 | pseudoxml:
217 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
218 | 	@echo
219 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
220 | 
221 | .PHONY: dummy
222 | dummy:
223 | 	$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
224 | 	@echo
225 | 	@echo "Build finished. Dummy builder generates no files."
226 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  epub3      to make an epub3
 31 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 32 | 	echo.  text       to make text files
 33 | 	echo.  man        to make manual pages
 34 | 	echo.  texinfo    to make Texinfo files
 35 | 	echo.  gettext    to make PO message catalogs
 36 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 37 | 	echo.  xml        to make Docutils-native XML files
 38 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 39 | 	echo.  linkcheck  to check all external links for integrity
 40 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 41 | 	echo.  coverage   to run coverage check of the documentation if enabled
 42 | 	echo.  dummy      to check syntax errors of document sources
 43 | 	goto end
 44 | )
 45 | 
 46 | if "%1" == "clean" (
 47 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 48 | 	del /q /s %BUILDDIR%\*
 49 | 	goto end
 50 | )
 51 | 
 52 | 
 53 | REM Check if sphinx-build is available and fallback to Python version if any
 54 | %SPHINXBUILD% 1>NUL 2>NUL
 55 | if errorlevel 9009 goto sphinx_python
 56 | goto sphinx_ok
 57 | 
 58 | :sphinx_python
 59 | 
 60 | set SPHINXBUILD=python -m sphinx.__init__
 61 | %SPHINXBUILD% 2> nul
 62 | if errorlevel 9009 (
 63 | 	echo.
 64 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 65 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 66 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 67 | 	echo.may add the Sphinx directory to PATH.
 68 | 	echo.
 69 | 	echo.If you don't have Sphinx installed, grab it from
 70 | 	echo.http://sphinx-doc.org/
 71 | 	exit /b 1
 72 | )
 73 | 
 74 | :sphinx_ok
 75 | 
 76 | 
 77 | if "%1" == "html" (
 78 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 79 | 	if errorlevel 1 exit /b 1
 80 | 	echo.
 81 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 82 | 	goto end
 83 | )
 84 | 
 85 | if "%1" == "dirhtml" (
 86 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 87 | 	if errorlevel 1 exit /b 1
 88 | 	echo.
 89 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 90 | 	goto end
 91 | )
 92 | 
 93 | if "%1" == "singlehtml" (
 94 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 95 | 	if errorlevel 1 exit /b 1
 96 | 	echo.
 97 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 98 | 	goto end
 99 | )
100 | 
101 | if "%1" == "pickle" (
102 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
103 | 	if errorlevel 1 exit /b 1
104 | 	echo.
105 | 	echo.Build finished; now you can process the pickle files.
106 | 	goto end
107 | )
108 | 
109 | if "%1" == "json" (
110 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
111 | 	if errorlevel 1 exit /b 1
112 | 	echo.
113 | 	echo.Build finished; now you can process the JSON files.
114 | 	goto end
115 | )
116 | 
117 | if "%1" == "htmlhelp" (
118 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
119 | 	if errorlevel 1 exit /b 1
120 | 	echo.
121 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
122 | .hhp project file in %BUILDDIR%/htmlhelp.
123 | 	goto end
124 | )
125 | 
126 | if "%1" == "qthelp" (
127 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
128 | 	if errorlevel 1 exit /b 1
129 | 	echo.
130 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
131 | .qhcp project file in %BUILDDIR%/qthelp, like this:
132 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\sparksteps.qhcp
133 | 	echo.To view the help file:
134 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\sparksteps.ghc
135 | 	goto end
136 | )
137 | 
138 | if "%1" == "devhelp" (
139 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
140 | 	if errorlevel 1 exit /b 1
141 | 	echo.
142 | 	echo.Build finished.
143 | 	goto end
144 | )
145 | 
146 | if "%1" == "epub" (
147 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
148 | 	if errorlevel 1 exit /b 1
149 | 	echo.
150 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
151 | 	goto end
152 | )
153 | 
154 | if "%1" == "epub3" (
155 | 	%SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3
156 | 	if errorlevel 1 exit /b 1
157 | 	echo.
158 | 	echo.Build finished. The epub3 file is in %BUILDDIR%/epub3.
159 | 	goto end
160 | )
161 | 
162 | if "%1" == "latex" (
163 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
164 | 	if errorlevel 1 exit /b 1
165 | 	echo.
166 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdf" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "latexpdfja" (
181 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
182 | 	cd %BUILDDIR%/latex
183 | 	make all-pdf-ja
184 | 	cd %~dp0
185 | 	echo.
186 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
187 | 	goto end
188 | )
189 | 
190 | if "%1" == "text" (
191 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
192 | 	if errorlevel 1 exit /b 1
193 | 	echo.
194 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
195 | 	goto end
196 | )
197 | 
198 | if "%1" == "man" (
199 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
200 | 	if errorlevel 1 exit /b 1
201 | 	echo.
202 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
203 | 	goto end
204 | )
205 | 
206 | if "%1" == "texinfo" (
207 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
208 | 	if errorlevel 1 exit /b 1
209 | 	echo.
210 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
211 | 	goto end
212 | )
213 | 
214 | if "%1" == "gettext" (
215 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
216 | 	if errorlevel 1 exit /b 1
217 | 	echo.
218 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
219 | 	goto end
220 | )
221 | 
222 | if "%1" == "changes" (
223 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
224 | 	if errorlevel 1 exit /b 1
225 | 	echo.
226 | 	echo.The overview file is in %BUILDDIR%/changes.
227 | 	goto end
228 | )
229 | 
230 | if "%1" == "linkcheck" (
231 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
232 | 	if errorlevel 1 exit /b 1
233 | 	echo.
234 | 	echo.Link check complete; look for any errors in the above output ^
235 | or in %BUILDDIR%/linkcheck/output.txt.
236 | 	goto end
237 | )
238 | 
239 | if "%1" == "doctest" (
240 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
241 | 	if errorlevel 1 exit /b 1
242 | 	echo.
243 | 	echo.Testing of doctests in the sources finished, look at the ^
244 | results in %BUILDDIR%/doctest/output.txt.
245 | 	goto end
246 | )
247 | 
248 | if "%1" == "coverage" (
249 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
250 | 	if errorlevel 1 exit /b 1
251 | 	echo.
252 | 	echo.Testing of coverage in the sources finished, look at the ^
253 | results in %BUILDDIR%/coverage/python.txt.
254 | 	goto end
255 | )
256 | 
257 | if "%1" == "xml" (
258 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
259 | 	if errorlevel 1 exit /b 1
260 | 	echo.
261 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
262 | 	goto end
263 | )
264 | 
265 | if "%1" == "pseudoxml" (
266 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
267 | 	if errorlevel 1 exit /b 1
268 | 	echo.
269 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
270 | 	goto end
271 | )
272 | 
273 | if "%1" == "dummy" (
274 | 	%SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy
275 | 	if errorlevel 1 exit /b 1
276 | 	echo.
277 | 	echo.Build finished. Dummy builder generates no files.
278 | 	goto end
279 | )
280 | 
281 | :end
282 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # sparksteps documentation build configuration file, created by
  5 | # sphinx-quickstart on Sun Jan  1 10:41:08 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | from pkg_resources import get_distribution
 24 | 
 25 | import sphinx_rtd_theme
 26 | 
 27 | # -- General configuration ------------------------------------------------
 28 | 
 29 | # If your documentation needs a minimal Sphinx version, state it here.
 30 | #
 31 | # needs_sphinx = '1.0'
 32 | 
 33 | # Add any Sphinx extension module names here, as strings. They can be
 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 35 | # ones.
 36 | extensions = [
 37 |     'sphinx.ext.autodoc',
 38 |     'sphinx.ext.doctest',
 39 |     'sphinx.ext.todo',
 40 |     'sphinx.ext.coverage',
 41 |     'sphinx.ext.viewcode',
 42 | ]
 43 | 
 44 | # Add any paths that contain templates here, relative to this directory.
 45 | templates_path = ['_templates']
 46 | 
 47 | # The suffix(es) of source filenames.
 48 | # You can specify multiple suffix as a list of string:
 49 | #
 50 | # source_suffix = ['.rst', '.md']
 51 | source_suffix = '.rst'
 52 | 
 53 | # The encoding of source files.
 54 | #
 55 | # source_encoding = 'utf-8-sig'
 56 | 
 57 | # The master toctree document.
 58 | master_doc = 'index'
 59 | 
 60 | # General information about the project.
 61 | project = 'SparkSteps'
 62 | copyright = '2017, JW Player'
 63 | author = 'Kamil Sindi'
 64 | 
 65 | # The version info for the project you're documenting, acts as replacement for
 66 | # |version| and |release|, also used in various other places throughout the
 67 | # built documents.
 68 | #
 69 | # The full version, including alpha/beta/rc tags.
 70 | release = get_distribution('sparksteps').version
 71 | # The short X.Y version.
 72 | version = '.'.join(release.split('.')[:2])
 73 | 
 74 | # The language for content autogenerated by Sphinx. Refer to documentation
 75 | # for a list of supported languages.
 76 | #
 77 | # This is also used if you do content translation via gettext catalogs.
 78 | # Usually you set "language" from the command line for these cases.
 79 | language = None
 80 | 
 81 | # There are two options for replacing |today|: either, you set today to some
 82 | # non-false value, then it is used:
 83 | #
 84 | # today = ''
 85 | #
 86 | # Else, today_fmt is used as the format for a strftime call.
 87 | #
 88 | # today_fmt = '%B %d, %Y'
 89 | 
 90 | # List of patterns, relative to source directory, that match files and
 91 | # directories to ignore when looking for source files.
 92 | # This patterns also effect to html_static_path and html_extra_path
 93 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 94 | 
 95 | # The reST default role (used for this markup: `text`) to use for all
 96 | # documents.
 97 | #
 98 | # default_role = None
 99 | 
100 | # If true, '()' will be appended to :func: etc. cross-reference text.
101 | #
102 | # add_function_parentheses = True
103 | 
104 | # If true, the current module name will be prepended to all description
105 | # unit titles (such as .. function::).
106 | #
107 | # add_module_names = True
108 | 
109 | # If true, sectionauthor and moduleauthor directives will be shown in the
110 | # output. They are ignored by default.
111 | #
112 | # show_authors = False
113 | 
114 | # The name of the Pygments (syntax highlighting) style to use.
115 | pygments_style = 'sphinx'
116 | 
117 | # A list of ignored prefixes for module index sorting.
118 | # modindex_common_prefix = []
119 | 
120 | # If true, keep warnings as "system message" paragraphs in the built documents.
121 | # keep_warnings = False
122 | 
123 | # If true, `todo` and `todoList` produce output, else they produce nothing.
124 | todo_include_todos = True
125 | 
126 | 
127 | # -- Options for HTML output ----------------------------------------------
128 | 
129 | # The theme to use for HTML and HTML Help pages.  See the documentation for
130 | # a list of builtin themes.
131 | #
132 | html_theme = 'sphinx_rtd_theme'
133 | 
134 | # Theme options are theme-specific and customize the look and feel of a theme
135 | # further.  For a list of options available for each theme, see the
136 | # documentation.
137 | #
138 | html_theme_options = {
139 |     'collapse_navigation': False,
140 |     'display_version': False,
141 | }
142 | 
143 | # Add any paths that contain custom themes here, relative to this directory.
144 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
145 | 
146 | # The name for this set of Sphinx documents.
147 | # "<project> v<release> documentation" by default.
148 | #
149 | # html_title = 'sparksteps v0.3.0'
150 | 
151 | # A shorter title for the navigation bar.  Default is the same as html_title.
152 | #
153 | # html_short_title = None
154 | 
155 | # The name of an image file (relative to this directory) to place at the top
156 | # of the sidebar.
157 | #
158 | # html_logo = None
159 | 
160 | # The name of an image file (relative to this directory) to use as a favicon of
161 | # the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
162 | # pixels large.
163 | #
164 | # html_favicon = None
165 | 
166 | # Add any paths that contain custom static files (such as style sheets) here,
167 | # relative to this directory. They are copied after the builtin static files,
168 | # so a file named "default.css" will overwrite the builtin "default.css".
169 | html_static_path = ['_static']
170 | 
171 | # Add any extra paths that contain custom files (such as robots.txt or
172 | # .htaccess) here, relative to this directory. These files are copied
173 | # directly to the root of the documentation.
174 | #
175 | # html_extra_path = []
176 | 
177 | # If not None, a 'Last updated on:' timestamp is inserted at every page
178 | # bottom, using the given strftime format.
179 | # The empty string is equivalent to '%b %d, %Y'.
180 | #
181 | # html_last_updated_fmt = None
182 | 
183 | # If true, SmartyPants will be used to convert quotes and dashes to
184 | # typographically correct entities.
185 | #
186 | # html_use_smartypants = True
187 | 
188 | # Custom sidebar templates, maps document names to template names.
189 | #
190 | # html_sidebars = {}
191 | 
192 | # Additional templates that should be rendered to pages, maps page names to
193 | # template names.
194 | #
195 | # html_additional_pages = {}
196 | 
197 | # If false, no module index is generated.
198 | #
199 | # html_domain_indices = True
200 | 
201 | # If false, no index is generated.
202 | #
203 | # html_use_index = True
204 | 
205 | # If true, the index is split into individual pages for each letter.
206 | #
207 | # html_split_index = False
208 | 
209 | # If true, links to the reST sources are added to the pages.
210 | #
211 | # html_show_sourcelink = True
212 | 
213 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
214 | #
215 | # html_show_sphinx = True
216 | 
217 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
218 | #
219 | # html_show_copyright = True
220 | 
221 | # If true, an OpenSearch description file will be output, and all pages will
222 | # contain a <link> tag referring to it.  The value of this option must be the
223 | # base URL from which the finished HTML is served.
224 | #
225 | # html_use_opensearch = ''
226 | 
227 | # This is the file name suffix for HTML files (e.g. ".xhtml").
228 | # html_file_suffix = None
229 | 
230 | # Language to be used for generating the HTML full-text search index.
231 | # Sphinx supports the following languages:
232 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
233 | #   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh'
234 | #
235 | # html_search_language = 'en'
236 | 
237 | # A dictionary with options for the search language support, empty by default.
238 | # 'ja' uses this config value.
239 | # 'zh' user can custom change `jieba` dictionary path.
240 | #
241 | # html_search_options = {'type': 'default'}
242 | 
243 | # The name of a javascript file (relative to the configuration directory) that
244 | # implements a search results scorer. If empty, the default will be used.
245 | #
246 | # html_search_scorer = 'scorer.js'
247 | 
248 | # Output file base name for HTML help builder.
249 | htmlhelp_basename = 'sparkstepsdoc'
250 | 
251 | # -- Options for LaTeX output ---------------------------------------------
252 | 
253 | latex_elements = {
254 |      # The paper size ('letterpaper' or 'a4paper').
255 |      #
256 |      # 'papersize': 'letterpaper',
257 | 
258 |      # The font size ('10pt', '11pt' or '12pt').
259 |      #
260 |      # 'pointsize': '10pt',
261 | 
262 |      # Additional stuff for the LaTeX preamble.
263 |      #
264 |      # 'preamble': '',
265 | 
266 |      # Latex figure (float) alignment
267 |      #
268 |      # 'figure_align': 'htbp',
269 | }
270 | 
271 | # Grouping the document tree into LaTeX files. List of tuples
272 | # (source start file, target name, title,
273 | #  author, documentclass [howto, manual, or own class]).
274 | latex_documents = [
275 |     (master_doc, 'sparksteps.tex', 'SparkSteps Documentation',
276 |      'Kamil Sindi', 'manual'),
277 | ]
278 | 
279 | # The name of an image file (relative to this directory) to place at the top of
280 | # the title page.
281 | #
282 | # latex_logo = None
283 | 
284 | # For "manual" documents, if this is true, then toplevel headings are parts,
285 | # not chapters.
286 | #
287 | # latex_use_parts = False
288 | 
289 | # If true, show page references after internal links.
290 | #
291 | # latex_show_pagerefs = False
292 | 
293 | # If true, show URL addresses after external links.
294 | #
295 | # latex_show_urls = False
296 | 
297 | # Documents to append as an appendix to all manuals.
298 | #
299 | # latex_appendices = []
300 | 
301 | # It false, will not define \strong, \code, 	itleref, \crossref ... but only
302 | # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
303 | # packages.
304 | #
305 | # latex_keep_old_macro_names = True
306 | 
307 | # If false, no module index is generated.
308 | #
309 | # latex_domain_indices = True
310 | 
311 | 
312 | # -- Options for manual page output ---------------------------------------
313 | 
314 | # One entry per manual page. List of tuples
315 | # (source start file, name, description, authors, manual section).
316 | man_pages = [
317 |     (master_doc, 'SparkSteps', 'SparkSteps Documentation',
318 |      [author], 1)
319 | ]
320 | 
321 | # If true, show URL addresses after external links.
322 | #
323 | # man_show_urls = False
324 | 
325 | 
326 | # -- Options for Texinfo output -------------------------------------------
327 | 
328 | # Grouping the document tree into Texinfo files. List of tuples
329 | # (source start file, target name, title, author,
330 | #  dir menu entry, description, category)
331 | texinfo_documents = [
332 |     (master_doc, 'SparkSteps', 'SparkSteps Documentation',
333 |      author, 'SparkSteps', 'Workflow tool to launch Spark jobs on AWS EMR.',
334 |      'Miscellaneous'),
335 | ]
336 | 
337 | # Documents to append as an appendix to all manuals.
338 | #
339 | # texinfo_appendices = []
340 | 
341 | # If false, no module index is generated.
342 | #
343 | # texinfo_domain_indices = True
344 | 
345 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
346 | #
347 | # texinfo_show_urls = 'footnote'
348 | 
349 | # If true, do not generate a @detailmenu in the "Top" node's menu.
350 | #
351 | # texinfo_no_detailmenu = False
352 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2016 JW Player
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/sparksteps/__main__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """Create Spark cluster on EMR.
  4 | 
  5 | Prompt parameters:
  6 |   app                           main spark script for submit spark (required)
  7 |   app-args:                     arguments passed to main spark script
  8 |   app-list:                     Applications to be installed on the EMR cluster (Default: Hadoop Spark)
  9 |   aws-region:                   AWS region name
 10 |   bid-price:                    specify bid price for task nodes
 11 |   bootstrap-script:             include a bootstrap script (s3 path)
 12 |   cluster-id:                   job flow id of existing cluster to submit to
 13 |   debug:                        allow debugging of cluster
 14 |   defaults:                     cluster configurations of the form "<classification1> key1=val1 key2=val2 ..."
 15 |   dynamic-pricing-master:       use spot pricing for the master nodes.
 16 |   dynamic-pricing-core:         use spot pricing for the core nodes.
 17 |   dynamic-pricing-task:         use spot pricing for the task nodes.
 18 |   ebs-volume-size-core:         size of the EBS volume to attach to core nodes in GiB.
 19 |   ebs-volume-type-core:         type of the EBS volume to attach to core nodes (supported: [standard, gp2, io1]).
 20 |   ebs-volumes-per-core:         the number of EBS volumes to attach per core node.
 21 |   ebs-optimized-core:           whether to use EBS optimized volumes for core nodes.
 22 |   ebs-volume-size-task:         size of the EBS volume to attach to task nodes in GiB.
 23 |   ebs-volume-type-task:         type of the EBS volume to attach to task nodes.
 24 |   ebs-volumes-per-task:         the number of EBS volumes to attach per task node.
 25 |   ebs-optimized-task:           whether to use EBS optimized volumes for task nodes.
 26 |   ec2-key:                      name of the Amazon EC2 key pair
 27 |   ec2-subnet-id:                Amazon VPC subnet id
 28 |   help (-h):                    argparse help
 29 |   jobflow-role:                 Amazon EC2 instance profile name to use (Default: EMR_EC2_DefaultRole)
 30 |   service-role:                 AWS IAM service role to use for EMR (Default: EMR_DefaultRole)
 31 |   keep-alive:                   whether to keep the EMR cluster alive when there are no steps
 32 |   log-level (-l):               logging level (default=INFO)
 33 |   instance-type-master:         instance type of of master host (default='m4.large')
 34 |   instance-type-core:           instance type of the core nodes, must be set when num-core > 0
 35 |   instance-type-task:           instance type of the task nodes, must be set when num-task > 0
 36 |   maximize-resource-allocation: sets the maximizeResourceAllocation property for the cluster to true when supplied.
 37 |   name:                         specify cluster name
 38 |   num-core:                     number of core nodes
 39 |   num-task:                     number of task nodes
 40 |   release-label:                EMR release label
 41 |   s3-bucket:                    name of s3 bucket to upload spark file (required)
 42 |   s3-path:                      path (key prefix) within s3-bucket to use when uploading spark file
 43 |   s3-dist-cp:                   s3-dist-cp step after spark job is done
 44 |   submit-args:                  arguments passed to spark-submit
 45 |   tags:                         EMR cluster tags of the form "key1=value1 key2=value2"
 46 |   uploads:                      files to upload to /home/hadoop/ in master instance
 47 |   wait:                         poll until all steps are complete (or error)
 48 | 
 49 | Examples:
 50 |   sparksteps examples/episodes.py \
 51 |     --s3-bucket $AWS_S3_BUCKET \
 52 |     --aws-region us-east-1 \
 53 |     --release-label emr-4.7.0 \
 54 |     --uploads examples/lib examples/episodes.avro \
 55 |     --submit-args="--jars /home/hadoop/lib/spark-avro_2.10-2.0.2-custom.jar" \
 56 |     --app-args="--input /home/hadoop/episodes.avro" \
 57 |     --num-core 1 \
 58 |     --instance-type-core m4.large \
 59 |     --debug
 60 | 
 61 | """
 62 | from __future__ import print_function
 63 | 
 64 | import json
 65 | import shlex
 66 | import logging
 67 | import argparse
 68 | 
 69 | import boto3
 70 | 
 71 | from sparksteps import steps
 72 | from sparksteps import cluster
 73 | from sparksteps import pricing
 74 | from sparksteps.cluster import DEFAULT_APP_LIST, DEFAULT_JOBFLOW_ROLE, DEFAULT_SERVICE_ROLE
 75 | from sparksteps.poll import wait_for_step_complete
 76 | 
 77 | logger = logging.getLogger(__name__)
 78 | LOGFORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
 79 | DEFAULT_SLEEP_INTERVAL_SECONDS = 150
 80 | 
 81 | 
 82 | def create_parser():
 83 |     parser = argparse.ArgumentParser(
 84 |         description=__doc__,
 85 |         formatter_class=argparse.RawDescriptionHelpFormatter
 86 |     )
 87 | 
 88 |     parser.add_argument('app', metavar='FILE')
 89 |     parser.add_argument('--app-args', type=shlex.split)
 90 |     parser.add_argument('--app-list', nargs='*', default=DEFAULT_APP_LIST)
 91 |     parser.add_argument('--aws-region', required=True)
 92 |     parser.add_argument('--bid-price')
 93 |     parser.add_argument('--bootstrap-script')
 94 |     parser.add_argument('--cluster-id')
 95 |     parser.add_argument('--debug', action='store_true')
 96 |     parser.add_argument('--defaults', nargs='*')
 97 |     parser.add_argument('--ec2-key')
 98 |     parser.add_argument('--ec2-subnet-id')
 99 |     parser.add_argument('--jobflow-role', default=DEFAULT_JOBFLOW_ROLE)
100 |     parser.add_argument('--service-role', default=DEFAULT_SERVICE_ROLE)
101 |     parser.add_argument('--keep-alive', action='store_true')
102 |     parser.add_argument('--log-level', '-l', type=str.upper, default='INFO')
103 |     parser.add_argument('--name')
104 |     parser.add_argument('--num-core', type=int)
105 |     parser.add_argument('--num-task', type=int)
106 |     parser.add_argument('--release-label', required=True)
107 |     parser.add_argument('--s3-bucket', required=True)
108 |     parser.add_argument('--s3-path', default='sparksteps/')
109 |     parser.add_argument('--s3-dist-cp', type=shlex.split)
110 |     parser.add_argument('--submit-args', type=shlex.split)
111 |     parser.add_argument('--tags', nargs='*')
112 |     parser.add_argument('--uploads', nargs='*')
113 |     parser.add_argument('--maximize-resource-allocation', action='store_true')
114 |     # TODO: wrap lines below in a for loop?
115 |     parser.add_argument('--instance-type-master', default='m4.large')
116 |     parser.add_argument('--instance-type-core')
117 |     parser.add_argument('--instance-type-task')
118 |     parser.add_argument('--dynamic-pricing-master', action='store_true')
119 |     parser.add_argument('--dynamic-pricing-core', action='store_true')
120 |     parser.add_argument('--dynamic-pricing-task', action='store_true')
121 | 
122 |     # EBS configuration
123 |     parser.add_argument('--ebs-volume-size-core', type=int, default=0)
124 |     parser.add_argument('--ebs-volume-type-core', type=str, default='standard')
125 |     parser.add_argument('--ebs-volumes-per-core', type=int, default=1)
126 |     parser.add_argument('--ebs-optimized-core', action='store_true')
127 | 
128 |     parser.add_argument('--ebs-volume-size-task', type=int, default=0)
129 |     parser.add_argument('--ebs-volume-type-task', type=str, default='standard')
130 |     parser.add_argument('--ebs-volumes-per-task', type=int, default=1)
131 |     parser.add_argument('--ebs-optimized-task', action='store_true')
132 | 
133 |     # Wait configuration
134 |     parser.add_argument('--wait', type=int, nargs='?', default=False)
135 | 
136 |     # Deprecated arguments
137 |     parser.add_argument('--master')
138 |     parser.add_argument('--slave')
139 |     parser.add_argument('--dynamic-pricing', action='store_true')
140 | 
141 |     return parser
142 | 
143 | 
144 | def parse_cli_args(parser, args=None):
145 |     """
146 |     Utilizes `parser` to parse command line variables and logs.
147 |     """
148 |     args = vars(parser.parse_args(args))
149 | 
150 |     # Perform sanitization on any arguments
151 |     if args['s3_path'] and args['s3_path'].startswith('/'):
152 |         raise ValueError(
153 |             f"Provided value for s3-path \"{args['s3_path']}\" cannot have leading \"/\" character.")
154 | 
155 |     if args['wait'] is None:
156 |         args['wait'] = DEFAULT_SLEEP_INTERVAL_SECONDS
157 | 
158 |     return args
159 | 
160 | 
161 | def determine_prices(args, ec2, pricing_client):
162 |     """
163 |     Checks `args` in order to determine whether spot pricing should be
164 |      used for instance groups within the EMR cluster, and if this is the
165 |      case attempts to determine the optimal bid price.
166 |     """
167 |     # Check if we need to do anything
168 |     pricing_properties = (
169 |         'dynamic_pricing_master', 'dynamic_pricing_core', 'dynamic_pricing_task')
170 |     if not any([x in args for x in pricing_properties]):
171 |         return args
172 | 
173 |     availability_zone = None
174 |     subnet_id = args.get('ec2_subnet_id')
175 |     if subnet_id:
176 |         # We need to determine the AZ associated with the provided EC2 subnet ID
177 |         # in order to look up spot prices in the correct region.
178 |         availability_zone = pricing.get_availability_zone(ec2, subnet_id)
179 |         if not availability_zone:
180 |             logger.info("Could not determine availability zone for subnet '%s'", subnet_id)
181 | 
182 |     # Mutate a copy of args.
183 |     args = args.copy()
184 | 
185 |     # Determine bid prices for the instance types for which we want to
186 |     # use bid pricing.
187 |     for price_property in pricing_properties:
188 |         if price_property not in args:
189 |             continue
190 | 
191 |         if args[price_property]:
192 |             instance_type_key = price_property.replace(
193 |                 'dynamic_pricing', 'instance_type')
194 |             instance_type = args[instance_type_key]
195 |             instance_group = price_property.replace('dynamic_pricing_', '')
196 |             # TODO (rikheijdens): optimize by caching instance prices
197 |             # between instance groups?
198 |             bid_price, is_spot = pricing.get_bid_price(ec2, pricing_client, instance_type, availability_zone)
199 |             if is_spot:
200 |                 logger.info("Using spot pricing with a bid price of $%.2f"
201 |                             " for %s instances in the %s instance group.",
202 |                             bid_price, instance_type,
203 |                             instance_group)
204 |                 bid_key = price_property.replace('dynamic_pricing', 'bid_price')
205 |                 args[bid_key] = str(bid_price)
206 |             else:
207 |                 logger.info("Spot price for %s in the %s instance group too high."
208 |                             " Using on-demand price of $%.2f",
209 |                             instance_type, instance_group, bid_price)
210 |     return args
211 | 
212 | 
213 | def main():
214 |     args_dict = parse_cli_args(create_parser())
215 |     print("Args: ", args_dict)
216 | 
217 |     numeric_level = getattr(logging, args_dict['log_level'], None)
218 |     logging.basicConfig(format=LOGFORMAT)
219 |     logging.getLogger('sparksteps').setLevel(numeric_level)
220 | 
221 |     client = boto3.client('emr', region_name=args_dict['aws_region'])
222 |     s3 = boto3.resource('s3')
223 | 
224 |     cluster_id = args_dict.get('cluster_id')
225 |     if cluster_id is None:
226 |         logger.info("Launching cluster...")
227 |         ec2_client = boto3.client('ec2', region_name=args_dict['aws_region'])
228 |         pricing_client = boto3.client('pricing', region_name=args_dict['aws_region'])
229 |         args_dict = determine_prices(args_dict, ec2_client, pricing_client)
230 |         cluster_config = cluster.emr_config(**args_dict)
231 |         response = client.run_job_flow(**cluster_config)
232 |         cluster_id = response['JobFlowId']
233 |         logger.info("Cluster ID: %s", cluster_id)
234 | 
235 |     emr_steps = steps.setup_steps(s3,
236 |                                   args_dict['s3_bucket'],
237 |                                   args_dict['s3_path'],
238 |                                   args_dict['app'],
239 |                                   args_dict['submit_args'],
240 |                                   args_dict['app_args'],
241 |                                   args_dict['uploads'],
242 |                                   args_dict['s3_dist_cp'])
243 | 
244 |     response = client.add_job_flow_steps(JobFlowId=cluster_id, Steps=emr_steps)
245 | 
246 |     try:
247 |         step_ids = json.dumps(response['StepIds'])
248 |     except KeyError:
249 |         step_ids = 'Invalid response'
250 |         args_dict['wait'] = False
251 |     logger.info("Step IDs: %s", step_ids)
252 | 
253 |     sleep_interval = args_dict.get('wait')
254 |     if sleep_interval:
255 |         last_step_id = response['StepIds'][-1]
256 |         logger.info('Polling until step {last_step} is complete using a sleep interval of {interval} seconds...'
257 |                     .format(last_step=last_step_id, interval=sleep_interval))
258 |         wait_for_step_complete(client, cluster_id, last_step_id, sleep_interval_s=int(sleep_interval))
259 | 


--------------------------------------------------------------------------------
/tests/test_sparksteps.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Test SparkSteps."""
  3 | import shlex
  4 | import os.path
  5 | 
  6 | import boto3
  7 | import moto
  8 | 
  9 | from sparksteps.cluster import emr_config
 10 | from sparksteps.steps import setup_steps, S3DistCp
 11 | 
 12 | TEST_BUCKET = 'sparksteps-test'
 13 | TEST_BUCKET_PATH = 'sparksteps/'
 14 | AWS_REGION_NAME = 'us-east-1'
 15 | 
 16 | DIR_PATH = os.path.dirname(os.path.realpath(__file__))
 17 | DATA_DIR = os.path.join(DIR_PATH, 'data')
 18 | LIB_DIR = os.path.join(DATA_DIR, 'dir')
 19 | EPISODES_APP = os.path.join(DATA_DIR, 'episodes.py')
 20 | EPISODES_AVRO = os.path.join(DATA_DIR, 'episodes.avro')
 21 | 
 22 | 
 23 | @moto.mock_emr
 24 | def test_emr_cluster_config():
 25 |     config = emr_config('emr-5.2.0',
 26 |                         instance_type_master='m4.large',
 27 |                         jobflow_role='MyCustomRole',
 28 |                         service_role='MyServiceRole',
 29 |                         keep_alive=False,
 30 |                         instance_type_core='m4.2xlarge',
 31 |                         instance_type_task='m4.2xlarge',
 32 |                         num_core=1,
 33 |                         num_task=1,
 34 |                         bid_price_task='0.1',
 35 |                         maximize_resource_allocation=True,
 36 |                         name="Test SparkSteps",
 37 |                         app_list=['hadoop', 'hive', 'spark'])
 38 |     assert config == {'Instances':
 39 |                           {'InstanceGroups': [{'InstanceCount': 1,  # NOQA: E127
 40 |                                                'InstanceRole': 'MASTER',
 41 |                                                'InstanceType': 'm4.large',
 42 |                                                'Market': 'ON_DEMAND',
 43 |                                                'Name': 'Master Node'},
 44 |                                               {'InstanceCount': 1,
 45 |                                                'InstanceRole': 'CORE',
 46 |                                                'InstanceType': 'm4.2xlarge',
 47 |                                                'Market': 'ON_DEMAND',
 48 |                                                'Name': 'Core Nodes'},
 49 |                                               {'BidPrice': '0.1',
 50 |                                                'InstanceCount': 1,
 51 |                                                'InstanceRole': 'TASK',
 52 |                                                'InstanceType': 'm4.2xlarge',
 53 |                                                'Market': 'SPOT',
 54 |                                                'Name': 'Task Nodes'}],
 55 |                            'KeepJobFlowAliveWhenNoSteps': False,
 56 |                            'TerminationProtected': False
 57 |                            },
 58 |                       'Applications': [{'Name': 'Hadoop'}, {'Name': 'Hive'}, {'Name': 'Spark'}],
 59 |                       'Name': 'Test SparkSteps',
 60 |                       'JobFlowRole': 'MyCustomRole',
 61 |                       'ServiceRole': 'MyServiceRole',
 62 |                       'ReleaseLabel': 'emr-5.2.0',
 63 |                       'VisibleToAllUsers': True,
 64 |                       'Configurations': [{'Classification': 'spark',
 65 |                                           'Properties': {'maximizeResourceAllocation': 'true'}}]
 66 |                       }
 67 | 
 68 |     client = boto3.client('emr', region_name=AWS_REGION_NAME)
 69 |     client.run_job_flow(**config)
 70 | 
 71 | 
 72 | @moto.mock_emr
 73 | def test_emr_cluster_config_with_bootstrap():
 74 |     config = emr_config('emr-5.2.0',
 75 |                         instance_type_master='m4.large',
 76 |                         keep_alive=False,
 77 |                         instance_type_core='m4.2xlarge',
 78 |                         instance_type_task='m4.2xlarge',
 79 |                         num_core=1,
 80 |                         num_task=1,
 81 |                         bid_price_task='0.1',
 82 |                         name="Test SparkSteps",
 83 |                         bootstrap_script='s3://bucket/bootstrap-actions.sh')
 84 |     assert config == {'Instances':
 85 |                           {'InstanceGroups': [{'InstanceCount': 1,  # NOQA: E127
 86 |                                                'InstanceRole': 'MASTER',
 87 |                                                'InstanceType': 'm4.large',
 88 |                                                'Market': 'ON_DEMAND',
 89 |                                                'Name': 'Master Node'},
 90 |                                               {'InstanceCount': 1,
 91 |                                                'InstanceRole': 'CORE',
 92 |                                                'InstanceType': 'm4.2xlarge',
 93 |                                                'Market': 'ON_DEMAND',
 94 |                                                'Name': 'Core Nodes'},
 95 |                                               {'BidPrice': '0.1',
 96 |                                                'InstanceCount': 1,
 97 |                                                'InstanceRole': 'TASK',
 98 |                                                'InstanceType': 'm4.2xlarge',
 99 |                                                'Market': 'SPOT',
100 |                                                'Name': 'Task Nodes'}],
101 |                            'KeepJobFlowAliveWhenNoSteps': False,
102 |                            'TerminationProtected': False
103 |                            },
104 |                       'Applications': [{'Name': 'Hadoop'}, {'Name': 'Spark'}],
105 |                       'BootstrapActions': [{'Name': 'bootstrap',
106 |                                             'ScriptBootstrapAction': {'Path': 's3://bucket/bootstrap-actions.sh'}}],
107 |                       'Name': 'Test SparkSteps',
108 |                       'JobFlowRole': 'EMR_EC2_DefaultRole',
109 |                       'ReleaseLabel': 'emr-5.2.0',
110 |                       'VisibleToAllUsers': True,
111 |                       'ServiceRole': 'EMR_DefaultRole'}
112 | 
113 |     client = boto3.client('emr', region_name=AWS_REGION_NAME)
114 |     client.run_job_flow(**config)
115 | 
116 | 
117 | @moto.mock_emr
118 | def test_emr_cluster_config_with_defaults():
119 |     config = emr_config('emr-5.2.0',
120 |                         instance_type_master='m4.large',
121 |                         keep_alive=False,
122 |                         instance_type_core='m4.2xlarge',
123 |                         instance_type_task='m4.2xlarge',
124 |                         num_core=1,
125 |                         num_task=1,
126 |                         bid_price_task='0.1',
127 |                         name="Test SparkSteps",
128 |                         defaults=['spark-defaults', 'spark.speculation=false',
129 |                                   'yarn-site', 'yarn.nodemanager.vmem-check-enabled=true'])
130 |     print(config['Configurations'])
131 |     assert config == {
132 |         'Instances': {
133 |             'InstanceGroups': [{'InstanceCount': 1,  # NOQA: E127
134 |                                 'InstanceRole': 'MASTER',
135 |                                 'InstanceType': 'm4.large',
136 |                                 'Market': 'ON_DEMAND',
137 |                                 'Name': 'Master Node'},
138 |                                {'InstanceCount': 1,
139 |                                 'InstanceRole': 'CORE',
140 |                                 'InstanceType': 'm4.2xlarge',
141 |                                 'Market': 'ON_DEMAND',
142 |                                 'Name': 'Core Nodes'},
143 |                                {'BidPrice': '0.1',
144 |                                 'InstanceCount': 1,
145 |                                 'InstanceRole': 'TASK',
146 |                                 'InstanceType': 'm4.2xlarge',
147 |                                 'Market': 'SPOT',
148 |                                 'Name': 'Task Nodes'}],
149 |             'KeepJobFlowAliveWhenNoSteps': False,
150 |             'TerminationProtected': False
151 |         },
152 |         'Applications': [{'Name': 'Hadoop'}, {'Name': 'Spark'}],
153 |         'Configurations': [
154 |             {
155 |                 'Classification': 'spark-defaults',
156 |                 'Properties': {
157 |                     'spark.speculation': 'false'
158 |                 }
159 |             },
160 |             {
161 |                 'Classification': 'yarn-site',
162 |                 'Properties': {
163 |                     'yarn.nodemanager.vmem-check-enabled': 'true'
164 |                 }
165 |             }
166 |         ],
167 |         'Name': 'Test SparkSteps',
168 |         'JobFlowRole': 'EMR_EC2_DefaultRole',
169 |         'ReleaseLabel': 'emr-5.2.0',
170 |         'VisibleToAllUsers': True,
171 |         'ServiceRole': 'EMR_DefaultRole'
172 |     }
173 | 
174 |     client = boto3.client('emr', region_name=AWS_REGION_NAME)
175 |     client.run_job_flow(**config)
176 | 
177 | 
178 | def test_emr_spot_cluster():
179 |     config = emr_config('emr-5.2.0',
180 |                         instance_type_master='m4.large',
181 |                         keep_alive=False,
182 |                         instance_type_core='c3.8xlarge',
183 |                         instance_type_task='c3.8xlarge',
184 |                         num_core=2,
185 |                         num_task=4,
186 |                         bid_price_master='0.05',
187 |                         bid_price_core='0.25',
188 |                         bid_price_task='0.1',
189 |                         name="Test SparkSteps",
190 |                         bootstrap_script='s3://bucket/bootstrap-actions.sh')
191 |     assert config == {'Instances':
192 |                           {'InstanceGroups': [{'InstanceCount': 1,  # NOQA: E127
193 |                                                'InstanceRole': 'MASTER',
194 |                                                'InstanceType': 'm4.large',
195 |                                                'Market': 'SPOT',
196 |                                                'BidPrice': '0.05',
197 |                                                'Name': 'Master Node'},
198 |                                               {'BidPrice': '0.25',
199 |                                                'InstanceCount': 2,
200 |                                                'InstanceRole': 'CORE',
201 |                                                'InstanceType': 'c3.8xlarge',
202 |                                                'Market': 'SPOT',
203 |                                                'Name': 'Core Nodes'},
204 |                                               {'BidPrice': '0.1',
205 |                                                'InstanceCount': 4,
206 |                                                'InstanceRole': 'TASK',
207 |                                                'InstanceType': 'c3.8xlarge',
208 |                                                'Market': 'SPOT',
209 |                                                'Name': 'Task Nodes'}],
210 |                            'KeepJobFlowAliveWhenNoSteps': False,
211 |                            'TerminationProtected': False
212 |                            },
213 |                       'Applications': [{'Name': 'Hadoop'}, {'Name': 'Spark'}],
214 |                       'BootstrapActions': [{'Name': 'bootstrap',
215 |                                             'ScriptBootstrapAction': {'Path': 's3://bucket/bootstrap-actions.sh'}}],
216 |                       'Name': 'Test SparkSteps',
217 |                       'JobFlowRole': 'EMR_EC2_DefaultRole',
218 |                       'ReleaseLabel': 'emr-5.2.0',
219 |                       'VisibleToAllUsers': True,
220 |                       'ServiceRole': 'EMR_DefaultRole'}
221 | 
222 | 
223 | def test_emr_ebs_storage():
224 |     config = emr_config('emr-5.2.0',
225 |                         instance_type_master='m4.large',
226 |                         keep_alive=False,
227 |                         instance_type_core='c3.8xlarge',
228 |                         instance_type_task='c3.8xlarge',
229 |                         ebs_volume_size_core=100,
230 |                         ebs_volume_type_core='gp2',
231 |                         ebs_volumes_per_core=2,
232 |                         ebs_volume_size_task=10,
233 |                         ebs_volume_type_task='io1',
234 |                         ebs_optimized_task=True,
235 |                         num_core=2,
236 |                         num_task=4,
237 |                         bid_price_master='0.05',
238 |                         bid_price_core='0.25',
239 |                         bid_price_task='0.1',
240 |                         name="Test SparkSteps",
241 |                         bootstrap_script='s3://bucket/bootstrap-actions.sh')
242 |     assert config == {'Instances':
243 |                           {'InstanceGroups': [{'InstanceCount': 1,  # NOQA: E127
244 |                                                'InstanceRole': 'MASTER',
245 |                                                'InstanceType': 'm4.large',
246 |                                                'Market': 'SPOT',
247 |                                                'BidPrice': '0.05',
248 |                                                'Name': 'Master Node'},
249 |                                               {'BidPrice': '0.25',
250 |                                                'InstanceCount': 2,
251 |                                                'InstanceRole': 'CORE',
252 |                                                'InstanceType': 'c3.8xlarge',
253 |                                                'Market': 'SPOT',
254 |                                                'Name': 'Core Nodes',
255 |                                                'EbsConfiguration': {
256 |                                                    'EbsBlockDeviceConfigs': [{
257 |                                                        'VolumeSpecification': {
258 |                                                            'VolumeType': 'gp2',
259 |                                                            'SizeInGB': 100
260 |                                                        },
261 |                                                        'VolumesPerInstance': 2
262 |                                                    }],
263 |                                                    'EbsOptimized': False
264 |                                                }},
265 |                                               {'BidPrice': '0.1',
266 |                                                'InstanceCount': 4,
267 |                                                'InstanceRole': 'TASK',
268 |                                                'InstanceType': 'c3.8xlarge',
269 |                                                'Market': 'SPOT',
270 |                                                'Name': 'Task Nodes',
271 |                                                'EbsConfiguration': {
272 |                                                    'EbsBlockDeviceConfigs': [{
273 |                                                        'VolumeSpecification': {
274 |                                                            'VolumeType': 'io1',
275 |                                                            'SizeInGB': 10
276 |                                                        },
277 |                                                        'VolumesPerInstance': 1
278 |                                                    }],
279 |                                                    'EbsOptimized': True
280 |                                                }}],
281 |                            'KeepJobFlowAliveWhenNoSteps': False,
282 |                            'TerminationProtected': False
283 |                            },
284 |                       'Applications': [{'Name': 'Hadoop'}, {'Name': 'Spark'}],
285 |                       'BootstrapActions': [{'Name': 'bootstrap',
286 |                                             'ScriptBootstrapAction': {'Path': 's3://bucket/bootstrap-actions.sh'}}],
287 |                       'Name': 'Test SparkSteps',
288 |                       'JobFlowRole': 'EMR_EC2_DefaultRole',
289 |                       'ReleaseLabel': 'emr-5.2.0',
290 |                       'VisibleToAllUsers': True,
291 |                       'ServiceRole': 'EMR_DefaultRole'}
292 | 
293 | 
294 | @moto.mock_s3
295 | def test_setup_steps():
296 |     s3 = boto3.resource('s3', region_name=AWS_REGION_NAME)
297 |     s3.create_bucket(Bucket=TEST_BUCKET)
298 |     steps = (setup_steps(s3,
299 |                          TEST_BUCKET,
300 |                          TEST_BUCKET_PATH,
301 |                          EPISODES_APP,
302 |                          submit_args="--jars /home/hadoop/dir/test.jar".split(),
303 |                          app_args="--input /home/hadoop/episodes.avro".split(),
304 |                          uploads=[LIB_DIR, EPISODES_AVRO])
305 |              )
306 |     assert steps == [
307 |         {'HadoopJarStep': {'Jar': 'command-runner.jar',
308 |                            'Args': ['aws', 's3', 'cp',
309 |                                     's3://sparksteps-test/sparksteps/sources/dir.zip',
310 |                                     '/home/hadoop/']},
311 |          'ActionOnFailure': 'CANCEL_AND_WAIT',
312 |          'Name': 'Copy dir.zip'},
313 |         {'HadoopJarStep': {'Jar': 'command-runner.jar',
314 |                            'Args': ['unzip', '-o', '/home/hadoop/dir.zip',
315 |                                     '-d', '/home/hadoop/dir']},
316 |          'ActionOnFailure': 'CANCEL_AND_WAIT',
317 |          'Name': 'Unzip dir.zip'},
318 |         {'HadoopJarStep': {'Jar': 'command-runner.jar',
319 |                            'Args': ['aws', 's3', 'cp',
320 |                                     's3://sparksteps-test/sparksteps/sources/episodes.avro',
321 |                                     '/home/hadoop/']},
322 |          'ActionOnFailure': 'CANCEL_AND_WAIT',
323 |          'Name': 'Copy episodes.avro'},
324 |         {'HadoopJarStep': {'Jar': 'command-runner.jar',
325 |                            'Args': ['aws', 's3', 'cp',
326 |                                     's3://sparksteps-test/sparksteps/sources/episodes.py',
327 |                                     '/home/hadoop/']},
328 |          'ActionOnFailure': 'CANCEL_AND_WAIT', 'Name': 'Copy episodes.py'},
329 |         {'HadoopJarStep': {'Jar': 'command-runner.jar',
330 |                            'Args': ['spark-submit', '--jars',
331 |                                     '/home/hadoop/dir/test.jar',
332 |                                     '/home/hadoop/episodes.py', '--input',
333 |                                     '/home/hadoop/episodes.avro']},
334 |          'ActionOnFailure': 'CANCEL_AND_WAIT',
335 |          'Name': 'Run episodes.py'}]
336 | 
337 | 
338 | @moto.mock_s3
339 | def test_setup_steps_non_existing_upload_file():
340 |     s3 = boto3.resource('s3', region_name=AWS_REGION_NAME)
341 |     s3.create_bucket(Bucket=TEST_BUCKET)
342 |     dne_file_path = os.path.join(DATA_DIR, 'does_not_exist.jar')
343 |     try:
344 |         setup_steps(s3,
345 |                     TEST_BUCKET,
346 |                     TEST_BUCKET_PATH,
347 |                     EPISODES_APP,
348 |                     submit_args="--jars /home/hadoop/dir/test.jar".split(),
349 |                     app_args="--input /home/hadoop/episodes.avro".split(),
350 |                     uploads=[dne_file_path])
351 |     except FileNotFoundError as e:
352 |         assert str(e) == '{} does not exist (does not reference a valid file or path).'.format(dne_file_path)
353 |         return
354 |     assert False, 'Expected ValueError to be raised when `--uploads` parameter contains path to non-existing file or directory.'  # NOQA: E501
355 | 
356 | 
357 | @moto.mock_s3
358 | def test_setup_steps_with_bucket_path():
359 |     s3 = boto3.resource('s3', region_name=AWS_REGION_NAME)
360 |     s3.create_bucket(Bucket=TEST_BUCKET)
361 |     steps = (setup_steps(s3,
362 |                          TEST_BUCKET,
363 |                          'custom/path/prefix/',
364 |                          EPISODES_APP,
365 |                          submit_args="--jars /home/hadoop/dir/test.jar".split(),
366 |                          app_args="--input /home/hadoop/episodes.avro".split(),
367 |                          uploads=[LIB_DIR, EPISODES_AVRO, 's3://custom-bucket/custom/path/s3_file.py'])
368 |              )
369 |     assert steps == [
370 |         {'HadoopJarStep': {'Jar': 'command-runner.jar',
371 |                            'Args': ['aws', 's3', 'cp',
372 |                                     's3://sparksteps-test/custom/path/prefix/sources/dir.zip',
373 |                                     '/home/hadoop/']},
374 |          'ActionOnFailure': 'CANCEL_AND_WAIT',
375 |          'Name': 'Copy dir.zip'},
376 |         {'HadoopJarStep': {'Jar': 'command-runner.jar',
377 |                            'Args': ['unzip', '-o', '/home/hadoop/dir.zip',
378 |                                     '-d', '/home/hadoop/dir']},
379 |          'ActionOnFailure': 'CANCEL_AND_WAIT',
380 |          'Name': 'Unzip dir.zip'},
381 |         {'HadoopJarStep': {'Jar': 'command-runner.jar',
382 |                            'Args': ['aws', 's3', 'cp',
383 |                                     's3://sparksteps-test/custom/path/prefix/sources/episodes.avro',
384 |                                     '/home/hadoop/']},
385 |          'ActionOnFailure': 'CANCEL_AND_WAIT',
386 |          'Name': 'Copy episodes.avro'},
387 |         {'HadoopJarStep': {'Jar': 'command-runner.jar',
388 |                            'Args': ['aws', 's3', 'cp',
389 |                                     's3://custom-bucket/custom/path/s3_file.py',
390 |                                     '/home/hadoop/']},
391 |          'ActionOnFailure': 'CANCEL_AND_WAIT', 'Name': 'Copy s3_file.py'},
392 |         {'HadoopJarStep': {'Jar': 'command-runner.jar',
393 |                            'Args': ['aws', 's3', 'cp',
394 |                                     's3://sparksteps-test/custom/path/prefix/sources/episodes.py',
395 |                                     '/home/hadoop/']},
396 |          'ActionOnFailure': 'CANCEL_AND_WAIT', 'Name': 'Copy episodes.py'},
397 |         {'HadoopJarStep': {'Jar': 'command-runner.jar',
398 |                            'Args': ['spark-submit', '--jars',
399 |                                     '/home/hadoop/dir/test.jar',
400 |                                     '/home/hadoop/episodes.py', '--input',
401 |                                     '/home/hadoop/episodes.avro']},
402 |          'ActionOnFailure': 'CANCEL_AND_WAIT',
403 |          'Name': 'Run episodes.py'}]
404 | 
405 | 
406 | def test_s3_dist_cp_step():
407 |     splitted = shlex.split(
408 |         "--s3Endpoint=s3.amazonaws.com --src=s3://mybucket/logs/j-3GYXXXXXX9IOJ/node/ --dest=hdfs:///output --srcPattern=.*[a-zA-Z,]+")  # NOQA: E501
409 |     assert S3DistCp(splitted).step == {
410 |         'ActionOnFailure': 'CONTINUE',
411 |         'HadoopJarStep': {
412 |             'Args': ['s3-dist-cp',
413 |                      '--s3Endpoint=s3.amazonaws.com',
414 |                      '--src=s3://mybucket/logs/j-3GYXXXXXX9IOJ/node/',
415 |                      '--dest=hdfs:///output',
416 |                      '--srcPattern=.*[a-zA-Z,]+'],
417 |             'Jar': 'command-runner.jar'},
418 |         'Name': 'S3DistCp step'
419 |     }
420 | 


--------------------------------------------------------------------------------