├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yaml │ ├── feature_request.md │ └── issue_template_simple.md └── workflows │ └── static-checking.yml ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── LICENSE-LAMBDA ├── LICENSE-SUMMARY ├── MANIFEST.in ├── NOTICE ├── README.md ├── THIRD_PARTY ├── VERSION ├── aws_emr_launch ├── __init__.py ├── constructs │ ├── __init__.py │ ├── base.py │ ├── emr_constructs │ │ ├── __init__.py │ │ ├── cluster_configuration.py │ │ ├── emr_code.py │ │ └── emr_profile.py │ ├── iam_roles │ │ ├── __init__.py │ │ └── emr_roles.py │ ├── lambdas │ │ ├── __init__.py │ │ └── emr_lambdas.py │ ├── managed_configurations │ │ ├── __init__.py │ │ ├── autoscaling_configuration.py │ │ ├── instance_fleet_configuration.py │ │ └── instance_group_configuration.py │ ├── security_groups │ │ ├── __init__.py │ │ └── emr.py │ └── step_functions │ │ ├── __init__.py │ │ ├── emr_chains.py │ │ ├── emr_launch_function.py │ │ └── emr_tasks.py ├── control_plane │ ├── __init__.py │ ├── constructs │ │ ├── __init__.py │ │ ├── control_plane_stack.py │ │ └── lambdas │ │ │ ├── __init__.py │ │ │ └── apis.py │ └── lambda_sources │ │ ├── __init__.py │ │ └── apis │ │ └── get_list_apis.py └── lambda_sources │ ├── LICENSE │ ├── __init__.py │ ├── emr_utilities │ ├── check_cluster_status │ │ ├── __init__.py │ │ └── lambda_source.py │ ├── fail_if_cluster_running │ │ ├── __init__.py │ │ └── lambda_source.py │ ├── load_cluster_configuration │ │ ├── __init__.py │ │ └── lambda_source.py │ ├── override_cluster_configs │ │ ├── __init__.py │ │ └── lambda_source.py │ ├── override_step_args │ │ ├── __init__.py │ │ └── lambda_source.py │ ├── parse_json_string │ │ ├── __init__.py │ │ └── lambda_source.py │ ├── run_job_flow │ │ ├── __init__.py │ │ └── lambda_source.py │ └── update_cluster_tags │ │ ├── __init__.py │ │ └── lambda_source.py │ └── layers │ └── emr_config_utils │ └── requirements.txt ├── docs ├── AWS EMR Launch Threat Model.pdf ├── emr_launch_usage.md ├── emr_launch_usage.pdf └── usage_diagram.png ├── examples ├── README.md ├── __init__.py ├── cluster_configurations │ ├── __init__.py │ ├── app.py │ ├── bootstrap_source │ │ └── test_bootstrap.sh │ ├── cdk.json │ └── jars │ │ ├── example_0.jar │ │ └── example_1.jar ├── control_plane │ ├── __init__.py │ ├── app.py │ └── cdk.json ├── deploy_all.sh ├── emr_launch_function │ ├── __init__.py │ ├── app.py │ └── cdk.json ├── emr_profiles │ ├── __init__.py │ ├── app.py │ └── cdk.json ├── environment_stack │ ├── __init__.py │ ├── app.py │ └── cdk.json ├── persistent_cluster_pipeline │ ├── __init__.py │ ├── app.py │ ├── cdk.json │ └── step_sources │ │ ├── phase_1 │ │ ├── test_step_0.sh │ │ ├── test_step_1.sh │ │ ├── test_step_2.sh │ │ ├── test_step_3.sh │ │ └── test_step_4.sh │ │ └── phase_2 │ │ ├── test_step_0.hql │ │ ├── test_step_1.hql │ │ ├── test_step_2.hql │ │ ├── test_step_3.hql │ │ └── test_step_4.hql ├── sns_triggered_pipeline │ ├── __init__.py │ ├── app.py │ ├── cdk.json │ ├── lambda_sources │ │ └── execute_pipeline.py │ └── step_sources │ │ ├── test_step_0.py │ │ ├── test_step_1.py │ │ ├── test_step_2.py │ │ ├── test_step_3.py │ │ └── test_step_4.py ├── spark_batch_orchestration │ ├── .gitignore │ ├── BatchSparkPipelineArchitecture.png │ ├── README.md │ ├── __init__.py │ ├── app.py │ ├── cdk.json │ ├── config.json │ ├── deploy.sh │ ├── infrastructure │ │ ├── __init__.py │ │ ├── emr_launch │ │ │ ├── README.md │ │ │ ├── bootstrap_actions │ │ │ │ └── install_boto3.sh │ │ │ ├── cluster_definition.py │ │ │ └── instance_group_config.py │ │ ├── emr_orchestration │ │ │ ├── __init__.py │ │ │ ├── stack.py │ │ │ └── steps │ │ │ │ ├── __init__.py │ │ │ │ ├── data_ingestion.py │ │ │ │ └── data_preparation.py │ │ ├── emr_trigger │ │ │ ├── __init__.py │ │ │ ├── lambda_source │ │ │ │ ├── __init__.py │ │ │ │ └── trigger.py │ │ │ └── stack.py │ │ └── job_summary │ │ │ ├── __init__.py │ │ │ ├── lambda_source │ │ │ ├── __init__.py │ │ │ ├── extracting.py │ │ │ ├── fetching.py │ │ │ ├── helpers.py │ │ │ ├── main.py │ │ │ ├── rendering.py │ │ │ └── summary.css │ │ │ └── stack.py │ ├── requirements.txt │ ├── sample_data │ │ ├── part-r-00000-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00001-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00002-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00003-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00004-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00005-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00006-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00007-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00008-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00009-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00010-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00011-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00012-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00013-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00014-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00015-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00016-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00017-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00018-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00019-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00020-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00021-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00022-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00023-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00024-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00025-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00026-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00027-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00028-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00029-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00030-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00031-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00032-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00033-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00034-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00035-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00036-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00037-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00038-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00039-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00040-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00041-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00042-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00043-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00044-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00045-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00046-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00047-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00048-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00049-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00050-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00051-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00052-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00053-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00054-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00055-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00056-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00057-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00058-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00059-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00060-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00061-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00062-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00063-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00064-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00065-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00066-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00067-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ ├── part-r-00068-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ │ └── part-r-00069-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet │ └── setup.py ├── terraform_pipeline │ ├── .gitignore │ ├── README.md │ ├── bin │ │ └── deploy.sh │ ├── emr_pipeline │ │ ├── emr-launch.tf │ │ ├── emr-orchestration.tf │ │ ├── emr_launch │ │ │ ├── .gitignore │ │ │ ├── README.md │ │ │ ├── infrastructure │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── cdk.json │ │ │ │ ├── cluster_definition.py │ │ │ │ ├── instance_group_config.py │ │ │ │ ├── main.py │ │ │ │ └── requirements.txt │ │ │ ├── main.tf │ │ │ ├── outputs.tf │ │ │ ├── utils │ │ │ │ ├── cdk_deploy.sh │ │ │ │ └── cdk_destroy.sh │ │ │ └── variables.tf │ │ ├── emr_step_function │ │ │ ├── lambda.zip │ │ │ ├── lambda │ │ │ │ ├── __init__.py │ │ │ │ └── lambda_parse_json.py │ │ │ ├── main.tf │ │ │ ├── outputs.tf │ │ │ ├── pipeline.json │ │ │ └── variables.tf │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── s3-buckets.tf │ │ ├── s3-spark-script.tf │ │ ├── spark_script.py │ │ └── variables.tf │ └── environments │ │ └── dev │ │ └── eu-west-1.tfvars └── transient_cluster_pipeline │ ├── __init__.py │ ├── app.py │ ├── cdk.json │ ├── pipeline.json │ └── step_sources │ ├── phase_1 │ ├── test_step_0.sh │ ├── test_step_1.sh │ ├── test_step_2.sh │ ├── test_step_3.sh │ ├── test_step_4.sh │ └── test_validation.sh │ └── phase_2 │ ├── test_step_0.hql │ ├── test_step_1.hql │ ├── test_step_2.hql │ ├── test_step_3.hql │ ├── test_step_4.hql │ └── test_validation.hql ├── extras └── airflow │ ├── README.md │ ├── airflow_dag.py │ └── aws_operators_plugin.py ├── fix.sh ├── pyproject.toml ├── requirements-1.3.x.txt ├── requirements-1.4.x.txt ├── requirements-1.5.x.txt ├── requirements-2.x.txt ├── requirements-dev.in ├── requirements-dev.txt ├── requirements-lambda-layer.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py └── aws_emr_launch │ ├── __init__.py │ ├── constructs │ ├── __init__.py │ ├── emr_constructs │ │ ├── __init__.py │ │ ├── test_cluster_configuration.py │ │ └── test_emr_profile.py │ ├── managed_configurations │ │ ├── __init__.py │ │ ├── test_autoscaling_configuration.py │ │ ├── test_instance_fleet_configuration.py │ │ └── test_instance_group_configuration.py │ ├── step_functions │ │ ├── test_emr_chains.py │ │ ├── test_emr_launch_function.py │ │ └── test_emr_tasks.py │ ├── test_iam_roles.py │ └── test_security_groups.py │ └── control_plane │ ├── __init__.py │ ├── constructs │ ├── __init__.py │ └── test_lambdas.py │ ├── lambda_sources │ ├── __init__.py │ └── apis │ │ └── test_get_list_apis.py │ └── test_control_plane.py ├── tox.ini └── validate.sh /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG] -" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yaml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE] -" 5 | labels: enhancement, feature 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/issue_template_simple.md: -------------------------------------------------------------------------------- 1 | ### Specify a Type of Issue: 2 | 3 | BUG/FEATURE/OTHER 4 | 5 | 6 | ### Describe the Issue: 7 | 8 | A clear and concise description of what the issue is. 9 | 10 | ### To Reproduce: 11 | 12 | Provide steps to reproduce the issue, or use `N/A` 13 | 14 | ### Additional Context: 15 | 16 | Add any other context about the problem here. -------------------------------------------------------------------------------- /.github/workflows/static-checking.yml: -------------------------------------------------------------------------------- 1 | name: Static Checking 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | - release 8 | pull_request: 9 | branches: 10 | - master 11 | - release 12 | 13 | jobs: 14 | ChecksAndTests: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: [3.7] 19 | defaults: 20 | run: 21 | working-directory: . 22 | steps: 23 | - uses: actions/checkout@v2 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v1 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install Requirements 29 | run: | 30 | python -m pip install --upgrade pip 31 | pip install -e . 32 | pip install -r requirements-dev.txt 33 | pip install -r requirements-2.x.txt 34 | - name: Mypy Check 35 | run: mypy . 36 | - name: Flake8 Check 37 | run: flake8 . 38 | - name: Black Check 39 | run: black --check . 40 | - name: ISort Check 41 | run: isort --check . 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | share/python-wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # IPython 77 | profile_default/ 78 | ipython_config.py 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | .dmypy.json 111 | dmypy.json 112 | 113 | # Pyre type checker 114 | .pyre/ 115 | 116 | # Visual Studio Code 117 | .vscode/ 118 | 119 | # JetBrains 120 | .idea/ 121 | 122 | # MacOS 123 | .DS_Store 124 | 125 | # Files generated by AWS Cloudformation package 126 | output/ 127 | 128 | # Output templates 129 | templates/cdk.out 130 | 131 | # cached word files 132 | artifacts/~$+*.doc 133 | cdk.out 134 | cdk.context.json 135 | 136 | examples/environment_vars.sh 137 | # aws_emr_launch/lambda_sources/layers/ 138 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 62 | -------------------------------------------------------------------------------- /LICENSE-LAMBDA: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | -------------------------------------------------------------------------------- /LICENSE-SUMMARY: -------------------------------------------------------------------------------- 1 | This project is licensed under the terms of the Apache 2.0 license. See LICENSE. 2 | Included AWS Lambda functions are licensed under the MIT-0 license. See LICENSE-LAMBDA. 3 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include VERSION 2 | include LICENSE* 3 | include THIRD_PARTY 4 | include NOTICE 5 | include *.md 6 | include *.txt 7 | include *.in 8 | include *.sh 9 | include *.yaml 10 | include *.lock 11 | include tox.ini 12 | 13 | recursive-include aws_emr_launch * 14 | recursive-include scripts *.sh 15 | recursive-include examples *.py 16 | recursive-include examples *.md 17 | recursive-include examples *.sh 18 | recursive-include examples *.json 19 | recursive-include examples *.hql 20 | recursive-include examples *.jar 21 | recursive-include docs *.md 22 | 23 | 24 | exclude Config 25 | exclude codepipeline/cdk.json 26 | 27 | recursive-exclude tests * 28 | recursive-exclude examples/.env * 29 | recursive-exclude examples/*/cdk.out * 30 | recursive-exclude codepipeline * 31 | 32 | recursive-exclude examples *.css 33 | recursive-exclude examples *.parquet 34 | recursive-exclude examples *.png 35 | recursive-exclude examples *.tf 36 | recursive-exclude examples *.tfvars 37 | recursive-exclude examples *.txt 38 | recursive-exclude examples *.zip 39 | recursive-exclude extras *.md 40 | recursive-exclude extras *.py 41 | recursive-exclude docs *.pdf 42 | recursive-exclude docs *.png 43 | 44 | global-exclude *.pyc 45 | global-exclude __pycache__ 46 | global-exclude environment_vars.sh 47 | global-exclude cdk.context.json 48 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /THIRD_PARTY: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/THIRD_PARTY -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 2.1.0.dev0 2 | -------------------------------------------------------------------------------- /aws_emr_launch/__init__.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import botocore 3 | import pkg_resources 4 | 5 | __product__ = "aws-emr-launch" 6 | __version__ = pkg_resources.get_distribution(__product__).version 7 | __package__ = f"{__product__}-{__version__}" 8 | 9 | 10 | def _get_botocore_config() -> botocore.config.Config: 11 | return botocore.config.Config( 12 | retries={"max_attempts": 5}, 13 | connect_timeout=10, 14 | max_pool_connections=10, 15 | user_agent_extra=f"{__product__}/{__version__}", 16 | ) 17 | 18 | 19 | def boto3_client(service_name: str) -> boto3.client: 20 | return boto3.Session().client(service_name=service_name, use_ssl=True, config=_get_botocore_config()) 21 | 22 | 23 | def boto3_resource(service_name: str) -> boto3.client: 24 | return boto3.Session().resource(service_name=service_name, use_ssl=True, config=_get_botocore_config()) 25 | -------------------------------------------------------------------------------- /aws_emr_launch/constructs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/constructs/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/constructs/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import aws_cdk 4 | from logzero import logger 5 | 6 | import constructs 7 | from aws_emr_launch import __product__, __version__ 8 | 9 | 10 | def _tag_construct(construct: constructs.Construct) -> None: 11 | suppress_tags = os.environ.get("SUPPRESS_EMR_LAUNCH_DEPLOYMENT_TAGS", "").lower() in ("1", "t", "true", "y", "yes") 12 | 13 | if not suppress_tags: 14 | aws_cdk.Tags.of(construct).add("deployment:product:name", __product__) 15 | aws_cdk.Tags.of(construct).add("deployment:product:version", __version__) 16 | else: 17 | logger.info('Suppressing "deployment:product" tags for: %s', construct.node.id) 18 | 19 | 20 | class BaseConstruct(constructs.Construct): 21 | def __init__(self, scope: constructs.Construct, id: str): 22 | super().__init__(scope, id) 23 | _tag_construct(self) 24 | 25 | 26 | class BaseBuilder: 27 | @staticmethod 28 | def tag_construct(construct: constructs.Construct) -> None: 29 | _tag_construct(construct) 30 | -------------------------------------------------------------------------------- /aws_emr_launch/constructs/emr_constructs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/constructs/emr_constructs/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/constructs/iam_roles/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/constructs/iam_roles/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/constructs/lambdas/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | LAMBDA_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../lambda_sources/")) 4 | 5 | 6 | def _lambda_path(path: str) -> str: 7 | return os.path.join(LAMBDA_DIR, path) 8 | -------------------------------------------------------------------------------- /aws_emr_launch/constructs/managed_configurations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/constructs/managed_configurations/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/constructs/security_groups/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/constructs/security_groups/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/constructs/security_groups/emr.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from aws_cdk import aws_ec2 as ec2 4 | 5 | import constructs 6 | from aws_emr_launch.constructs.base import BaseConstruct 7 | 8 | 9 | class EMRSecurityGroups(BaseConstruct): 10 | def __init__(self, scope: constructs.Construct, id: str, *, vpc: Optional[ec2.IVpc] = None) -> None: 11 | super().__init__(scope, id) 12 | 13 | if vpc: 14 | self._master_group: ec2.ISecurityGroup = ec2.SecurityGroup( 15 | self, "MasterGroup", allow_all_outbound=True, vpc=vpc 16 | ) 17 | self._workers_group: ec2.ISecurityGroup = ec2.SecurityGroup( 18 | self, "WorkersGroup", allow_all_outbound=True, vpc=vpc 19 | ) 20 | self._service_group: ec2.ISecurityGroup = ec2.SecurityGroup( 21 | self, "ServiceGroup", allow_all_outbound=False, vpc=vpc 22 | ) 23 | 24 | # Master SG rules 25 | self._master_group.add_ingress_rule(self._service_group, ec2.Port.tcp(8443)) 26 | self._set_common_ingress_rules(self._master_group, self.workers_group) 27 | 28 | # Workers SG rules 29 | self._workers_group.add_ingress_rule(self._service_group, ec2.Port.tcp(8443)) 30 | self._set_common_ingress_rules(self._workers_group, self._master_group) 31 | 32 | # Service SG rules 33 | self._service_group.add_egress_rule(self._master_group, ec2.Port.tcp(8443)) 34 | self._service_group.add_egress_rule(self._workers_group, ec2.Port.tcp(8443)) 35 | self._service_group.add_ingress_rule(self._master_group, ec2.Port.tcp(9443)) 36 | 37 | @staticmethod 38 | def _set_common_ingress_rules(primary: ec2.ISecurityGroup, secondary: ec2.ISecurityGroup) -> ec2.ISecurityGroup: 39 | primary.add_ingress_rule(primary, ec2.Port.tcp_range(0, 65535)) 40 | primary.add_ingress_rule(primary, ec2.Port.udp_range(0, 65535)) 41 | primary.add_ingress_rule(primary, ec2.Port.icmp_type(-1)) 42 | primary.add_ingress_rule(secondary, ec2.Port.tcp_range(0, 65535)) 43 | primary.add_ingress_rule(secondary, ec2.Port.udp_range(0, 65535)) 44 | primary.add_ingress_rule(secondary, ec2.Port.icmp_type(-1)) 45 | return primary 46 | 47 | @staticmethod 48 | def from_security_group_ids( 49 | scope: constructs.Construct, 50 | id: str, 51 | master_group_id: str, 52 | workers_group_id: str, 53 | service_group_id: str, 54 | mutable: Optional[bool] = None, 55 | ) -> "EMRSecurityGroups": 56 | security_groups = EMRSecurityGroups(scope, id) 57 | security_groups._master_group = ec2.SecurityGroup.from_security_group_id( 58 | security_groups, "MasterGroup", master_group_id, mutable=mutable 59 | ) 60 | security_groups._workers_group = ec2.SecurityGroup.from_security_group_id( 61 | security_groups, "WorkersGroup", workers_group_id, mutable=mutable 62 | ) 63 | security_groups._service_group = ec2.SecurityGroup.from_security_group_id( 64 | security_groups, "ServiceGroup", service_group_id, mutable=False 65 | ) 66 | return security_groups 67 | 68 | @property 69 | def master_group(self) -> ec2.ISecurityGroup: 70 | return self._master_group 71 | 72 | @property 73 | def workers_group(self) -> ec2.ISecurityGroup: 74 | return self._workers_group 75 | 76 | @property 77 | def service_group(self) -> ec2.ISecurityGroup: 78 | return self._service_group 79 | -------------------------------------------------------------------------------- /aws_emr_launch/constructs/step_functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/constructs/step_functions/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/control_plane/__init__.py: -------------------------------------------------------------------------------- 1 | from aws_emr_launch.control_plane.constructs.control_plane_stack import ControlPlaneStack 2 | 3 | __all__ = ["ControlPlaneStack"] 4 | -------------------------------------------------------------------------------- /aws_emr_launch/control_plane/constructs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/control_plane/constructs/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/control_plane/constructs/control_plane_stack.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import aws_cdk 4 | 5 | from aws_emr_launch import __product__, __version__ 6 | from aws_emr_launch.control_plane.constructs.lambdas import apis 7 | 8 | 9 | class ControlPlaneStack(aws_cdk.Stack): 10 | def __init__(self, app: aws_cdk.App, name: str = "aws-emr-launch-control-plane", **kwargs: Any) -> None: 11 | super().__init__(app, name, **kwargs) 12 | self.tags.set_tag("deployment:product:name", __product__) 13 | self.tags.set_tag("deployment:product:version", __version__) 14 | self._apis = apis.Apis(self, "Apis") 15 | 16 | @property 17 | def apis(self) -> apis.Apis: 18 | return self._apis 19 | -------------------------------------------------------------------------------- /aws_emr_launch/control_plane/constructs/lambdas/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | LAMBDA_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../lambda_sources/")) 4 | 5 | 6 | def _lambda_path(path: str) -> str: 7 | return os.path.join(LAMBDA_DIR, path) 8 | -------------------------------------------------------------------------------- /aws_emr_launch/control_plane/lambda_sources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/control_plane/lambda_sources/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 4 | software and associated documentation files (the "Software"), to deal in the Software 5 | without restriction, including without limitation the rights to use, copy, modify, 6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/lambda_sources/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/check_cluster_status/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/lambda_sources/emr_utilities/check_cluster_status/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/check_cluster_status/lambda_source.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | from datetime import date, datetime 5 | from typing import Any, Dict, Optional 6 | 7 | import boto3 8 | import botocore 9 | 10 | logger = logging.getLogger() 11 | logger.setLevel(logging.INFO) 12 | 13 | 14 | def _get_botocore_config() -> botocore.config.Config: 15 | product = os.environ.get("AWS_EMR_LAUNCH_PRODUCT", "") 16 | version = os.environ.get("AWS_EMR_LAUNCH_VERSION", "") 17 | return botocore.config.Config( 18 | retries={"max_attempts": 5}, 19 | connect_timeout=10, 20 | max_pool_connections=10, 21 | user_agent_extra=f"{product}/{version}", 22 | ) 23 | 24 | 25 | def _boto3_client(service_name: str) -> boto3.client: 26 | return boto3.Session().client(service_name=service_name, use_ssl=True, config=_get_botocore_config()) 27 | 28 | 29 | emr = _boto3_client("emr") 30 | events = _boto3_client("events") 31 | sfn = _boto3_client("stepfunctions") 32 | 33 | 34 | def json_serial(obj: object) -> str: 35 | if isinstance(obj, (datetime, date)): 36 | return obj.isoformat() 37 | raise TypeError("Type %s not serializable" % type(obj)) 38 | 39 | 40 | def log_exception(e: Exception, event: Dict[str, Any]) -> None: 41 | logger.error(f"Error processing event {json.dumps(event)}") 42 | logger.exception(e) 43 | 44 | 45 | def handler(event: Dict[str, Any], context: Optional[Dict[str, Any]]) -> None: 46 | logger.info(f"Lambda metadata: {json.dumps(event)} (type = {type(event)})") 47 | cluster_id = event["ClusterId"] 48 | task_token = event["TaskToken"] 49 | rule_name = event["RuleName"] 50 | expected_state = event["ExpectedState"] 51 | 52 | try: 53 | cluster_description = emr.describe_cluster(ClusterId=cluster_id) 54 | state = cluster_description["Cluster"]["Status"]["State"] 55 | 56 | if state == expected_state: 57 | success = True 58 | elif state in ["TERMINATING", "TERMINATED", "TERMINATED_WITH_ERRORS"]: 59 | success = False 60 | else: 61 | heartbeat = { 62 | "ClusterId": cluster_id, 63 | "TaskToken": task_token, 64 | "ClusterState": state, 65 | "ExpectedState": expected_state, 66 | } 67 | logger.info(f"Sending Task Heartbeat: {heartbeat}") 68 | sfn.send_task_heartbeat(taskToken=task_token) 69 | return 70 | 71 | cluster_description["ClusterId"] = cluster_id 72 | 73 | if success: 74 | logger.info( 75 | f"Sending Task Success, TaskToken: {task_token}, " 76 | f"Output: {json.dumps(cluster_description, default=json_serial)}" 77 | ) 78 | sfn.send_task_success(taskToken=task_token, output=json.dumps(cluster_description, default=json_serial)) 79 | else: 80 | logger.info( 81 | f"Sending Task Failure,TaskToken: {task_token}, " 82 | f"Output: {json.dumps(cluster_description, default=json_serial)}" 83 | ) 84 | sfn.send_task_failure( 85 | taskToken=task_token, 86 | error="States.TaskFailed", 87 | cause=json.dumps(cluster_description, default=json_serial), 88 | ) 89 | 90 | task_token = None 91 | 92 | logger.info(f"Removing Rule Targets: {cluster_id}") 93 | failed_targets = events.remove_targets(Rule=rule_name, Ids=[cluster_id]) 94 | 95 | if failed_targets["FailedEntryCount"] > 0: 96 | failed_entries = failed_targets["FailedEntries"] 97 | raise Exception(f"Failed Removing Targets: {json.dumps(failed_entries)}") 98 | 99 | targets = events.list_targets_by_rule(Rule=rule_name)["Targets"] 100 | if len(targets) == 0: 101 | logger.info(f"Disabling Rule with no Targets: {rule_name}") 102 | events.disable_rule(Name=rule_name) 103 | 104 | except Exception as e: 105 | try: 106 | if task_token: 107 | logger.error(f"Sending TaskFailure: {task_token}") 108 | sfn.send_task_failure(taskToken=task_token, error="States.TaskFailed", cause=str(e)) 109 | logger.error(f"Removing Rule Targets: {cluster_id}") 110 | events.remove_targets(Rule=rule_name, Ids=[cluster_id]) 111 | except Exception as ee: 112 | logger.exception(ee) 113 | log_exception(e, event) 114 | raise e 115 | -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/fail_if_cluster_running/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/lambda_sources/emr_utilities/fail_if_cluster_running/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/fail_if_cluster_running/lambda_source.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | from typing import Any, Dict, Optional, cast 5 | 6 | import boto3 7 | import botocore 8 | 9 | logger = logging.getLogger() 10 | logger.setLevel(logging.INFO) 11 | 12 | 13 | def _get_botocore_config() -> botocore.config.Config: 14 | product = os.environ.get("AWS_EMR_LAUNCH_PRODUCT", "") 15 | version = os.environ.get("AWS_EMR_LAUNCH_VERSION", "") 16 | return botocore.config.Config( 17 | retries={"max_attempts": 5}, 18 | connect_timeout=10, 19 | max_pool_connections=10, 20 | user_agent_extra=f"{product}/{version}", 21 | ) 22 | 23 | 24 | def _boto3_client(service_name: str) -> boto3.client: 25 | return boto3.Session().client(service_name=service_name, use_ssl=True, config=_get_botocore_config()) 26 | 27 | 28 | emr = _boto3_client("emr") 29 | 30 | 31 | class ClusterRunningError(Exception): 32 | pass 33 | 34 | 35 | def parse_bool(v: str) -> bool: 36 | return str(v).lower() in ("yes", "true", "t", "1") 37 | 38 | 39 | def handler(event: Dict[str, Any], context: Optional[Dict[str, Any]]) -> Dict[str, Any]: 40 | 41 | try: 42 | logger.info(f"Lambda metadata: {json.dumps(event)} (type = {type(event)})") 43 | default_fail_if_cluster_running = parse_bool(event.get("DefaultFailIfClusterRunning", False)) 44 | 45 | # This will work for {"JobInput": {"FailIfClusterRunning": true}} or {"FailIfClusterRunning": true} 46 | fail_if_cluster_running = parse_bool( 47 | event.get("ExecutionInput", event).get("FailIfClusterRunning", default_fail_if_cluster_running) 48 | ) 49 | 50 | # check if job flow already exists 51 | if fail_if_cluster_running: 52 | cluster_name = event.get("Input", {}).get("Name", "") 53 | cluster_is_running = False 54 | logger.info(f'Checking if job flow "{cluster_name}" is running already') 55 | response = emr.list_clusters(ClusterStates=["STARTING", "BOOTSTRAPPING", "RUNNING", "WAITING"]) 56 | for job_flow_running in response["Clusters"]: 57 | jf_name = job_flow_running["Name"] 58 | cluster_id = job_flow_running["Id"] 59 | if jf_name == cluster_name: 60 | logger.info(f"Job flow {cluster_name} is already running: terminate? {fail_if_cluster_running}") 61 | cluster_is_running = True 62 | break 63 | 64 | if cluster_is_running and fail_if_cluster_running: 65 | raise ClusterRunningError( 66 | f"Found running Cluster with name {cluster_name}. " 67 | f"ClusterId: {cluster_id}. FailIfClusterRunning is {fail_if_cluster_running}" 68 | ) 69 | else: 70 | return cast(Dict[str, Any], event["Input"]) 71 | 72 | else: 73 | return cast(Dict[str, Any], event["Input"]) 74 | 75 | except Exception as e: 76 | logger.error(f"Error processing event {json.dumps(event)}") 77 | logger.exception(e) 78 | raise e 79 | -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/load_cluster_configuration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/lambda_sources/emr_utilities/load_cluster_configuration/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/override_cluster_configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/lambda_sources/emr_utilities/override_cluster_configs/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/override_cluster_configs/lambda_source.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | from typing import Any, Dict, Optional 5 | 6 | import boto3 7 | import botocore 8 | from dictor import dictor 9 | 10 | logger = logging.getLogger() 11 | logger.setLevel(logging.INFO) 12 | 13 | 14 | def _get_botocore_config() -> botocore.config.Config: 15 | product = os.environ.get("AWS_EMR_LAUNCH_PRODUCT", "") 16 | version = os.environ.get("AWS_EMR_LAUNCH_VERSION", "") 17 | return botocore.config.Config( 18 | retries={"max_attempts": 5}, 19 | connect_timeout=10, 20 | max_pool_connections=10, 21 | user_agent_extra=f"{product}/{version}", 22 | ) 23 | 24 | 25 | def _boto3_client(service_name: str) -> boto3.client: 26 | return boto3.Session().client(service_name=service_name, use_ssl=True, config=_get_botocore_config()) 27 | 28 | 29 | emr = _boto3_client("emr") 30 | 31 | 32 | class InvalidOverrideError(Exception): 33 | pass 34 | 35 | 36 | def handler(event: Dict[str, Any], context: Optional[Dict[str, Any]]) -> Dict[str, Any]: 37 | logger.info(f"Lambda metadata: {json.dumps(event)} (type = {type(event)})") 38 | # This will work with ClusterConfigurationOverrides or ClusterConfigOverrides 39 | overrides = event.get("ExecutionInput", {}).get("ClusterConfigurationOverrides", None) 40 | if overrides is None: 41 | overrides = event.get("ExecutionInput", {}).get("ClusterConfigOverrides", {}) 42 | 43 | allowed_overrides = event.get("AllowedClusterConfigOverrides", None) 44 | cluster_config: Dict[str, Any] = event.get("Input", {}) 45 | 46 | if overrides and not allowed_overrides: 47 | raise InvalidOverrideError("Cluster configuration overrides are not allowed") 48 | 49 | try: 50 | for path, new_value in overrides.items(): 51 | minimum = None 52 | maximum = None 53 | 54 | new_path = allowed_overrides.get(path, None) 55 | if new_path is None: 56 | raise InvalidOverrideError(f'Value "{path}" is not an allowed cluster configuration override') 57 | else: 58 | path = new_path["JsonPath"] 59 | minimum = new_path.get("Minimum", None) 60 | maximum = new_path.get("Maximum", None) 61 | 62 | path_parts = path.split(".") 63 | update_key = path_parts[-1] 64 | key_path = ".".join(path_parts[0:-1]) 65 | 66 | update_key = int(update_key) if update_key.isdigit() else update_key 67 | update_attr = cluster_config if key_path == "" else dictor(cluster_config, key_path) 68 | 69 | if update_attr is None or update_attr.get(update_key, None) is None: 70 | raise InvalidOverrideError(f'The update path "{path}" was not found in the cluster configuration') 71 | 72 | logger.info(f'Path: "{key_path}" CurrentValue: "{update_attr[update_key]}" NewValue: "{new_value}"') 73 | if (minimum or maximum) and (isinstance(new_value, int) or isinstance(new_value, float)): 74 | if minimum and new_value < minimum: 75 | raise InvalidOverrideError( 76 | f"The Override Value ({new_value}) " f"is less than the Minimum allowed ({minimum})" 77 | ) 78 | if maximum and new_value > maximum: 79 | raise InvalidOverrideError( 80 | f"The Override Value ({new_value}) " f"is greater than the Maximum allowed ({maximum})" 81 | ) 82 | 83 | update_attr[update_key] = new_value 84 | 85 | return cluster_config 86 | 87 | except Exception as e: 88 | logger.error(f"Error processing event {json.dumps(event)}") 89 | logger.exception(e) 90 | raise e 91 | -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/override_step_args/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/lambda_sources/emr_utilities/override_step_args/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/override_step_args/lambda_source.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Any, Dict, List, Optional 4 | 5 | logger = logging.getLogger() 6 | logger.setLevel(logging.INFO) 7 | 8 | 9 | def handler(event: Dict[str, Any], context: Optional[Dict[str, Any]]) -> List[str]: 10 | logger.info(f"Lambda metadata: {json.dumps(event)} (type = {type(event)})") 11 | # This will work with StepArgumentOverrides or StepArgOverrides 12 | overrides = event.get("ExecutionInput", {}).get("StepArgumentOverrides", None) 13 | if overrides is None: 14 | overrides = event.get("ExecutionInput", {}).get("StepArgOverrides", {}) 15 | step_name = event.get("StepName", "") 16 | args = event.get("Args", []) 17 | 18 | try: 19 | step_overrides = overrides.get(step_name, {}) 20 | overridden_args: List[str] = [step_overrides.get(arg, arg) for arg in args] 21 | logger.info(f"Overridden Args: {overridden_args}") 22 | return overridden_args 23 | 24 | except Exception as e: 25 | logger.error(f"Error processing event {json.dumps(event)}") 26 | logger.exception(e) 27 | raise e 28 | -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/parse_json_string/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/lambda_sources/emr_utilities/parse_json_string/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/parse_json_string/lambda_source.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Any, Dict, Optional, cast 4 | 5 | logger = logging.getLogger() 6 | logger.setLevel(logging.INFO) 7 | 8 | 9 | def handler(event: Dict[str, Any], context: Optional[Dict[str, Any]]) -> Dict[str, Any]: 10 | logger.info(f"Lambda metadata: {json.dumps(event)} (type = {type(event)})") 11 | json_string = event.get("JsonString", {}) 12 | 13 | try: 14 | return cast(Dict[str, Any], json.loads(json_string)) 15 | 16 | except Exception as e: 17 | logger.error(f"Error processing event {json.dumps(event)}") 18 | logger.exception(e) 19 | raise e 20 | -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/run_job_flow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/lambda_sources/emr_utilities/run_job_flow/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/update_cluster_tags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/aws_emr_launch/lambda_sources/emr_utilities/update_cluster_tags/__init__.py -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/emr_utilities/update_cluster_tags/lambda_source.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | from typing import Any, Dict, Optional 5 | 6 | import boto3 7 | import botocore 8 | 9 | logger = logging.getLogger() 10 | logger.setLevel(logging.INFO) 11 | 12 | 13 | def _get_botocore_config() -> botocore.config.Config: 14 | product = os.environ.get("AWS_EMR_LAUNCH_PRODUCT", "") 15 | version = os.environ.get("AWS_EMR_LAUNCH_VERSION", "") 16 | return botocore.config.Config( 17 | retries={"max_attempts": 5}, 18 | connect_timeout=10, 19 | max_pool_connections=10, 20 | user_agent_extra=f"{product}/{version}", 21 | ) 22 | 23 | 24 | def _boto3_client(service_name: str) -> boto3.client: 25 | return boto3.Session().client(service_name=service_name, use_ssl=True, config=_get_botocore_config()) 26 | 27 | 28 | emr = _boto3_client("emr") 29 | 30 | 31 | def handler(event: Dict[str, Any], context: Optional[Dict[str, Any]]) -> Any: 32 | logger.info(f"Lambda metadata: {json.dumps(event)} (type = {type(event)})") 33 | new_tags = event.get("ExecutionInput", {}).get("Tags", []) 34 | cluster_config = event.get("Input", {}) 35 | current_tags = cluster_config.get("Tags", []) 36 | 37 | try: 38 | new_tags_dict = {tag["Key"]: tag["Value"] for tag in new_tags} 39 | current_tags_dict = {tag["Key"]: tag["Value"] for tag in current_tags} 40 | 41 | merged_tags_dict = dict(current_tags_dict, **new_tags_dict) 42 | merged_tags = [{"Key": k, "Value": v} for k, v in merged_tags_dict.items()] 43 | 44 | cluster_config["Tags"] = merged_tags 45 | return cluster_config 46 | 47 | except Exception as e: 48 | logger.error(f"Error processing event {json.dumps(event)}") 49 | logger.exception(e) 50 | raise e 51 | -------------------------------------------------------------------------------- /aws_emr_launch/lambda_sources/layers/emr_config_utils/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3==1.12.23 2 | dictor==0.1.3 3 | -------------------------------------------------------------------------------- /docs/AWS EMR Launch Threat Model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/docs/AWS EMR Launch Threat Model.pdf -------------------------------------------------------------------------------- /docs/emr_launch_usage.md: -------------------------------------------------------------------------------- 1 | # AWS EMR Launch Usage 2 | 3 | ## Intro 4 | AWS EMR Launch provides Step Functions preconfigured to launch EMR Clusters with predefined Profiles (IAM Roles, Instance Profiles, KMS Keys, Security Groups) and Configurations (Node Types/Counts, Configuration Files, EMR Versions, etc). Each Step Function is configured to load specific Profile and Configuration Metadata, ensuring that Users authorized to execute the Step Function are restrited to this Cluster Definition. 5 | 6 | Two types of EMR Launch are supported: with and without Secrets. Secrets are configuration parameters like database/metastore credentials, Kerberos parameters, etc. Rather than storing the Secrets in Step Function definition, these are kept in Secrets Manager and loaded dynamically when the Cluster is launched. 7 | 8 | ![Usage workflow image](usage_diagram.png) 9 | 10 | ## Launching a Cluster (without Secrets) 11 | 1. The EMR Launch Step Function is executed. The IAM User/Role must be authorized the execute the Step Function 12 | 2. The Step Function utilizes a Lambda Function to load Profile and Configuration Metadata from the Parameter Store 13 | - This is a dedicated Lambda Function with an Execution Role granted access to only these specific Parameter Store values 14 | - The Step Function Execution Role is granted execute on only this specific Lambda Function 15 | 3. Metadata is combined and passed to a Step Function EMR Integration Task 16 | 4. The EMR Integration Task launches the EMR Cluster 17 | - The Step Function Execution Role is granted PassRole to only the specific IAM Role/Instance Profile defined in the Profile Metadata 18 | 19 | ## Launching a Cluster (with Secrets) 20 | 1. The EMR Launch Step Function is executed. The IAM User/Role must be authorized the execute the Step Function 21 | 2. The Step Function utilizes a Lambda Function to load Profile and Configuration Metadata from the Parameter Store 22 | - This is a dedicated Lambda Function with an Execution Role granted access to only these specific Parameter Store values 23 | - The Step Function Execution Role is granted execute on only this specific Lambda Function 24 | 3. Metadata is combined and passed to a Cluster Launcher Lambda Function 25 | 4. The Cluster Launcher Lambda Function loads Secrets from the Secrets Manager, combines them with the Cluster Definition. 26 | - This is a dedicated Lambda Function with an Execution Role granted access to the Secrets. 27 | 5. The Cluster Launcher Lambda Function launches the EMR Cluster 28 | - The Lambda Execution Role is granted PassRole to only the specific IAM Role/Instance Profile defined in the Profile Metadata 29 | 30 | ## Potential Threats 31 | 1. Users/Roles can create clusters with Profiles/Configurations they are not authorized for 32 | 2. Modifying Profile metadata could allow Users to create clusters with alternate IAM Roles/Instance Profiles -------------------------------------------------------------------------------- /docs/emr_launch_usage.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/docs/emr_launch_usage.pdf -------------------------------------------------------------------------------- /docs/usage_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/docs/usage_diagram.png -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # EMR Launch Examples 2 | 3 | ## Exmaples 4 | The examples require an environment with the following (the Buckets can be seperate or the same): 5 | 6 | 1. A VPC with at least one Private Subnet 7 | 2. An S3 Bucket used for EMR Artifacts (Bootstrap scripts, Step scripts, etc) 8 | 3. An S3 Bucket used for EMR Logs 9 | 4. An S3 Bucket used for EMR input/output data 10 | 5. A SecretsManager Secret with a Kerberos Attributes example 11 | 6. A SecretsManager Secret with a secret Cluster Configuration example 12 | 13 | To get up and running quickly the `environment_stack` will deploy these into your account. The resources 14 | deployed by this stack are then used in the other examples. 15 | 16 | ### Lambda Layer packages 17 | The Lambda Layer packages are required to deploy the examples. If these haven't been installed 18 | see the **Development** section of the top-level README.md. 19 | 20 | ### Deploying the Examples 21 | Create and activate a virtualenv for the examples: 22 | ```bash 23 | cd examples/ 24 | python3 -m venv .env 25 | source .env/bin/activate 26 | ``` 27 | 28 | Install the `aws-emr-launch` library and dependencies: 29 | ```bash 30 | pip install -e .. 31 | ``` 32 | 33 | You can use the `deploy_all.sh` script to deploy all the example projects. Or deploy manually in 34 | in the following order: 35 | 1. `control_plane` 36 | 2. `environment_stack` 37 | 3. `emr_profiles` 38 | 4. `cluster_configurations` 39 | 5. `emr_launch_functions` 40 | 6. `transient_cluster_pipeline` 41 | 7. `persistent_cluster_pipeline` 42 | 8. `sns_triggered_pipeline` 43 | 44 | Deployment of the `control_plane` is optional. It provides some Lambda functions you can use 45 | to investigate the environment. 46 | 47 | To deploy the `control_plane`: 48 | ```bash 49 | cd control_plane/ 50 | cdk deploy 51 | ``` 52 | 53 | Deployment of the `environment_stack` only needs to be done once to prepare the resources used 54 | by the other examples. 55 | 56 | To deploy the `envronment_stack`: 57 | ```bash 58 | cd environment_stack/ 59 | cdk deploy 60 | ``` 61 | 62 | Each of the other examples is deployed in the same way: 63 | 1. `cd` into the directory 64 | 2. `cdk deploy` to deploy the resources 65 | 66 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/__init__.py -------------------------------------------------------------------------------- /examples/cluster_configurations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/cluster_configurations/__init__.py -------------------------------------------------------------------------------- /examples/cluster_configurations/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | 5 | import aws_cdk 6 | from aws_cdk import aws_ec2 as ec2 7 | from aws_cdk import aws_s3 as s3 8 | from aws_cdk import aws_secretsmanager as secretsmanager 9 | 10 | from aws_emr_launch.constructs.emr_constructs import emr_code 11 | from aws_emr_launch.constructs.managed_configurations import instance_group_configuration 12 | 13 | NAMING_PREFIX = f"emr-launch-{aws_cdk.Aws.ACCOUNT_ID}-{aws_cdk.Aws.REGION}" 14 | 15 | app = aws_cdk.App() 16 | stack = aws_cdk.Stack( 17 | app, 18 | "ClusterConfigurationsStack", 19 | env=aws_cdk.Environment(account=os.environ["CDK_DEFAULT_ACCOUNT"], region=os.environ["CDK_DEFAULT_REGION"]), 20 | ) 21 | 22 | vpc = ec2.Vpc.from_lookup(stack, "Vpc", vpc_name="EmrLaunchExamplesEnvStack/EmrLaunchVpc") 23 | artifacts_bucket = s3.Bucket.from_bucket_name(stack, "ArtifactsBucket", f"{NAMING_PREFIX}-artifacts") 24 | 25 | # This prepares the project's bootstrap_source/ folder for deployment 26 | # We use the Artifacts bucket configured and authorized on the EMR Profile 27 | bootstrap_code = emr_code.Code.from_path( 28 | path="./bootstrap_source", 29 | deployment_bucket=artifacts_bucket, 30 | deployment_prefix="emr_launch_testing/bootstrap_source", 31 | ) 32 | 33 | # Define a Bootstrap Action using the bootstrap_source/ folder's deployment location 34 | bootstrap = emr_code.EMRBootstrapAction( 35 | name="bootstrap-1", path=f"{bootstrap_code.s3_path}/test_bootstrap.sh", args=["Arg1", "Arg2"], code=bootstrap_code 36 | ) 37 | 38 | 39 | # Cluster Configurations that use InstanceGroups are deployed to a Private subnet 40 | subnet = vpc.private_subnets[0] 41 | 42 | # Load a SecretsManger Secret with secure RDS Metastore credentials 43 | secret_name = f"{NAMING_PREFIX}-external-metastore" 44 | secret = secretsmanager.Secret.from_secret_partial_arn( 45 | stack, 46 | "Secret", 47 | f"arn:{aws_cdk.Aws.PARTITION}:secretsmanager:{aws_cdk.Aws.REGION}:{aws_cdk.Aws.ACCOUNT_ID}:secret:{secret_name}", 48 | ) 49 | 50 | 51 | # Create a basic Cluster Configuration using InstanceGroups, the Subnet and Bootstrap 52 | # Action defined above, the EMR Profile we loaded, and defaults defined in 53 | # the InstanceGroupConfiguration 54 | basic_cluster_config = instance_group_configuration.ManagedScalingConfiguration( 55 | stack, 56 | "BasicClusterConfiguration", 57 | configuration_name="basic-instance-group-cluster", 58 | subnet=subnet, 59 | bootstrap_actions=[bootstrap], 60 | step_concurrency_level=2, 61 | # secret_configurations={"hive-site": secret}, 62 | ) 63 | 64 | basic_cluster_config.add_spark_package("com.amazon.deequ:deequ:1.0.2") 65 | 66 | basic_cluster_config.add_spark_jars( 67 | emr_code.Code.from_path( 68 | path="./jars", deployment_bucket=artifacts_bucket, deployment_prefix="emr_launch_testing/jars" 69 | ), 70 | emr_code.Code.files_in_path("./jars", "*.jar"), 71 | ) 72 | 73 | 74 | # Here we create another Cluster Configuration using the same subnet, bootstrap, and 75 | # EMR Profile while customizing the default Instance Type and Instance Count 76 | high_mem_cluster_config = instance_group_configuration.InstanceGroupConfiguration( 77 | stack, 78 | "HighMemClusterConfiguration", 79 | configuration_name="high-mem-instance-group-cluster", 80 | subnet=subnet, 81 | bootstrap_actions=[bootstrap], 82 | step_concurrency_level=5, 83 | core_instance_type="r5.2xlarge", 84 | core_instance_count=2, 85 | ) 86 | 87 | app.synth() 88 | -------------------------------------------------------------------------------- /examples/cluster_configurations/bootstrap_source/test_bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "hello world" && echo "Args: $@" && exit 0 -------------------------------------------------------------------------------- /examples/cluster_configurations/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py" 3 | } 4 | -------------------------------------------------------------------------------- /examples/cluster_configurations/jars/example_0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/cluster_configurations/jars/example_0.jar -------------------------------------------------------------------------------- /examples/cluster_configurations/jars/example_1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/cluster_configurations/jars/example_1.jar -------------------------------------------------------------------------------- /examples/control_plane/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/control_plane/__init__.py -------------------------------------------------------------------------------- /examples/control_plane/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import aws_cdk 4 | 5 | from aws_emr_launch import control_plane 6 | 7 | app = aws_cdk.App() 8 | control_plane.ControlPlaneStack(app, "EMRLaunchControlPlaneStack") 9 | 10 | app.synth() 11 | -------------------------------------------------------------------------------- /examples/control_plane/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py" 3 | } 4 | -------------------------------------------------------------------------------- /examples/deploy_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | pushd ./control_plane && cdk deploy --require-approval never && popd 5 | pushd ./environment_stack && cdk deploy --require-approval never && popd 6 | pushd ./emr_profiles && cdk deploy --require-approval never && popd 7 | pushd ./cluster_configurations && cdk deploy --require-approval never && popd 8 | pushd ./emr_launch_function && cdk deploy --require-approval never && popd 9 | pushd ./transient_cluster_pipeline && cdk deploy --require-approval never && popd 10 | pushd ./persistent_cluster_pipeline && cdk deploy --require-approval never && popd 11 | pushd ./sns_triggered_pipeline && cdk deploy --require-approval never && popd 12 | -------------------------------------------------------------------------------- /examples/emr_launch_function/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/emr_launch_function/__init__.py -------------------------------------------------------------------------------- /examples/emr_launch_function/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | 5 | import aws_cdk 6 | 7 | from aws_emr_launch.constructs.emr_constructs import cluster_configuration, emr_profile 8 | from aws_emr_launch.constructs.step_functions import emr_launch_function 9 | 10 | app = aws_cdk.App() 11 | stack = aws_cdk.Stack( 12 | app, 13 | "EmrLaunchFunctionStack", 14 | env=aws_cdk.Environment(account=os.environ["CDK_DEFAULT_ACCOUNT"], region=os.environ["CDK_DEFAULT_REGION"]), 15 | ) 16 | 17 | # Load our SSE-KMS EMR Profile created in the emr_profiles example 18 | sse_kms_profile = emr_profile.EMRProfile.from_stored_profile(stack, "EMRProfile", "sse-kms-profile") 19 | 20 | # Load our Basic Cluster Configuration created in the cluster_configurations example 21 | cluster_config = cluster_configuration.ClusterConfiguration.from_stored_configuration( 22 | stack, "ClusterConfiguration", "basic-instance-group-cluster" 23 | ) 24 | 25 | # Create a new State Machine to launch a cluster with the Basic configuration 26 | # Unless specifically indicated, fail to start if a cluster of the same name 27 | # is already running. Allow any parameter in the default override_interface to 28 | # be overwritten. 29 | launch_function = emr_launch_function.EMRLaunchFunction( 30 | stack, 31 | "EMRLaunchFunction", 32 | launch_function_name="launch-basic-cluster", 33 | cluster_configuration=cluster_config, 34 | emr_profile=sse_kms_profile, 35 | cluster_name="basic-cluster", 36 | default_fail_if_cluster_running=True, 37 | cluster_tags=[aws_cdk.Tag("Key1", "Value1"), aws_cdk.Tag("Key2", "Value2")], 38 | wait_for_cluster_start=True, 39 | ) 40 | 41 | app.synth() 42 | -------------------------------------------------------------------------------- /examples/emr_launch_function/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py" 3 | } 4 | -------------------------------------------------------------------------------- /examples/emr_profiles/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/emr_profiles/__init__.py -------------------------------------------------------------------------------- /examples/emr_profiles/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | 5 | import aws_cdk 6 | from aws_cdk import aws_ec2 as ec2 7 | from aws_cdk import aws_kms as kms 8 | from aws_cdk import aws_s3 as s3 9 | from aws_cdk import aws_secretsmanager as secretsmanager 10 | 11 | from aws_emr_launch.constructs.emr_constructs import emr_profile 12 | 13 | NAMING_PREFIX = f"emr-launch-{aws_cdk.Aws.ACCOUNT_ID}-{aws_cdk.Aws.REGION}" 14 | 15 | app = aws_cdk.App() 16 | stack = aws_cdk.Stack( 17 | app, 18 | "EmrProfilesStack", 19 | env=aws_cdk.Environment(account=os.environ["CDK_DEFAULT_ACCOUNT"], region=os.environ["CDK_DEFAULT_REGION"]), 20 | ) 21 | 22 | # Load some preexisting resources from my environment 23 | vpc = ec2.Vpc.from_lookup(stack, "Vpc", vpc_name="EmrLaunchExamplesEnvStack/EmrLaunchVpc") 24 | artifacts_bucket = s3.Bucket.from_bucket_name(stack, "ArtifactsBucket", f"{NAMING_PREFIX}-artifacts") 25 | logs_bucket = s3.Bucket.from_bucket_name(stack, "LogsBucket", f"{NAMING_PREFIX}-logs") 26 | data_bucket = s3.Bucket.from_bucket_name(stack, "DataBucket", f"{NAMING_PREFIX}-data") 27 | 28 | secret_name = f"{NAMING_PREFIX}-kerberos-attributes" 29 | kerberos_attributes_secret = secretsmanager.Secret.from_secret_partial_arn( 30 | stack, 31 | "KerberosAttributesSecret", 32 | f"arn:{aws_cdk.Aws.PARTITION}:secretsmanager:{aws_cdk.Aws.REGION}:{aws_cdk.Aws.ACCOUNT_ID}:secret:{secret_name}", 33 | ) 34 | 35 | 36 | # A simple EMR Profile that grants proper access to the Logs and Artifacts buckets 37 | # By default S3 Server Side encryption is enabled 38 | sse_s3_profile = emr_profile.EMRProfile( 39 | stack, 40 | "SSES3Profile", 41 | profile_name="sse-s3-profile", 42 | vpc=vpc, 43 | logs_bucket=logs_bucket, 44 | artifacts_bucket=artifacts_bucket, 45 | ) 46 | 47 | sse_s3_profile.authorize_input_bucket(data_bucket).authorize_output_bucket(data_bucket) 48 | 49 | 50 | # Here we create a KMS Key to use for At Rest Encryption in S3 and Locally 51 | kms_key = kms.Key(stack, "AtRestKMSKey") 52 | 53 | # And a new profile to use the KMS Key 54 | sse_kms_profile = emr_profile.EMRProfile( 55 | stack, 56 | "SSEKMSProfile", 57 | profile_name="sse-kms-profile", 58 | vpc=vpc, 59 | logs_bucket=logs_bucket, 60 | artifacts_bucket=artifacts_bucket, 61 | ) 62 | 63 | # Authorize the profile for the Data Bucket and set the At Rest Encryption type 64 | sse_kms_profile.authorize_input_bucket(data_bucket).authorize_output_bucket(data_bucket).set_s3_encryption( 65 | emr_profile.S3EncryptionMode.SSE_KMS, encryption_key=kms_key 66 | ).set_local_disk_encryption(kms_key, ebs_encryption=True) 67 | # .set_local_kdc(kerberos_attributes_secret) 68 | 69 | app.synth() 70 | -------------------------------------------------------------------------------- /examples/emr_profiles/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py" 3 | } 4 | -------------------------------------------------------------------------------- /examples/environment_stack/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/environment_stack/__init__.py -------------------------------------------------------------------------------- /examples/environment_stack/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import os 5 | 6 | import aws_cdk 7 | from aws_cdk import aws_ec2 as ec2 8 | from aws_cdk import aws_s3 as s3 9 | from aws_cdk import aws_secretsmanager as secretsmanager 10 | 11 | NAMING_PREFIX = f"emr-launch-{aws_cdk.Aws.ACCOUNT_ID}-{aws_cdk.Aws.REGION}" 12 | 13 | app = aws_cdk.App() 14 | stack = aws_cdk.Stack( 15 | app, 16 | "EmrLaunchExamplesEnvStack", 17 | env=aws_cdk.Environment(account=os.environ["CDK_DEFAULT_ACCOUNT"], region=os.environ["CDK_DEFAULT_REGION"]), 18 | ) 19 | 20 | vpc = ec2.Vpc(stack, "EmrLaunchVpc", cidr="10.0.0.0/24", max_azs=2) 21 | 22 | logs_bucket = s3.Bucket( 23 | stack, 24 | "EmrLaunchLogsBucket", 25 | bucket_name=f"{NAMING_PREFIX}-logs", 26 | block_public_access=s3.BlockPublicAccess.BLOCK_ALL, 27 | removal_policy=aws_cdk.RemovalPolicy.DESTROY, 28 | ) 29 | artifacts_bucket = s3.Bucket( 30 | stack, 31 | "EmrLaunchArtifactsBucket", 32 | bucket_name=f"{NAMING_PREFIX}-artifacts", 33 | block_public_access=s3.BlockPublicAccess.BLOCK_ALL, 34 | removal_policy=aws_cdk.RemovalPolicy.DESTROY, 35 | ) 36 | data_bucket = s3.Bucket( 37 | stack, 38 | "EmrLaunchDataBucket", 39 | bucket_name=f"{NAMING_PREFIX}-data", 40 | block_public_access=s3.BlockPublicAccess.BLOCK_ALL, 41 | removal_policy=aws_cdk.RemovalPolicy.DESTROY, 42 | ) 43 | 44 | external_metastore_secret = secretsmanager.Secret( 45 | stack, 46 | "EmrLaunchExternalMetastoreSecret", 47 | secret_name=f"{NAMING_PREFIX}-external-metastore", 48 | generate_secret_string=secretsmanager.SecretStringGenerator( 49 | secret_string_template=json.dumps( 50 | { 51 | "javax.jdo.option.ConnectionURL": "jdbc", 52 | "javax.jdo.option.ConnectionDriverName": "mariaDB", 53 | "javax.jdo.option.ConnectionUserName": "user", 54 | } 55 | ), 56 | generate_string_key="javax.jdo.option.ConnectionPassword", 57 | ), 58 | ) 59 | kerberos_attributes_secret = secretsmanager.Secret( 60 | stack, 61 | "EmrLaunchKerberosAttributesSecret", 62 | secret_name=f"{NAMING_PREFIX}-kerberos-attributes", 63 | generate_secret_string=secretsmanager.SecretStringGenerator( 64 | secret_string_template=json.dumps( 65 | { 66 | "Realm": "EC2.INTERNAL", 67 | } 68 | ), 69 | generate_string_key="KdcAdminPassword", 70 | ), 71 | ) 72 | 73 | app.synth() 74 | -------------------------------------------------------------------------------- /examples/environment_stack/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py" 3 | } 4 | -------------------------------------------------------------------------------- /examples/persistent_cluster_pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/persistent_cluster_pipeline/__init__.py -------------------------------------------------------------------------------- /examples/persistent_cluster_pipeline/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | 5 | import aws_cdk 6 | from aws_cdk import aws_s3 as s3 7 | from aws_cdk import aws_sns as sns 8 | from aws_cdk import aws_stepfunctions as sfn 9 | 10 | from aws_emr_launch.constructs.emr_constructs import emr_code 11 | from aws_emr_launch.constructs.step_functions import emr_chains, emr_tasks 12 | 13 | NAMING_PREFIX = f"emr-launch-{aws_cdk.Aws.ACCOUNT_ID}-{aws_cdk.Aws.REGION}" 14 | 15 | app = aws_cdk.App() 16 | stack = aws_cdk.Stack( 17 | app, 18 | "PersistentPipelineStack", 19 | env=aws_cdk.Environment(account=os.environ["CDK_DEFAULT_ACCOUNT"], region=os.environ["CDK_DEFAULT_REGION"]), 20 | ) 21 | 22 | # SNS Topics for Success/Failures messages from our Pipeline 23 | success_topic = sns.Topic(stack, "SuccessTopic") 24 | failure_topic = sns.Topic(stack, "FailureTopic") 25 | 26 | # The bucket to deploy Step artifacts to 27 | artifacts_bucket = s3.Bucket.from_bucket_name(stack, "ArtifactsBucket", f"{NAMING_PREFIX}-artifacts") 28 | 29 | # Prepare the scripts executed by our Steps for deployment 30 | # This uses the Artifacts bucket defined in Cluster Configuration used by our 31 | # Launch Function 32 | step_code = emr_code.Code.from_path( 33 | path="./step_sources", deployment_bucket=artifacts_bucket, deployment_prefix="persistent_pipeline/step_sources" 34 | ) 35 | 36 | # Create a Chain to receive Failure messages 37 | fail = emr_chains.Fail( 38 | stack, 39 | "FailChain", 40 | message=sfn.TaskInput.from_json_path_at("$.Error"), 41 | subject="Pipeline Failure", 42 | topic=failure_topic, 43 | ) 44 | 45 | # Create a Parallel Task for the Phase 1 Steps 46 | phase_1 = sfn.Parallel(stack, "Phase1", result_path="$.Result.Phase1") 47 | 48 | # Add a Failure catch to our Parallel phase 49 | phase_1.add_catch(fail, errors=["States.ALL"], result_path="$.Error") 50 | 51 | # Create 5 Phase 1 Parallel Steps. The number of concurrently running Steps is 52 | # defined in the Cluster Configuration 53 | for file in emr_code.Code.files_in_path("./step_sources", "test_step_*.sh"): 54 | # Define an AddStep Task for Each Step 55 | step_task = emr_tasks.AddStepBuilder.build( 56 | stack, 57 | f"Phase1_{file}", 58 | emr_step=emr_code.EMRStep( 59 | name=f"Phase 1 - {file}", 60 | jar="s3://us-west-2.elasticmapreduce/libs/script-runner/script-runner.jar", 61 | args=[f"{step_code.s3_path}/{file}", "Arg1", "Arg2"], 62 | code=step_code, 63 | ), 64 | cluster_id=sfn.TaskInput.from_json_path_at("$.ClusterId").value, 65 | ) 66 | phase_1.branch(step_task) 67 | 68 | 69 | # Create a Parallel Task for the Phase 2 Steps 70 | phase_2 = sfn.Parallel(stack, "Phase2", result_path="$.Result.Phase2") 71 | 72 | # Add a Failure catch to our Parallel phase 73 | phase_2.add_catch(fail, errors=["States.ALL"], result_path="$.Error") 74 | 75 | # Create 5 Phase 2 Parallel Hive SQL Steps. 76 | for file in emr_code.Code.files_in_path("./step_sources", "test_step_*.hql"): 77 | # Define an AddStep Task for Each Step 78 | step_task = emr_tasks.AddStepBuilder.build( 79 | stack, 80 | f"Phase2_{file}", 81 | emr_step=emr_code.EMRStep( 82 | name=f"Phase 2 - {file}", 83 | jar="command-runner.jar", 84 | args=[ 85 | "hive-script", 86 | "--run-hive-script", 87 | "--args", 88 | "-f", 89 | f"{step_code.s3_path}/{file}", 90 | "-d" "ARG1=Arg1", 91 | "-d", 92 | "ARG2=Arg2", 93 | ], 94 | code=step_code, 95 | ), 96 | cluster_id=sfn.TaskInput.from_json_path_at("$.ClusterId").value, 97 | ) 98 | phase_2.branch(step_task) 99 | 100 | # A Chain for Success notification when the pipeline completes 101 | success = emr_chains.Success( 102 | stack, 103 | "SuccessChain", 104 | message=sfn.TaskInput.from_json_path_at("$.Result"), 105 | subject="Pipeline Succeeded", 106 | topic=success_topic, 107 | ) 108 | 109 | # Assemble the Pipeline 110 | definition = sfn.Chain.start(phase_1).next(phase_2).next(success) 111 | 112 | state_machine = sfn.StateMachine( 113 | stack, "PersistentPipeline", state_machine_name="persistent-multi-phase-pipeline", definition=definition 114 | ) 115 | 116 | app.synth() 117 | -------------------------------------------------------------------------------- /examples/persistent_cluster_pipeline/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py" 3 | } 4 | -------------------------------------------------------------------------------- /examples/persistent_cluster_pipeline/step_sources/phase_1/test_step_0.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | echo "hello world" && echo "Args: $@" && sleep 30 && exit 0 5 | -------------------------------------------------------------------------------- /examples/persistent_cluster_pipeline/step_sources/phase_1/test_step_1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | echo "hello world" && echo "Args: $@" && sleep 30 && exit 0 5 | -------------------------------------------------------------------------------- /examples/persistent_cluster_pipeline/step_sources/phase_1/test_step_2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | echo "hello world" && echo "Args: $@" && sleep 30 && exit 0 5 | -------------------------------------------------------------------------------- /examples/persistent_cluster_pipeline/step_sources/phase_1/test_step_3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | echo "hello world" && echo "Args: $@" && sleep 30 && exit 0 5 | -------------------------------------------------------------------------------- /examples/persistent_cluster_pipeline/step_sources/phase_1/test_step_4.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | echo "hello world" && echo "Args: $@" && sleep 30 && exit 0 5 | -------------------------------------------------------------------------------- /examples/persistent_cluster_pipeline/step_sources/phase_2/test_step_0.hql: -------------------------------------------------------------------------------- 1 | SHOW databases; 2 | -------------------------------------------------------------------------------- /examples/persistent_cluster_pipeline/step_sources/phase_2/test_step_1.hql: -------------------------------------------------------------------------------- 1 | SHOW databases; 2 | -------------------------------------------------------------------------------- /examples/persistent_cluster_pipeline/step_sources/phase_2/test_step_2.hql: -------------------------------------------------------------------------------- 1 | SHOW databases; 2 | -------------------------------------------------------------------------------- /examples/persistent_cluster_pipeline/step_sources/phase_2/test_step_3.hql: -------------------------------------------------------------------------------- 1 | SHOW databases; 2 | -------------------------------------------------------------------------------- /examples/persistent_cluster_pipeline/step_sources/phase_2/test_step_4.hql: -------------------------------------------------------------------------------- 1 | SHOW databases; 2 | -------------------------------------------------------------------------------- /examples/sns_triggered_pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/sns_triggered_pipeline/__init__.py -------------------------------------------------------------------------------- /examples/sns_triggered_pipeline/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py" 3 | } 4 | -------------------------------------------------------------------------------- /examples/sns_triggered_pipeline/lambda_sources/execute_pipeline.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import traceback 5 | from typing import Any, Dict, Optional 6 | 7 | import boto3 8 | 9 | sfn = boto3.client("stepfunctions") 10 | 11 | LOGGER = logging.getLogger() 12 | LOGGER.setLevel(logging.INFO) 13 | 14 | 15 | def handler(event: Dict[str, Any], context: Optional[Dict[str, Any]]) -> None: 16 | LOGGER.info("Lambda metadata: {} (type = {})".format(json.dumps(event), type(event))) 17 | 18 | try: 19 | pipeline_arn = os.environ.get("PIPELINE_ARN", "") 20 | pipeline_input = json.dumps( 21 | {"ClusterConfigurationOverrides": {"ClusterName": "sns-triggered-pipeline"}, "Tags": []} 22 | ) 23 | sfn.start_execution(stateMachineArn=pipeline_arn, input=pipeline_input) 24 | 25 | LOGGER.info(f'Started StateMachine {pipeline_arn} with input "{pipeline_input}"') 26 | 27 | except Exception as e: 28 | trc = traceback.format_exc() 29 | s = "Failed parsing JSON {}: {}\n\n{}".format(str(event), str(e), trc) 30 | LOGGER.error(s) 31 | raise e 32 | -------------------------------------------------------------------------------- /examples/sns_triggered_pipeline/step_sources/test_step_0.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.appName("TestStepApp").getOrCreate() 4 | spark.sql("SHOW databases") 5 | -------------------------------------------------------------------------------- /examples/sns_triggered_pipeline/step_sources/test_step_1.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.appName("TestStepApp").getOrCreate() 4 | spark.sql("SHOW databases") 5 | -------------------------------------------------------------------------------- /examples/sns_triggered_pipeline/step_sources/test_step_2.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.appName("TestStepApp").getOrCreate() 4 | spark.sql("SHOW databases") 5 | -------------------------------------------------------------------------------- /examples/sns_triggered_pipeline/step_sources/test_step_3.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.appName("TestStepApp").getOrCreate() 4 | spark.sql("SHOW databases") 5 | -------------------------------------------------------------------------------- /examples/sns_triggered_pipeline/step_sources/test_step_4.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.appName("TestStepApp").getOrCreate() 4 | spark.sql("SHOW databases") 5 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/.gitignore: -------------------------------------------------------------------------------- 1 | cdk.out 2 | .env 3 | *.pyc 4 | infrastructure/infrastructure.egg-info -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/BatchSparkPipelineArchitecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/BatchSparkPipelineArchitecture.png -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/README.md: -------------------------------------------------------------------------------- 1 | # Solution Architecture 2 | 3 | ![alt text](https://github.com/awslabs/aws-emr-launch/blob/43a28f00f98399d838a8e403631fadef6d01750b/examples/spark_batch_orchestration/BatchSparkPipelineArchitecture.png?raw=true) 4 | 5 | # Welcome to your CDK Python project! 6 | 7 | Initial Configuration 8 | 9 | 1. Update config.json as needed, including the S3 bucket name 10 | 11 | 2. Add your spark script to emr_orchestration/steps/ 12 | 13 | 3. Update or duplicate the pyspark_step in emr_orchestration/stack.py [Line 95] according to your script's needs 14 | 15 | 4. Update Triggering Logic in the 'should_lambda_trigger_pipeline' function in emr_trigger/lambda_source/trigger.py 16 | **Currently set to start a batch for every 1 megabyte of data for demo purposes** 17 | 18 | deploy.sh with build=true will create an ecr repository in your account, if it does not yet exist, and push your docker image to that repository 19 | Then it will execute the CDK command to deploy all infrastructure referenced in app.py 20 | 21 | The `cdk.json` file tells the CDK Toolkit how to execute your app. 22 | 23 | This project is set up like a standard Python project. The initialization 24 | process also creates a virtualenv within this project, stored under the .env 25 | directory. To create the virtualenv it assumes that there is a `python3` 26 | (or `python` for Windows) executable in your path with access to the `venv` 27 | package. If for any reason the automatic creation of the virtualenv fails, 28 | you can create the virtualenv manually. 29 | 30 | To manually create a virtualenv on MacOS and Linux: 31 | 32 | ``` 33 | $ python3 -m venv .env 34 | $ source .env/bin/activate 35 | ``` 36 | 37 | After the init process completes and the virtualenv is created, you can use the following 38 | step to test deployment 39 | 40 | To add additional dependencies, for example other CDK libraries, just add 41 | them to your `requirements.txt` or `setup.py` file and rerun the `pip install -r requirements.txt` 42 | command. 43 | 44 | ## Useful CDK commands 45 | 46 | * `bash deploy.sh ls` list all stacks in the app 47 | * `bash deploy.sh synth` emits the synthesized CloudFormation template 48 | * `bash deploy.sh deploy` build and deploy this stack to your default AWS account/region 49 | * `bash deploy.sh diff` compare deployed stack with current state 50 | * `bash deploy.sh docs` open CDK documentation 51 | 52 | ## Run the demo 53 | 54 | Steps to run the Data Pipelines: 55 | 1. Upload the parquet files stored under /sample_data folder in S3, into the "input" S3 Bucket as it follows: 56 | ``` 57 | /demo-pipeline-s3inputbucket-srcbucketa467747d-unbfunfcy9dm/ 58 | sample_data/ 59 | file_slot=1/ 60 | part-r-00....parquet 61 | part-r-00....parquet 62 | ``` 63 | 2. Go to EMR console and check that a cluster is being created, including 2 steps 64 | 3. When the 2 steps are completed, open the logs "stdout" to check results, and check for output data in "output" S3 bucket. -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/__init__.py -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import os 5 | from typing import List 6 | 7 | import aws_cdk 8 | from aws_cdk import aws_s3 9 | from aws_cdk import aws_s3_notifications as s3n 10 | from aws_cdk import aws_sns as sns 11 | from infrastructure.emr_launch.cluster_definition import EMRClusterDefinition 12 | from infrastructure.emr_orchestration.stack import StepFunctionStack 13 | from infrastructure.emr_trigger.stack import EmrTriggerStack 14 | from infrastructure.job_summary.stack import JobSummaryStack 15 | 16 | # Load config 17 | project_dir = os.path.dirname(os.path.abspath(__file__)) 18 | 19 | config_file = os.path.join(project_dir, "config.json") 20 | 21 | with open(config_file) as json_file: 22 | config = json.load(json_file) 23 | 24 | print(config) 25 | 26 | app = aws_cdk.App() 27 | stack_id = config["stack-id"] 28 | cluster_name = config["emr"]["CLUSTER_NAME"] 29 | 30 | 31 | def emr_launch(config, input_buckets: List[str]): 32 | 33 | environment_variables = [ 34 | "CLUSTER_NAME", 35 | "MASTER_INSTANCE_TYPE", 36 | "CORE_INSTANCE_TYPE", 37 | "CORE_INSTANCE_COUNT", 38 | "CORE_INSTANCE_MARKET", 39 | "TASK_INSTANCE_TYPE", 40 | "TASK_INSTANCE_COUNT", 41 | "TASK_INSTANCE_MARKET", 42 | "RELEASE_LABEL", 43 | "APPLICATIONS", 44 | "CONFIGURATION", 45 | ] 46 | 47 | # list_vars = ["APPLICATIONS"] 48 | 49 | # int_vars = [ 50 | # "CORE_INSTANCE_COUNT", 51 | # "TASK_INSTANCE_COUNT", 52 | # ] 53 | 54 | # json_vars = ["CONFIGURATION"] 55 | 56 | clean_config = {"INPUT_BUCKETS": input_buckets} 57 | 58 | for v in environment_variables: 59 | val = config[v] 60 | clean_config[v] = val 61 | 62 | return EMRClusterDefinition(app, id=config["CLUSTER_NAME"] + "-EMRLaunch", config=clean_config) 63 | 64 | 65 | class S3InputStack(aws_cdk.Stack): 66 | def __init__(self, scope: aws_cdk.Construct, id: str, **kwargs): 67 | super().__init__(scope, id, **kwargs) 68 | 69 | src_bucket = aws_s3.Bucket(self, id="src-bucket", removal_policy=aws_cdk.RemovalPolicy.DESTROY) 70 | 71 | new_files_topic = sns.Topic(self, "NewFileEventNotification") 72 | src_bucket.add_event_notification(aws_s3.EventType.OBJECT_CREATED, s3n.SnsDestination(new_files_topic)) 73 | 74 | self.input_bucket_sns = new_files_topic 75 | self.input_bucket_arn = src_bucket.bucket_arn 76 | print("Input bucket: " + self.input_bucket_arn) 77 | print("Input bucket SNS: " + self.input_bucket_sns.topic_arn) 78 | 79 | 80 | # To create an input s3 bucket and sns topic 81 | s3_stack = S3InputStack(app, id=cluster_name + "-S3InputBucket") 82 | 83 | emr_cluster_stack = emr_launch(config["emr"], input_buckets=[s3_stack.input_bucket_arn]) 84 | 85 | emr_orchestration_stack = StepFunctionStack( 86 | app, 87 | id=cluster_name + "-EMROrchestration", 88 | emr_launch_stack=emr_cluster_stack, 89 | artifact_bucket=emr_cluster_stack.artifact_bucket, 90 | output_bucket=emr_cluster_stack.output_bucket, 91 | ) 92 | 93 | emr_trigger_stack = EmrTriggerStack( 94 | app, 95 | id=cluster_name + "-EMRTrigger", 96 | target_step_function_arn=emr_orchestration_stack.state_machine.state_machine_arn, 97 | source_bucket_sns=s3_stack.input_bucket_sns, 98 | dynamo_table=emr_orchestration_stack.dynamo_table, 99 | ) 100 | 101 | job_summary_stack = JobSummaryStack( 102 | app, 103 | id=cluster_name + "-JobSummary", 104 | orchestration_sfn_name=emr_orchestration_stack.state_machine.state_machine_name, 105 | launch_sfn_name=emr_cluster_stack.launch_function.state_machine.state_machine_name, 106 | log_bucket_arn=emr_cluster_stack.emr_profile.logs_bucket.bucket_arn, 107 | destination_bucket_name=emr_cluster_stack.emr_profile.logs_bucket.bucket_name, 108 | success_sns_topic_arn=emr_orchestration_stack.success_topic.topic_arn, 109 | failure_sns_topic_arn=emr_orchestration_stack.failure_topic.topic_arn, 110 | ) 111 | 112 | app.synth() 113 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "context": { 4 | "@aws-cdk/core:enableStackNameDuplicates": "false", 5 | "aws-cdk:enableDiffNoFail": "true" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "stack-id": "my-emr-pipeline", 3 | "emr": { 4 | "CLUSTER_NAME": "demo-pipeline", 5 | "MASTER_INSTANCE_TYPE": "m5.xlarge", 6 | "CORE_INSTANCE_TYPE": "m5d.xlarge", 7 | "CORE_INSTANCE_COUNT": 1, 8 | "CORE_INSTANCE_MARKET": "ON_DEMAND", 9 | "TASK_INSTANCE_TYPE": "m5.xlarge", 10 | "TASK_INSTANCE_COUNT": 1, 11 | "TASK_INSTANCE_MARKET": "SPOT", 12 | "RELEASE_LABEL": "emr-6.2.0", 13 | "APPLICATIONS": ["Hadoop", "Spark"], 14 | "CONFIGURATION": [ 15 | { 16 | "Classification": "spark", 17 | "Properties": { 18 | "maximizeResourceAllocation": "true" 19 | } 20 | } 21 | ] 22 | } 23 | } -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd=$1 4 | build=$2 5 | 6 | python3 -m venv .env 7 | source .env/bin/activate 8 | pip install -r requirements.txt | grep -v 'already satisfied' 9 | cdk bootstrap "aws://unknown-account/unknown-region" 10 | 11 | cdk $cmd --all 12 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/infrastructure/__init__.py -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/emr_launch/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## cluster_definition.py 3 | To be updated or extended if an EMR cluster requires additional permissions or configurations 4 | 5 | EMRClusterDefinition 6 | CDK Stack to define a Standard EMR Cluster 7 | 8 | EMRClusterDefinition.emr_resource_config() 9 | # Everything related to cluster sizing and hardware/software configuration. For example: 10 | 11 | master_instance_type: Optional[str] = 'm5.2xlarge', 12 | master_instance_market: Optional[InstanceMarketType] = InstanceMarketType.ON_DEMAND, 13 | core_instance_type: Optional[str] = 'm5.xlarge', 14 | core_instance_market: Optional[InstanceMarketType] = InstanceMarketType.ON_DEMAND, 15 | core_instance_count: Optional[int] = 2, 16 | applications: Optional[List[str]] = None, 17 | bootstrap_actions: Optional[List[emr_code.EMRBootstrapAction]] = None, 18 | configurations: Optional[List[dict]] = None, 19 | use_glue_catalog: Optional[bool] = True, 20 | step_concurrency_level: Optional[int] = 1, 21 | description: Optional[str] = None, 22 | secret_configurations: Optional[Dict[str, secretsmanager.Secret]] = None): 23 | 24 | EMRClusterDefinition.security_profile_config() 25 | # Everything related to the cluster's security configuration. For example: 26 | vpc: Optional[ec2.Vpc] = None, 27 | artifacts_bucket: Optional[s3.Bucket] = None, 28 | artifacts_path: Optional[str] = None, 29 | logs_bucket: Optional[s3.Bucket] = None, 30 | logs_path: Optional[str] = 'elasticmapreduce/', 31 | mutable_instance_role: bool = True, 32 | mutable_security_groups: bool = True, 33 | description: Optional[str] = None) -> None: 34 | 35 | EMRClusterDefinition.launch_function_config() 36 | # Everything related to checking for running clusters and lauching new clusters. 37 | To be extended if the default step function for launching a cluster does not meet your needs. 38 | For example, if you want to use a permanent cluster or transient cluster. Implement that here. 39 | 40 | 41 | ## Development 42 | This repository uses the [AWS CDK](https://aws.amazon.com/cdk/) and the Professional Services developed 43 | **AWS EMR Launch** plugin for the CDK to define EMR Clusters and Step Functions. 44 | 45 | It is recommended that a Python3 `venv` be used for all CDK builds and deployments. 46 | 47 | To get up and running quickly: 48 | 49 | 1. Install the [CDK CLI](https://docs.aws.amazon.com/cdk/latest/guide/getting_started.html) 50 | ```bash 51 | npm install -g aws-cdk 52 | ``` 53 | 54 | 2. Use your mechanism of choice to create and activate a Python3 `venv`: 55 | ```bash 56 | python3 -m venv ~/.env 57 | source ~/.env/bin/activate 58 | ``` 59 | 60 | 3. Install the CDK and Boto3 minimum requirements: 61 | ```bash 62 | pip install -r requirements.txt 63 | ``` 64 | 65 | ## Deployment of cluster_definition.py 66 | * terraform apply 67 | 68 | Deploys: 69 | SSM Parameter Store: 70 | Security, resource, and launch function configs to SSM Paramter Store 71 | https://eu-west-1.console.aws.amazon.com/systems-manager/parameters/?region=eu-west-1&tab=Table 72 | Lambdas to Orchestrate Cluster Provisioning 73 | https://eu-west-1.console.aws.amazon.com/lambda/home?region=eu-west-1#/functions 74 | Step Function to Orchestrate Cluster Provisioning 75 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/emr_launch/bootstrap_actions/install_boto3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sudo python3 -m pip install boto3 -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/emr_orchestration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/infrastructure/emr_orchestration/__init__.py -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/emr_orchestration/steps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/infrastructure/emr_orchestration/steps/__init__.py -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/emr_orchestration/steps/data_ingestion.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import sys 4 | 5 | import boto3 6 | import pyspark.sql.functions as func 7 | from pyspark.sql import SparkSession 8 | 9 | 10 | def union_all(dfs): 11 | return functools.reduce(lambda df1, df2: df1.union(df2.select(df1.columns)), dfs) 12 | 13 | 14 | def parse_arguments(args): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--batch-metadata-table-name", required=True) 17 | parser.add_argument("--batch-id", required=True) 18 | parser.add_argument("--output-bucket", required=True) 19 | parser.add_argument("--region", required=True) 20 | return parser.parse_args(args=args) 21 | 22 | 23 | def get_batch_file_metadata(table_name, batch_id, region): 24 | dynamodb = boto3.resource("dynamodb", region_name=region) 25 | table = dynamodb.Table(table_name) 26 | response = table.query(KeyConditions={"BatchId": {"AttributeValueList": [batch_id], "ComparisonOperator": "EQ"}}) 27 | data = response["Items"] 28 | while "LastEvaluatedKey" in response: 29 | response = table.query(ExclusiveStartKey=response["LastEvaluatedKey"]) 30 | data.update(response["Items"]) 31 | return data 32 | 33 | 34 | def load_file_path(spark, bucket, prefix, file_partition, file_slot): 35 | s3path = "s3://" + bucket + "/" + prefix 36 | df = ( 37 | spark.read.load(s3path) 38 | .withColumn("file_partition", func.lit(file_partition)) 39 | .withColumn("file_slot", func.lit(file_slot)) 40 | ) 41 | return df 42 | 43 | 44 | def load_and_union_data(spark, batch_metadata): 45 | distinct_partitions = list(set([x["FilePartition"] for x in batch_metadata])) 46 | partition_dfs = {} 47 | for partition in distinct_partitions: 48 | dfs = [ 49 | load_file_path( 50 | spark, 51 | bucket=x["FileBucket"], 52 | prefix=x["Name"], 53 | file_partition=x["FilePartition"], 54 | file_slot=x["FileSlot"], 55 | ) 56 | for x in batch_metadata 57 | if x["FilePartition"] == partition 58 | ] 59 | partition_dfs[partition] = union_all(dfs) 60 | 61 | return partition_dfs 62 | 63 | 64 | def write_results(df, table_name, output_bucket, partition_cols=[]): 65 | df.write.mode("append").partitionBy(*partition_cols).parquet(f"s3://{output_bucket}/{table_name}") 66 | 67 | 68 | def main(args, spark): 69 | arguments = parse_arguments(args) 70 | # Load files to process 71 | batch_metadata = get_batch_file_metadata( 72 | table_name=arguments.batch_metadata_table_name, batch_id=arguments.batch_id, region=arguments.region 73 | ) 74 | 75 | # Load data from s3 and union 76 | input_data = load_and_union_data(spark, batch_metadata) 77 | 78 | # Save Output to S3 79 | for dataset, df in input_data.items(): 80 | write_results(df, table_name=dataset, output_bucket=arguments.output_bucket, partition_cols=["file_slot"]) 81 | break 82 | 83 | 84 | if __name__ == "__main__": 85 | spark = SparkSession.builder.appName("data-ingestion").getOrCreate() 86 | sc = spark.sparkContext 87 | main(sys.argv[1:], spark) 88 | sc.stop() 89 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/emr_orchestration/steps/data_preparation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import sys 4 | 5 | import boto3 6 | from pyspark.sql import SparkSession # noqa 7 | from pyspark.sql import functions as F # noqa 8 | 9 | 10 | def union_all(dfs): 11 | return functools.reduce(lambda df1, df2: df1.union(df2.select(df1.columns)), dfs) 12 | 13 | 14 | def parse_arguments(args): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--batch-metadata-table-name", required=True) 17 | parser.add_argument("--batch-id", required=True) 18 | parser.add_argument("--input-bucket", required=True) 19 | parser.add_argument("--region", required=True) 20 | return parser.parse_args(args=args) 21 | 22 | 23 | def get_batch_file_metadata(table_name, batch_id, region): 24 | dynamodb = boto3.resource("dynamodb", region_name=region) 25 | table = dynamodb.Table(table_name) 26 | response = table.query(KeyConditions={"BatchId": {"AttributeValueList": [batch_id], "ComparisonOperator": "EQ"}}) 27 | data = response["Items"] 28 | while "LastEvaluatedKey" in response: 29 | response = table.query(ExclusiveStartKey=response["LastEvaluatedKey"]) 30 | data.update(response["Items"]) 31 | return data 32 | 33 | 34 | def load_partition(spark, bucket, partition): 35 | s3path = "s3://" + bucket + "/" + partition + "/*" 36 | df = spark.read.load(s3path) 37 | return df 38 | 39 | 40 | def load_and_union_data(spark, batch_metadata, input_bucket): 41 | distinct_partitions = list(set([x["FilePartition"] for x in batch_metadata])) 42 | partition_dfs = {} 43 | for partition in distinct_partitions: 44 | dfs = [ 45 | load_partition(spark, bucket=input_bucket, partition=partition) 46 | for x in batch_metadata 47 | if x["FilePartition"] == partition 48 | ] 49 | partition_dfs[partition] = union_all(dfs) 50 | 51 | return partition_dfs 52 | 53 | 54 | def main(args, spark): 55 | arguments = parse_arguments(args) 56 | 57 | # Load metadata to process 58 | batch_metadata = get_batch_file_metadata( 59 | table_name=arguments.batch_metadata_table_name, batch_id=arguments.batch_id, region=arguments.region 60 | ) 61 | 62 | input_bucket = arguments.input_bucket 63 | input_data = load_and_union_data(spark, batch_metadata, input_bucket) 64 | 65 | input_dfs = [] 66 | for dataset, df in input_data.items(): 67 | input_dfs.append(df) 68 | 69 | # get input dataframe 70 | input_df = union_all(input_dfs) 71 | 72 | # add extra column to input dataframe 73 | input_df = input_df.withColumn("current_ts", F.current_timestamp()) 74 | 75 | input_df.printSchema() 76 | 77 | input_df.show() 78 | 79 | 80 | if __name__ == "__main__": 81 | spark = SparkSession.builder.appName("data-preparation").getOrCreate() 82 | sc = spark.sparkContext 83 | main(sys.argv[1:], spark) 84 | sc.stop() 85 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/emr_trigger/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/infrastructure/emr_trigger/__init__.py -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/emr_trigger/lambda_source/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/infrastructure/emr_trigger/lambda_source/__init__.py -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/emr_trigger/stack.py: -------------------------------------------------------------------------------- 1 | import aws_cdk 2 | from aws_cdk import aws_dynamodb as dynamo 3 | from aws_cdk import aws_iam as iam 4 | from aws_cdk import aws_lambda 5 | from aws_cdk import aws_lambda_event_sources as sources 6 | from aws_cdk import aws_sns as sns 7 | 8 | import constructs 9 | 10 | 11 | class EmrTriggerStack(aws_cdk.Stack): 12 | def __init__( 13 | self, 14 | scope: constructs.Construct, 15 | id: str, 16 | target_step_function_arn: str, 17 | source_bucket_sns: sns.Topic, 18 | dynamo_table: dynamo.Table, 19 | **kwargs, 20 | ): 21 | super().__init__(scope, id, **kwargs) 22 | 23 | # SNS Triggered Pipeline 24 | lambda_code = aws_lambda.Code.from_asset("infrastructure/emr_trigger/lambda_source/") 25 | 26 | aws_lambda.Function( 27 | self, 28 | "SNSTriggeredLambda", 29 | code=lambda_code, 30 | handler="trigger.handler", 31 | runtime=aws_lambda.Runtime.PYTHON_3_7, 32 | timeout=aws_cdk.Duration.minutes(1), 33 | environment={"PIPELINE_ARN": target_step_function_arn, "TABLE_NAME": dynamo_table.table_name}, 34 | initial_policy=[ 35 | iam.PolicyStatement( 36 | effect=iam.Effect.ALLOW, 37 | actions=["states:StartExecution", "states:ListExecutions"], 38 | resources=[target_step_function_arn], 39 | ), 40 | iam.PolicyStatement( 41 | effect=iam.Effect.ALLOW, 42 | actions=[ 43 | "dynamodb:BatchGet*", 44 | "dynamodb:DescribeStream", 45 | "dynamodb:DescribeTable", 46 | "dynamodb:Get*", 47 | "dynamodb:Query", 48 | "dynamodb:Scan", 49 | "dynamodb:BatchWrite*", 50 | "dynamodb:CreateTable", 51 | "dynamodb:Delete*", 52 | "dynamodb:Update*", 53 | "dynamodb:PutItem", 54 | ], 55 | resources=[dynamo_table.table_arn], 56 | ), 57 | ], 58 | events=[sources.SnsEventSource(source_bucket_sns)], 59 | ) 60 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/job_summary/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/infrastructure/job_summary/__init__.py -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/job_summary/lambda_source/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/infrastructure/job_summary/lambda_source/__init__.py -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/job_summary/lambda_source/extracting.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from fetching import ( 4 | download_logs, 5 | extract_region_from_arn, 6 | get_emr_cluster_info, 7 | get_emr_cluster_steps, 8 | get_sfn_execution_events, 9 | get_sfn_execution_info, 10 | ) 11 | 12 | 13 | def extract_sfn_execution_info(execution_arn): 14 | info = get_sfn_execution_info(execution_arn) 15 | events = get_sfn_execution_events(execution_arn) 16 | 17 | info["steps"] = extract_sfn_steps(events) 18 | 19 | return info 20 | 21 | 22 | def extract_sfn_steps(events): 23 | groups = {} 24 | 25 | # Find step boundaries 26 | last_entered_step = None 27 | for e in events: 28 | if e["type"] == "TaskStateEntered": 29 | step_name = e["stateEnteredEventDetails"]["name"] 30 | groups[step_name] = {"fromId": e["id"]} 31 | last_entered_step = step_name 32 | if e["type"] == "TaskStateExited": 33 | step_name = e["stateExitedEventDetails"]["name"] 34 | groups[step_name]["toId"] = e["id"] 35 | if e["type"] == "TaskStateAborted": 36 | step_name = last_entered_step 37 | groups[step_name]["toId"] = e["id"] 38 | 39 | # Group events by step 40 | for group in groups.values(): 41 | group["events"] = [e for e in events if group["fromId"] <= e["id"] <= group["toId"]] 42 | 43 | # Keep original order 44 | sorted_groups = sorted([g for g in groups.items()], key=lambda kv: kv[1]["fromId"]) 45 | 46 | return [extract_sfn_step_info(k, v) for k, v in sorted_groups] 47 | 48 | 49 | def extract_sfn_step_info(step_name, group): 50 | events = group["events"] 51 | events_by_id = {e["id"]: e for e in events} 52 | 53 | start_time = events_by_id[group["fromId"]]["timestamp"] 54 | end_time = events_by_id[group["toId"]]["timestamp"] 55 | duration = end_time - start_time 56 | 57 | status = "Succeeded" if any([e["type"].endswith("Succeeded") for e in events]) else "Failed" 58 | 59 | step = { 60 | "stepName": step_name, 61 | "startTime": start_time.isoformat(), 62 | "endTime": end_time.isoformat(), 63 | "duration": str(duration), 64 | "status": status, 65 | } 66 | 67 | for e in events: 68 | if e["type"] == "TaskSubmitted": 69 | details = e["taskSubmittedEventDetails"] 70 | 71 | if details["resourceType"] == "states" and details["resource"].startswith("startExecution"): 72 | if "sfnExecutionInfo" in step.keys(): 73 | raise Exception("Step already has 'sfnExecutionInfo'") 74 | output = json.loads(details["output"]) 75 | step["sfnExecutionInfo"] = extract_sfn_execution_info(output["ExecutionArn"]) 76 | 77 | if details["resourceType"] == "elasticmapreduce" and details["resource"].startswith("createCluster"): 78 | if "emrClusterInfo" in step.keys(): 79 | raise Exception("Step already has 'emrClusterInfo'") 80 | output = json.loads(details["output"]) 81 | step["emrClusterInfo"] = extract_emr_cluster_info(output) 82 | 83 | return step 84 | 85 | 86 | def extract_emr_cluster_info(output): 87 | cluster_id = output["ClusterId"] 88 | cluster_arn = output["ClusterArn"] 89 | region = extract_region_from_arn(cluster_arn) 90 | cluster_link = ( 91 | f"https://{region}.console.aws.amazon.com/elasticmapreduce/home?region={region}#cluster-details:{cluster_id}" 92 | ) 93 | 94 | info = get_emr_cluster_info(cluster_id) 95 | steps = sorted( 96 | [extract_emr_step_info(s, info) for s in get_emr_cluster_steps(cluster_id)], 97 | key=lambda s: s["startTime"], 98 | ) 99 | 100 | return { 101 | "emrClusterId": cluster_id, 102 | "emrClusterArn": cluster_arn, 103 | "emrClusterName": info["emrClusterName"], 104 | "emrClusterLink": cluster_link, 105 | "status": info["status"], 106 | "steps": steps, 107 | } 108 | 109 | 110 | def extract_emr_step_info(step, cluster_info): 111 | timeline = step["Status"]["Timeline"] 112 | 113 | info = { 114 | "stepId": step["Id"], 115 | "stepName": step["Name"], 116 | "createTime": timeline["CreationDateTime"].isoformat(), 117 | "startTime": timeline["StartDateTime"].isoformat(), 118 | "endTime": timeline["EndDateTime"].isoformat(), 119 | "duration": str(timeline["EndDateTime"] - timeline["StartDateTime"]), 120 | "status": step["Status"]["State"], 121 | } 122 | 123 | if "FailureDetails" in step["Status"]: 124 | failure = step["Status"]["FailureDetails"] 125 | info["error"] = failure.get("Reason") or failure.get("Message") 126 | info["logPath"] = failure["LogFile"].replace("stderr.gz", "") # File name is not always included 127 | download_logs(info["logPath"] + "stderr.gz") 128 | 129 | return info 130 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/job_summary/lambda_source/fetching.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import io 3 | import json 4 | 5 | import boto3 6 | from helpers import parse_s3_uri 7 | 8 | sfn_client = boto3.client("stepfunctions") 9 | emr_client = boto3.client("emr") 10 | s3_client = boto3.client("s3") 11 | 12 | 13 | def get_sfn_execution_info(execution_arn): 14 | response = sfn_client.describe_execution(executionArn=execution_arn) 15 | 16 | info = { 17 | "sfnExecutionArn": execution_arn, 18 | "status": response["status"], 19 | "input": json.loads(response["input"]), 20 | } 21 | 22 | return info 23 | 24 | 25 | def get_sfn_execution_events(execution_arn): 26 | params = { 27 | "executionArn": execution_arn, 28 | "maxResults": 1000, 29 | } 30 | 31 | events = [] 32 | keep_fetching = True 33 | 34 | while keep_fetching: 35 | response = sfn_client.get_execution_history(**params) 36 | events += response["events"] 37 | if "nextToken" in response.keys(): 38 | params["nextToken"] = response["nextToken"] 39 | else: 40 | keep_fetching = False 41 | 42 | return events 43 | 44 | 45 | def get_emr_cluster_info(cluster_id): 46 | response = emr_client.describe_cluster(ClusterId=cluster_id) 47 | 48 | info = { 49 | "emrClusterId": response["Cluster"]["Id"], 50 | "emrClusterName": response["Cluster"]["Name"], 51 | "status": response["Cluster"]["Status"]["State"], 52 | "logUri": response["Cluster"]["LogUri"], 53 | } 54 | 55 | return info 56 | 57 | 58 | def get_emr_cluster_steps(cluster_id): 59 | params = { 60 | "ClusterId": cluster_id, 61 | } 62 | 63 | steps = [] 64 | keep_fetching = True 65 | 66 | while keep_fetching: 67 | response = emr_client.list_steps(**params) 68 | steps += response["Steps"] 69 | if "Marker" in response.keys(): 70 | params["Marker"] = response["Marker"] 71 | else: 72 | keep_fetching = False 73 | 74 | return steps 75 | 76 | 77 | def download_logs(step_log_uri): 78 | bucket_name, object_key = parse_s3_uri(step_log_uri) 79 | 80 | # Download the *.gz file 81 | gz_file = io.BytesIO() 82 | s3_client.download_fileobj(bucket_name, object_key, gz_file) 83 | 84 | # Decompress 85 | gz_file.seek(0) 86 | log_file = gzip.GzipFile(fileobj=gz_file) 87 | 88 | # Decode into text 89 | log_content = log_file.read().decode("UTF-8") 90 | 91 | return log_content 92 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/job_summary/lambda_source/helpers.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def extract_region_from_arn(arn): 5 | return re.match(r"arn:[\w\-]+:\w+:([\w\-]+):.*", arn).group(1) 6 | 7 | 8 | def parse_s3_uri(s3_uri): 9 | match = re.match(r"s3.?://([\w\-]+)/(.+)$", s3_uri) 10 | bucket_name = match.group(1) 11 | object_key = match.group(2) 12 | 13 | return bucket_name, object_key 14 | 15 | 16 | def make_s3_console_link(bucket_name, object_key): 17 | return f"https://s3.console.aws.amazon.com/s3/object/{bucket_name}?prefix={object_key}" 18 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/job_summary/lambda_source/main.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | import os 4 | 5 | import boto3 6 | from extracting import extract_sfn_execution_info 7 | from helpers import make_s3_console_link 8 | from rendering import render_html_page 9 | 10 | s3_client = boto3.client("s3") 11 | sns_client = boto3.client("sns") 12 | 13 | 14 | def lambda_handler(event, context): 15 | print(f"INPUT: {json.dumps(event)}") 16 | 17 | execution_arn = event["sfnExecutionArn"] 18 | 19 | destination_bucket_name = event.get("destinationBucketName") or os.environ["DESTINATION_BUCKET_NAME"] 20 | destination_object_key = ( 21 | event.get("destinationObjectKey") or f'job-summary/{execution_arn.split(":")[-1]}/summary.html' 22 | ) 23 | 24 | success_sns_topic_arn = event.get("successSnsTopicArn") or os.environ.get("SUCCESS_SNS_TOPIC_ARN") 25 | failure_sns_topic_arn = event.get("failureSnsTopicArn") or os.environ.get("FAILURE_SNS_TOPIC_ARN") 26 | 27 | # Collect the information about execution 28 | info = extract_sfn_execution_info(execution_arn) 29 | print(f"SUMMARY: {json.dumps(info)}") 30 | 31 | # Store execution summary to S3 32 | save_execution_info(info, destination_bucket_name, destination_object_key) 33 | s3_console_link = make_s3_console_link(destination_bucket_name, destination_object_key) 34 | 35 | # Send a notification if SNS Topic ARN is provided 36 | sns_topic_arn = success_sns_topic_arn if info["status"] == "SUCCEEDED" else failure_sns_topic_arn 37 | if sns_topic_arn: 38 | send_notification(sns_topic_arn, execution_arn, info["status"], s3_console_link) 39 | 40 | 41 | def save_execution_info(info, bucket_name, object_key): 42 | html_page = render_html_page(info) 43 | 44 | # with open("summary.html", "w") as f: 45 | # print(html_page, file=f) 46 | 47 | html_file = io.BytesIO(html_page.encode("UTF-8")) 48 | 49 | print(f"Storing summary to s3://{bucket_name}/{object_key}") 50 | s3_client.upload_fileobj(html_file, bucket_name, object_key) 51 | 52 | aws_console_link = make_s3_console_link(bucket_name, object_key) 53 | print(f"AWS Console link: {aws_console_link}") 54 | 55 | 56 | def send_notification(sns_topic_arn, execution_arn, status, s3_console_link): 57 | subject = f"SFN Execution Summary ({status})" 58 | message = f"Summary for SFN execution {execution_arn} is stored in S3 bucket: {s3_console_link}" 59 | 60 | sns_client.publish( 61 | TopicArn=sns_topic_arn, 62 | Subject=subject, 63 | Message=message, 64 | ) 65 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/job_summary/lambda_source/summary.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: sans-serif; 3 | } 4 | 5 | a { 6 | text-decoration: none; 7 | color: orange; 8 | } 9 | 10 | table { 11 | margin-top: 1em; 12 | } 13 | 14 | th { 15 | text-align: left; 16 | padding-right: 2em; 17 | } 18 | 19 | td { 20 | vertical-align: top; 21 | padding-right: 2em; 22 | } 23 | 24 | textarea { 25 | width: 100%; 26 | height: 20em; 27 | color: white; 28 | background-color: black; 29 | } 30 | 31 | .success { 32 | color: green; 33 | font-weight: bold; 34 | } 35 | 36 | .failure { 37 | color: red; 38 | font-weight: bold; 39 | } 40 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/infrastructure/job_summary/stack.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import aws_cdk 4 | from aws_cdk import aws_events as events 5 | from aws_cdk import aws_events_targets as event_targets 6 | from aws_cdk import aws_iam as iam 7 | from aws_cdk import aws_lambda 8 | 9 | import constructs 10 | 11 | 12 | class JobSummaryStack(aws_cdk.Stack): 13 | def __init__( 14 | self, 15 | scope: constructs.Construct, 16 | id: str, 17 | orchestration_sfn_name: str, 18 | launch_sfn_name: str, 19 | log_bucket_arn: str, 20 | destination_bucket_name: str, 21 | success_sns_topic_arn: str, 22 | failure_sns_topic_arn: str, 23 | **kwargs, 24 | ): 25 | super().__init__(scope, id, **kwargs) 26 | 27 | aws_account = os.environ["CDK_DEFAULT_ACCOUNT"] 28 | aws_region = os.environ["CDK_DEFAULT_REGION"] 29 | 30 | lambda_code = aws_lambda.Code.from_asset("infrastructure/job_summary/lambda_source/") 31 | 32 | job_summary_lambda = aws_lambda.Function( 33 | self, 34 | "EmrLaunchJobSummaryLambda", 35 | code=lambda_code, 36 | handler="main.lambda_handler", 37 | runtime=aws_lambda.Runtime.PYTHON_3_7, 38 | timeout=aws_cdk.Duration.minutes(1), 39 | environment={ 40 | "DESTINATION_BUCKET_NAME": destination_bucket_name, 41 | "SUCCESS_SNS_TOPIC_ARN": success_sns_topic_arn, 42 | "FAILURE_SNS_TOPIC_ARN": failure_sns_topic_arn, 43 | }, 44 | initial_policy=[ 45 | iam.PolicyStatement( 46 | effect=iam.Effect.ALLOW, 47 | actions=[ 48 | "states:DescribeExecution", 49 | "states:GetExecutionHistory", 50 | ], 51 | resources=[ 52 | f"arn:aws:states:{aws_region}:{aws_account}:execution:{orchestration_sfn_name}:*", 53 | f"arn:aws:states:{aws_region}:{aws_account}:execution:{launch_sfn_name}:*", 54 | ], 55 | ), 56 | iam.PolicyStatement( 57 | effect=iam.Effect.ALLOW, 58 | actions=[ 59 | "elasticmapreduce:DescribeCluster", 60 | "elasticmapreduce:ListSteps", 61 | ], 62 | resources=[ 63 | f"arn:aws:elasticmapreduce:{aws_region}:{aws_account}:cluster/*", 64 | ], 65 | ), 66 | iam.PolicyStatement( 67 | effect=iam.Effect.ALLOW, 68 | actions=[ 69 | "s3:ListBucket", 70 | "s3:GetObject", 71 | "s3:PutObject", 72 | ], 73 | resources=[ 74 | log_bucket_arn, 75 | f"{log_bucket_arn}/*", 76 | ], 77 | ), 78 | iam.PolicyStatement( 79 | effect=iam.Effect.ALLOW, 80 | actions=[ 81 | "SNS:Publish", 82 | ], 83 | resources=[ 84 | success_sns_topic_arn, 85 | failure_sns_topic_arn, 86 | ], 87 | ), 88 | ], 89 | ) 90 | 91 | job_summary_event_rule = events.Rule( 92 | self, 93 | "EmrLaunchJobSummaryEventRule", 94 | description="Triggers the creation of SFN execution summary", 95 | event_pattern=events.EventPattern( 96 | source=["aws.states"], 97 | detail_type=["Step Functions Execution Status Change"], 98 | detail={ 99 | "status": ["SUCCEEDED", "FAILED", "TIMED_OUT", "ABORTED"], 100 | "stateMachineArn": [ 101 | f"arn:aws:states:{aws_region}:{aws_account}:stateMachine:{orchestration_sfn_name}", 102 | ], 103 | }, 104 | ), 105 | ) 106 | 107 | job_summary_event_rule.add_target( 108 | event_targets.LambdaFunction( 109 | job_summary_lambda, 110 | event=events.RuleTargetInput.from_object( 111 | {"sfnExecutionArn": events.EventField.from_path("$.detail.executionArn")} 112 | ), 113 | ) 114 | ) 115 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | aws_cdk.core==1.94.1 3 | aws_cdk.aws_ec2==1.94.1 4 | aws_cdk.aws_s3==1.94.1 5 | aws_cdk.aws_s3_deployment==1.94.1 6 | aws_cdk.aws_ecs==1.94.1 7 | aws_cdk.aws_emr==1.94.1 8 | aws_cdk.aws_efs==1.94.1 9 | aws_cdk.aws_ecs_patterns==1.94.1 10 | aws_cdk.aws_ecr==1.94.1 11 | aws_cdk.aws_stepfunctions_tasks==1.94.1 12 | aws_cdk.aws_stepfunctions==1.94.1 13 | aws_cdk.aws_logs==1.94.1 14 | aws_cdk.aws_lambda_event_sources==1.94.1 15 | aws_cdk.aws_lambda==1.94.1 16 | boto3 17 | aws_emr_launch==1.5.0 18 | -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00000-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00000-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00001-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00001-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00002-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00002-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00003-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00003-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00004-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00004-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00005-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00005-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00006-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00006-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00007-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00007-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00008-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00008-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00009-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00009-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00010-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00010-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00011-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00011-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00012-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00012-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00013-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00013-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00014-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00014-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00015-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00015-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00016-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00016-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00017-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00017-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00018-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00018-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00019-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00019-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00020-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00020-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00021-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00021-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00022-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00022-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00023-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00023-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00024-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00024-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00025-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00025-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00026-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00026-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00027-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00027-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00028-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00028-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00029-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00029-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00030-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00030-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00031-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00031-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00032-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00032-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00033-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00033-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00034-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00034-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00035-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00035-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00036-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00036-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00037-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00037-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00038-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00038-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00039-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00039-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00040-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00040-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00041-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00041-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00042-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00042-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00043-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00043-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00044-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00044-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00045-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00045-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00046-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00046-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00047-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00047-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00048-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00048-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00049-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00049-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00050-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00050-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00051-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00051-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00052-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00052-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00053-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00053-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00054-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00054-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00055-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00055-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00056-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00056-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00057-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00057-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00058-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00058-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00059-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00059-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00060-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00060-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00061-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00061-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00062-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00062-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00063-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00063-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00064-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00064-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00065-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00065-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00066-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00066-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00067-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00067-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00068-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00068-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/sample_data/part-r-00069-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/spark_batch_orchestration/sample_data/part-r-00069-ddaee723-f3f6-4f25-a34b-3312172aa6d7.snappy.parquet -------------------------------------------------------------------------------- /examples/spark_batch_orchestration/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md") as fp: 4 | long_description = fp.read() 5 | 6 | 7 | setuptools.setup( 8 | name="infrastructure", 9 | version="0.0.1", 10 | description="A CDK Python app for building EMR Pipelines running spark jobs", 11 | long_description=long_description, 12 | long_description_content_type="text/markdown", 13 | author="Kevin Soucy - Amazon Web Services", 14 | package_dir={"": "infrastructure"}, 15 | packages=setuptools.find_packages(where="infrastructure"), 16 | install_requires=[ 17 | "aws-cdk.core==1.94.1", 18 | ], 19 | python_requires=">=3.6", 20 | classifiers=[ 21 | "Development Status :: 4 - Beta", 22 | "Intended Audience :: Developers", 23 | "License :: OSI Approved :: Apache Software License", 24 | "Programming Language :: JavaScript", 25 | "Programming Language :: Python :: 3 :: Only", 26 | "Programming Language :: Python :: 3.6", 27 | "Programming Language :: Python :: 3.7", 28 | "Programming Language :: Python :: 3.8", 29 | "Topic :: Software Development :: Code Generators", 30 | "Topic :: Utilities", 31 | "Typing :: Typed", 32 | ], 33 | ) 34 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/.gitignore: -------------------------------------------------------------------------------- 1 | emr_pipeline/.terraform/* 2 | emr_step_function/lambda.zip 3 | emr_pipeline/terraform.tfstate 4 | emr_pipeline/terraform.tfstate.backup 5 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/README.md: -------------------------------------------------------------------------------- 1 | ### Getting Started 2 | This repository contains a fully integrated Terraform module for the EMR Launch framework. It still relies 3 | on aws's open source emr-launch library under the hood, but allows users who want to continue using Terraform for the 4 | bulk of their infrastructure, to realize the benefits of aws-emr-launch [https://github.com/awslabs/aws-emr-launch] 5 | 6 | The Launch Function is responsible for provisioning EMR clusters and managing security, resource, and application configurations. 7 | Any user or service who needs an EMR cluster can trigger this launch step function and receive the cluster id when it is ready. 8 | 9 | An example pipeline is provided for running a simple spark application with this EMR Launch framework. 10 | 11 | See the emr_pipeline/emr_launch/infrastructure/README.md for more details on cluster configurations 12 | 13 | #### First time setup for Terraform (module tested with v0.12.29) 14 | 1. Download and install Terraform v0.12.29 15 | https://releases.hashicorp.com/terraform/0.12.29/ 16 | 17 | 2. Also helpful for managing different versions of terraform is tfswitch for OSX 18 | https://github.com/warrensbox/terraform-switcher 19 | brew install warrensbox/tap/tfswitch 20 | 21 | 3. For more information on getting started with Terraform, read the official docs here: 22 | https://www.terraform.io/intro/index.html 23 | 24 | #### First time set up for cdk 25 | 1. Install Node Package Manager - https://www.npmjs.com/get-npm 26 | 27 | 2. Install CDK per machine / client 28 | 29 | npm install -g aws-cdk 30 | 31 | ### Deploy a demo pipeline, test, and clean up 32 | 33 | 1. Add an environment to environments//.tfvars Or update the dev environment by adding your own vpc and subnet id's 34 | 35 | 2. Run the following commands: 36 | 37 | Command syntax 38 | bash bin/deploy.sh 39 | 40 | # Plan 41 | bash bin/deploy.sh plan dev eu-west-1 42 | 43 | # Deploy 44 | bash bin/deploy.sh apply dev eu-west-1 45 | 46 | # Test: 47 | Once deployed, trigger the deployed pipeline Step Function to test end-to-end or 48 | trigger just the deployed Launch function to spin up a cluster 49 | 50 | # Clean up 51 | bash bin/deploy.sh destroy dev eu-west-1 52 | 53 | 54 | #### Add your own spark application code: 55 | - Option 1: Add your spark code to spark_script.py, and deploy the pipeline as-is 56 | - Option 2: Replace the variable spark_script with the name of your pyspark script and deploy the pipeline 57 | - To edit spark settings like shuffle.partitions or executor.cores, add these directly to the spark-submit command: 58 | - Example - LINE76: emr_pipeline/emr_step_function/pipeline.json 59 | "--conf", "spark.yarn.maxAppAttempts=1", 60 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/bin/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 4 | 5 | cmd=$1 6 | stage=$2 7 | region=$3 8 | 9 | export aws_account_id=$(aws sts get-caller-identity --query Account --output text) 10 | 11 | cd $DIR 12 | cd .. 13 | 14 | python3 -m venv .env 15 | source .env/bin/activate 16 | 17 | cd $DIR 18 | pip install -r ../emr_pipeline/emr_launch/infrastructure/requirements.txt 19 | cdk bootstrap aws://$aws_account_id/$region 20 | 21 | cd ../emr_pipeline 22 | terraform init 23 | terraform $cmd -var-file=../environments/$stage/$region.tfvars 24 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr-launch.tf: -------------------------------------------------------------------------------- 1 | 2 | ##################################################################################### 3 | # EMR Cluster Configuration - resources, security, permissions 4 | ##################################################################################### 5 | 6 | module "launch_function" { 7 | 8 | source = "./emr_launch" 9 | emr_cluster_name = var.emr_cluster_name 10 | master_instance_type = var.master_instance_type 11 | core_instance_type = var.core_instance_type 12 | core_instance_count = var.core_instance_count 13 | release_label = var.release_label 14 | applications = var.applications 15 | configuration = var.configuration 16 | 17 | task_instance_type = var.task_instance_type 18 | task_instance_market = var.task_instance_market 19 | task_instance_count = var.task_instance_count 20 | 21 | stage = var.environment 22 | aws_region = var.aws_region 23 | 24 | vpc_id = var.vpc_id 25 | subnet_id = var.subnet_id 26 | 27 | log_bucket = aws_s3_bucket.log_bucket.arn 28 | artifact_bucket = aws_s3_bucket.artifact_bucket.arn 29 | 30 | // Any bucket from which EMR must read data - goes here: 31 | input_buckets = [aws_s3_bucket.demo_bucket.arn] 32 | 33 | // Any bucket to which EMR must write data - goes here: 34 | output_buckets = [aws_s3_bucket.demo_bucket.arn] 35 | 36 | // Any KMS key needed for decrypting data on-read goes here 37 | input_kms_arns = [] 38 | 39 | // Any KMS key neededfor encrypting data on-write goes here 40 | output_kms_arns = [] 41 | } 42 | 43 | // Pass EMR Launch Function ARN to Orchestration Pipeline via Locals 44 | locals { 45 | cdk_output_file = module.launch_function.cdk_outputs_file 46 | launch_function_arn = module.launch_function.launch_function_arn 47 | instance_role_name = module.launch_function.instance_role_name 48 | } 49 | 50 | // Print Launch Function Outputs 51 | resource "null_resource" "print_my_outputs" { 52 | depends_on = [module.launch_function] 53 | provisioner "local-exec" { 54 | command = "terraform output" 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr-orchestration.tf: -------------------------------------------------------------------------------- 1 | 2 | // Define EMR Pipeline as Step Function - triggering EMR Launch ARN as first pipeline step 3 | module "orchestration" { 4 | aws_region = var.aws_region 5 | stage = var.environment 6 | source = "./emr_step_function" 7 | emr-launch-arn = fileexists(module.launch_function.cdk_outputs_file) ? local.launch_function_arn : "fakearn" 8 | cluster-name = var.emr_cluster_name 9 | account_id = local.account_id 10 | output_s3_path = "s3://${aws_s3_bucket.demo_bucket.id}/output/" 11 | module_depends_on = [module.launch_function.launch_function_arn, null_resource.print_my_outputs] 12 | sfn_orchestrate_name = "${var.emr_cluster_name}-sfn-pipeline" 13 | sfn_orchestrate_definition_file = "${path.module}/emr_step_function/pipeline.json" 14 | artifacts_bucket = aws_s3_bucket.artifact_bucket.id 15 | artifact_path = aws_s3_bucket_object.emr_pyspark_script.key 16 | } -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_launch/.gitignore: -------------------------------------------------------------------------------- 1 | *.zip 2 | data/* 3 | infrastructure/cdk.context.json -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_launch/README.md: -------------------------------------------------------------------------------- 1 | Terraform wrapper around aws's open source emr-launch library 2 | * https://github.com/awslabs/aws-emr-launch * 3 | 4 | 5 | All CDK related code lives in ./infrastructure including emr-launch usage 6 | 7 | ./utils contains three scripts for managing the integration between CDK and Terraform 8 | 9 | 10 | Example Usage: 11 | 12 | module "launch_function" { 13 | 14 | source = "./emr_launch" 15 | emr_cluster_name = var.emr_cluster_name 16 | security_config = var.security_config // Must exist in ./infrastructure/emr_configs/security_configs/default.py 17 | resource_config = var.resource_config // Must exist in ./infrastructure/emr_configs/resource_configs/default.py 18 | launch_function_config = var.launch_function_config // Must exist in ./infrastructure/emr_configs/launch_step_functions/default.py 19 | 20 | stage = var.environment 21 | aws_region = var.aws_region 22 | vpc_id = var.vpc_id 23 | subnet_id = var.subnet_id 24 | 25 | log_bucket = aws_s3_bucket.demo_bucket.id 26 | artifact_bucket = aws_s3_bucket.demo_bucket.id 27 | input_buckets = [aws_s3_bucket.demo_bucket.id] 28 | output_buckets = [aws_s3_bucket.demo_bucket.id] 29 | } 30 | 31 | 32 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_launch/infrastructure/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## cluster_definition.py 3 | To be updated or extended if an EMR cluster requires additional permissions or configurations 4 | 5 | EMRClusterDefinition 6 | CDK Stack to define a Standard EMR Cluster 7 | 8 | EMRClusterDefinition.emr_resource_config() 9 | # Everything related to cluster sizing and hardware/software configuration. For example: 10 | 11 | master_instance_type: Optional[str] = 'm5.2xlarge', 12 | master_instance_market: Optional[InstanceMarketType] = InstanceMarketType.ON_DEMAND, 13 | core_instance_type: Optional[str] = 'm5.xlarge', 14 | core_instance_market: Optional[InstanceMarketType] = InstanceMarketType.ON_DEMAND, 15 | core_instance_count: Optional[int] = 2, 16 | applications: Optional[List[str]] = None, 17 | bootstrap_actions: Optional[List[emr_code.EMRBootstrapAction]] = None, 18 | configurations: Optional[List[dict]] = None, 19 | use_glue_catalog: Optional[bool] = True, 20 | step_concurrency_level: Optional[int] = 1, 21 | description: Optional[str] = None, 22 | secret_configurations: Optional[Dict[str, secretsmanager.Secret]] = None): 23 | 24 | EMRClusterDefinition.security_profile_config() 25 | # Everything related to the cluster's security configuration. For example: 26 | vpc: Optional[ec2.Vpc] = None, 27 | artifacts_bucket: Optional[s3.Bucket] = None, 28 | artifacts_path: Optional[str] = None, 29 | logs_bucket: Optional[s3.Bucket] = None, 30 | logs_path: Optional[str] = 'elasticmapreduce/', 31 | mutable_instance_role: bool = True, 32 | mutable_security_groups: bool = True, 33 | description: Optional[str] = None) -> None: 34 | 35 | EMRClusterDefinition.launch_function_config() 36 | # Everything related to checking for running clusters and lauching new clusters. 37 | To be extended if the default step function for launching a cluster does not meet your needs. 38 | For example, if you want to use a permanent cluster or transient cluster. Implement that here. 39 | 40 | 41 | ## Development 42 | This repository uses the [AWS CDK](https://aws.amazon.com/cdk/) and the Professional Services developed 43 | **AWS EMR Launch** plugin for the CDK to define EMR Clusters and Step Functions. 44 | 45 | It is recommended that a Python3 `venv` be used for all CDK builds and deployments. 46 | 47 | To get up and running quickly: 48 | 49 | 1. Install the [CDK CLI](https://docs.aws.amazon.com/cdk/latest/guide/getting_started.html) 50 | ```bash 51 | npm install -g aws-cdk 52 | ``` 53 | 54 | 2. Use your mechanism of choice to create and activate a Python3 `venv`: 55 | ```bash 56 | python3 -m venv ~/.env 57 | source ~/.env/bin/activate 58 | ``` 59 | 60 | 3. Install the CDK and Boto3 minimum requirements: 61 | ```bash 62 | pip install -r requirements.txt 63 | ``` 64 | 65 | ## Deployment of cluster_definition.py 66 | * terraform apply 67 | 68 | Deploys: 69 | SSM Parameter Store: 70 | Security, resource, and launch function configs to SSM Paramter Store 71 | https://eu-west-1.console.aws.amazon.com/systems-manager/parameters/?region=eu-west-1&tab=Table 72 | Lambdas to Orchestrate Cluster Provisioning 73 | https://eu-west-1.console.aws.amazon.com/lambda/home?region=eu-west-1#/functions 74 | Step Function to Orchestrate Cluster Provisioning 75 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_launch/infrastructure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/terraform_pipeline/emr_pipeline/emr_launch/infrastructure/__init__.py -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_launch/infrastructure/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 main.py" 3 | } -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_launch/infrastructure/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import logging 5 | import os 6 | 7 | import aws_cdk 8 | from cluster_definition import EMRClusterDefinition 9 | 10 | environment_variables = [ 11 | "DEPLOYMENT_STAGE", 12 | "CDK_DEPLOY_ACCOUNT", 13 | "CDK_DEPLOY_REGION", 14 | "VPC_ID", 15 | "SUBNET_ID", 16 | "CLUSTER_NAME", 17 | "MASTER_INSTANCE_TYPE", 18 | "CORE_INSTANCE_TYPE", 19 | "CORE_INSTANCE_COUNT", 20 | "CORE_INSTANCE_MARKET", 21 | "CORE_INSTANCE_EBS_SIZE", 22 | "CORE_INSTANCE_EBS_TYPE", 23 | "CORE_INSTANCE_EBS_IOPS", 24 | "TASK_INSTANCE_EBS_SIZE", 25 | "TASK_INSTANCE_TYPE", 26 | "TASK_INSTANCE_COUNT", 27 | "TASK_INSTANCE_MARKET", 28 | "RELEASE_LABEL", 29 | "LOG_BUCKET", 30 | "ARTIFACT_BUCKET", 31 | "INPUT_BUCKETS", 32 | "OUTPUT_BUCKETS", 33 | "INPUT_KMS_ARNS", 34 | "OUTPUT_KMS_ARNS", 35 | "APPLICATIONS", 36 | "CONFIGURATION", 37 | ] 38 | 39 | list_vars = [ 40 | "INPUT_BUCKETS", 41 | "OUTPUT_BUCKETS", 42 | "INPUT_KMS_ARNS", 43 | "OUTPUT_KMS_ARNS", 44 | "APPLICATIONS", 45 | ] 46 | 47 | int_vars = ["CORE_INSTANCE_COUNT", "TASK_INSTANCE_COUNT", "TASK_INSTANCE_EBS_SIZE", "CORE_INSTANCE_EBS_SIZE"] 48 | 49 | json_vars = ["CONFIGURATION"] 50 | 51 | app = aws_cdk.App() 52 | 53 | config = {"CLUSTER_NAME": app.node.try_get_context("cluster-name")} 54 | 55 | for v in environment_variables: 56 | if v in list_vars: 57 | val = [x for x in os.environ[v].split(",") if x != ""] 58 | if len(val) > 0: 59 | config[v] = val 60 | else: 61 | config[v] = [] 62 | elif v in int_vars: 63 | config[v] = int(os.environ[v]) 64 | elif v in json_vars: 65 | config[v] = json.loads(os.environ[v]) 66 | else: 67 | config[v] = os.environ[v] 68 | 69 | if config["CORE_INSTANCE_EBS_SIZE"] > 0: 70 | assert ( 71 | config["CORE_INSTANCE_EBS_IOPS"] <= config["CORE_INSTANCE_EBS_SIZE"] * 50 72 | ), "CORE_INSTANCE_EBS_IOPS must be <= CORE_INSTANCE_EBS_SIZE (GB) * 50" 73 | 74 | print(config) 75 | logging.info(config) 76 | 77 | env = aws_cdk.Environment( 78 | account=config["CDK_DEPLOY_ACCOUNT"], 79 | region=config["CDK_DEPLOY_REGION"], 80 | ) 81 | 82 | emr_cluster_stack = EMRClusterDefinition(app, id=config["CLUSTER_NAME"] + "-stack", env=env, config=config) 83 | 84 | app.synth() 85 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_launch/infrastructure/requirements.txt: -------------------------------------------------------------------------------- 1 | aws-cdk.core>=1.29.0,<1.36.0 2 | aws-cdk.aws-iam>=1.29.0,<1.36.0 3 | aws-cdk.aws-s3>=1.29.0,<1.36.0 4 | aws-cdk.aws-s3-deployment>=1.29.0,<1.36.0 5 | aws-cdk.aws-kms>=1.29.0,<1.36.0 6 | aws-cdk.aws-ec2>=1.29.0,<1.36.0 7 | aws-cdk.aws-emr>=1.29.0,<1.36.0 8 | aws-cdk.aws-sns>=1.29.0,<1.36.0 9 | aws-cdk.aws-sqs>=1.29.0,<1.36.0 10 | aws-cdk.aws-ssm>=1.29.0,<1.36.0 11 | aws-cdk.aws-secretsmanager>=1.29.0,<1.36.0 12 | aws-cdk.aws-lambda>=1.29.0,<1.36.0 13 | aws-cdk.aws-lambda-event-sources>=1.29.0,<1.36.0 14 | aws-cdk.aws-stepfunctions>=1.29.0,<1.36.0 15 | aws-cdk.aws-stepfunctions-tasks>=1.29.0,<1.36.0 16 | aws-cdk.aws-events>=1.29.0,<1.36.0 17 | aws-cdk.aws-events-targets>=1.29.0,<1.36.0 18 | aws-cdk.aws-codecommit>=1.29.0,<1.36.0 19 | aws-cdk.aws-codepipeline>=1.29.0,<1.36.0 20 | aws-cdk.aws-codepipeline_actions>=1.29.0,<1.36.0 21 | aws-cdk.aws-codedeploy>=1.29.0,<1.36.0 22 | aws-cdk.aws-ecr>=1.29.0,<1.36.0 23 | aws-cdk.aws-ecr_assets>=1.29.0,<1.36.0 24 | boto3>=1.12.23 25 | logzero~=1.5.0 26 | aws_emr_launch==1.4.0 27 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_launch/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "local" {} 3 | required_version = "~> 0.12" 4 | } 5 | 6 | provider "aws" { 7 | region = var.aws_region 8 | } 9 | 10 | data "archive_file" "infrastructure" { 11 | type = "zip" 12 | source_dir = "${path.module}/infrastructure" 13 | output_path = "${path.module}/data/infrastructure.zip" 14 | } 15 | 16 | data "archive_file" "utils" { 17 | type = "zip" 18 | source_dir = "${path.module}/utils" 19 | output_path = "${path.module}/data/utils.zip" 20 | } 21 | 22 | resource "null_resource" "launch_function" { 23 | 24 | triggers = { 25 | always_run = timestamp() 26 | } 27 | 28 | provisioner "local-exec" { 29 | command = "bash ${path.module}/utils/cdk_deploy.sh" 30 | on_failure = fail 31 | environment = { 32 | CDK_DEPLOY_ACCOUNT = data.aws_caller_identity.current.account_id 33 | CDK_DEPLOY_REGION = var.aws_region 34 | DEPLOYMENT_STAGE = var.stage 35 | 36 | VPC_ID = var.vpc_id 37 | SUBNET_ID = var.subnet_id 38 | CLUSTER_NAME = var.emr_cluster_name 39 | MASTER_INSTANCE_TYPE = var.master_instance_type 40 | CORE_INSTANCE_TYPE = var.core_instance_type 41 | CORE_INSTANCE_COUNT = var.core_instance_count 42 | CORE_INSTANCE_MARKET = var.core_instance_market 43 | RELEASE_LABEL = var.release_label 44 | LOG_BUCKET = var.log_bucket 45 | ARTIFACT_BUCKET = var.artifact_bucket 46 | INPUT_BUCKETS = join(",", var.input_buckets) 47 | OUTPUT_BUCKETS = join(",", var.output_buckets) 48 | INPUT_KMS_ARNS = join(",", var.input_kms_arns) 49 | OUTPUT_KMS_ARNS = join(",", var.output_kms_arns) 50 | APPLICATIONS = join(",", var.applications) 51 | CONFIGURATION = jsonencode(var.configuration) 52 | TASK_INSTANCE_TYPE = var.task_instance_type 53 | TASK_INSTANCE_MARKET = var.task_instance_market 54 | TASK_INSTANCE_COUNT = var.task_instance_count 55 | CORE_INSTANCE_EBS_SIZE = var.core_instance_ebs_size 56 | CORE_INSTANCE_EBS_TYPE = var.core_instance_ebs_type 57 | CORE_INSTANCE_EBS_IOPS = var.core_instance_ebs_iops 58 | TASK_INSTANCE_EBS_SIZE = var.task_instance_ebs_size 59 | } 60 | } 61 | } 62 | 63 | resource "null_resource" "destroy_launch_function" { 64 | 65 | provisioner "local-exec" { 66 | command = "bash ${path.module}/utils/cdk_destroy.sh" 67 | on_failure = fail 68 | when = destroy 69 | environment = { 70 | STACK_NAME = "${var.emr_cluster_name}-stack" 71 | CDK_DEPLOY_REGION = var.aws_region 72 | } 73 | } 74 | } 75 | 76 | locals { 77 | parserscript = "${path.module}/utils/parse-outputs.py" 78 | input = "${path.module}/data/my-outputs.json" 79 | } 80 | 81 | data "local_file" "cdk-outputs" { 82 | filename = local.input 83 | depends_on = [null_resource.launch_function] 84 | } 85 | 86 | locals { 87 | launch_arn_string = jsondecode(data.local_file.cdk-outputs.content)["${var.emr_cluster_name}-stack"]["LaunchFunctionARN"] 88 | instance_role_string = jsondecode(data.local_file.cdk-outputs.content)["${var.emr_cluster_name}-stack"]["InstanceRoleName"] 89 | } 90 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_launch/outputs.tf: -------------------------------------------------------------------------------- 1 | 2 | output "cdk_outputs_file" { 3 | value = local.input 4 | } 5 | 6 | output "launch_function_arn" { 7 | value = local.launch_arn_string 8 | } 9 | 10 | output "instance_role_name" { 11 | value = local.instance_role_string 12 | } 13 | 14 | output "cluster-name" { 15 | value = var.emr_cluster_name 16 | } -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_launch/utils/cdk_deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 5 | 6 | cd $DIR 7 | cd ../ 8 | rm -r -f ./data 9 | cd ./infrastructure 10 | echo "Deploying emr launch functions with CDK" 11 | export AWS_REGION=$CDK_DEPLOY_REGION 12 | cdk deploy "*" --require-approval never --output ../data/cdk.out --outputs-file ../data/my-outputs.json \ 13 | --context cluster-name=$CLUSTER_NAME 14 | 15 | 16 | rm ./cdk.context.json 17 | cd .. 18 | rm -r ./data/cdk.out 19 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_launch/utils/cdk_destroy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 5 | 6 | cd $DIR 7 | cd ../ 8 | rm -r -f ./data 9 | echo "Destroying emr launch functions with aws cli delete stack" 10 | export AWS_REGION=$CDK_DEPLOY_REGION 11 | aws cloudformation delete-stack --stack-name $STACK_NAME 12 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_launch/variables.tf: -------------------------------------------------------------------------------- 1 | variable "aws_region" {} 2 | 3 | variable "stage" {} 4 | 5 | data "aws_caller_identity" "current" {} 6 | 7 | variable "emr_cluster_name" { 8 | description = "Unique name for the EMR Cluster's running this batch job" 9 | type = string 10 | } 11 | 12 | variable "vpc_id" { 13 | description = "VPC to deploy EMR cluster into" 14 | type = string 15 | } 16 | 17 | variable "subnet_id" { 18 | description = "Subnet Id to deploy EMR cluster into" 19 | type = string 20 | } 21 | 22 | variable "master_instance_type" { 23 | type = string 24 | } 25 | 26 | variable "core_instance_type" { 27 | type = string 28 | } 29 | 30 | variable "core_instance_market" { 31 | description = "ON_DEMAND or SPOT" 32 | type = string 33 | default = "ON_DEMAND" 34 | } 35 | 36 | variable "core_instance_count" { 37 | type = number 38 | } 39 | 40 | variable "release_label" { 41 | type = string 42 | } 43 | 44 | variable "log_bucket" { 45 | description = "The S3 bucket name EMR will write logs to" 46 | } 47 | 48 | variable "artifact_bucket" { 49 | description = "The S3 bucket name EMR will read Artifacts from" 50 | } 51 | 52 | variable "input_buckets" { 53 | description = "A list of S3 Bucket names that EMR will read data from" 54 | type = list(string) 55 | } 56 | 57 | variable "output_buckets" { 58 | description = "A list of S3 Bucket names that EMR will write data to" 59 | type = list(string) 60 | } 61 | 62 | variable "input_kms_arns" { 63 | description = "A list of KMS Key ARNs used for decrypting data being read into the cluster" 64 | type = list(string) 65 | default = [] 66 | } 67 | 68 | variable "output_kms_arns" { 69 | description = "A list of KMS Key ARNs used for encrypting data being written by the cluster" 70 | type = list(string) 71 | default = [] 72 | } 73 | 74 | variable "applications" { 75 | description = "Applications to install on EMR" 76 | default = ["Hadoop", "Hive", "Spark", "Ganglia"] 77 | type = list(string) 78 | } 79 | 80 | variable "configuration" { 81 | default = [ 82 | { 83 | "Classification": "spark", 84 | "Properties": { 85 | "maximizeResourceAllocation": "true" 86 | } 87 | } 88 | ] 89 | } 90 | 91 | variable "task_instance_type" { 92 | default = "m5.xlarge" 93 | } 94 | 95 | variable "task_instance_count" { 96 | default = 0 97 | type = number 98 | } 99 | 100 | variable "task_instance_market" { 101 | description = "ON_DEMAND or SPOT" 102 | type = string 103 | default = "SPOT" 104 | } 105 | 106 | variable "core_instance_ebs_size" { 107 | description = "GB of Core EBS Storage" 108 | default = 0 109 | type = number 110 | } 111 | 112 | variable "core_instance_ebs_type" { 113 | description = "should be one of 'gp2', 'io1', 'io2', 'st1', 'sc1'" 114 | default = "io1" 115 | type = string 116 | } 117 | 118 | variable "core_instance_ebs_iops" { 119 | description = "should be one of 'gp2', 'io1', 'io2', 'st1', 'sc1'" 120 | default = 10000 121 | type = number 122 | } 123 | 124 | variable "task_instance_ebs_size" { 125 | description = "EBS Volume size to add to task nodes, used when spark spills or cachces to disk - recommended 64-500GB" 126 | default = 64 127 | type = number 128 | } 129 | 130 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_step_function/lambda.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/terraform_pipeline/emr_pipeline/emr_step_function/lambda.zip -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_step_function/lambda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/examples/terraform_pipeline/emr_pipeline/emr_step_function/lambda/__init__.py -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_step_function/lambda/lambda_parse_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | logger = logging.getLogger() 5 | logger.setLevel(logging.INFO) 6 | 7 | 8 | def handler(event, context): 9 | logger.info(f"Lambda metadata: {json.dumps(event)} (type = {type(event)})") 10 | json_string = event.get("LaunchOutput", {}).get("LaunchOutput", {}) 11 | 12 | try: 13 | parsed = json.loads(json_string)["LaunchClusterResult"] 14 | return parsed 15 | 16 | except Exception as e: 17 | logger.error(f"Error processing event {json.dumps(event)}") 18 | logger.exception(e) 19 | raise e 20 | -------------------------------------------------------------------------------- /examples/terraform_pipeline/emr_pipeline/emr_step_function/main.tf: -------------------------------------------------------------------------------- 1 | 2 | provider "aws" { 3 | region = var.aws_region 4 | } 5 | 6 | resource "aws_sns_topic" "emr_success" { 7 | name = "${var.cluster-name}-emr-success" 8 | } 9 | 10 | resource "aws_sns_topic" "emr_failure" { 11 | name = "${var.cluster-name}-emr-failure" 12 | } 13 | 14 | 15 | data "aws_iam_policy_document" "emr_sfn_role_policy" { 16 | statement { 17 | effect = "Allow" 18 | actions = [ 19 | "events:DescribeRule", 20 | "xray:PutTelemetryRecords", 21 | "events:PutRule", 22 | "states:StopExecution", 23 | "elasticmapreduce:DescribeCluster", 24 | "xray:GetSamplingTargets", 25 | "elasticmapreduce:SetTerminationProtection", 26 | "xray:PutTraceSegments", 27 | "events:PutTargets", 28 | "elasticmapreduce:DescribeStep", 29 | "sns:Publish", 30 | "states:DescribeExecution", 31 | "xray:GetSamplingRules", 32 | "states:StartExecution", 33 | "elasticmapreduce:AddJobFlowSteps", 34 | "elasticmapreduce:TerminateJobFlows", 35 | "elasticmapreduce:CancelSteps", 36 | "lambda:InvokeFunction" 37 | ] 38 | resources = [ 39 | "*" 40 | ] 41 | } 42 | } 43 | 44 | resource "aws_iam_role" "sfn_service_role" { 45 | name = "${var.cluster-name}-sfn_service_role" 46 | tags = { 47 | env = var.stage 48 | region = var.aws_region 49 | } 50 | assume_role_policy = <> cluster_checker >> step_adder >> step_checker >> cluster_remover 116 | -------------------------------------------------------------------------------- /fix.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ex 4 | 5 | isort . 6 | black . 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 120 3 | target-version = ["py37", "py38"] 4 | exclude = ''' 5 | /( 6 | \.eggs 7 | | \.git 8 | | \.hg 9 | | \.mypy_cache 10 | | \.tox 11 | | \.venv 12 | | \.env 13 | | _build 14 | | buck-out 15 | | build 16 | | dist 17 | | cdk.out 18 | | emr_config_utils 19 | )/ 20 | ''' 21 | 22 | [tool.isort] 23 | multi_line_output = 3 24 | include_trailing_comma = true 25 | force_grid_wrap = 0 26 | use_parentheses = true 27 | ensure_newline_before_comments = true 28 | line_length = 120 29 | src_paths = ["aws_emr_launch", "tests"] 30 | py_version = 37 31 | skip_gitignore = false 32 | skip =["cdk.out", ".venv", "emr_config_utils"] -------------------------------------------------------------------------------- /requirements-1.3.x.txt: -------------------------------------------------------------------------------- 1 | aws-cdk.core>=1.29.0,<1.36.0 2 | aws-cdk.aws-iam>=1.29.0,<1.36.0 3 | aws-cdk.aws-s3>=1.29.0,<1.36.0 4 | aws-cdk.aws-s3-deployment>=1.29.0,<1.36.0 5 | aws-cdk.aws-kms>=1.29.0,<1.36.0 6 | aws-cdk.aws-ec2>=1.29.0,<1.36.0 7 | aws-cdk.aws-emr>=1.29.0,<1.36.0 8 | aws-cdk.aws-sns>=1.29.0,<1.36.0 9 | aws-cdk.aws-sqs>=1.29.0,<1.36.0 10 | aws-cdk.aws-ssm>=1.29.0,<1.36.0 11 | aws-cdk.aws-secretsmanager>=1.29.0,<1.36.0 12 | aws-cdk.aws-lambda>=1.29.0,<1.36.0 13 | aws-cdk.aws-lambda-event-sources>=1.29.0,<1.36.0 14 | aws-cdk.aws-stepfunctions>=1.29.0,<1.36.0 15 | aws-cdk.aws-stepfunctions-tasks>=1.29.0,<1.36.0 16 | aws-cdk.aws-events>=1.29.0,<1.36.0 17 | aws-cdk.aws-events-targets>=1.29.0,<1.36.0 18 | boto3>=1.12.23 19 | logzero~=1.5.0 20 | -------------------------------------------------------------------------------- /requirements-1.4.x.txt: -------------------------------------------------------------------------------- 1 | aws-cdk.core>=1.36.0,<1.46.0 2 | aws-cdk.aws-iam>=1.36.0,<1.46.0 3 | aws-cdk.aws-s3>=1.36.0,<1.46.0 4 | aws-cdk.aws-s3-deployment>=1.36.0,<1.46.0 5 | aws-cdk.aws-kms>=1.36.0,<1.46.0 6 | aws-cdk.aws-ec2>=1.36.0,<1.46.0 7 | aws-cdk.aws-emr>=1.36.0,<1.46.0 8 | aws-cdk.aws-sns>=1.36.0,<1.46.0 9 | aws-cdk.aws-sqs>=1.36.0,<1.46.0 10 | aws-cdk.aws-ssm>=1.36.0,<1.46.0 11 | aws-cdk.aws-secretsmanager>=1.36.0,<1.46.0 12 | aws-cdk.aws-lambda>=1.36.0,<1.46.0 13 | aws-cdk.aws-lambda-event-sources>=1.36.0,<1.46.0 14 | aws-cdk.aws-stepfunctions>=1.36.0,<1.46.0 15 | aws-cdk.aws-stepfunctions-tasks>=1.36.0,<1.46.0 16 | aws-cdk.aws-events>=1.36.0,<1.46.0 17 | aws-cdk.aws-events-targets>=1.36.0,<1.46.0 18 | boto3>=1.12.23 19 | logzero~=1.5.0 20 | -------------------------------------------------------------------------------- /requirements-1.5.x.txt: -------------------------------------------------------------------------------- 1 | aws-cdk.core>=1.64.1 2 | aws-cdk.aws-iam>=1.64.1 3 | aws-cdk.aws-s3>=1.64.1 4 | aws-cdk.aws-s3-deployment>=1.64.1 5 | aws-cdk.aws-kms>=1.64.1 6 | aws-cdk.aws-ec2>=1.64.1 7 | aws-cdk.aws-emr>=1.64.1 8 | aws-cdk.aws-sns>=1.64.1 9 | aws-cdk.aws-sqs>=1.64.1 10 | aws-cdk.aws-ssm>=1.64.1 11 | aws-cdk.aws-secretsmanager>=1.64.1 12 | aws-cdk.aws-lambda>=1.64.1 13 | aws-cdk.aws-lambda-event-sources>=1.64.1 14 | aws-cdk.aws-stepfunctions>=1.64.1 15 | aws-cdk.aws-stepfunctions-tasks>=1.64.1 16 | aws-cdk.aws-events>=1.64.1 17 | aws-cdk.aws-events-targets>=1.64.1 18 | aws-cdk.assertions>=1.64.1 19 | boto3>=1.12.23 20 | logzero~=1.5.0 21 | -------------------------------------------------------------------------------- /requirements-2.x.txt: -------------------------------------------------------------------------------- 1 | aws-cdk-lib>=2.20.0 2 | aws-cdk.aws-lambda-python-alpha>=2.20.0a0 3 | boto3>=1.12.23 4 | logzero~=1.5.0 5 | -------------------------------------------------------------------------------- /requirements-dev.in: -------------------------------------------------------------------------------- 1 | pip-tools 2 | pyroma 3 | check-manifest 4 | pip-tools 5 | isort 6 | black 7 | flake8 8 | mypy 9 | moto 10 | pyyaml 11 | pydocstyle 12 | pylint 13 | pytest-cov 14 | pytest-flake8 15 | pytest-isort 16 | pytest-mypy 17 | pytest-pydocstyle 18 | pytest-xdist 19 | pytest 20 | tox 21 | twine 22 | wheel 23 | types-setuptools 24 | zest-releaser[recommended] 25 | -------------------------------------------------------------------------------- /requirements-lambda-layer.txt: -------------------------------------------------------------------------------- 1 | boto3==1.12.23 2 | dictor==0.1.3 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license_files = 3 | LICENSE 4 | NOTICE 5 | VERSION 6 | 7 | [flake8] 8 | max-line-length = 120 9 | extend-ignore = E203, W503 10 | exclude = 11 | .git, 12 | __pycache__, 13 | docs/source/conf.py, 14 | old, 15 | build, 16 | dist, 17 | .venv, 18 | cdk.out, 19 | emr_config_utils 20 | 21 | [mypy] 22 | python_version = 3.7 23 | strict = True 24 | ignore_missing_imports = True 25 | allow_untyped_decorators = True 26 | exclude = extras/|cdk.out/|terraform_pipeline/|spark_batch_orchestration/|emr_config_utils|.venv/ 27 | 28 | [zest.releaser] 29 | release = no 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("VERSION", "r") as version_file: 4 | version = version_file.read().strip() 5 | 6 | with open("README.md") as fp: 7 | long_description = fp.read() 8 | 9 | setuptools.setup( 10 | name="aws-emr-launch", 11 | version=version, 12 | description="AWS EMR Launch modules", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | keywords=["aws", "cdk"], 16 | author="Chauncy McCaughey", 17 | author_email="chamcca@amazon.com", 18 | url="https://github.com/awslabs/aws-emr-launch/", 19 | package_dir={"aws_emr_launch": "aws_emr_launch"}, 20 | packages=setuptools.find_packages(exclude=("tests",)), 21 | install_requires=[ 22 | "logzero~=1.5.0", 23 | ], 24 | include_package_data=True, 25 | python_requires=">=3.6", 26 | classifiers=[ 27 | "Development Status :: 5 - Production/Stable", 28 | "Intended Audience :: Developers", 29 | "License :: OSI Approved :: Apache Software License", 30 | "Programming Language :: JavaScript", 31 | "Programming Language :: Python :: 3 :: Only", 32 | "Programming Language :: Python :: 3.6", 33 | "Programming Language :: Python :: 3.7", 34 | "Programming Language :: Python :: 3.8", 35 | "Topic :: Software Development :: Code Generators", 36 | "Topic :: Utilities", 37 | "Typing :: Typed", 38 | ], 39 | ) 40 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/tests/__init__.py -------------------------------------------------------------------------------- /tests/aws_emr_launch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/tests/aws_emr_launch/__init__.py -------------------------------------------------------------------------------- /tests/aws_emr_launch/constructs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/tests/aws_emr_launch/constructs/__init__.py -------------------------------------------------------------------------------- /tests/aws_emr_launch/constructs/emr_constructs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/tests/aws_emr_launch/constructs/emr_constructs/__init__.py -------------------------------------------------------------------------------- /tests/aws_emr_launch/constructs/emr_constructs/test_cluster_configuration.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Any, Dict 3 | 4 | import aws_cdk 5 | from aws_cdk import aws_ec2 as ec2 6 | from aws_cdk import aws_s3 as s3 7 | 8 | from aws_emr_launch.constructs.emr_constructs import cluster_configuration, emr_code 9 | 10 | app = aws_cdk.App() 11 | stack = aws_cdk.Stack(app, "test-stack") 12 | vpc = ec2.Vpc(stack, "test-vpc") 13 | default_config: Dict[str, Any] = { 14 | "ConfigurationName": "test-cluster", 15 | "Namespace": "default", 16 | "ClusterConfiguration": { 17 | "Applications": [{"Name": "Hadoop"}, {"Name": "Hive"}, {"Name": "Spark"}], 18 | "BootstrapActions": [], 19 | "Configurations": [ 20 | { 21 | "Classification": "hive-site", 22 | "Properties": { 23 | "hive.metastore.client.factory.class": ( 24 | "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" 25 | ) 26 | }, 27 | }, 28 | { 29 | "Classification": "spark-hive-site", 30 | "Properties": { 31 | "hive.metastore.client.factory.class": ( 32 | "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" 33 | ) 34 | }, 35 | }, 36 | ], 37 | "Instances": { 38 | "KeepJobFlowAliveWhenNoSteps": True, 39 | "TerminationProtected": False, 40 | }, 41 | "Name": "test-cluster", 42 | "ReleaseLabel": "emr-5.29.0", 43 | "StepConcurrencyLevel": 1, 44 | "Tags": [], 45 | "VisibleToAllUsers": True, 46 | }, 47 | "OverrideInterfaces": { 48 | "default": { 49 | "ClusterName": {"JsonPath": "Name", "Default": "test-cluster"}, 50 | "ReleaseLabel": {"JsonPath": "ReleaseLabel", "Default": "emr-5.29.0"}, 51 | "StepConcurrencyLevel": {"JsonPath": "StepConcurrencyLevel", "Default": 1}, 52 | } 53 | }, 54 | "ConfigurationArtifacts": [], 55 | } 56 | 57 | 58 | def test_default_configuration() -> None: 59 | cluster_config = cluster_configuration.ClusterConfiguration( 60 | stack, "test-instance-group-config", configuration_name="test-cluster" 61 | ) 62 | 63 | config = copy.deepcopy(default_config) 64 | 65 | resolved_config = stack.resolve(cluster_config.to_json()) 66 | print(config) 67 | print(resolved_config) 68 | assert resolved_config == config 69 | 70 | 71 | def test_disabling_glue_metastore() -> None: 72 | cluster_config = cluster_configuration.ClusterConfiguration( 73 | stack, "test-disable-glue-metastore", configuration_name="test-cluster", use_glue_catalog=False 74 | ) 75 | 76 | config = copy.deepcopy(default_config) 77 | config["ClusterConfiguration"]["Configurations"] = [ 78 | {"Classification": "hive-site", "Properties": {}}, 79 | {"Classification": "spark-hive-site", "Properties": {}}, 80 | ] 81 | 82 | resolved_config = stack.resolve(cluster_config.to_json()) 83 | print(config) 84 | print(resolved_config) 85 | assert resolved_config == config 86 | 87 | 88 | def test_bootstrap_action_config() -> None: 89 | bucket = s3.Bucket(stack, "test-bucket") 90 | bootstrap_code = emr_code.Code.from_path(path="./examples", deployment_bucket=bucket, deployment_prefix="prefix") 91 | bootstrap_action = emr_code.EMRBootstrapAction( 92 | name="Bootstrap", path=f"{bootstrap_code.s3_path}/bootstrap_action", args=["Arg1", "Arg2"], code=bootstrap_code 93 | ) 94 | 95 | cluster_config = cluster_configuration.ClusterConfiguration( 96 | stack, "test-bootstrap-action-config", configuration_name="test-cluster", bootstrap_actions=[bootstrap_action] 97 | ) 98 | 99 | config = copy.deepcopy(default_config) 100 | config["ClusterConfiguration"]["BootstrapActions"] = [ 101 | { 102 | "Name": "Bootstrap", 103 | "ScriptBootstrapAction": { 104 | "Path": {"Fn::Join": ["", ["s3://", {"Ref": "testbucketE6E05ABE"}, "/prefix/bootstrap_action"]]}, 105 | "Args": ["Arg1", "Arg2"], 106 | }, 107 | } 108 | ] 109 | config["ConfigurationArtifacts"] = [{"Bucket": {"Ref": "testbucketE6E05ABE"}, "Path": "prefix/*"}] 110 | 111 | resolved_config = stack.resolve(cluster_config.to_json()) 112 | print(config) 113 | print(resolved_config) 114 | assert resolved_config == config 115 | -------------------------------------------------------------------------------- /tests/aws_emr_launch/constructs/managed_configurations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/tests/aws_emr_launch/constructs/managed_configurations/__init__.py -------------------------------------------------------------------------------- /tests/aws_emr_launch/constructs/managed_configurations/test_instance_group_configuration.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import cast 3 | 4 | import aws_cdk 5 | from aws_cdk import aws_ec2 as ec2 6 | 7 | from aws_emr_launch.constructs.managed_configurations import instance_group_configuration 8 | 9 | app = aws_cdk.App() 10 | stack = aws_cdk.Stack(app, "test-stack") 11 | vpc = ec2.Vpc(stack, "test-vpc") 12 | default_config = { 13 | "ConfigurationName": "test-cluster", 14 | "Namespace": "default", 15 | "ClusterConfiguration": { 16 | "Applications": [{"Name": "Hadoop"}, {"Name": "Hive"}, {"Name": "Spark"}], 17 | "BootstrapActions": [], 18 | "Configurations": [ 19 | { 20 | "Classification": "hive-site", 21 | "Properties": { 22 | "hive.metastore.client.factory.class": ( 23 | "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" 24 | ) 25 | }, 26 | }, 27 | { 28 | "Classification": "spark-hive-site", 29 | "Properties": { 30 | "hive.metastore.client.factory.class": ( 31 | "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" 32 | ) 33 | }, 34 | }, 35 | ], 36 | "Instances": { 37 | "Ec2SubnetId": {"Ref": "testvpcPrivateSubnet1Subnet865FB50A"}, 38 | "InstanceGroups": [ 39 | { 40 | "Name": "Master", 41 | "InstanceRole": "MASTER", 42 | "InstanceType": "m5.2xlarge", 43 | "Market": "ON_DEMAND", 44 | "InstanceCount": 1, 45 | "EbsConfiguration": { 46 | "EbsBlockDeviceConfigs": [ 47 | {"VolumeSpecification": {"SizeInGB": 500, "VolumeType": "st1"}, "VolumesPerInstance": 1} 48 | ], 49 | "EbsOptimized": True, 50 | }, 51 | }, 52 | { 53 | "Name": "Core", 54 | "InstanceRole": "CORE", 55 | "InstanceType": "m5.xlarge", 56 | "Market": "ON_DEMAND", 57 | "InstanceCount": 2, 58 | "EbsConfiguration": { 59 | "EbsBlockDeviceConfigs": [ 60 | {"VolumeSpecification": {"SizeInGB": 500, "VolumeType": "st1"}, "VolumesPerInstance": 1} 61 | ], 62 | "EbsOptimized": True, 63 | }, 64 | }, 65 | ], 66 | "KeepJobFlowAliveWhenNoSteps": True, 67 | "TerminationProtected": False, 68 | }, 69 | "Name": "test-cluster", 70 | "ReleaseLabel": "emr-5.29.0", 71 | "StepConcurrencyLevel": 1, 72 | "Tags": [], 73 | "VisibleToAllUsers": True, 74 | }, 75 | "OverrideInterfaces": { 76 | "default": { 77 | "ClusterName": {"JsonPath": "Name", "Default": "test-cluster"}, 78 | "ReleaseLabel": {"JsonPath": "ReleaseLabel", "Default": "emr-5.29.0"}, 79 | "StepConcurrencyLevel": {"JsonPath": "StepConcurrencyLevel", "Default": 1}, 80 | "MasterInstanceType": {"JsonPath": "Instances.InstanceGroups.0.InstanceType", "Default": "m5.2xlarge"}, 81 | "MasterInstanceMarket": {"JsonPath": "Instances.InstanceGroups.0.Market", "Default": "ON_DEMAND"}, 82 | "CoreInstanceCount": {"JsonPath": "Instances.InstanceGroups.1.InstanceCount", "Default": 2}, 83 | "CoreInstanceType": {"JsonPath": "Instances.InstanceGroups.1.InstanceType", "Default": "m5.xlarge"}, 84 | "CoreInstanceMarket": {"JsonPath": "Instances.InstanceGroups.1.Market", "Default": "ON_DEMAND"}, 85 | "Subnet": {"JsonPath": "Instances.Ec2SubnetId", "Default": {"Ref": "testvpcPrivateSubnet1Subnet865FB50A"}}, 86 | } 87 | }, 88 | "ConfigurationArtifacts": [], 89 | } 90 | 91 | 92 | def test_default_configuration() -> None: 93 | cluster_config = instance_group_configuration.InstanceGroupConfiguration( 94 | stack, 95 | "test-instance-group-config", 96 | configuration_name="test-cluster", 97 | subnet=cast(ec2.Subnet, vpc.private_subnets[0]), 98 | ) 99 | 100 | config = copy.deepcopy(default_config) 101 | 102 | resolved_config = stack.resolve(cluster_config.to_json()) 103 | print(config) 104 | print(resolved_config) 105 | assert resolved_config == config 106 | -------------------------------------------------------------------------------- /tests/aws_emr_launch/constructs/test_iam_roles.py: -------------------------------------------------------------------------------- 1 | import aws_cdk 2 | from aws_cdk import aws_s3 as s3 3 | 4 | from aws_emr_launch.constructs.iam_roles.emr_roles import EMRRoles 5 | 6 | 7 | def test_emr_security_groups() -> None: 8 | app = aws_cdk.App() 9 | stack = aws_cdk.Stack(app, "test-stack") 10 | artifacts_bucket = s3.Bucket(stack, "test-artifacts-bucket") 11 | logs_bucket = s3.Bucket(stack, "test-logs-bucket") 12 | 13 | emr_roles = EMRRoles( 14 | stack, 15 | "test-emr-components", 16 | role_name_prefix="TestCluster", 17 | artifacts_bucket=artifacts_bucket, 18 | logs_bucket=logs_bucket, 19 | ) 20 | 21 | assert emr_roles.service_role 22 | assert emr_roles.instance_role 23 | assert emr_roles.autoscaling_role 24 | -------------------------------------------------------------------------------- /tests/aws_emr_launch/constructs/test_security_groups.py: -------------------------------------------------------------------------------- 1 | import aws_cdk 2 | from aws_cdk import aws_ec2 as ec2 3 | 4 | from aws_emr_launch.constructs.security_groups.emr import EMRSecurityGroups 5 | 6 | 7 | def test_emr_security_groups() -> None: 8 | app = aws_cdk.App() 9 | stack = aws_cdk.Stack(app, "test-stack") 10 | vpc = ec2.Vpc(stack, "test-vpc") 11 | emr_security_groups = EMRSecurityGroups(stack, "test-security-groups", vpc=vpc) 12 | 13 | assert emr_security_groups.service_group 14 | assert emr_security_groups.master_group 15 | assert emr_security_groups.workers_group 16 | -------------------------------------------------------------------------------- /tests/aws_emr_launch/control_plane/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../control_plane/"))) 5 | -------------------------------------------------------------------------------- /tests/aws_emr_launch/control_plane/constructs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/tests/aws_emr_launch/control_plane/constructs/__init__.py -------------------------------------------------------------------------------- /tests/aws_emr_launch/control_plane/constructs/test_lambdas.py: -------------------------------------------------------------------------------- 1 | import aws_cdk 2 | 3 | from aws_emr_launch.control_plane.constructs.lambdas.apis import Apis 4 | 5 | 6 | def test_emr_lambdas() -> None: 7 | app = aws_cdk.App() 8 | stack = aws_cdk.Stack(app, "test-lambdas-stack") 9 | apis = Apis(stack, "test-apis") 10 | 11 | assert apis.get_profile 12 | assert apis.get_profiles 13 | assert apis.get_configuration 14 | assert apis.get_configurations 15 | assert apis.get_function 16 | assert apis.get_functions 17 | -------------------------------------------------------------------------------- /tests/aws_emr_launch/control_plane/lambda_sources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/aws-emr-launch/26075673df875498bac11ba258a39b66be68e8f9/tests/aws_emr_launch/control_plane/lambda_sources/__init__.py -------------------------------------------------------------------------------- /tests/aws_emr_launch/control_plane/lambda_sources/apis/test_get_list_apis.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import unittest 4 | 5 | import boto3 6 | from moto import mock_ssm 7 | 8 | from aws_emr_launch.control_plane.lambda_sources.apis import get_list_apis 9 | from aws_emr_launch.control_plane.lambda_sources.apis.get_list_apis import ( 10 | ClusterConfigurationNotFoundError, 11 | EMRLaunchFunctionNotFoundError, 12 | EMRProfileNotFoundError, 13 | ) 14 | 15 | # Turn the LOGGER off for the tests 16 | get_list_apis.LOGGER.setLevel(logging.WARN) 17 | 18 | 19 | class TestControlPlaneApis(unittest.TestCase): 20 | @mock_ssm 21 | def test_get_profiles_handler(self) -> None: 22 | event = {"Namespace": "test"} 23 | 24 | self.assertEquals(get_list_apis.get_profiles_handler(event, None), {"EMRProfiles": []}) 25 | 26 | @mock_ssm 27 | def test_get_profile(self) -> None: 28 | profile = {"Profile": "TestProfile", "Key": "Value"} 29 | 30 | ssm = boto3.client("ssm") 31 | ssm.put_parameter(Name=f"{get_list_apis.PROFILES_SSM_PARAMETER_PREFIX}/default/test", Value=json.dumps(profile)) 32 | 33 | event = {"ProfileName": "test"} 34 | 35 | self.assertEquals(get_list_apis.get_profile_handler(event, None), profile) 36 | 37 | @mock_ssm 38 | def test_profile_not_found(self) -> None: 39 | event = {"ProfileName": "test"} 40 | 41 | with self.assertRaises(EMRProfileNotFoundError): 42 | get_list_apis.get_profile_handler(event, None) 43 | 44 | @mock_ssm 45 | def test_get_configurations_handler(self) -> None: 46 | event = {"Namespace": "test"} 47 | 48 | self.assertEquals(get_list_apis.get_configurations_handler(event, None), {"ClusterConfigurations": []}) 49 | 50 | @mock_ssm 51 | def test_get_configuration(self) -> None: 52 | configuration = {"Configuration": "TestConfiguration", "Key": "Value"} 53 | 54 | ssm = boto3.client("ssm") 55 | ssm.put_parameter( 56 | Name=f"{get_list_apis.CONFIGURATIONS_SSM_PARAMETER_PREFIX}/default/test", Value=json.dumps(configuration) 57 | ) 58 | 59 | event = {"ConfigurationName": "test"} 60 | 61 | self.assertEquals(get_list_apis.get_configuration_handler(event, None), configuration) 62 | 63 | @mock_ssm 64 | def test_configuration_not_found(self) -> None: 65 | event = {"ConfigurationName": "test"} 66 | 67 | with self.assertRaises(ClusterConfigurationNotFoundError): 68 | get_list_apis.get_configuration_handler(event, None) 69 | 70 | @mock_ssm 71 | def test_get_functions_handler(self) -> None: 72 | event = {"Namespace": "test"} 73 | 74 | self.assertEquals(get_list_apis.get_functions_handler(event, None), {"EMRLaunchFunctions": []}) 75 | 76 | @mock_ssm 77 | def test_get_function(self) -> None: 78 | function = {"Function": "TestFunction", "Key": "Value"} 79 | 80 | ssm = boto3.client("ssm") 81 | ssm.put_parameter( 82 | Name=f"{get_list_apis.FUNCTIONS_SSM_PARAMETER_PREFIX}/default/test", Value=json.dumps(function) 83 | ) 84 | 85 | event = {"FunctionName": "test"} 86 | 87 | self.assertEquals(get_list_apis.get_function_handler(event, None), function) 88 | 89 | @mock_ssm 90 | def test_function_not_found(self) -> None: 91 | event = {"FunctionName": "test"} 92 | 93 | with self.assertRaises(EMRLaunchFunctionNotFoundError): 94 | get_list_apis.get_function_handler(event, None) 95 | -------------------------------------------------------------------------------- /tests/aws_emr_launch/control_plane/test_control_plane.py: -------------------------------------------------------------------------------- 1 | import aws_cdk 2 | 3 | from aws_emr_launch.control_plane import ControlPlaneStack 4 | 5 | 6 | def test_control_plane_stack() -> None: 7 | stack = ControlPlaneStack(aws_cdk.App()) 8 | 9 | assert stack.apis 10 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [testenv] 2 | # Passing down some AWS environment variables 3 | passenv = AWS_PROFILE AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_DEFAULT_REGION 4 | 5 | [pytest] 6 | addopts = 7 | --flake8 8 | --isort 9 | --color=auto 10 | --cov-report=html 11 | --cov-report=term 12 | --cov=aws_emr_launch 13 | --cov=control_plane 14 | -W ignore::DeprecationWarning 15 | norecursedirs = examples layers control_plane/cdk.out 16 | flake8-ignore = 17 | __init__.py ALL 18 | tests/* ALL 19 | isort_ignore = 20 | tests/*.py 21 | -------------------------------------------------------------------------------- /validate.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ex 4 | 5 | isort --check . 6 | black --check . 7 | mypy --no-incremental . 8 | flake8 . 9 | --------------------------------------------------------------------------------